一、每个部门工资第二高员工
MySQL8.0测试运行。
1.题目
有一张公司员工信息表 employee
,有4个字段:
employee_id varchar -- 员工ID
employee_name varchar -- 员工姓名
employee_salary int -- 员工薪酬
department varchar -- 部门ID
另外一张部门信息表 department
,有2个字段:
department_id varchar -- 部门ID
department_name varchar -- 部门名称
请查询每个部门工资第二高员工,输出员工ID、员工姓名、员工薪酬、员工部门名称4个字段。
2.建表
-- 建表
DROP TABLE IF EXISTS employee;
CREATE TABLE employee(
employee_id VARCHAR(8),
employee_name VARCHAR(8),
employee_salary INT,
department VARCHAR(8)
)
ENGINE = INNODB
DEFAULT CHARSET = utf8mb4;
-- 插入数据
INSERT INTO employee
(employee_id, employee_name, employee_salary, department)
VALUE ('a001','Bob',7000,'b1')
,('a002','Jack',9000,'b1')
,('a003','Alice',8000,'b2')
,('a004','Ben',5000,'b2')
,('a005','Candy',4000,'b2')
,('a006','Allen',5000,'b2')
,('a007','Linda',10000,'b3');
-- 建表
DROP TABLE IF EXISTS department;
CREATE TABLE department(
department_id VARCHAR(8),
department_name VARCHAR(8)
)
ENGINE = INNODB
DEFAULT CHARSET = utf8mb4;
-- 插入数据
INSERT INTO
department (department_id,department_name)
VALUE ('b1','Sales')
,('b2','IT')
,('b3','Product');
3.答案
使用窗口函数 rank() over(partition by xxx order by xxx)
进行分组排序。
窗口函数、子查询、多表连接
-- 答案
SELECT a.employee_id,
a.employee_name,
a.employee_salary,
b.department_name
FROM
(
SELECT *,
rank() over (PARTITION BY department ORDER BY employee_salary DESC) AS rn
FROM employee
) a
LEFT JOIN department b
ON a.department = b.department_id
WHERE a.rn = 2;
'''
employee_id employee_name employee_salary department_name
a001 Bob 7000 Sales
a004 Ben 5000 IT
a006 Allen 5000 IT
'''
当然,也可以先进行关联后,再分组排序。
二、网站登录时间间隔统计
1.题目
有一张网站登录情况表 login_info
,记录用户登录信息,有2个字段:
user_id varchar -- 用户ID
login_time date -- 用户登录日期 2021-1-15
计算每个用户登录日期间隔小于5天的次数,输出用户ID、次数2个字段。
2.建表
-- 建表
DROP TABLE IF EXISTS login_info;
CREATE TABLE login_info(
user_id VARCHAR(8),
login_time DATE
)
ENGINE = INNODB
DEFAULT CHARSET = utf8mb4;
-- 插入数据
INSERT INTO
login_info (user_id,login_time)
VALUE ('a001','2021-01-01')
,('b001','2021-01-01')
,('a001','2021-01-03')
,('a001','2021-01-06')
,('a001','2021-01-07')
,('b001','2021-01-07')
,('a001','2021-01-08')
,('a001','2021-01-09')
,('b001','2021-01-09')
,('b001','2021-01-10')
,('b001','2021-01-15')
,('a001','2021-01-16')
,('a001','2021-01-18')
,('a001','2021-01-19')
,('b001','2021-01-20')
,('a001','2021-01-23');
3.答案
利用偏移函数 lead()
处理时间间隔。
窗口函数、子查询、分组聚合、时间函数
-- 答案
SELECT a.user_id,
COUNT(1) AS cnt
FROM
(
SELECT user_id,
login_time,
lead(login_time) over (PARTITION BY user_id ORDER BY login_time) AS next_login_time
FROM login_info
) a
WHERE TIMESTAMPDIFF(DAY, login_time, next_login_time) < 5
GROUP BY user_id;
'''
user_id cnt
a001 8
b001 2
'''
注意:lag
、lead
的区别,一个向前移,一个向后移。
三、用户购买渠道分析
1.题目
有一张用户购买信息表 purchase_channel
,记录了用户的购物信息,有4个字段:
user_id varchar -- 用户ID
channel varchar -- 渠道
purchase_date date -- 购买日期
purchase_amount int -- 购买金额
请查询每天仅适用手机端、仅使用网页端的用户和同时使用两种渠道的不同用户人数,和总购物金额。
并且即使某天某渠道没有用户的购买信息,也需要展示。
输出:日期、购买渠道、总购买金额、不同用户人数4个字段。
2.建表
-- 建表
DROP TABLE IF EXISTS purchase_channel;
CREATE TABLE purchase_channel(
user_id VARCHAR(8),
channel VARCHAR(8),
purchase_date DATE,
purchase_amount INT
)
ENGINE = INNODB
DEFAULT CHARSET = utf8mb4;
-- 插入数据
INSERT INTO
purchase_channel (user_id,channel,purchase_date,purchase_amount)
VALUE ('a001','app','2021-03-14',200)
,('a001','web','2021-03-14',100)
,('a002','app','2021-03-14',400)
,('a001','web','2021-03-15',3000)
,('a002','app','2021-03-15',900)
,('a003','app','2021-03-15',1000);
3.答案
根据用户ID和日期进行分组,统计用户在各个渠道的购买个数来判断采用方式(web、app、both)。
分别统计单个渠道,多个渠道数据,进行 union all
合并。
union all、分组聚合、数据去重、笛卡尔积
-- 答案
SELECT purchase_date,
channel,
SUM(sum_amount) AS sum_amount,
SUM(user_cnt) AS total_users
FROM
(
SELECT purchase_date,
MIN(channel) AS channel,
COUNT(DISTINCT user_id) AS user_cnt,
SUM(purchase_amount) AS sum_amount
FROM purchase_channel
GROUP BY purchase_date, user_id
HAVING COUNT(DISTINCT channel) = 1
UNION ALL
SELECT purchase_date,
'both' AS channel,
COUNT(DISTINCT user_id) AS user_cnt,
SUM(purchase_amount) AS sum_amount
FROM purchase_channel
GROUP BY purchase_date, user_id
HAVING COUNT(DISTINCT channel) > 1
) aa
GROUP BY purchase_date, channel;
此种结果只是将存在的日期、渠道列出来,未包括所有的,还待优化。
所有日期与渠道的笛卡尔积,再进行 left join
关联操作即可。
-- 最终答案
SELECT t1.purchase_date,
t1.channel,
t2.sum_amount,
t2.total_users
FROM
(
SELECT DISTINCT a.purchase_date,
b.channel
FROM purchase_channel a,
(
SELECT 'app' AS channel
UNION ALL
SELECT 'web' AS channel
UNION ALL
SELECT 'both' AS channel
) b
) t1
LEFT JOIN
(
SELECT purchase_date,
channel,
SUM(sum_amount) AS sum_amount,
SUM(user_cnt) AS total_users
FROM
(
SELECT purchase_date,
MIN(channel) AS channel,
COUNT(DISTINCT user_id) AS user_cnt,
SUM(purchase_amount) AS sum_amount
FROM purchase_channel
GROUP BY purchase_date, user_id
HAVING COUNT(DISTINCT channel) = 1
UNION ALL
SELECT purchase_date,
'both' AS channel,
COUNT(DISTINCT user_id) AS user_cnt,
SUM(purchase_amount) AS sum_amount
FROM purchase_channel
GROUP BY purchase_date, user_id
HAVING COUNT(DISTINCT channel) > 1
) aa
GROUP BY purchase_date, channel
) t2
ON t1.purchase_date = t2.purchase_date
AND t1.channel = t2.channel
ORDER BY purchase_date, channel;
/*
purchase_date channel sum_amount total_users
2021-03-14 app 400 1
2021-03-14 both 300 1
2021-03-14 web \N \N
2021-03-15 app 1900 2
2021-03-15 both \N \N
2021-03-15 web 3000 1
*/
参考链接:数据分析笔试题06