1、使用regexp_split_to_table(text_industries, '#;#')将字符串切分为行
======================================
使用到的知识点:
1、使用with临时存储sql语句,格式【with as xxx(), as xxx2() 】以减少代码;
2、使用round()取小数点后几位;
3、使用to_char()将时间格式的数据转换为text型;
4、使用split_part(xx,xx2,xx3)函数对文本型数据进行切分;
5、使用group by之后利用count()进行统计;
6、join 以及 left join之间的区别;
7、使用join连接多个表,基本格式:【a join b on a.id = b.id join c on a.id = c.id】;
8、嵌套查询(select * from (select * from ));
9、case xx when a then b else c end xx2:判断xx,如果满足a,赋值为b,否则赋值为c,最后取别名xx2;
10、使用current_date获取年月日:2021-01-28,使用now()获取当前时间戳,使用select to_char(now(),'YYYY')获取年;
11、使用【||】进行字符串的拼接;
12、使用to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' )将CURRENT_DATE 拼接时间后转时间戳;
13、使用【时间戳 + '-1 day'】进行时间戳的天数减一;
14、使用:【字段::类型】可以将字段转换为指定类型,或者使用【cast(字段 as 类型)】;
15、使用【insert into 表名(字段名1,字段名2) select * from 表名2 】将查询出来的值批量添加到另一个表中;
with tmp as ( select * from ( select d1.user_id, d1.company_name, d1.website_name, d1.source_top, round( 100 * d1.source_top / d2.news_num, 2 ) AS ratio, row_number( ) OVER ( PARTITION BY d1.user_id, d1.company_name ORDER BY d1.source_top DESC) AS row_num from (SELECT t1.user_id, split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name, website_name AS website_name, count( website_name ) AS source_top FROM service.eoias_sentiment_analysis_result t1 JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) WHERE t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) GROUP BY t1.user_id, company_name, website_name) d1 join (SELECT user_id, company_name, count( company_name ) AS news_num FROM ( SELECT t1.user_id AS user_id, t1.case_id AS case_id, split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name, website_name AS website_name, CURRENT_DATE AS daily_date FROM service.eoias_sentiment_analysis_result t1 JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) WHERE t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) and t1.user_id = t2.user_id and t1.case_id = cast( t2.id AS text ) ) c1 GROUP BY c1.user_id, company_name) d2 on d1.user_id = d2.user_id and d1.company_name = d2.company_name) e1 where row_num <=2 ), tmp2 as ( SELECT user_id, company_name, count( company_name ) AS news_num FROM ( SELECT t1.user_id AS user_id, t1.case_id AS case_id, split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name, website_name AS website_name, CURRENT_DATE AS daily_date FROM service.eoias_sentiment_analysis_result t1 JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) WHERE t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) and t1.user_id = t2.user_id and t1.case_id = cast( t2.id AS text ) ) c1 GROUP BY c1.user_id, company_name ), tmp3 as ( select user_id,company_name,sentiment_top1,sentiment_top1_num,sentiment_top1_ratio from ( SELECT c1.user_id, c1.company_name, c1.text_sentiment as sentiment_top1, c1.sentiment_top as sentiment_top1_num, round(100 * c1.sentiment_top / c2.news_num, 2) as sentiment_top1_ratio, row_number() over (partition by c1.user_id, c1.company_name) as rown FROM ( SELECT t1.user_id, split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name, t1.text_sentiment, count( 1 ) AS sentiment_top FROM service.eoias_sentiment_analysis_result t1 JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) WHERE t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) GROUP BY t1.user_id, company_name, text_sentiment ) c1 JOIN ( SELECT user_id, company_name, count( company_name ) AS news_num FROM ( SELECT t1.user_id AS user_id, t1.case_id AS case_id, split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name, website_name AS website_name, CURRENT_DATE AS daily_date FROM service.eoias_sentiment_analysis_result t1 JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) WHERE t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) AND t1.user_id = t2.user_id AND t1.case_id = cast( t2.id AS text ) ) c1 GROUP BY c1.user_id, company_name ) c2 ON c1.user_id = c2.user_id AND c1.company_name = c2.company_name) d1 where rown = '1' ) insert into daily.eoias_daily_abstract(user_id,case_id,daily_date,company_name,news_num,source_top1,source_top1_num,source_top1_ratio,source_top2,source_top2_num,source_top2_ratio,sentiment_top1,sentiment_top1_num,sentiment_top1_ratio) select c.user_id, c.case_id, to_char(now()::timestamp,'YYYYmmdd') as daily_date, c.company_name, tmp2.news_num, tmp1.source_top1, tmp1.source_top1_num, tmp1.source_top1_ratio, tmp1.source_top2, tmp1.source_top2_num, tmp1.source_top2_ratio, tmp3.sentiment_top1, tmp3.sentiment_top1_num, tmp3.sentiment_top1_ratio from ( SELECT a.user_id, a.case_id, split_part ( b.monitor_words_company, '#;#', 1 ) AS company_name FROM service.eoias_sentiment_analysis_result a JOIN service.eoias_crawler_key_param b ON a.case_id = cast( b.id AS text ) WHERE a.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' AND a.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss')) c join (select a.user_id, a.company_name, a.website_name as source_top1, a.source_top as source_top1_num, a.ratio as source_top1_ratio, case when b.website_name is null then '' else b.website_name end source_top2, case when b.source_top is null then 0 else b.source_top end source_top2_num, case when b.ratio is null then 0 else b.ratio end source_top2_ratio from (select user_id, company_name, website_name, ratio, source_top from tmp where row_num = 1) a left join (select user_id, company_name, website_name, ratio, source_top from tmp where row_num = 2) b on a.company_name = b.company_name and a.user_id = b.user_id) tmp1 on c.user_id = tmp1.user_id and c.company_name = tmp1.company_name join tmp2 on c.user_id = tmp2.user_id and c.company_name = tmp2.company_name join tmp3 on c.user_id = tmp3.user_id and c.company_name = tmp3.company_name;
补充:
判断公司名称中是否包含相关字段来进行统一命名:
SELECT CASE WHEN cast( position( '腾讯' IN company_name ) AS boolean ) THEN '腾讯' WHEN cast( position( '阿里' IN company_name ) AS boolean ) THEN '阿里巴巴' WHEN cast( position( '中新赛克' IN company_name ) AS boolean ) THEN '中新赛克' ELSE company_name END company_name FROM daily.eoias_daily_website
从时间戳中提取月、日、时等
extract(Month from now()) || '月' || extract(Day from now()) || '日' || extract(Hour from now()) || '时'
提取一段时间内的每小时:
select generate_series ( '2021-02-24 07:00:00' :: TIMESTAMP, '2021-02-25 07:00:00' :: TIMESTAMP, '1 hour' ) AS "hour"