• hive-分析函数


    bitmap
    --------------
        位图/位映射。
        5
        
    hive
    --------------
        分区表
        udtf函数
        wordcount
        lateral view            //和udtf配合使用。
        order by                //数据倾斜
        sort by                    //reduce内排序
        distribute by            //分区
        cluster by                //sort by + distribute by
    
    hive
    --------------
        --显式表头
        hive>set hive.cli.print.header=true ;
    
    hive分析函数
    --------------
        1.准备数据
            create table emp
            (
                id int ,
                empno string ,
                age int ,
                salary float ,
                deptno int 
            )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS TEXTFILE;
    
        2.加载数据
            1,tom1,28,3000,1
            2,tom2,29,5000,2
            3,tom3,30,5400,1
            4,tom4,28,6600,3
            5,tom5,27,8000,2
            6,tom6,35,10000,3
    
        3.加载数据
            load data local inpath '/home/centos/emp.txt' into table emp ;
    
    hive分区函数应用
    -----------------
        0.简介
            hive分析函数可以连接每条记录形成数据集,字段甚至可以不是分组字段,使用一次mr
            完成聚合查询。
            常规的sql下的分组聚合很诸多限制,其中select字段必须是分组字段,有时需要多次mr.
            select deptno , max(salary) from emp group by deptno ;
        1.分析函数
            -- over , 只是分区
            SELECT id, empno, salary ,deptno ,max(salary) OVER (PARTITION BY deptno) AS max from emp  ;
    
    
            -- OVER + ORDER BY , 分区并在分区内排序
            SELECT empno, deptno, salary ,SUM(salary) OVER(PARTITION BY deptno ORDER BY salary) AS t1 from emp;
    
            -- OVER ... rows unbounded preceding 基于前导所有行的汇总操作。
            SELECT empno, deptno, salary , SUM(salary) OVER(ORDER BY deptno, empno rows unbounded preceding) AS t3 from emp ;
    
            -- RANK, 排名操作 ,计算每个部门内按照工资的降序进行排名(有缝,并列情况存在缝隙)绝对排名。
            SELECT empno, deptno, salary,  RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
            
            -- dense_rank()密度排名,无缝。绝对排名。
            SELECT empno, deptno, salary,  Dense_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
    
            -- percent_rank()百分比排名,相对排名.
            SELECT empno, deptno, salary,  percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
    
            --NTILE(n) ,分桶操纵,将数据均匀分散到各个桶中。
            SELECT empno, deptno, salary , NTILE(4) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
    
            -- lead()前导操作,查询从当前开始,后续第几行的操作。
            SELECT empno, deptno, salary, LEAD(salary, 2) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
    
            --lag,从当前行计数,访问之前的第几行salary,如果超过窗口范围返回null。
            SELECT empno, deptno, salary, lag(salary, 1) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
    
            --first_value()
            SELECT empno, deptno, salary, first_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
    
            --last_value()
            SELECT empno, deptno, salary, last_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
            
            --使用range开窗函数 RANGE BETWEEN ... AND ...,在分区内在划分记录范围。
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ;
    
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ;
            
            --RANGE : 对值的+/-.
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ;
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ;
    
            --range : 计算的是值
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN 2000 PRECEDING AND 2000 FOLLOWING) AS t1 from emp ;
            //rows计算的行
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS t1 from emp ;
    
            --窗口重用
            SELECT empno, deptno, salary , MAX(salary) OVER w1 AS mx,MIN(salary) OVER w1 AS mn,AVG(salary) OVER w1 AS ag from emp WINDOW w1 AS (PARTITION BY deptno ORDER BY salary desc ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) ;
    
            [开窗函数]
            range|rows between ... and ;
            range是值偏移,rows是行偏移。
    
        2.统计员工数量
            select max(salary) from emp group by deptno union select max(salary) from emp group by age union select max(salary) from emp group by deptno,age ;
    
    
    hive高级聚合
    ---------------
        1.grouping set
            作用等同于union.
            select deptno , age , count(1) from emp group by deptno,age grouping sets(deptno ,age ,(deptno,age) ) ;
        3.rollup
            select ... from ... GROUP BY a,b,c WITH ROLLUP ;
            select ... from ... GROUP BY a,b,c GROUPING SETS ((a,b,c),(a,b),(a),())
    
            select deptno , age , count(1) from emp group by deptno,age with rollup ;
        4.cube
            select ... from ... GROUP BY a,b,c WITH cube ;
            select ... from ... GROUP BY a,b,c GROUPING SETS ((a),(a,b),(a,c),(a,b,c) ,(b),(b,c),(c),()))
    
            select deptno , age , count(1) from emp group by deptno,age with cube ;
    
    hive优化
    -------------------
        1.创建索引
            本质上就是表,对于检索单条记录是有优势的。
            排序的。
            --创建索引
            CREATE INDEX idx_emp ON TABLE emp (empno) AS 'COMPACT' WITH DEFERRED REBUILD;
    
            --生成索引
            ALTER INDEX idx_emp ON emp REBUILD;
    
            --查询是通过索引列查询
            select * from emp where empno = 'tom1'
    
        2.文件格式
            [列存储格式]
            parquet , orcfile , orc    //
            投影查询时,发挥磁盘的线性读写。
            select id,name from emp ;
            //创建表,存储成parquet格式
            create table pq1(id int , empno string, age int ,salary float ,  deptno int ) stored as parquet ;
            
            //查看文件格式
            insert into pq1 select * from emp ;
            [行存储]
            txt 
    
        3.压缩
            减少数据传输量,降低网络IO的负载。
            --在多级job中,job之间的结果是否需要压缩。
            SET hive.exec.compress.intermediate=true ;
            SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec ;
            SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.GzipCodec ;
            SET hive.intermediate.compression.type=record|block|none ;
    
            --控制job的最终输出是否压缩.
            SET hive.exec.compress.output=true;
            SET mapred.output.compression.codec= org.apache.hadoop.io.compress.GzipCodec;
        
        4.大量小文件
            导致map过多。
            1.归档
                hadoop Archive and HAR进行归档。
    
            2.转换成序列文件
                将小文件合成SeqenceFile.
    
            3.org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat
                alter table emp set inputformat=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat
                --修改默认输入格式
                set hive.input.format=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat ;
    
                --建新表
    create table emp2
    (
    id int ,
    empno string ,
    age int ,
    salary float ,
    deptno int 
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS
    INPUTFORMAT 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
  • 相关阅读:
    移动端文本编辑器
    jquery移动端日期插件
    Spring 4集成 Quartz2(转)
    json 特殊字符 javascript 特殊字符处理(转载)
    解决使用JavaScriptConvert转换对象为Json时,中文和&符号被转码的问题
    RFID的winform程序心得2
    异步编程模型
    DataGridView获取或者设置当前单元格的内容
    DataGridView修改数据并传到数据库
    把存储过程结果集SELECT INTO到临时表
  • 原文地址:https://www.cnblogs.com/zyde/p/9225353.html
Copyright © 2020-2023  润新知