• SparkSQL DataFrames操作


    Hive中已经存在emp和dept表:

    select * from emp;
    +--------+---------+------------+-------+-------------+---------+---------+---------+
    | empno  |  ename  |    job     |  mgr  |  hiredate   |   sal   |  comm   | deptno  |
    +--------+---------+------------+-------+-------------+---------+---------+---------+
    | 7369   | SMITH   | CLERK      | 7902  | 1980-12-17  | 800.0   | NULL    | 20      |
    | 7499   | ALLEN   | SALESMAN   | 7698  | 1981-2-20   | 1600.0  | 300.0   | 30      |
    | 7521   | WARD    | SALESMAN   | 7698  | 1981-2-22   | 1250.0  | 500.0   | 30      |
    | 7566   | JONES   | MANAGER    | 7839  | 1981-4-2    | 2975.0  | NULL    | 20      |
    | 7654   | MARTIN  | SALESMAN   | 7698  | 1981-9-28   | 1250.0  | 1400.0  | 30      |
    | 7698   | BLAKE   | MANAGER    | 7839  | 1981-5-1    | 2850.0  | NULL    | 30      |
    | 7782   | CLARK   | MANAGER    | 7839  | 1981-6-9    | 2450.0  | NULL    | 10      |
    | 7788   | SCOTT   | ANALYST    | 7566  | 1987-4-19   | 3000.0  | NULL    | 20      |
    | 7839   | KING    | PRESIDENT  | NULL  | 1981-11-17  | 5000.0  | NULL    | 10      |
    | 7844   | TURNER  | SALESMAN   | 7698  | 1981-9-8    | 1500.0  | 0.0     | 30      |
    | 7876   | ADAMS   | CLERK      | 7788  | 1987-5-23   | 1100.0  | NULL    | 20      |
    | 7900   | JAMES   | CLERK      | 7698  | 1981-12-3   | 950.0   | NULL    | 30      |
    | 7902   | FORD    | ANALYST    | 7566  | 1981-12-3   | 3000.0  | NULL    | 20      |
    | 7934   | MILLER  | CLERK      | 7782  | 1982-1-23   | 1300.0  | NULL    | 10      |
    +--------+---------+------------+-------+-------------+---------+---------+---------+
    
    select * from dept;
    +---------+-------------+-----------+
    | deptno  |    dname    |    loc    |
    +---------+-------------+-----------+
    | 10      | ACCOUNTING  | NEW YORK  |
    | 20      | RESEARCH    | DALLAS    |
    | 30      | SALES       | CHICAGO   |
    | 40      | OPERATIONS  | BOSTON    |
    +---------+-------------+-----------+

    DataFrame常用功能测试

    val hc = new org.apache.spark.sql.hive.HiveContext(sc)
    val emp = hc.table("emp")    //根据hive表创建DataFrame
    
    emp.dtypes.foreach(println)   //查看所有字段名称和类型
        (empno,IntegerType)
        (ename,StringType)
        (job,StringType)
        (mgr,IntegerType)
        (hiredate,StringType)
        (sal,DoubleType)
        (comm,DoubleType)
        (deptno,IntegerType)
        
    emp.columns.foreach(println)  //查看所有字段名称
        empno
        ename
        job
        mgr
        hiredate
        sal
        comm
        deptno
    
    emp.printSchema    //打印schema信息
        root
            |-- empno: integer (nullable = true)
            |-- ename: string (nullable = true)
            |-- job: string (nullable = true)
            |-- mgr: integer (nullable = true)
            |-- hiredate: string (nullable = true)
            |-- sal: double (nullable = true)
            |-- comm: double (nullable = true)
            |-- deptno: integer (nullable = true)
    
    emp.explain  //查看物理执行计划
    == Physical Plan ==
    HiveTableScan [empno#0,ename#1,job#2,mgr#3,hiredate#4,sal#5,comm#6,deptno#7], (MetastoreRelation default, emp, None), None
    
    emp.show  #默认显示20行
        empno ename  job       mgr  hiredate   sal    comm   deptno
        7369  SMITH  CLERK     7902 1980-12-17 800.0  null   20    
        7499  ALLEN  SALESMAN  7698 1981-2-20  1600.0 300.0  30    
        7521  WARD   SALESMAN  7698 1981-2-22  1250.0 500.0  30    
        7566  JONES  MANAGER   7839 1981-4-2   2975.0 null   20    
        7654  MARTIN SALESMAN  7698 1981-9-28  1250.0 1400.0 30    
        7698  BLAKE  MANAGER   7839 1981-5-1   2850.0 null   30    
        7782  CLARK  MANAGER   7839 1981-6-9   2450.0 null   10    
        7788  SCOTT  ANALYST   7566 1987-4-19  3000.0 null   20    
        7839  KING   PRESIDENT null 1981-11-17 5000.0 null   10    
        7844  TURNER SALESMAN  7698 1981-9-8   1500.0 0.0    30    
        7876  ADAMS  CLERK     7788 1987-5-23  1100.0 null   20    
        7900  JAMES  CLERK     7698 1981-12-3  950.0  null   30    
        7902  FORD   ANALYST   7566 1981-12-3  3000.0 null   20    
        7934  MILLER CLERK     7782 1982-1-23  1300.0 null   10 
    
    emp.show(10) #显示指定行数
    
    emp.limit(5).show
    emp.head(3)
    emp.head   #等价于head(1)
    emp.first  #等价于head(1)
    val emp_as = emp.as("emp_as")   #别名
    emp_as.select("empno","ename","deptno").show
    
    #查看指定列:
    emp.select("empno","ename","deptno").show
    emp.select($"empno",$"ename",$"deptno").show
    emp.selectExpr("empno", "ename as name", "substr(ename,0,4)").show     #配合udf使用
    emp.select($"empno",$"sal"+100).show  #给sal加100
    
    
    #条件过滤:
    emp.filter("empno>7698").show
    emp.filter($"empno" > 7698).show
    emp.where($"empno" > 7698).show
    
    #排序:
    emp.sort("empno").show  #默认升序
    emp.sort($"empno").show
    emp.sort("empno").show
    emp.sort($"empno".desc).show
    emp.sort($"deptno", $"empno".desc).show #多字段排序
    
    emp.orderBy($"empno").show 
    emp.orderBy($"empno".desc).show 
    emp.orderBy($"deptno", $"empno".desc).show
        
    #分组:    
    emp.groupBy("deptno").count.show
    emp.groupBy($"deptno").avg().show   #所有的列求平均值
    emp.groupBy($"deptno").avg("sal").show   #sal列求平均值
    emp.groupBy($"deptno").agg("sal"->"max").show   #sal取最大
    emp.groupBy($"deptno").agg("sal"->"min").show   #sal取最小
    emp.groupBy($"deptno").agg("sal"->"sum").show   #sal求和
    emp.groupBy($"deptno").agg("sal"->"avg").show   #sal求平均值
    #agg中能有的方法有: avg/max/min/sum/count
    
    
    #join:    
    val dept = hc.table("dept")
    dept.show
    emp.join(dept,emp.col("deptno") === dept.col("deptno"),"left_outer").show
    emp.join(dept,emp.col("deptno") === dept.col("deptno"),"right_outer").show
    emp.join(dept,emp.col("deptno") === dept.col("deptno"),"inner").show
    emp.join(dept,$"emp.deptno"===$"dept.deptno" ,"inner").select("empno","ename","dname").show

    DataFrames结合SQL使用测试

    val emp_dept = emp.join(dept,emp.col("deptno") === dept.col("deptno"),"left_outer")
    emp_dept.registerTempTable("emp_dept_temp")
    hc.sql("select count(*) from emp_dept_temp").collect

    DataFrames结合hive和mysql jdbc external datasource使用测试:

    mysql中准备数据:

    DROP TABLE IF EXISTS `dept`;
    CREATE TABLE `dept` (
      `deptno` int(11) NOT NULL DEFAULT '0',
      `dname` varchar(30) DEFAULT NULL,
      `loc` varchar(30) DEFAULT NULL,
      PRIMARY KEY (`deptno`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    
    INSERT INTO `dept` VALUES ('10', 'ACCOUNTING', 'NEW YORK');
    INSERT INTO `dept` VALUES ('20', 'RESEARCH', 'DALLAS');
    INSERT INTO `dept` VALUES ('30', 'SALES', 'CHICAGO');
    INSERT INTO `dept` VALUES ('40', 'OPERATIONS', 'BOSTON');
    val hc = new org.apache.spark.sql.hive.HiveContext(sc)
    val emp = hc.table("emp")    
    val dept_jdbc = hc.jdbc("jdbc:mysql://hadoop000:3306/hive?user=root&password=root", "dept")
    emp.join(dept_jdbc, emp.col("deptno") === dept_jdbc.col("deptno"), "left_outer").show    
        
        empno ename  job       mgr  hiredate   sal    comm   deptno deptno dname      loc     
        7782  CLARK  MANAGER   7839 1981-6-9   2450.0 null   10     10     ACCOUNTING NEW YORK
        7839  KING   PRESIDENT null 1981-11-17 5000.0 null   10     10     ACCOUNTING NEW YORK
        7934  MILLER CLERK     7782 1982-1-23  1300.0 null   10     10     ACCOUNTING NEW YORK
        7369  SMITH  CLERK     7902 1980-12-17 800.0  null   20     20     RESEARCH   DALLAS  
        7566  JONES  MANAGER   7839 1981-4-2   2975.0 null   20     20     RESEARCH   DALLAS  
        7788  SCOTT  ANALYST   7566 1987-4-19  3000.0 null   20     20     RESEARCH   DALLAS  
        7876  ADAMS  CLERK     7788 1987-5-23  1100.0 null   20     20     RESEARCH   DALLAS  
        7902  FORD   ANALYST   7566 1981-12-3  3000.0 null   20     20     RESEARCH   DALLAS  
        7499  ALLEN  SALESMAN  7698 1981-2-20  1600.0 300.0  30     30     SALES      CHICAGO 
        7521  WARD   SALESMAN  7698 1981-2-22  1250.0 500.0  30     30     SALES      CHICAGO 
        7654  MARTIN SALESMAN  7698 1981-9-28  1250.0 1400.0 30     30     SALES      CHICAGO 
        7698  BLAKE  MANAGER   7839 1981-5-1   2850.0 null   30     30     SALES      CHICAGO 
        7844  TURNER SALESMAN  7698 1981-9-8   1500.0 0.0    30     30     SALES      CHICAGO 
        7900  JAMES  CLERK     7698 1981-12-3  950.0  null   30     30     SALES      CHICAGO 

    DataFrames结合parquet和mysql jdbc external datasource使用测试:

  • 相关阅读:
    八、drop和alter
    undefined reference to ****
    cgdb的认识
    ping: unknown host www.baidu.com
    ubuntu mysql汉字写入只写入了一个字符
    gdb map.insert方法运行异常:program received signal segmentation fault
    ubuntu环境下c++ 模板特化的编写
    putty fatal error software caused connection
    ubuntu共享文件夹不能被访问,其他主机ping不通该服务器
    ERROR 1045 (28000): Access denied for user 'root'@'localhost' (using password: YES) 问题
  • 原文地址:https://www.cnblogs.com/luogankun/p/4311124.html
Copyright © 2020-2023  润新知