• spark dataframe的分分合合 join和select分割


    emp = [(1,"Smith",-1,"2018","10","M",3000), \
        (2,"Rose",1,"2010","20","M",4000), \
        (3,"Williams",1,"2010","10","M",1000), \
        (4,"Jones",2,"2005","10","F",2000), \
        (5,"Brown",2,"2010","40","",-1), \
          (6,"Brown",2,"2010","50","",-1) \
      ]
    empColumns = ["emp_id","name","superior_emp_id","year_joined", \
           "emp_dept_id","gender","salary"]
    
    empDF = spark.createDataFrame(data=emp, schema = empColumns)
    empDF.printSchema()
    empDF.show(truncate=False)
    
    dept = [("Finance",10), \
        ("Marketing",20), \
        ("Sales",30), \
        ("IT",40) \
      ]
    deptColumns = ["dept_name","dept_id"]
    deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
    deptDF.printSchema()
    deptDF.show(truncate=False)
    

    运行结果:

    Emp Dataset
    +------+--------+---------------+-----------+-----------+------+------+
    |emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
    +------+--------+---------------+-----------+-----------+------+------+
    |1     |Smith   |-1             |2018       |10         |M     |3000  |
    |2     |Rose    |1              |2010       |20         |M     |4000  |
    |3     |Williams|1              |2010       |10         |M     |1000  |
    |4     |Jones   |2              |2005       |10         |F     |2000  |
    |5     |Brown   |2              |2010       |40         |      |-1    |
    |6     |Brown   |2              |2010       |50         |      |-1    |
    +------+--------+---------------+-----------+-----------+------+------+
    
    Dept Dataset
    +---------+-------+
    |dept_name|dept_id|
    +---------+-------+
    |Finance  |10     |
    |Marketing|20     |
    |Sales    |30     |
    |IT       |40     |
    +---------+-------+


    我自己运行示例,可以看到select是可以直接分割列的(准确说是选择列):

    >>> emp = [(1,"Smith",-1,"2018","10","M",3000), \
    ...     (2,"Rose",1,"2010","20","M",4000), \
    ...     (3,"Williams",1,"2010","10","M",1000), \
    ...     (4,"Jones",2,"2005","10","F",2000), \
    ...     (5,"Brown",2,"2010","40","",-1), \
    ...       (6,"Brown",2,"2010","50","",-1) \
    ...   ]
    >>> empColumns = ["emp_id","name","superior_emp_id","year_joined", \
    ...        "emp_dept_id","gender","salary"]
    >>>
    >>> empDF = spark.createDataFrame(data=emp, schema = empColumns)
    >>> empDF.printSchema()
    root
     |-- emp_id: long (nullable = true)
     |-- name: string (nullable = true)
     |-- superior_emp_id: long (nullable = true)
     |-- year_joined: string (nullable = true)
     |-- emp_dept_id: string (nullable = true)
     |-- gender: string (nullable = true)
     |-- salary: long (nullable = true)
    
    >>> empDF.show(truncate=False)
    +------+--------+---------------+-----------+-----------+------+------+
    |emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
    +------+--------+---------------+-----------+-----------+------+------+
    |1     |Smith   |-1             |2018       |10         |M     |3000  |
    |2     |Rose    |1              |2010       |20         |M     |4000  |
    |3     |Williams|1              |2010       |10         |M     |1000  |
    |4     |Jones   |2              |2005       |10         |F     |2000  |
    |5     |Brown   |2              |2010       |40         |      |-1    |
    |6     |Brown   |2              |2010       |50         |      |-1    |
    +------+--------+---------------+-----------+-----------+------+------+
    
    >>>
    >>> dept = [("Finance",10), \
    ...     ("Marketing",20), \
    ...     ("Sales",30), \
    ...     ("IT",40) \
    ...   ]
    >>> deptColumns = ["dept_name","dept_id"]
    >>> deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
    >>> deptDF.printSchema()
    root
     |-- dept_name: string (nullable = true)
     |-- dept_id: long (nullable = true)
    
    >>> deptDF.show(truncate=False)
    +---------+-------+
    |dept_name|dept_id|
    +---------+-------+
    |Finance  |10     |
    |Marketing|20     |
    |Sales    |30     |
    |IT       |40     |
    +---------+-------+
    
    >>> empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"inner") \
    ...      .show(truncate=False)
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    |emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    |1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
    |3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
    |4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
    |2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
    |5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    
    >>> alldf = empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"inner")
    >>> alldf.show()
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    |emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    |     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
    |     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
    |     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
    |     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
    |     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    
    >>> alldf.select("emp_id", "name")
    DataFrame[emp_id: bigint, name: string]
    >>> alldf.select("emp_id", "name").show()
    +------+--------+
    |emp_id|    name|
    +------+--------+
    |     1|   Smith|
    |     3|Williams|
    |     4|   Jones|
    |     2|    Rose|
    |     5|   Brown|
    +------+--------+
    
    >>> alldf.show(truncate=False)
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    |emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    |1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
    |3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
    |4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
    |2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
    |5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
    +------+--------+---------------+-----------+-----------+------+------+---------+-------+
    

      

  • 相关阅读:
    RxJava+okhttp3
    RetrofitOkHttp网络请求
    布局111
    网络请求展示数据
    一级列表展示购物车
    终极MVP二级购物车
    将博客搬至CSDN
    nyoj-开灯问题
    nyoj-数乌龟
    nyoj 正三角形的外接圆面积
  • 原文地址:https://www.cnblogs.com/bonelee/p/16578923.html
Copyright © 2020-2023  润新知