• PySpark Dataframe 添加新列


    为spark dataframe 添加新的列的几种实现

    from pyspark.sql import SparkSession
    from pyspark.sql import Row
    spark = SparkSession.builder.getOrCreate()
    • 测试数据准备
    test_data = [
                Row(name='China', Population=1439323776, area=960.1),
                Row(name='India', Population=1380004385, area=328.8),
                Row(name='United States', Population=331002651, area=936.4)]
    df = spark.createDataFrame(test_data)
    df.show()
    '''
    +----------+-----+-------------+
    |Population| area|         name|
    +----------+-----+-------------+
    |1439323776|960.1|        China|
    |1380004385|328.8|        India|
    | 331002651|936.4|United States|
    +----------+-----+-------------+
    '''
    1. 使用Spark自带函数
      import pyspark.sql.functions as F
      df_log = df.withColumn("PopulationLog", F.log(10.0, "Population"))
      df_log.show()
      '''
      +----------+-----+-------------+-----------------+
      |Population| area|         name|    PopulationLog|
      +----------+-----+-------------+-----------------+
      |1439323776|960.1|        China|9.158158499505339|
      |1380004385|328.8|        India|9.139880466385495|
      | 331002651|936.4|United States|8.519831472053848|
      +----------+-----+-------------+-----------------+
      '''

      import math
      math.log10(1439323776)

    2. 使用Spark UDFs
      a. UDFs
      from pyspark.sql.types import *
      def get_level(value):
          if  value > 1400000000: 
              return 'high'
          elif value > 1300000000: 
              return 'medium'
          else:
              return 'low'
      udf_level_func = F.udf(get_level, StringType())
      
      df_level = df.withColumn("PopulationLevel", udf_level_func("Population"))
      df_level.show()
      '''
      +----------+-----+-------------+---------------+
      |Population| area|         name|PopulationLevel|
      +----------+-----+-------------+---------------+
      |1439323776|960.1|        China|           high|
      |1380004385|328.8|        India|         medium|
      | 331002651|936.4|United States|            low|
      +----------+-----+-------------+---------------+
      '''

      b.Pandas UDFs

      out_schema = StructType([
                          StructField('Population',LongType(),True),
                          StructField('area',DoubleType(),True),
                          StructField('name',StringType(),True),
                          StructField('density',StringType(),True)
                              ])
      
      @F.pandas_udf(out_schema, F.PandasUDFType.GROUPED_MAP)
      def population_density(pdf):
          pdf['density'] =pdf.Population/pdf.area
          return pdf
      
      df_density = df.groupby("name").apply(population_density)
      df_density.printSchema()
      '''
      root
       |-- Population: long (nullable = true)
       |-- area: double (nullable = true)
       |-- name: string (nullable = true)
       |-- density: string (nullable = true)
      
      '''
    3. 使用Spark SQL
      df.registerTempTable('t_population_table')
      newDF = spark.sql('select *, 2*Population as DoublePopulation from t_population_table')
      newDF.show()
      '''
      +----------+-----+-------------+----------------+
      |Population| area|         name|DoublePopulation|
      +----------+-----+-------------+----------------+
      |1439323776|960.1|        China|      2878647552|
      |1380004385|328.8|        India|      2760008770|
      | 331002651|936.4|United States|       662005302|
      +----------+-----+-------------+----------------+
      '''
    4. 使用Spark RDDs
      from pyspark.sql import Row
      def rowwise_function(row):
          # convert row to dict:
          row_dict = row.asDict()
          # 设置新列的值
          row_dict['NameReverse'] = row_dict['name'][::-1]
          # convert dict to row:
          newrow = Row(**row_dict)
          return newrow
      
      # dataframe convert to RDD
      df_rdd = df.rdd
      # apply function to RDD
      df_name = df_rdd.map(lambda row: rowwise_function(row))
      # Convert RDD Back to DataFrame
      df_name_reverse = spark.createDataFrame(df_name)
      df_name_reverse.show()
      """
      +-------------+----------+-------------+
      |  NameReverse|Population|         name|
      +-------------+----------+-------------+
      |        anihC|1439323776|        China|
      |        aidnI|1380004385|        India|
      |setatS detinU| 331002651|United States|
      +-------------+----------+-------------+
      """
  • 相关阅读:
    分布式事务中间件你知道哪些?
    SpringBoot2.1.9+dubbo2.7.3+Nacos1.1.4构建你的微服务体系
    Nginx用做静态资源服务器和动静分离
    物联网温度服务器-ECharts、HTML5、JavaScript / ECharts gauge使用示例
    物联网温度服务器-ECharts、HTML5、JavaScript / ECharts gauge使用示例
    impala
    impala
    impala
    impala
    代码中的禅机
  • 原文地址:https://www.cnblogs.com/similarface/p/12974756.html
Copyright © 2020-2023  润新知