为spark dataframe 添加新的列的几种实现
from pyspark.sql import SparkSession from pyspark.sql import Row spark = SparkSession.builder.getOrCreate()
- 测试数据准备
test_data = [ Row(name='China', Population=1439323776, area=960.1), Row(name='India', Population=1380004385, area=328.8), Row(name='United States', Population=331002651, area=936.4)] df = spark.createDataFrame(test_data)
df.show()
'''
+----------+-----+-------------+ |Population| area| name| +----------+-----+-------------+ |1439323776|960.1| China| |1380004385|328.8| India| | 331002651|936.4|United States| +----------+-----+-------------+
'''
- 使用Spark自带函数
import pyspark.sql.functions as F df_log = df.withColumn("PopulationLog", F.log(10.0, "Population")) df_log.show() ''' +----------+-----+-------------+-----------------+ |Population| area| name| PopulationLog| +----------+-----+-------------+-----------------+ |1439323776|960.1| China|9.158158499505339| |1380004385|328.8| India|9.139880466385495| | 331002651|936.4|United States|8.519831472053848| +----------+-----+-------------+-----------------+ '''
import math
math.log10(1439323776) - 使用Spark UDFs
a. UDFs
from pyspark.sql.types import * def get_level(value): if value > 1400000000: return 'high' elif value > 1300000000: return 'medium' else: return 'low' udf_level_func = F.udf(get_level, StringType()) df_level = df.withColumn("PopulationLevel", udf_level_func("Population")) df_level.show() ''' +----------+-----+-------------+---------------+ |Population| area| name|PopulationLevel| +----------+-----+-------------+---------------+ |1439323776|960.1| China| high| |1380004385|328.8| India| medium| | 331002651|936.4|United States| low| +----------+-----+-------------+---------------+ '''
b.Pandas UDFs
out_schema = StructType([ StructField('Population',LongType(),True), StructField('area',DoubleType(),True), StructField('name',StringType(),True), StructField('density',StringType(),True) ]) @F.pandas_udf(out_schema, F.PandasUDFType.GROUPED_MAP) def population_density(pdf): pdf['density'] =pdf.Population/pdf.area return pdf df_density = df.groupby("name").apply(population_density) df_density.printSchema() ''' root |-- Population: long (nullable = true) |-- area: double (nullable = true) |-- name: string (nullable = true) |-- density: string (nullable = true) '''
- 使用Spark SQL
df.registerTempTable('t_population_table') newDF = spark.sql('select *, 2*Population as DoublePopulation from t_population_table') newDF.show() ''' +----------+-----+-------------+----------------+ |Population| area| name|DoublePopulation| +----------+-----+-------------+----------------+ |1439323776|960.1| China| 2878647552| |1380004385|328.8| India| 2760008770| | 331002651|936.4|United States| 662005302| +----------+-----+-------------+----------------+ '''
- 使用Spark RDDs
from pyspark.sql import Row def rowwise_function(row): # convert row to dict: row_dict = row.asDict() # 设置新列的值 row_dict['NameReverse'] = row_dict['name'][::-1] # convert dict to row: newrow = Row(**row_dict) return newrow # dataframe convert to RDD df_rdd = df.rdd # apply function to RDD df_name = df_rdd.map(lambda row: rowwise_function(row)) # Convert RDD Back to DataFrame df_name_reverse = spark.createDataFrame(df_name) df_name_reverse.show() """ +-------------+----------+-------------+ | NameReverse|Population| name| +-------------+----------+-------------+ | anihC|1439323776| China| | aidnI|1380004385| India| |setatS detinU| 331002651|United States| +-------------+----------+-------------+ """