• pyspark数据处理学习笔记


    PYSPARK学习笔记

    Defining a schema

    # Import the pyspark.sql.types library
    from pyspark.sql.types import *
    
    # Define a new schema using the StructType method
    people_schema = StructType([
      # Define a StructField for each field
      StructField('name', StringType(), False),
      StructField('age', IntegerType(), False),
      StructField('city', StringType(), False)
    ])
    

    展示数据

    # Load the CSV file
    aa_dfw_df = spark.read.format('csv').options(Header=True).load('AA_DFW_2018.csv.gz')
    
    # Add the airport column using the F.lower() method
    aa_dfw_df = aa_dfw_df.withColumn('airport', F.lower(aa_dfw_df['Destination Airport'])) #增加一列名为airport的,并置为小写
    
    # Drop the Destination Airport column
    aa_dfw_df = aa_dfw_df.drop(aa_dfw_df['Destination Airport']) # 删掉Destination Airport这一列
    
    # Show the DataFrame
    aa_dfw_df.show() # show等价于head
    

    parquet文件读写

    为了节省内存

    pyspark 写文件到hdfs (一般都存为parquet读写都比json、csv快,还节约约75%存储空间)
    其他hadoop生态圈内容在这

    # View the row count of df1 and df2
    print("df1 Count: %d" % df1.count())
    print("df2 Count: %d" % df2.count())
    
    # Combine the DataFrames into one 
    df3 = df1.union(df2) # 等价于r里面的rbind,就是按行拼接
    
    # Save the df3 DataFrame in Parquet format
    df3.write.parquet('AA_DFW_ALL.parquet', mode='overwrite')
    
    # Read the Parquet file into a new DataFrame and run a count
    print(spark.read.parquet('AA_DFW_ALL.parquet').count())
    
    # Read the Parquet file into flights_df
    flights_df = spark.read.parquet('AA_DFW_ALL.parquet')
    
    # Register the temp table
    flights_df.createOrReplaceTempView('flights') # 创建一个可替换的临时表
    
    # Run a SQL query of the average flight duration
    avg_duration = spark.sql('SELECT avg(flight_duration) from flights').collect()[0]
    print('The average flight time is: %d' % avg_duration)
    

    DataFrame column operations

    对数据框列的操作
    筛选操作

    # Show the distinct VOTER_NAME entries
    voter_df.select(voter_df['VOTER_NAME']).distinct().show(40, truncate=False) 去除重复值
    
    # Filter voter_df where the VOTER_NAME is 1-20 characters in length
    voter_df = voter_df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20')
    
    # Filter out voter_df where the VOTER_NAME contains an underscore
    voter_df = voter_df.filter(~ F.col('VOTER_NAME').contains('_'))
    
    # Show the distinct VOTER_NAME entries again
    voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)
    
    # Show the distinct VOTER_NAME entries
    voter_df.select(voter_df['VOTER_NAME']).distinct().show(40, truncate=False)
    
    # Filter voter_df where the VOTER_NAME is 1-20 characters in length
    voter_df = voter_df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20').  # 筛选操作,同r
    
    # Filter out voter_df where the VOTER_NAME contains an underscore
    voter_df = voter_df.filter(~ F.col('VOTER_NAME').contains('_'))
    
    # Show the distinct VOTER_NAME entries again
    voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)
    

    withcolumn

    # Add a new column called splits separated on whitespace
    voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, 's+'))  增加一列
    
    # Create a new column called first_name based on the first item in splits
    voter_df = voter_df.withColumn('first_name', voter_df.splits.getItem(0)) # getItem是查一个字典的映射
    
    # Get the last entry of the splits list and create a column called last_name
    voter_df = voter_df.withColumn('last_name', voter_df.splits.getItem(F.size('splits') - 1))
    
    # Drop the splits column
    voter_df = voter_df.drop('splits')  # 删除一列
    
    # Show the voter_df DataFrame
    voter_df.show()  #等价于head()
    

    select

    
    # Add a column to voter_df for any voter with the title **Councilmember**
    voter_df = voter_df.withColumn('random_val',
                                   when(voter_df.TITLE == 'Councilmember', F.rand())) #新增一列random_val ,当title=='Councilmember'时,产生随机数
    
    # Show some of the DataFrame rows, noting whether the when clause worked
    voter_df.show()
    
    
    # 查看一下结果
    <script.py> output:
        +----------+-------------+-------------------+--------------------+
        |      DATE|        TITLE|         VOTER_NAME|          random_val|
        +----------+-------------+-------------------+--------------------+
        |02/08/2017|Councilmember|  Jennifer S. Gates|  0.6072090670654174|
        |02/08/2017|Councilmember| Philip T. Kingston|  0.8779894137938334|
        |02/08/2017|        Mayor|Michael S. Rawlings|                null|
        |02/08/2017|Councilmember|       Adam Medrano|  0.2496996705882797|
        |02/08/2017|Councilmember|       Casey Thomas| 0.20338678125255483|
        |02/08/2017|Councilmember|Carolyn King Arnold|   0.911553073855913|
        |02/08/2017|Councilmember|       Scott Griggs|  0.1134459593298831|
        |02/08/2017|Councilmember|   B. Adam  McGough| 0.42041407481646487|
        |02/08/2017|Councilmember|       Lee Kleinman|  0.9109217573924748|
        |02/08/2017|Councilmember|      Sandy Greyson|0.055814633336865205|
        |02/08/2017|Councilmember|  Jennifer S. Gates|  0.9429223451510873|
        |02/08/2017|Councilmember| Philip T. Kingston|0.022915415927586502|
        |02/08/2017|        Mayor|Michael S. Rawlings|                null|
        |02/08/2017|Councilmember|       Adam Medrano|  0.9833216773540682|
        |02/08/2017|Councilmember|       Casey Thomas|  0.2944981610876857|
        |02/08/2017|Councilmember|Carolyn King Arnold|    0.67447246683049|
        |02/08/2017|Councilmember| Rickey D. Callahan| 0.20480391619888827|
        |01/11/2017|Councilmember|  Jennifer S. Gates| 0.14057384767559866|
        |04/25/2018|Councilmember|     Sandy  Greyson|  0.6598564900991037|
        |04/25/2018|Councilmember| Jennifer S.  Gates|  0.9412719200394332|
        +----------+-------------+-------------------+--------------------+
        only showing top 20 rows
    
    

    when

    # Add a column to voter_df for a voter based on their position
    voter_df = voter_df.withColumn('random_val',
                                   when(voter_df.TITLE == 'Councilmember', F.rand()) #这个when完美的取代了if else。。
                                   .when(voter_df.TITLE == 'Mayor', 2)
                                   .otherwise(0))
    
    # Show some of the DataFrame rows
    voter_df.show()
    
    # Use the .filter() clause with random_val
    voter_df.filter(voter_df.random_val == 0).show()
    
    
    ### user defined functions
    
    udf  自定义函数
    
    ```r
    def getFirstAndMiddle(names):
      # Return a space separated string of names
      return ' '.join(names[:-1])
    
    # Define the method as a UDF
    udfFirstAndMiddle = F.udf(getFirstAndMiddle, StringType()). #函数的返回结果是字符串类型,这里需要定义一下函数,有点类似于声明一个函数
    
    # Create a new column using your UDF
    voter_df = voter_df.withColumn('first_and_middle_name', udfFirstAndMiddle(voter_df.splits)) #第一个参数是新增一列的名字,第二个参数就是调用这个函数并给函数赋值
    
    # Show the DataFrame
    voter_df.show()
    
    
    <script.py> output:
        +----------+-------------+-------------------+--------------------+----------+---------+---------------------+
        |      DATE|        TITLE|         VOTER_NAME|              splits|first_name|last_name|first_and_middle_name|
        +----------+-------------+-------------------+--------------------+----------+---------+---------------------+
        |02/08/2017|Councilmember|  Jennifer S. Gates|[Jennifer, S., Ga...|  Jennifer|    Gates|          Jennifer S.|
        |02/08/2017|Councilmember| Philip T. Kingston|[Philip, T., King...|    Philip| Kingston|            Philip T.|
        |02/08/2017|        Mayor|Michael S. Rawlings|[Michael, S., Raw...|   Michael| Rawlings|           Michael S.|
        |02/08/2017|Councilmember|       Adam Medrano|     [Adam, Medrano]|      Adam|  Medrano|                 Adam|
        |02/08/2017|Councilmember|       Casey Thomas|     [Casey, Thomas]|     Casey|   Thomas|                Casey|
        |02/08/2017|Councilmember|Carolyn King Arnold|[Carolyn, King, A...|   Carolyn|   Arnold|         Carolyn King|
        |02/08/2017|Councilmember|       Scott Griggs|     [Scott, Griggs]|     Scott|   Griggs|                Scott|
        |02/08/2017|Councilmember|   B. Adam  McGough| [B., Adam, McGough]|        B.|  McGough|              B. Adam|
        |02/08/2017|Councilmember|       Lee Kleinman|     [Lee, Kleinman]|       Lee| Kleinman|                  Lee|
        |02/08/2017|Councilmember|      Sandy Greyson|    [Sandy, Greyson]|     Sandy|  Greyson|                Sandy|
        |02/08/2017|Councilmember|  Jennifer S. Gates|[Jennifer, S., Ga...|  Jennifer|    Gates|          Jennifer S.|
        |02/08/2017|Councilmember| Philip T. Kingston|[Philip, T., King...|    Philip| Kingston|            Philip T.|
        |02/08/2017|        Mayor|Michael S. Rawlings|[Michael, S., Raw...|   Michael| Rawlings|           Michael S.|
        |02/08/2017|Councilmember|       Adam Medrano|     [Adam, Medrano]|      Adam|  Medrano|                 Adam|
        |02/08/2017|Councilmember|       Casey Thomas|     [Casey, Thomas]|     Casey|   Thomas|                Casey|
        |02/08/2017|Councilmember|Carolyn King Arnold|[Carolyn, King, A...|   Carolyn|   Arnold|         Carolyn King|
        |02/08/2017|Councilmember| Rickey D. Callahan|[Rickey, D., Call...|    Rickey| Callahan|            Rickey D.|
        |01/11/2017|Councilmember|  Jennifer S. Gates|[Jennifer, S., Ga...|  Jennifer|    Gates|          Jennifer S.|
        |04/25/2018|Councilmember|     Sandy  Greyson|    [Sandy, Greyson]|     Sandy|  Greyson|                Sandy|
        |04/25/2018|Councilmember| Jennifer S.  Gates|[Jennifer, S., Ga...|  Jennifer|    Gates|          Jennifer S.|
        +----------+-------------+-------------------+--------------------+----------+---------+---------------------+
        only showing top 20 rows
    

    Partitioning and lazy processing

    分区和延时处理

    spark 不是实时处理的,而是延时处理任务,也就是lazy

    I’m # Select all the unique council voters
    voter_df = df.select(df["VOTER NAME"]).distinct() #查找值并去重,上次有个任务做了多次的groupby现在可以用这个
    # Count the rows in voter_df
    print("
    There are %d rows in the voter_df DataFrame.
    " % voter_df.count()) #计数
    
    # Add a ROW_ID
    voter_df = voter_df.withColumn('ROW_ID', F.monotonically_increasing_id()) #增加一列
    
    # Show the rows with 10 highest IDs in the set
    voter_df.orderBy(voter_df.ROW_ID.desc()).show(10) #排序任务
    

    cache

    缓存机制

    start_time = time.time()
    
    # Add caching to the unique rows in departures_df
    departures_df = departures_df.distinct().cache()
    
    # Count the unique rows in departures_df, noting how long the operation takes
    print("Counting %d rows took %f seconds" % (departures_df.count(), time.time() - start_time))
    
    # Count the rows again, noting the variance in time of a cached DataFrame
    start_time = time.time()
    print("Counting %d rows again took %f seconds" % (departures_df.count(), time.time() - start_time))
    

    计算时间

    # Import the full and split files into DataFrames
    full_df = spark.read.csv('departures_full.txt.gz')
    split_df = spark.read.csv('departures_0*.txt.gz')
    
    # Print the count and run time for each DataFrame
    start_time_a = time.time(). #开始时间
    print("Total rows in full DataFrame:	%d" % full_df.count())
    print("Time to run: %f" % (time.time() - start_time_a))  # 算运行时间
     
    start_time_b = time.time()
    print("Total rows in split DataFrame:	%d" % split_df.count())
    print("Time to run: %f" % (time.time() - start_time_b))
    

    集群配置

    
    # Name of the Spark application instance
    app_name = spark.conf.get('spark.app.name')
    
    # Driver TCP port
    driver_tcp_port = spark.conf.get('spark.driver.port')
    
    # Number of join partitions
    num_partitions = spark.conf.get('spark.sql.shuffle.partitions')
    
    # Show the results
    print("Name: %s" % app_name)
    print("Driver TCP port: %s" % driver_tcp_port)
    print("Number of partitions: %s" % num_partitions)
    
    # Store the number of partitions in variable
    before = departures_df.rdd.getNumPartitions()
    
    # Configure Spark to use 500 partitions
    spark.conf.set('spark.sql.shuffle.partitions', 500)
    
    # Recreate the DataFrame using the departures data file
    departures_df = spark.read.csv('departures.txt.gz').distinct()
    
    # Print the number of partitions for each instance
    print("Partition count before change: %d" % before)
    print("Partition count after change: %d" % departures_df.rdd.getNumPartitions())
    

    json

    输出json格式

    # Import the data to a DataFrame
    departures_df = spark.read.csv('2015-departures.csv.gz', header=True)
    
    # Remove any duration of 0
    departures_df = departures_df.filter(departures_df[3] > 0)
    
    # Add an ID column
    departures_df = departures_df.withColumn('id', F.monotonically_increasing_id())
    
    # Write the file out to JSON format
    departures_df.write.json('output.json', mode='overwrite').  #输出json格式
    

    ~ 这个符号表示去取反

    # Split _c0 on the tab character and store the list in a variable
    tmp_fields = F.split(annotations_df['_c0'], '	')
    
    # Create the colcount column on the DataFrame
    annotations_df = annotations_df.withColumn('colcount', F.size(tmp_fields))
    
    # Remove any rows containing fewer than 5 fields
    annotations_df_filtered = annotations_df.filter(~ (annotations_df["colcount"] < 5))
    
    # Count the number of rows
    final_count = annotations_df_filtered.count()
    print("Initial count: %d
    Final count: %d" % (initial_count, final_count))
    

  • 相关阅读:
    2019-10-14-云之幻-UWP-视频教程
    2019-10-14-云之幻-UWP-视频教程
    2018-2-13-win10-uwp-自定义控件-SplitViewItem
    2018-2-13-win10-uwp-自定义控件-SplitViewItem
    2019-9-19-dotnet-找不到-PostAsJsonAsync-方法
    2019-9-19-dotnet-找不到-PostAsJsonAsync-方法
    2018-5-28-WPF-Process.Start-出现-Win32Exception-异常
    2018-5-28-WPF-Process.Start-出现-Win32Exception-异常
    Java实现 LeetCode 606 根据二叉树创建字符串(遍历树)
    Java实现 LeetCode 606 根据二叉树创建字符串(遍历树)
  • 原文地址:https://www.cnblogs.com/gaowenxingxing/p/14511179.html
Copyright © 2020-2023  润新知