• Pyspark DataFrame 字段|列数据[正则]替换 PySpark Replace Column Values in DataFrame


    PySpark Replace Column Values in DataFrame

    Pyspark 字段|列数据[正则]替换

    转载:[Reprint]: https://sparkbyexamples.com/pyspark/pyspark-replace-column-values/#:~:text=By using PySpark SQL function regexp_replace () you,value with Road string on address column. 2.

    1.Create DataFrame

    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
    address = [(1,"14851 Jeffrey Rd","DE"),
        (2,"43421 Margarita St","NY"),
        (3,"13111 Siemon Ave","CA")]
    df = spark.createDataFrame(address,["id","address","state"])
    df.show()
    

    2.Use Regular expression to replace String Column Value

    #Replace part of string with another string
    from pyspark.sql.functions import regexp_replace
    df.withColumn('address', regexp_replace('address', 'Rd', 'Road')) 
      .show(truncate=False)
    # createVar[f"{table_name}_df"] = getattr(sys.modules[__name__], f'{table_name}_df').withColumn('STVINNO',regexp_replace('STVINNO', '�', ''))
    
    #+---+------------------+-----+
    #|id |address           |state|
    #+---+------------------+-----+
    #|1  |14851 Jeffrey Road|DE   |
    #|2  |43421 Margarita St|NY   |
    #|3  |13111 Siemon Ave  |CA   |
    #+---+------------------+-----+
    

    3.Replace Column Values Conditionally

    #Replace string column value conditionally
    from pyspark.sql.functions import when
    df.withColumn('address', 
        when(df.address.endswith('Rd'),regexp_replace(df.address,'Rd','Road')) 
       .when(df.address.endswith('St'),regexp_replace(df.address,'St','Street')) 
       .when(df.address.endswith('Ave'),regexp_replace(df.address,'Ave','Avenue')) 
       .otherwise(df.address)) 
       .show(truncate=False)
    
    #+---+----------------------+-----+
    #|id |address               |state|
    #+---+----------------------+-----+
    #|1  |14851 Jeffrey Road    |DE   |
    #|2  |43421 Margarita Street|NY   |
    #|3  |13111 Siemon Avenue   |CA   |
    #+---+----------------------+-----+ 
    

    4.Replace Column Value with Dictionary (map)

    #Replace values from Dictionary
    stateDic={'CA':'California','NY':'New York','DE':'Delaware'}
    df2=df.rdd.map(lambda x: 
        (x.id,x.address,stateDic[x.state]) 
        ).toDF(["id","address","state"])
    df2.show()
    
    #+---+------------------+----------+
    #| id|           address|     state|
    #+---+------------------+----------+
    #|  1|  14851 Jeffrey Rd|  Delaware|
    #|  2|43421 Margarita St|  New York|
    #|  3|  13111 Siemon Ave|California|
    #+---+------------------+----------+
    

    5.Replace Column Value Character by Character

    #Using translate to replace character by character
    from pyspark.sql.functions import translate
    df.withColumn('address', translate('address', '123', 'ABC')) 
      .show(truncate=False)
    
    #+---+------------------+-----+
    #|id |address           |state|
    #+---+------------------+-----+
    #|1  |A485A Jeffrey Rd  |DE   |
    #|2  |4C4BA Margarita St|NY   |
    #|3  |ACAAA Siemon Ave  |CA   |
    #+---+------------------+-----+
    

    6.Replace Column with Another Column Value

    #Replace column with another column
    from pyspark.sql.functions import expr
    df = spark.createDataFrame(
       [("ABCDE_XYZ", "XYZ","FGH")], 
        ("col1", "col2","col3")
      )
    df.withColumn("new_column",
                  expr("regexp_replace(col1, col2, col3)")
                  .alias("replaced_value")
                  ).show()
    
    #+---------+----+----+----------+
    #|     col1|col2|col3|new_column|
    #+---------+----+----+----------+
    #|ABCDE_XYZ| XYZ| FGH| ABCDE_FGH|
    #+---------+----+----+----------+
    

    7.All In One

    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
    
    address = [(1,"14851 Jeffrey Rd","DE"),
        (2,"43421 Margarita St","NY"),
        (3,"13111 Siemon Ave","CA")]
    
    df =spark.createDataFrame(address,["id","address","state"])
    df.show()
    
    #Replace string
    from pyspark.sql.functions import regexp_replace
    df.withColumn('address', regexp_replace('address', 'Rd', 'Road')) 
      .show(truncate=False)
    
    #Replace string
    from pyspark.sql.functions import when
    df.withColumn('address', 
          when(df.address.endswith('Rd'),regexp_replace(df.address,'Rd','Road')) 
         .when(df.address.endswith('St'),regexp_replace(df.address,'St','Street')) 
         .when(df.address.endswith('Ave'),regexp_replace(df.address,'Ave','Avenue')) 
         .otherwise(df.address)) 
         .show(truncate=False)   
    
    
    #Replace values from Dictionary
    stateDic={'CA':'California','NY':'New York','DE':'Delaware'}
    df2=df.rdd.map(lambda x: 
        (x.id,x.address,stateDic[x.state]) 
        ).toDF(["id","address","state"])
    df2.show()
    
    #Using translate
    from pyspark.sql.functions import translate
    df.withColumn('address', translate('address', '123', 'ABC')) 
      .show(truncate=False)
    
    #Replace column with another column
    from pyspark.sql.functions import expr
    df = spark.createDataFrame([("ABCDE_XYZ", "XYZ","FGH")], ("col1", "col2","col3"))
    df.withColumn("new_column",
                  expr("regexp_replace(col1, col2, col3)")
                  .alias("replaced_value")
                  ).show()
      
    #Overlay
    from pyspark.sql.functions import overlay
    df = spark.createDataFrame([("ABCDE_XYZ", "FGH")], ("col1", "col2"))
    df.select(overlay("col1", "col2", 7).alias("overlayed")).show()
    
  • 相关阅读:
    sql处理数据库锁的存储过程
    SQL语句
    partial 函数
    map函数
    python命令行上下 退格,左右键不能用
    postgresql 在linux上的源码安装
    python字典操作
    根据key存不存在查询json
    精典博文
    python解析XML之ElementTree
  • 原文地址:https://www.cnblogs.com/pandaa/p/15029479.html
Copyright © 2020-2023  润新知