PySpark Replace Column Values in DataFrame
Pyspark 字段|列数据[正则]替换
转载:[Reprint]: https://sparkbyexamples.com/pyspark/pyspark-replace-column-values/#:~:text=By using PySpark SQL function regexp_replace () you,value with Road string on address column. 2.
1.Create DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
address = [(1,"14851 Jeffrey Rd","DE"),
(2,"43421 Margarita St","NY"),
(3,"13111 Siemon Ave","CA")]
df = spark.createDataFrame(address,["id","address","state"])
df.show()
2.Use Regular expression to replace String Column Value
#Replace part of string with another string
from pyspark.sql.functions import regexp_replace
df.withColumn('address', regexp_replace('address', 'Rd', 'Road'))
.show(truncate=False)
# createVar[f"{table_name}_df"] = getattr(sys.modules[__name__], f'{table_name}_df').withColumn('STVINNO',regexp_replace('STVINNO', '�', ''))
#+---+------------------+-----+
#|id |address |state|
#+---+------------------+-----+
#|1 |14851 Jeffrey Road|DE |
#|2 |43421 Margarita St|NY |
#|3 |13111 Siemon Ave |CA |
#+---+------------------+-----+
3.Replace Column Values Conditionally
#Replace string column value conditionally
from pyspark.sql.functions import when
df.withColumn('address',
when(df.address.endswith('Rd'),regexp_replace(df.address,'Rd','Road'))
.when(df.address.endswith('St'),regexp_replace(df.address,'St','Street'))
.when(df.address.endswith('Ave'),regexp_replace(df.address,'Ave','Avenue'))
.otherwise(df.address))
.show(truncate=False)
#+---+----------------------+-----+
#|id |address |state|
#+---+----------------------+-----+
#|1 |14851 Jeffrey Road |DE |
#|2 |43421 Margarita Street|NY |
#|3 |13111 Siemon Avenue |CA |
#+---+----------------------+-----+
4.Replace Column Value with Dictionary (map)
#Replace values from Dictionary
stateDic={'CA':'California','NY':'New York','DE':'Delaware'}
df2=df.rdd.map(lambda x:
(x.id,x.address,stateDic[x.state])
).toDF(["id","address","state"])
df2.show()
#+---+------------------+----------+
#| id| address| state|
#+---+------------------+----------+
#| 1| 14851 Jeffrey Rd| Delaware|
#| 2|43421 Margarita St| New York|
#| 3| 13111 Siemon Ave|California|
#+---+------------------+----------+
5.Replace Column Value Character by Character
#Using translate to replace character by character
from pyspark.sql.functions import translate
df.withColumn('address', translate('address', '123', 'ABC'))
.show(truncate=False)
#+---+------------------+-----+
#|id |address |state|
#+---+------------------+-----+
#|1 |A485A Jeffrey Rd |DE |
#|2 |4C4BA Margarita St|NY |
#|3 |ACAAA Siemon Ave |CA |
#+---+------------------+-----+
6.Replace Column with Another Column Value
#Replace column with another column
from pyspark.sql.functions import expr
df = spark.createDataFrame(
[("ABCDE_XYZ", "XYZ","FGH")],
("col1", "col2","col3")
)
df.withColumn("new_column",
expr("regexp_replace(col1, col2, col3)")
.alias("replaced_value")
).show()
#+---------+----+----+----------+
#| col1|col2|col3|new_column|
#+---------+----+----+----------+
#|ABCDE_XYZ| XYZ| FGH| ABCDE_FGH|
#+---------+----+----+----------+
7.All In One
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
address = [(1,"14851 Jeffrey Rd","DE"),
(2,"43421 Margarita St","NY"),
(3,"13111 Siemon Ave","CA")]
df =spark.createDataFrame(address,["id","address","state"])
df.show()
#Replace string
from pyspark.sql.functions import regexp_replace
df.withColumn('address', regexp_replace('address', 'Rd', 'Road'))
.show(truncate=False)
#Replace string
from pyspark.sql.functions import when
df.withColumn('address',
when(df.address.endswith('Rd'),regexp_replace(df.address,'Rd','Road'))
.when(df.address.endswith('St'),regexp_replace(df.address,'St','Street'))
.when(df.address.endswith('Ave'),regexp_replace(df.address,'Ave','Avenue'))
.otherwise(df.address))
.show(truncate=False)
#Replace values from Dictionary
stateDic={'CA':'California','NY':'New York','DE':'Delaware'}
df2=df.rdd.map(lambda x:
(x.id,x.address,stateDic[x.state])
).toDF(["id","address","state"])
df2.show()
#Using translate
from pyspark.sql.functions import translate
df.withColumn('address', translate('address', '123', 'ABC'))
.show(truncate=False)
#Replace column with another column
from pyspark.sql.functions import expr
df = spark.createDataFrame([("ABCDE_XYZ", "XYZ","FGH")], ("col1", "col2","col3"))
df.withColumn("new_column",
expr("regexp_replace(col1, col2, col3)")
.alias("replaced_value")
).show()
#Overlay
from pyspark.sql.functions import overlay
df = spark.createDataFrame([("ABCDE_XYZ", "FGH")], ("col1", "col2"))
df.select(overlay("col1", "col2", 7).alias("overlayed")).show()