Pyspark全角半角符号数据格式化转换UDF
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, FloatType, StringType
sc = SparkContext()
sqlContext = SQLContext(sc)
# 全角转成半角
def full2half(s):
n = ''
for char in s:
num = ord(char)
if num == 0x3000: # 将全角空格转成半角空格
num = 32
elif 0xFF01 <= num <= 0xFF5E: # 将其余全角字符转成半角字符
num -= 0xFEE0
num = chr(num)
n += num
return n
df = sc.parallelize([['DBD', '迪布达'],
['GBD', '迪布达(中国)'],
['GBD', '迪布达(中国)'],
['GBD', '迪布达(中国)'],
]).toDF(['code', 'org'])
full_half = udf(full2half, StringType())
df = df.withColumn('org', full_half(df.org))
df.show()
# output :
# +----+---------------+
# |code|fullToHalf(org)|
# +----+---------------+
# | DBD| 迪布达|
# | GBD| 迪布达(中国)|
# | GBD| 迪布达(中国)|
# | GBD| 迪布达(中国)|
# +----+---------------+
sqlContext.udf.register(name="fullToHalf", f=full_half)
df.createOrReplaceTempView("t1")
sqlContext.sql(
"""
SELECT code,fullToHalf(org) FROM T1
"""
).show()
# output :
# +----+---------------+
# |code|fullToHalf(org)|
# +----+---------------+
# | DBD| 迪布达|
# | GBD| 迪布达(中国)|
# | GBD| 迪布达(中国)|
# | GBD| 迪布达(中国)|
# +----+---------------+