import pandas as pd
import numpy as np
常规的字符串操作
s = pd.Series(['A',"B","C","AaBa","Baca",np.nan,'dog','cat'])
s
0 A
1 B
2 C
3 AaBa
4 Baca
5 NaN
6 dog
7 cat
dtype: object
s.str.lower()
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 dog
7 cat
dtype: object
s.str.upper()
0 A
1 B
2 C
3 AABA
4 BACA
5 NaN
6 DOG
7 CAT
dtype: object
s.str.len()
0 1.0
1 1.0
2 1.0
3 4.0
4 4.0
5 NaN
6 3.0
7 3.0
dtype: float64
idx = pd.Index([' jack','jill ',' jesse','frank'])
idx.str.strip() # 去掉左右两边的空白符
Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')
idx.str.lstrip() # 左去掉空白字符
Index(['jack', 'jill ', 'jesse', 'frank'], dtype='object')
idx.str.rstrip() # 去掉右边的空白符
Index([' jack', 'jill', ' jesse', 'frank'], dtype='object')
df = pd.DataFrame(np.random.randn(3,2),columns=[' Column A ',' Column B '],index=range(3))
df
|
Column A |
Column B |
0 |
0.048811 |
-1.097950 |
1 |
-1.099516 |
-0.514286 |
2 |
0.984136 |
-1.027790 |
df.columns.str.strip()
Index(['Column A', 'Column B'], dtype='object')
df.columns.str.lower()
Index([' column a ', ' column b '], dtype='object')
df.columns = df.columns.str.strip().str.lower().str.replace(' ',"_")
df
|
column_a |
column_b |
0 |
0.048811 |
-1.097950 |
1 |
-1.099516 |
-0.514286 |
2 |
0.984136 |
-1.027790 |
分割与替换字符
str.split 操作
s2 = pd.Series(['a_b_c',"c_D_e",np.nan,'f_g_H'])
s2.str.split("_")
0 [a, b, c]
1 [c, D, e]
2 NaN
3 [f, g, H]
dtype: object
s2.str.split('_')[1]
['c', 'D', 'e']
s2.str.split('_').str[1] # 切割之后的Series,通过str方法可以得到新的数据
0 b
1 D
2 NaN
3 g
dtype: object
s2.str.split('_').str.get(1)
0 b
1 D
2 NaN
3 g
dtype: object
s2.str.split('_',expand=True,n=1) # expand 参数,通过可以通过n确定延伸的次数
|
0 |
1 |
0 |
a |
b_c |
1 |
c |
D_e |
2 |
NaN |
NaN |
3 |
f |
g_H |
s2.str.rsplit('_',expand=True,n=1) # rsplit 方法
|
0 |
1 |
0 |
a_b |
c |
1 |
c_D |
e |
2 |
NaN |
NaN |
3 |
f_g |
H |
str.replace操作
s3 = pd.Series(['A',"B","C","AaBa","Baca",np.nan,"CABA","dog","cat"])
s3
0 A
1 B
2 C
3 AaBa
4 Baca
5 NaN
6 CABA
7 dog
8 cat
dtype: object
s3.str.replace('^.a|dog','XX_XX',case=False) # 替换第二个字符是a或者dog的字符串,忽略大小写,关于正则表达式的内容篇幅很大
0 A
1 B
2 C
3 XX_XXBa
4 XX_XXca
5 NaN
6 XX_XXBA
7 XX_XX
8 XX_XXt
dtype: object
dollars = pd.Series(['12', '-$10', '$10,000'])
dollars.str.replace('$', '') # replace $ to ''
0 12
1 -10
2 10,000
dtype: object
dollars.str.replace("-$",'-') # doesn't work
0 12
1 -$10
2 $10,000
dtype: object
dollars.str.replace(r'-$','-')
# 转义 原字符-$ 替换成'-'
0 12
1 -10
2 $10,000
dtype: object
dollars.str.replace('-$', '-')
0 12
1 -10
2 $10,000
dtype: object
str.cat操作
s = pd.Series(['A',"B","C","D"])
s.str.cat(sep=',')
'A,B,C,D'
s.str.cat()
'ABCD'
t = pd.Series(['a', 'b', np.nan, 'd'])
t.str.cat(sep=',',na_rep='_')
'a,b,_,d'
s.str.cat(['a',"b","c","d"])
0 Aa
1 Bb
2 Cc
3 Dd
dtype: object
pd.Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>d)', expand=False)# 组命名?P
|
letter |
digit |
0 |
a |
1 |
1 |
b |
2 |
2 |
NaN |
NaN |
match or contain操作
pattern = r'[0-9][a-z]'
pd.Series(['1','2','3a','3b','03c']).str.contains(pattern)# 包含数字字母的文本
0 False
1 False
2 True
3 True
4 True
dtype: bool
pd.Series(['1','2','3a','3b','03c']).str.match(pattern)# 匹配数字字母的文本
0 False
1 False
2 True
3 True
4 False
dtype: bool
其他的方法,可以参考官方文档中的方法函数