• pandas文本处理


      1 import pandas as pd
      2 import numpy as np
      3 
      4 s = pd.Series(['A', 'b', 'c', 'bbhello', '123', np.nan, 'hj'])
      5 df = pd.DataFrame({'key1': list('abcdef'),
      6                    'key2': ['hee', 'fv', 'w', 'hija', '123', np.nan]})
      7 print(s)
      8 print('-'*8)
      9 print(df)
     10 print('-'*8)
     11 '''
     12 0          A
     13 1          b
     14 2          c
     15 3    bbhello
     16 4        123
     17 5        NaN
     18 6         hj
     19 dtype: object
     20 --------
     21   key1  key2
     22 0    a   hee
     23 1    b    fv
     24 2    c     w
     25 3    d  hija
     26 4    e   123
     27 5    f   NaN
     28 --------
     29 '''
     30 # 直接通过.str调用字符串方法,可以对Series、DataFrame使用,自动过滤NaN值
     31 print(s.str.count('b'))
     32 '''
     33 0    0.0
     34 1    1.0
     35 2    0.0
     36 3    2.0
     37 4    0.0
     38 5    NaN
     39 6    0.0
     40 dtype: float64
     41 '''
     42 print(df['key2'].str.upper())
     43 '''
     44 0     HEE
     45 1      FV
     46 2       W
     47 3    HIJA
     48 4     123
     49 5     NaN
     50 Name: key2, dtype: object
     51 '''
     52 # 将所有的列名改为大写
     53 df.columns = df.columns.str.upper()
     54 print(df)
     55 '''
     56   KEY1  KEY2
     57 0    a   hee
     58 1    b    fv
     59 2    c     w
     60 3    d  hija
     61 4    e   123
     62 5    f   NaN
     63 '''
     64 # 字符串常用方法 --lower,upper,len,starswith,endswith
     65 
     66 print('小写,lower()',s.str.lower())
     67 print('大写,upper()',s.str.upper())
     68 print('长度,len()',s.str.len())
     69 print('判断起始是否为b,startswith()',s.str.startswith('b'))
     70 print('判断结束是否为"o",endswith()',s.str.endswith('o'))
     71 '''
     72 小写,lower() 0          a
     73 1          b
     74 2          c
     75 3    bbhello
     76 4        123
     77 5        NaN
     78 6         hj
     79 dtype: object
     80 大写,upper() 0          A
     81 1          B
     82 2          C
     83 3    BBHELLO
     84 4        123
     85 5        NaN
     86 6         HJ
     87 dtype: object
     88 长度,len() 0    1.0
     89 1    1.0
     90 2    1.0
     91 3    7.0
     92 4    3.0
     93 5    NaN
     94 6    2.0
     95 dtype: float64
     96 判断起始是否为b,startswith() 0    False
     97 1     True
     98 2    False
     99 3     True
    100 4    False
    101 5      NaN
    102 6    False
    103 dtype: object
    104 判断结束是否为"o",endswith() 0    False
    105 1    False
    106 2    False
    107 3     True
    108 4    False
    109 5      NaN
    110 6    False
    111 dtype: object
    112 '''
    113 # 字符串常用方法 --strip
    114 
    115 s2 = pd.Series([' jack', 'jill ', ' jesse  '])
    116 df2 = pd.DataFrame(np.random.randn(3, 2), columns=[' A ', ' B'], index=range(3))
    117 print(s2)
    118 print('-'*8)
    119 print(df2)
    120 print('-'*8)
    121 '''
    122 0        jack
    123 1       jill 
    124 2     jesse  
    125 dtype: object
    126 --------
    127          A          B
    128 0 -0.333042 -0.467830
    129 1  0.605179 -0.658910
    130 2 -0.490881 -0.639754
    131 --------
    132 '''
    133 print(s2.str.strip())
    134 print('-'*8)
    135 print(s2.str.lstrip())
    136 print('-'*8)
    137 print(s2.str.rstrip())
    138 '''
    139 0     jack
    140 1     jill
    141 2    jesse
    142 dtype: object
    143 --------
    144 0       jack
    145 1      jill 
    146 2    jesse  
    147 dtype: object
    148 --------
    149 0      jack
    150 1      jill
    151 2     jesse
    152 dtype: object
    153 '''
    154 df2.columns = df2.columns.str.strip()
    155 print(df2)
    156 '''
    157           A         B
    158 0 -0.801508  1.650113
    159 1 -0.669556 -1.195999
    160 2  0.277338 -0.727100
    161 
    162 '''
    163 
    164 # 字符串常用方法  -- replace()
    165 df3 = pd.DataFrame(np.random.randn(3, 2), columns=[' A a', ' B  b'], index=range(3))
    166 df3.columns = df3.columns.str.replace(' ', '-', n=2)
    167 print(df3)
    168 '''
    169        -A-a     -B- b
    170 0 -1.225938  0.296270
    171 1  0.769037  2.794032
    172 2 -1.686818  0.109314
    173 '''
    174 # 字符串常用方法 -- spilt、rsplit
    175 s4 = pd.Series(['a,b,c', '1,2,3', ['a,,,c'], np.nan])
    176 print(s4)
    177 print(s4.str.split(','))
    178 '''
    179 0      a,b,c
    180 1      1,2,3
    181 2    [a,,,c]
    182 3        NaN
    183 dtype: object
    184 0    [a, b, c]
    185 1    [1, 2, 3]
    186 2          NaN
    187 3          NaN
    188 dtype: object
    189 '''
    190 # 直接索引得到一个list
    191 # 可以使用get或[]符号访问拆散列表中的元素
    192 print(s4.str.split(',').str[0])
    193 print(s4.str.split(',').str.get(0))
    194 '''
    195 0      a
    196 1      1
    197 2    NaN
    198 3    NaN
    199 dtype: object
    200 0      a
    201 1      1
    202 2    NaN
    203 3    NaN
    204 dtype: object
    205 '''
    206 
    207 # 可以使用expand可以轻松扩展此操作以返回DataFrame
    208 # n 参数限制分割数
    209 print(s4.str.split(','))
    210 print('-' * 8)
    211 print(s4.str.split(',', expand=True))
    212 '''
    213 0    [a, b, c]
    214 1    [1, 2, 3]
    215 2          NaN
    216 3          NaN
    217 dtype: object
    218 --------
    219      0    1    2
    220 0    a    b    c
    221 1    1    2    3
    222 2  NaN  NaN  NaN
    223 3  NaN  NaN  NaN
    224 '''
    225 print(s4.str.split(',', expand=True, n=1))
    226 '''
    227      0    1
    228 0    a  b,c
    229 1    1  2,3
    230 2  NaN  NaN
    231 3  NaN  NaN
    232 '''
    233 # rsplit类似于split,反向工作,即从字符串的末尾到字符串的开头
    234 print(s4.str.split(',', expand=True, n=1))
    235 print('-' * 8)
    236 print(s4.str.rsplit(',', expand=True, n=1))
    237 '''
    238      0    1
    239 0    a  b,c
    240 1    1  2,3
    241 2  NaN  NaN
    242 3  NaN  NaN
    243 --------
    244      0    1
    245 0  a,b    c
    246 1  1,2    3
    247 2  NaN  NaN
    248 3  NaN  NaN
    249 '''
    250 
    251 df4 = pd.DataFrame({'key1': ['a,b,c', '1,2,3', [':,,, ']],
    252                     'key2': ['a-b-c', '1-2-3', [':-.- ']]})
    253 print(df4)
    254 print('-'*8)
    255 print(df4['key2'].str.split('-'))
    256 '''
    257       key1     key2
    258 0    a,b,c    a-b-c
    259 1    1,2,3    1-2-3
    260 2  [:,,, ]  [:-.- ]
    261 --------
    262 0    [a, b, c]
    263 1    [1, 2, 3]
    264 2          NaN
    265 Name: key2, dtype: object
    266 '''
    267 # 通过索引获取分割后的元素
    268 df4['k201'] = df4['key2'].str.split('-').str[0]
    269 df4['k202'] = df4['key2'].str.split('-').str[1]
    270 df4['k203'] = df4['key2'].str.split('-').str[2]
    271 print(df4)
    272 '''
    273       key1     key2 k201 k202 k203
    274 0    a,b,c    a-b-c    a    b    c
    275 1    1,2,3    1-2-3    1    2    3
    276 2  [:,,, ]  [:-.- ]  NaN  NaN  NaN
    277 '''
  • 相关阅读:
    AddParent
    AddChild
    贝塞尔曲线代码
    顶点纹理shader
    抽象方法与抽象类
    Application类
    布局组件
    C#单例和Unity单例
    Unity&UGUI
    Json解析
  • 原文地址:https://www.cnblogs.com/xshan/p/10803333.html
Copyright © 2020-2023  润新知