• pandas中的数值计算及统计基础


      1 import pandas as pd
      2 import numpy as np
      3 
      4 df = pd.DataFrame({
      5     'key1': [4, 5, 3, np.nan, 2],
      6     'key2': [1, 2, np.nan, 4, 5],
      7     'key3': [1, 2, 3, 'j', 'k']
      8 }, index=['a', 'b', 'c', 'd', 'e'])
      9 print(df)
     10 print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
     11 print('-------')
     12 '''
     13    key1  key2 key3
     14 a   4.0   1.0    1
     15 b   5.0   2.0    2
     16 c   3.0   NaN    3
     17 d   NaN   4.0    j
     18 e   2.0   5.0    k
     19 float64 float64 object
     20 -------
     21 '''
     22 # 计算每一列的均值 df.mean()
     23 # 只统计数字列,默认忽略nan。
     24 print(df.mean())
     25 '''
     26 key1    3.5
     27 key2    3.0
     28 dtype: float64
     29 '''
     30 # 不忽略nan值计算均值
     31 # skipna默认为True,如果为False,有NaN的列统计结果仍为NaN
     32 m3 = df.mean(skipna=False)
     33 print(m3)
     34 '''
     35 key1   NaN
     36 key2   NaN
     37 dtype: float64
     38 '''
     39 # 计算单一列的均值
     40 print('计算单一列的均值',df['key2'].mean())
     41 '''
     42 计算单一列的均值 3.0
     43 '''
     44 
     45 df2 = pd.DataFrame({
     46     'key1': [1, 3, 5],
     47     'key2': [2, 4, 6],
     48     'key3': [3, 5, 7]
     49 }, index=['a', 'b', 'c'])
     50 # print(df2)
     51 # print('--------df2')
     52 # 计算df2每一行的均值并将其结果添加到新的列
     53 df2['mean'] = df2.mean(axis=1)
     54 print(df2)
     55 '''
     56    key1  key2  key3  mean
     57 a     1     2     3   2.0
     58 b     3     4     5   4.0
     59 c     5     6     7   6.0
     60 '''
     61 
     62 # 统计非NaN值的数量  count()
     63 print(df)
     64 print('-'*6)
     65 print(df.count())
     66 '''
     67    key1  key2 key3
     68 a   4.0   1.0    1
     69 b   5.0   2.0    2
     70 c   3.0   NaN    3
     71 d   NaN   4.0    j
     72 e   2.0   5.0    k
     73 ------
     74 key1    4
     75 key2    4
     76 key3    5
     77 dtype: int64
     78 '''
     79 
     80 # 统计
     81 print(df)
     82 print('-' * 6)
     83 print('df的最小值',df.min())
     84 print('df的最大值',df.max())
     85 print('df的key2列的最大值',df['key2'].max())
     86 print('统计df的分位数,参数q确定位置',df.quantile(q=0.75))
     87 print('对df求和',df.sum())
     88 print('求df的中位数,median(),50%分位数',df.median())
     89 print('求df的标准差,std()',df.std())
     90 print('求df的方差,var()',df.var())
     91 print('求skew样本的偏度,skew()',df.skew())
     92 print('求kurt样本的峰度,kurt()',df.kurt())
     93 print('df累计求和,cumsum()',df['key2'].cumsum())
     94 print('df累计求积,cumprod()',df['key2'].cumprod())
     95 print('求df的累计最大值,cummax()', df['key2'].cummax())
     96 print('求df的累计最小值,cummin()', df['key2'].cummin())
     97 '''
     98    key1  key2 key3
     99 a   4.0   1.0    1
    100 b   5.0   2.0    2
    101 c   3.0   NaN    3
    102 d   NaN   4.0    j
    103 e   2.0   5.0    k
    104 ------
    105 df的最小值 key1    2.0
    106 key2    1.0
    107 dtype: float64
    108 df的最大值 key1    5.0
    109 key2    5.0
    110 dtype: float64
    111 df的key2列的最大值 5.0
    112 统计df的分位数,参数q确定位置 key1    4.25
    113 key2    4.25
    114 Name: 0.75, dtype: float64
    115 对df求和 key1    14.0
    116 key2    12.0
    117 dtype: float64
    118 求df的中位数,median(),50%分位数 key1    3.5
    119 key2    3.0
    120 dtype: float64
    121 求df的标准差,std() key1    1.290994
    122 key2    1.825742
    123 dtype: float64
    124 求df的方差,var() key1    1.666667
    125 key2    3.333333
    126 dtype: float64
    127 求skew样本的偏度,skew() key1    0.0
    128 key2    0.0
    129 dtype: float64
    130 求kurt样本的峰度,kurt() key1   -1.2
    131 key2   -3.3
    132 dtype: float64
    133 df累计求和,cumsum() a     1.0
    134 b     3.0
    135 c     NaN
    136 d     7.0
    137 e    12.0
    138 Name: key2, dtype: float64
    139 df累计求积,cumprod() a     1.0
    140 b     2.0
    141 c     NaN
    142 d     8.0
    143 e    40.0
    144 Name: key2, dtype: float64
    145 求df的累计最大值,cummax() a    1.0
    146 b    2.0
    147 c    NaN
    148 d    4.0
    149 e    5.0
    150 Name: key2, dtype: float64
    151 求df的累计最小值,cummin() a    1.0
    152 b    1.0
    153 c    NaN
    154 d    1.0
    155 e    1.0
    156 Name: key2, dtype: float64
    157 '''
    158 
    159 # 唯一值 :unique()
    160 s = pd.Series(list('kjdhsakjdhjfh'))
    161 sq = s.unique()
    162 print(s)
    163 print(sq)
    164 print('sq的类型:',type(sq))
    165 print('对sq进行重新排序:',pd.Series(sq).sort_values())
    166 '''
    167 0     k
    168 1     j
    169 2     d
    170 3     h
    171 4     s
    172 5     a
    173 6     k
    174 7     j
    175 8     d
    176 9     h
    177 10    j
    178 11    f
    179 12    h
    180 dtype: object
    181 ['k' 'j' 'd' 'h' 's' 'a' 'f']
    182 sq的类型: <class 'numpy.ndarray'>
    183 对sq进行重新排序: 5    a
    184 2    d
    185 6    f
    186 3    h
    187 1    j
    188 0    k
    189 4    s
    190 dtype: object
    191 '''
    192 # 对某一列进行值的计数,只能对一列,不能对Dataframe
    193 print(df['key2'].value_counts())
    194 
    195 # 判断Dataframe中的每个元素是否都是在某个列表中
    196 print(df)
    197 df_isin = df.isin([1,3])
    198 print(df_isin)
    199 '''
    200    key1  key2 key3
    201 a   4.0   1.0    1
    202 b   5.0   2.0    2
    203 c   3.0   NaN    3
    204 d   NaN   4.0    j
    205 e   2.0   5.0    k
    206 
    207 
    208     key1   key2   key3
    209 a  False   True   True
    210 b  False  False  False
    211 c   True  False   True
    212 d  False  False  False
    213 e  False  False  False
    214 '''
  • 相关阅读:
    从Kratos设计看Go微服务工程实践
    京东到家安全测试实践
    浅谈 Protobuf 编码 原创 gsonli 腾讯技术工程 2021-07-14
    API Design Guide
    The power of two choices in randomized load balancing
    NGINX and the "Power of Two Choices" Load-Balancing Algorithm
    SRE 崩溃
    DDoS木马
    String.fromCharCode(88,83,83) 方法返回由指定的 UTF-16 代码单元序列创建的字符串
    汇编语言的AX,BX,CX,DX,分别表示什么
  • 原文地址:https://www.cnblogs.com/xshan/p/10793011.html
Copyright © 2020-2023  润新知