• pandas之分组


    import numpy as np
    import pandas as pd
    
    df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                       'key2': ['one', 'two', 'one', 'two', 'one'],
                       'data1': np.random.randn(5),
                       'data2': np.random.randn(5)})
    print(df)
    
    # data1列数据,按照key1来进行分组
    g1 = df['data1'].groupby(by=df['key1'])
    print(g1)
    print(g1.mean())
    
    # 一次传入多个数组,按照key1,key2分组计算
    g2 = df['data1'].groupby(by=[df['key1'], df['key2']])
    print(g2.sum())
    
    # 可以将列名(可以是字符串、数字或者其他python对象)用作分组
    g3 = df.groupby(by='key1')
    print(g3.mean())
    
    g4 = df.groupby(by=['key1', 'key2'])
    print(g4.mean())
    
    输出结果:
      key1 key2     data1     data2
    0    a  one  0.374982  1.064534
    1    a  two -0.485860  0.071513
    2    b  one -0.741052  1.868335
    3    b  two -0.757161 -0.536510
    4    a  one -0.092603 -1.647309
    <pandas.core.groupby.generic.SeriesGroupBy object at 0x000001720F810BA8>
    key1
    a   -0.067827
    b   -0.749106
    Name: data1, dtype: float64
    key1  key2
    a     one     0.282379
          two    -0.485860
    b     one    -0.741052
          two    -0.757161
    Name: data1, dtype: float64
             data1     data2
    key1                    
    a    -0.067827 -0.170421
    b    -0.749106  0.665912
                  data1     data2
    key1 key2                    
    a    one   0.141189 -0.291387
         two  -0.485860  0.071513
    b    one  -0.741052  1.868335
         two  -0.757161 -0.536510
    import numpy as np
    import pandas as pd
    
    df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                       'key2': ['one', 'two', 'one', 'two', 'one'],
                       'data1': np.random.randn(5),
                       'data2': np.random.randn(5)})
    print(df)
    
    states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
    years = np.array([2005, 2005, 2006, 2005, 2006])
    # 按照states, years传入数组的方式对data1列数据分组后进行平均值计算
    g1 = df['data1'].groupby(by=[states, years])
    print(g1.mean())
    
    people = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'],
                          index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
    print(people)
    mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red'}
    g2 = people.groupby(mapping, axis=1).sum()
    print(g2)
    
    
    输出结果:
      key1 key2     data1     data2
    0    a  one -0.388889  0.083999
    1    a  two  0.109513  2.016369
    2    b  one -0.423841  0.800895
    3    b  two -0.899408  0.039836
    4    a  one -0.137741 -1.382602
    California  2005    0.109513
                2006   -0.423841
    Ohio        2005   -0.644148
                2006   -0.137741
    Name: data1, dtype: float64
                   a         b         c         d         e
    Joe     1.179284  0.289269  0.303566  1.493424 -1.432677
    Steve  -0.386514 -0.154331 -0.771970 -0.597251  0.004637
    Wes    -0.933249  0.300309 -1.659948  0.404844 -0.375680
    Jim    -2.427930  0.574752  0.504091 -0.996975 -1.547615
    Travis -1.555477 -1.065392 -1.149300  1.326839 -1.530914
                blue       red
    Joe     1.796990  0.035876
    Steve  -1.369220 -0.536208
    Wes    -1.255104 -1.008621
    Jim    -0.492884 -3.400793
    Travis  0.177539 -4.151784
    import numpy as np
    import pandas as pd
    
    df = pd.DataFrame({
        'A': [1, 1, 2, 2],
        'B': [1, 2, 3, 4],
        'C': np.random.randn(4)})
    
    print(df)
    # 按照A列分组后聚合求最小值
    df1 = df.groupby('A').agg('min')
    print(df1)
    
    # 按照A列分组后聚合求最小值和最大值
    df2 = df.groupby('A').agg(['min', 'max'])
    print(df2)
    
    # 按照A列分组后聚合后对B列求最小值和最大值,同时对C列求和
    df3 = df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
    print(df3)
    
    输出结果:
       A  B         C
    0  1  1  1.630997
    1  1  2 -0.238686
    2  2  3  0.841689
    3  2  4 -0.532344
       B         C
    A             
    1  1 -0.238686
    2  3 -0.532344
        B             C          
      min max       min       max
    A                            
    1   1   2 -0.238686  1.630997
    2   3   4 -0.532344  0.841689
        B             C
      min max       sum
    A                  
    1   1   2  1.392311
    2   3   4  0.309345
    import numpy as np
    import pandas as pd
    
    df = pd.DataFrame(np.random.randn(4, 5), columns=list('ABCDE'))
    print(df)
    
    # 计算各列数据总和并作为新列添加到末尾
    df['col_sum'] = df.apply(lambda x: x.sum(), axis=1)
    print(df)
    # 计算各行数据总和并作为新行添加到末尾
    df.loc['row_sum'] = df.apply(lambda x: x.sum(), axis=0)
    print(df)
    
    
    
    输出结果:
              A         B         C         D         E
    0 -0.978040 -0.820454  0.673568  1.613058 -0.203536
    1  0.879338  0.561962  0.278855 -0.481307  1.097682
    2 -1.244722  1.304386  0.206882 -0.715907 -1.246526
    3  1.419993  0.453622  0.237463  1.292601 -0.184714
              A         B         C         D         E   col_sum
    0 -0.978040 -0.820454  0.673568  1.613058 -0.203536  0.284595
    1  0.879338  0.561962  0.278855 -0.481307  1.097682  2.336530
    2 -1.244722  1.304386  0.206882 -0.715907 -1.246526 -1.695887
    3  1.419993  0.453622  0.237463  1.292601 -0.184714  3.218965
                    A         B         C         D         E   col_sum
    0       -0.978040 -0.820454  0.673568  1.613058 -0.203536  0.284595
    1        0.879338  0.561962  0.278855 -0.481307  1.097682  2.336530
    2       -1.244722  1.304386  0.206882 -0.715907 -1.246526 -1.695887
    3        1.419993  0.453622  0.237463  1.292601 -0.184714  3.218965
    row_sum  0.076569  1.499516  1.396768  1.708445 -0.537094  4.144204
  • 相关阅读:
    Django:重定向
    Django:使用get_template、render实现读取数据并传递数据
    Django:读入html中的数据
    Django:urls配置路由地址
    Django:基础知识-创建模块
    HTMLTestRunner美化
    python2 selenium
    Jmeter检查点
    Jmeter集合点
    jmeter的三种参数化方式
  • 原文地址:https://www.cnblogs.com/yuxiangyang/p/11266457.html
Copyright © 2020-2023  润新知