• pandas-09 pd.groupby()的用法


    pandas-09 pd.groupby()的用法

    在pandas中的groupby和在sql语句中的groupby有异曲同工之妙,不过也难怪,毕竟关系数据库中的存放数据的结构也是一张大表罢了,与dataframe的形式相似。

    import numpy as np
    import pandas as pd
    from pandas import Series, DataFrame
    
    
    df = pd.read_csv('./city_weather.csv')
    print(df)
    '''
              date city  temperature  wind
    0   03/01/2016   BJ            8     5
    1   17/01/2016   BJ           12     2
    2   31/01/2016   BJ           19     2
    3   14/02/2016   BJ           -3     3
    4   28/02/2016   BJ           19     2
    5   13/03/2016   BJ            5     3
    6   27/03/2016   SH           -4     4
    7   10/04/2016   SH           19     3
    8   24/04/2016   SH           20     3
    9   08/05/2016   SH           17     3
    10  22/05/2016   SH            4     2
    11  05/06/2016   SH          -10     4
    12  19/06/2016   SH            0     5
    13  03/07/2016   SH           -9     5
    14  17/07/2016   GZ           10     2
    15  31/07/2016   GZ           -1     5
    16  14/08/2016   GZ            1     5
    17  28/08/2016   GZ           25     4
    18  11/09/2016   SZ           20     1
    19  25/09/2016   SZ          -10     4
    '''
    
    g = df.groupby(df['city'])
    # <pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f10450e12e8>
    
    print(g.groups)
    
    # {'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'),
    # 'GZ': Int64Index([14, 15, 16, 17], dtype='int64'),
    # 'SZ': Int64Index([18, 19], dtype='int64'),
    # 'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')}
    
    print(g.size()) # g.size() 可以统计每个组 成员的 数量
    '''
    city
    BJ    6
    GZ    4
    SH    8
    SZ    2
    dtype: int64
    '''
    
    print(g.get_group('BJ')) # 得到 某个 分组
    '''
             date city  temperature  wind
    0  03/01/2016   BJ            8     5
    1  17/01/2016   BJ           12     2
    2  31/01/2016   BJ           19     2
    3  14/02/2016   BJ           -3     3
    4  28/02/2016   BJ           19     2
    5  13/03/2016   BJ            5     3
    '''
    
    df_bj = g.get_group('BJ')
    print(df_bj.mean()) # 对这个 分组 求平均
    '''
    temperature    10.000000
    wind            2.833333
    dtype: float64
    '''
    
    # 直接使用 g 对象,求平均值
    print(g.mean()) # 对 每一个 分组, 都计算分组
    '''
          temperature      wind
    city                       
    BJ         10.000  2.833333
    GZ          8.750  4.000000
    SH          4.625  3.625000
    SZ          5.000  2.500000
    '''
    
    print(g.max())
    '''
                date  temperature  wind
    city                               
    BJ    31/01/2016           19     5
    GZ    31/07/2016           25     5
    SH    27/03/2016           20     5
    SZ    25/09/2016           20     4
    '''
    
    print(g.min())
    '''
                date  temperature  wind
    city                               
    BJ    03/01/2016           -3     2
    GZ    14/08/2016           -1     2
    SH    03/07/2016          -10     2
    SZ    11/09/2016          -10     1
    '''
    
    # g 对象还可以使用 for 进行循环遍历
    for name, group in g:
        print(name)
        print(group)
    
    
    
    
    # g 可以转化为 list类型, dict类型
    print(list(g)) # 元组第一个元素是 分组的label,第二个是dataframe
    '''
    [('BJ',          date city  temperature  wind
    0  03/01/2016   BJ            8     5
    1  17/01/2016   BJ           12     2
    2  31/01/2016   BJ           19     2
    3  14/02/2016   BJ           -3     3
    4  28/02/2016   BJ           19     2
    5  13/03/2016   BJ            5     3), 
    ('GZ',           date city  temperature  wind
    14  17/07/2016   GZ           10     2
    15  31/07/2016   GZ           -1     5
    16  14/08/2016   GZ            1     5
    17  28/08/2016   GZ           25     4), 
    ('SH',           date city  temperature  wind
    6   27/03/2016   SH           -4     4
    7   10/04/2016   SH           19     3
    8   24/04/2016   SH           20     3
    9   08/05/2016   SH           17     3
    10  22/05/2016   SH            4     2
    11  05/06/2016   SH          -10     4
    12  19/06/2016   SH            0     5
    13  03/07/2016   SH           -9     5), 
    ('SZ',           date city  temperature  wind
    18  11/09/2016   SZ           20     1
    19  25/09/2016   SZ          -10     4)]
    '''
    print(dict(list(g))) # 返回键值对,值的类型是 dataframe
    '''
    {'SH':           date city  temperature  wind
    6   27/03/2016   SH           -4     4
    7   10/04/2016   SH           19     3
    8   24/04/2016   SH           20     3
    9   08/05/2016   SH           17     3
    10  22/05/2016   SH            4     2
    11  05/06/2016   SH          -10     4
    12  19/06/2016   SH            0     5
    13  03/07/2016   SH           -9     5, 
    'SZ':           date city  temperature  wind
    18  11/09/2016   SZ           20     1
    19  25/09/2016   SZ          -10     4, 
    'GZ':           date city  temperature  wind
    14  17/07/2016   GZ           10     2
    15  31/07/2016   GZ           -1     5
    16  14/08/2016   GZ            1     5
    17  28/08/2016   GZ           25     4, 
    'BJ':          date city  temperature  wind
    0  03/01/2016   BJ            8     5
    1  17/01/2016   BJ           12     2
    2  31/01/2016   BJ           19     2
    3  14/02/2016   BJ           -3     3
    4  28/02/2016   BJ           19     2
    5  13/03/2016   BJ            5     3}
    '''
    
  • 相关阅读:
    linux上传文件到oss的方法
    centos6.5重装python
    nfs共享文件夹
    mysql报错ERROR 2002 (HY000): Can't connect to local MySQL server through socket '/tmp/mysql.sock' (2)
    搭建网关服务器
    面试总结
    innerText兼容性问题
    Title Case
    Character frequency
    Least Common Multiple
  • 原文地址:https://www.cnblogs.com/wenqiangit/p/11252765.html
Copyright © 2020-2023  润新知