• python进行数据分析groupby基础操作


    from pandas import Series,DataFrame
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np

    df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],'key2' : ['one', 'two', 'one', 'two', 'one'],
     'data1' : np.random.randn(5),
    'data2' : np.random.randn(5)})
    grouped=df['data1'].groupby(df['key1'])
    grouped.mean()
    means = df['data1'].groupby([df['key1'], df['key2']]).mean()
    means
    Out[10]:
    key1  key2
    a     one    -0.230076
          two     1.248653
    b     one    -0.196613
          two     0.689761
    Name: data1, dtype: float64
    means.unstack(level=-1)
    Out[12]:
    key2       one       two
    key1                    
    a    -0.230076  1.248653
    b    -0.196613  0.689761

    means.unstack(level=0)
    Out[13]:
    key1         a         b
    key2                    
    one  -0.230076 -0.196613
    two   1.248653  0.689761

    mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

    people = DataFrame(np.random.randn(5, 5),columns=['a', 'b', 'c', 'd', 'e'],index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

    mapping
    Out[25]: {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

    people
    Out[26]:
                   a         b         c         d         e
    Joe    -0.676828  0.880429  0.386540  2.961741  1.651430
    Steve   0.269119  0.916097 -0.238961  0.949983 -0.697962
    Wes     0.078493 -0.120419 -1.275525 -0.289106 -0.956001
    Jim    -1.225288  0.120219 -1.173347 -0.792199 -0.417982
    Travis  0.728367  0.906879  0.373102 -1.151590 -1.883083

    people.groupby(mapping,axis=1).sum()
    Out[29]:
                blue       red
    Joe     3.348280  1.855032
    Steve   0.711022  0.487254
    Wes    -1.564631 -0.997927
    Jim    -1.965546 -1.523050
    Travis -0.778488 -0.247836

    people.groupby(len).sum()
    Out[30]:
              a         b         c         d         e
    3 -1.823623  0.880229 -2.062331  1.880435  0.277447
    5  0.269119  0.916097 -0.238961  0.949983 -0.697962
    6  0.728367  0.906879  0.373102 -1.151590 -1.883083

    columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])

    columns
    Out[35]:
    MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
               labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
               names=['cty', 'tenor'])

    hier_df = DataFrame(np.random.randn(4, 5), columns=columns)

    hier_df.groupby(level='cty',axis=1).count()
    Out[37]:
    cty  JP  US
    0     2   3
    1     2   3
    2     2   3
    3     2   3
    states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

    years = np.array([2005, 2005, 2006, 2005, 2006])

    df
    Out[16]:
          data1     data2 key1 key2
    0 -0.169761 -0.297803    a  one
    1  1.248653  0.116745    a  two
    2 -0.196613 -0.224198    b  one
    3  0.689761  1.436277    b  two
    4 -0.290392  1.292000    a  one

    df['data1'].groupby([states,years]).mean()
    Out[17]:
    California  2005    1.248653
                2006   -0.196613
    Ohio        2005    0.260000
                2006   -0.290392
    Name: data1, dtype: float64

    #被聚合的只有数值列
    df.groupby(df['key1']).mean()
    Out[19]:
             data1     data2
    key1                    
    a     0.262833  0.370314
    b     0.246574  0.606039

    df.groupby(['key1','key2']).mean()
    Out[20]:
                  data1     data2
    key1 key2                    
    a    one  -0.230076  0.497098
         two   1.248653  0.116745
    b    one  -0.196613 -0.224198
         two   0.689761  1.436277

    for name,group in df.groupby(['key1']):
        print (name)
        print(group)
        
    a
          data1     data2 key1 key2
    0 -0.169761 -0.297803    a  one
    1  1.248653  0.116745    a  two
    4 -0.290392  1.292000    a  one
    b
          data1     data2 key1 key2
    2 -0.196613 -0.224198    b  one
    3  0.689761  1.436277    b  two

    for (k1,k2),group in df.groupby(['key1','key2']):
        print (k1,k2)
        print(group)
        
    a one
          data1     data2 key1 key2
    0 -0.169761 -0.297803    a  one
    4 -0.290392  1.292000    a  one
    a two
          data1     data2 key1 key2
    1  1.248653  0.116745    a  two
    b one
          data1     data2 key1 key2
    2 -0.196613 -0.224198    b  one
    b two
          data1     data2 key1 key2
    3  0.689761  1.436277    b  two

    df.dtypes
    Out[29]:
    data1    float64
    data2    float64
    key1      object
    key2      object
    dtype: object

    grouped=df.groupby(df.dtypes,axis=1)

    dict(list(grouped))
    Out[31]:
    {dtype('float64'):       data1     data2
     0 -0.169761 -0.297803
     1  1.248653  0.116745
     2 -0.196613 -0.224198
     3  0.689761  1.436277
     4 -0.290392  1.292000, dtype('O'):   key1 key2
     0    a  one
     1    a  two
     2    b  one
     3    b  two
     4    a  one}

    df.groupby(['key1', 'key2'])[['data2']].mean()
    Out[15]:
                  data2
    key1 key2          
    a    one   0.942926
         two  -0.671692
    b    one   0.366026
         two  -0.794155
         
    df.ix[2:3,['data1','data2']]=np.nan

    df
    Out[22]:
          data1     data2 key1 key2
    0 -1.090066  0.132120    a  one
    1 -0.683913 -0.671692    a  two
    2       NaN       NaN    b  one
    3       NaN       NaN    b  two
    4 -0.003452  1.753732    a  one

  • 相关阅读:
    2018_10_15 堆栈内存
    2018_10_14 java 环境 win10
    2018_09_21 ftp 安装
    2018_09_21 myeclipse git插件
    转 iOS开发中一些常见的并行处理
    转【iOS设计】一款APP从设计稿到切图过程概述
    算法题
    iOS -copy-mutableCopy- NSHashTable -NSMapTable-Keychain
    转--iOS下JS与原生OC互相调用
    ReactiveCocoa
  • 原文地址:https://www.cnblogs.com/qmxj-blog/p/9373711.html
Copyright © 2020-2023  润新知