• python pandas 笔记3


    介绍pandas的有关DataFrame的高级操作。

    1. 数据合并

    import pandas as pd
    
    df = pd.DataFrame([{'Name': 'Chris', 'Item Purchased': 'Sponge', 'Cost': 22.50},
                       {'Name': 'Kevyn', 'Item Purchased': 'Kitty Litter', 'Cost': 2.50},
                       {'Name': 'Filip', 'Item Purchased': 'Spoon', 'Cost': 5.00}],
                      index=['Store 1', 'Store 1', 'Store 2'])
    df

    数据输出:

    1.1 加列

    df['Date'] = ['December 1', 'January 1', 'mid-May']
    df

    df['Delivered'] = True
    df['Feedback'] = ['Positive', None, 'Negative']

    重新设置某一列的值:

    adf = df.reset_index()
    adf['Date'] = pd.Series({0: 'December 1', 2: 'mid-May'})
    adf

    1.2 合并操作

    staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
                             {'Name': 'Sally', 'Role': 'Course liasion'},
                             {'Name': 'James', 'Role': 'Grader'}])
    staff_df = staff_df.set_index('Name')
    student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
                               {'Name': 'Mike', 'School': 'Law'},
                               {'Name': 'Sally', 'School': 'Engineering'}])
    student_df = student_df.set_index('Name')
    print(staff_df.head())
    print()
    print(student_df.head())

    输出:

    1.3 合并所有的值,能够自动用NaN填充

    pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)
    #合并所有的值

    1.4 只连接两者共有部分

    pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True)

    输出: 

    1.5 只考虑左边部分的完整性。

    pd.merge(staff_df, student_df, how='left', left_index=True, right_index=True)

    1.6 只考虑右部分的完整性。

    pd.merge(staff_df, student_df, how='right', left_index=True, right_index=True)

    1.7 或者按照列进行合并。

    staff_df = staff_df.reset_index()
    student_df = student_df.reset_index()
    pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')

    1.8 合并两个具有共同列名字的DataFrame,为了区分同名列,一般用_x表示合并操作中左边的DataFrama,_y表示右边的DataFrame。

    staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 'Location': 'State Street'},
                             {'Name': 'Sally', 'Role': 'Course liasion', 'Location': 'Washington Avenue'},
                             {'Name': 'James', 'Role': 'Grader', 'Location': 'Washington Avenue'}])
    student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 'Location': '1024 Billiard Avenue'},
                               {'Name': 'Mike', 'School': 'Law', 'Location': 'Fraternity House #22'},
                               {'Name': 'Sally', 'School': 'Engineering', 'Location': '512 Wilson Crescent'}])
    pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')

    Example:

    staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 'Role': 'Director of HR'},
                             {'First Name': 'Sally', 'Last Name': 'Brooks', 'Role': 'Course liasion'},
                             {'First Name': 'James', 'Last Name': 'Wilde', 'Role': 'Grader'}])
    student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 'School': 'Business'},
                               {'First Name': 'Mike', 'Last Name': 'Smith', 'School': 'Law'},
                               {'First Name': 'Sally', 'Last Name': 'Brooks', 'School': 'Engineering'}])
    print(staff_df)
    print(student_df)
    print(pd.merge(staff_df, student_df, how='inner', left_on=['First Name','Last Name'], right_on=['First Name','Last Name']))

    2. Pandas Idioms.

    import pandas as pd
    df = pd.read_csv('census.csv')
    df.head()

    风格1:

    ( df.where(df['Quantity'] != 0)
      .dropna()
      .rename(columns = {'Weight' : 'Weight(oz.)'}))
    # to modify the DataFrame df in one statement to drop any entries 
    # where 'Quantity' is 0 and rename the column 'Weight' to 'Weight
    (oz.)'?.

    或者用了drop()方法。效果同上。

    print(df.drop(df[df['Quantity'] == 0].index).rename(columns={'Weight': 'Weight (oz.)'}))
    # to modify the DataFrame df in one statement to drop any entries 
    # where 'Quantity' is 0 and rename the column 'Weight' to 'Weight (oz.)'?.
    (df.where(df['SUMLEV']==50)
        .dropna()
        .set_index(['STNAME','CTYNAME'])
        .rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'}))

    风格2:

    df = df[df['SUMLEV']==50]
    df.set_index(['STNAME','CTYNAME'], inplace=True)
    df.rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'})

    2.1 关于apply()方法

    import numpy as np
    def min_max(row):
        data = row[['POPESTIMATE2010',
                    'POPESTIMATE2011',
                    'POPESTIMATE2012',
                    'POPESTIMATE2013',
                    'POPESTIMATE2014',
                    'POPESTIMATE2015']]
        return pd.Series({'min': np.min(data), 'max': np.max(data)})
    df.apply(min_max, axis=1)
    # axis 将每一行都遍历,得到每一行的最大最小值。

    或者返回原来的列加min和max。

    import numpy as np
    def min_max(row): 
        data = row[['POPESTIMATE2010',
                    'POPESTIMATE2011',
                    'POPESTIMATE2012',
                    'POPESTIMATE2013',
                    'POPESTIMATE2014',
                    'POPESTIMATE2015']]
        row['max'] = np.max(data)
        row['min'] = np.min(data)
        return row
    df.apply(min_max, axis=1)

     或者使用lambda函数。

    rows = ['POPESTIMATE2010',
            'POPESTIMATE2011',
            'POPESTIMATE2012',
            'POPESTIMATE2013',
            'POPESTIMATE2014',
            'POPESTIMATE2015']
    df.apply(lambda x: np.max(x[rows]), axis=1)

    3. Group by 用法

    3.1 载入数据

    import pandas as pd
    import numpy as np
    df = pd.read_csv('census.csv')
    df = df[df['SUMLEV']==50]
    df.head()

    3.2 计算时长

    unique() 方法

    %%timeit -n 10
    for state in df['STNAME'].unique():
        avg = np.average(df.where(df['STNAME']==state).dropna()['CENSUS2010POP'])
        print('Counties in state ' + state + ' have an average population of ' + str(avg))
    # 10 loops, best of 3: 1.45 s per loop

    或者groupby()方法。

    %%timeit -n 10
    for group, frame in df.groupby('STNAME'):
        avg = np.average(frame['CENSUS2010POP'])
        print('Counties in state ' + group + ' have an average population of ' + str(avg))
    #根据state分组计算平均值
    #运行时长:10 loops, best of 3: 38.1 ms per loop 

    或者将索引设置为groupby()需要用的索引,然后根据利用函数进行分割。

    df = df.set_index('STNAME')
    
    def fun(item):
        if item[0]<'M':
            return 0
        if item[0]<'Q':
            return 1
        return 2
    
    for group, frame in df.groupby(fun):
        print('There are ' + str(len(frame)) + ' records in group ' + str(group) + ' for processing.')
    '''
    There are 1177 records in group 0 for processing.
    There are 1134 records in group 1 for processing.
    There are 831 records in group 2 for processing.
    '''

    lambda函数和apply()

    print(df.groupby('Category').apply(lambda df,a,b: sum(df[a] * df[b]), 'Weight (oz.)', 'Quantity'))
    # Or alternatively without using a lambda:
    # def totalweight(df, w, q):
    #        return sum(df[w] * df[q])
    #        
    # print(df.groupby('Category').apply(totalweight, 'Weight (oz.)', 'Quantity'))

    3.3 关于agg()方法。

    df = pd.read_csv('census.csv')
    df = df[df['SUMLEV']==50]
    #agg()方法的应用
    df.groupby('STNAME').agg({'CENSUS2010POP': np.average})

    print(type(df.groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']))
    print(type(df.groupby(level=0)['POPESTIMATE2010']))
    """
    <class 'pandas.core.groupby.DataFrameGroupBy'>
    <class 'pandas.core.groupby.SeriesGroupBy'>
    """

    groupby 以及 .agg()

    (df.set_index('STNAME').groupby(level=0)['CENSUS2010POP']
        .agg({'avg': np.average, 'sum': np.sum}))

    (df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']
        .agg({'avg': np.average, 'sum': np.sum}))

    (df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']
        .agg({'POPESTIMATE2010': np.average, 'POPESTIMATE2011': np.sum}))

    4. Scale

    4.1 对数据的排序

    s = pd.Series(['Low', 'Low', 'High', 'Medium', 'Low', 'High', 'Low'])
    
    s.astype('category', categories=['Low', 'Medium', 'High'], ordered=True)
    #output: 
    """
    0       Low
    1       Low
    2      High
    3    Medium
    4       Low
    5      High
    6       Low
    dtype: category
    Categories (3, object): [Low < Medium < High]
    """

    4.2 数据

    df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],
                      index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 'ok', 'ok', 'ok', 'poor', 'poor'])
    df.rename(columns={0: 'Grades'}, inplace=True)
    df

    df['Grades'].astype('category').head()
    #显示
    """
    excellent    A+
    excellent     A
    excellent    A-
    good         B+
    good          B
    Name: Grades, dtype: category
    Categories (11, object): [A, A+, A-, B, ..., C+, C-, D, D+]
    """

    排序:

    grades = df['Grades'].astype('category',
                                 categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],
                                 ordered=True)
    grades.head()
    #output:
    """
    excellent    A+
    excellent     A
    excellent    A-
    good         B+
    good          B
    Name: Grades, dtype: category
    Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+]
    """

    比较

    grades > 'C'
    """
    excellent     True
    excellent     True
    excellent     True
    good          True
    good          True
    good          True
    ok            True
    ok           False
    ok           False
    poor         False
    poor         False
    Name: Grades, dtype: bool
    """

     4.3 pd.cut()方法分成多个区间。

    df = pd.read_csv('census.csv')
    df = df[df['SUMLEV']==50]
    df = df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg({'avg': np.average})
    
    pd.cut(df['avg'],3).head()
    """
    STNAME
    Alabama       (11706.0871, 222327.236]
    Alaska        (11706.0871, 222327.236]
    Arizona       (222327.236, 432318.411]
    Arkansas      (11706.0871, 222327.236]
    California    (432318.411, 642309.586]
    Name: avg, dtype: category
    Categories (3, object): [(11706.0871, 222327.236] < (222327.236, 432318.411] < (432318.411, 642309.586]]
    """

    5. Pivot Tables.

    df = pd.read_csv('cars.csv')
    df

    df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=np.mean)

    或者用多个aggfunc函数

    df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=[np.mean,np.min], margins=True)

     6 .列相除

    >> data1 = {'a' : [1., 3., 5., 2.], 'b':[4,8,3,7], 'c' : [5,45,67,34]}
    >>> df = pd.DataFrame(data1)
    >>> df
    #output 输出
    """
       a  b   c
    0  1  4   5
    1  3  8  45
    2  5  3  67
    3  2  7  34
    """
    b = df['b']
    """
    >>> b
    0    4
    1    8
    2    3
    3    7
    """
    >>> a = df['a']
    """
    >>> a
    0    1
    1    3
    2    5
    3    2
    """
    #除法
    >>> a = a.div(b)
    """
    >>> a
    0    0.250000
    1    0.375000
    2    1.666667
    3    0.285714
    Name: a, dtype: float64
    """

     7. 关于DataFrame的applymap()的使用。

    import pandas as pd
    import numpy as np
    df = pd.DataFrame(np.random.randn(3,3))
    #output:
    """
    >>> df
              0         1         2
    0 -0.694165 -0.217500 -0.017622
    1 -1.255941  0.442811 -0.490014
    2 -1.608948  0.022557 -0.750053
    """
    dict = {0: 'A', 1 : 'B', 2 : 'C'}
    #加列, 利用map()函数
    df['cat'] = df.index.to_series().map(dict)
    #输出
    """
    >>> df
           0        1        2       cat
    0  -0.69  -0.22  -0.02   A
    1  -1.26   0.44  -0.49   B
    2  -1.61   0.02  -0.75   C 
    """
    The Safest Way to Get what you Want is to Try and Deserve What you Want.
  • 相关阅读:
    vhdl 语法总结 4
    模拟地数字地连接以及0欧电阻作用
    vhdl 语法总结5
    电容参数X5R,X7R,Y5V等代表什么意思呢?(转)
    传智播客视频资料
    out.println 和<%=%>两种不同输出的形式
    易忘记的java名词
    JavaScript和JSP运用上的区别
    多线程 Time和TimeTask组合
    马路上的井盖为什么是圆的?(微软面试题)
  • 原文地址:https://www.cnblogs.com/Shinered/p/9226646.html
Copyright © 2020-2023  润新知