• pandas 基础学习


    import numpy as np
    import pandas as pd
    
    s = pd.Series([1, 3, 6, np.nan, 10, 23])
    print(s)
    dates = pd.date_range('20180101', periods=4)
    print(dates)
    df = pd.DataFrame(np.random.randn(4, 5), index=dates, columns=['a', 'b', 'c', 'd', 'e'])
    print(df)
    df = pd.DataFrame(np.random.randn(4, 5))
    print(df)

    pandas.Series(data=Noneindex=Nonedtype=Nonename=Nonecopy=Falsefastpath=False)  是一维数组

    pandas.DataFrame([data, index, columns, dtype, copy])具有标记轴(行和列)的二维大小可变,可能异构的表格数据结构。 其中index代表行,colums是列

    DataFrame:

    DataFrame可以从字典形式或者pandas中组建

    values属性

    describe()方法

    transpose()方法和T属性是转置

    sort_index()方法,对index进行排序 其中 axis为0是对行排序,ascending是倒叙还是顺序

    sort_values()方法,对值进行排序 其中 axis为0是对列排序,ascending是倒叙还是顺序

     

    选择数据:

    import numpy as np
    import pandas as pd
    
    dates = pd.date_range('20180101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape(6,4),index =dates, columns=['A', 'B', 'C', 'D'])
    print(df.A,df['A'])
    print(df[0:3])
    print(df['20180101':'20180103'])

    这四种选择方法都是可以的

    select by label 即 loc

    import numpy as np
    import pandas as pd
    
    dates = pd.date_range('20180101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape(6,4),index =dates, columns=['A', 'B', 'C', 'D'])
    
    print(df.loc['20180103'])
    print(df.loc['20180101',['A','B']])
    print(df.loc['20180101':'20180102',['A','B']])

    select by position: iloc

    import numpy as np
    import pandas as pd
    
    dates = pd.date_range('20180101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
    
    print(df.iloc[[1, 2], [1, 2]])
    print(df.iloc[1:3,4:5])

    mixed selection:ix

    已经被移出了

    Boolean indexing

    import numpy as np
    import pandas as pd
    
    dates = pd.date_range('20180101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
    
    print(df[df.A>8])

    首先我们看df.A>8是什么

    设置值:

    import numpy as np
    import pandas as pd
    
    dates = pd.date_range('20180101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
    
    df.iloc[2,2] = None        #位置
    df.loc['20180101','B'] = -1  #标签
    df.B[df.A>4] = -2    #逻辑
    df['F'] = np.nan    
    df['G'] = pd.Series([1,2,3,4,5,6],index=df.index) #添加上一个Series

    丢去损失数据

    dropna()

    import numpy as np
    import pandas as pd
    
    dates = pd.date_range('20180101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
    
    df.iloc[0, 1] = np.nan
    df.iloc[1, 2] = np.nan
    print(df)
    
    print(df.dropna(axis=0, how='any'))

    其中的how是方法,any代表只要有nan就丢弃,all代表行/列 全部都是nan才丢掉

    fillna()

    import numpy as np
    import pandas as pd
    
    dates = pd.date_range('20180101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
    
    df.iloc[0, 1] = np.nan
    df.iloc[1, 2] = np.nan
    print(df)
    
    print(df.fillna(value=0))

    isnull() 是否为Nan,返回一个Boolean向量

    可以配合np.any配合使用,检查是否有Nan

    import numpy as np
    import pandas as pd
    
    dates = pd.date_range('20180101', periods=6)
    df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
    
    df.iloc[0, 1] = np.nan
    df.iloc[1, 2] = np.nan
    print(df)
    
    print(np.any(df.isnull()) == True)

    打印出True

    储存和读取:

    import numpy as np
    import pandas as pd
    
    data = pd.read_csv('1.csv')
    data.to_pickle('1.pickel')

    合并concat:

    ignore_index

    import numpy as np
    import pandas as pd
    
    df1 = pd.DataFrame(np.ones([2,3])*0, columns=['a','b','c'])
    df2 = pd.DataFrame(np.ones([2,3])*1, columns=['a','b','c'])
    df3 = pd.DataFrame(np.ones([2,3])*2, columns=['a','b','c'])
    print(df1)
    print(df2)
    print(df3)
    res = pd.concat([df1,df2,df3])
    print(res)
    res = pd.concat([df1,df2,df3], ignore_index=True)
    print(res)

    第一次是没有设置ignore_index 它的排序是010101。第二次就是0到5了

    join

    参数join模式默认的是outer如下:

    import numpy as np
    import pandas as pd
    
    df1 = pd.DataFrame(np.ones([3,4])*0, columns=['a','b','c','d'])
    df2 = pd.DataFrame(np.ones([3,4])*1, columns=['b','c','d','e'])
    
    print(df1)
    print(df2)
    
    res = pd.concat([df1,df2])   #默认join='outer'
    print(res)

    抛出了一个warning,在没有'a'标签的情况下填充NaN

    当把join属性转化为inner

    import numpy as np
    import pandas as pd
    
    df1 = pd.DataFrame(np.ones([3,4])*0, columns=['a','b','c','d'])
    df2 = pd.DataFrame(np.ones([3,4])*1, columns=['b','c','d','e'])
    
    print(df1)
    print(df2)
    
    res = pd.concat([df1,df2],join='inner')
    print(res)

    这时候pandas在合并时做裁剪

    join_axis()

    import numpy as np
    import pandas as pd
    
    df1 = pd.DataFrame(np.ones([3,4])*0, columns=['a','b','c','d'],index=[1,2,3])
    df2 = pd.DataFrame(np.ones([3,4])*1, columns=['b','c','d','e'],index=[2,3,4])
    
    print(df1)
    print(df2)
    
    res = pd.concat([df1,df2], axis=1)
    print(res)
    
    res = pd.concat([df1, df2],axis=1, join_axes=[df1.index])
    print(res)

    join_axis 决定了合并时依照哪个数据的index或者coloums,在没有设置这个参数的情况下他把 1 2 3 4都写进,没有的用Nan填充,当我们把join_axis设置成df1的index,只保留了tf的 1 2 3

    append() 按照行增加数据

    import numpy as np
    import pandas as pd
    
    df1 = pd.DataFrame(np.ones([3,4])*0, columns=['a','b','c','d'])
    df2 = pd.DataFrame(np.ones([3,4])*1, columns=['a','b','c','d'])
    
    res = df1.append(df2,ignore_index=True)
    print(res)

    它也有ignore_index参数

    并入一个Series

    import numpy as np
    import pandas as pd
    
    df1 = pd.DataFrame(np.ones([3,4])*0, columns=['a','b','c','d'])
    df2 = pd.DataFrame(np.ones([3,4])*1, columns=['a','b','c','d'])
    
    
    s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
    res = df1.append(s1, ignore_index=True)
    print(res)

     merge()

     on参数:指明连接键

    import pandas as pd
    
    left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                                      'A': ['A0', 'A1', 'A2', 'A3'],
                                      'B': ['B0', 'B1', 'B2', 'B3']})
    right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                                        'C': ['C0', 'C1', 'C2', 'C3'],
                                        'D': ['D0', 'D1', 'D2', 'D3']})
    print(left)
    print(right)
    res = pd.merge(left, right, on='key')
    print(res)

    how参数 inner(省却) or outer

    import pandas as pd
    
    left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                                 'key2': ['K0', 'K1', 'K0', 'K1'],
                                 'A': ['A0', 'A1', 'A2', 'A3'],
                                 'B': ['B0', 'B1', 'B2', 'B3']})
    right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                                  'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})
    print(left)
    print(right)
    res = pd.merge(left, right, on=['key1', 'key2'], how='inner')  # default for how='inner'
    res = pd.merge(left,right)
    print(res)

    当我们设置成inner时,只保留key1和key2相同部分。舍去不相同不部分

    当改成outer时

    改成right 或者 right也是同理的,是以左边或者右边为基准了

     indicator参数

    显示合并方式,默认是False , 当传入参数是字符串时。相当于自定义了名称

    import pandas as pd
    
    df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
    df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
    print(df1)
    print(df2)
    res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
    print(res)
    
    # give the indicator a custom name 自定义indicator名称
    res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
    print(res)

    left_index和right_index参数

    之前的合并都是按照columns 当设置left_index和right_index参数时,merge()会按照index来合并

    import pandas as pd
    
    # merged by index
    left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                                      'B': ['B0', 'B1', 'B2']},
                                      index=['K0', 'K1', 'K2'])
    right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                                         'D': ['D0', 'D2', 'D3']},
                                          index=['K0', 'K2', 'K3'])
    print(left)
    print(right)
    # left_index and right_index
    res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
    print(res)
    res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
    print(res)

    suffixes更改合并后的键名称

    import pandas as pd
    
    # merged by index
    # handle overlapping
    boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
    girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
    res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
    print(res)

    可视化plot

    import pandas as pd
    import numpy as np
    from matplotlib import pyplot as plt
    
    data = pd.DataFrame(np.random.randn(1000,4), columns=list('ABCD'))
    data = data.cumsum()
    data.plot()
    plt.show()

    plot.scatter()

    import pandas as pd
    import numpy as np
    from matplotlib import pyplot as plt
    
    data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
    data = data.cumsum()
    # plot methods:
    # 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
    ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
    data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
    
    plt.show()

    这边把data.plot.scatter中的ax参数设置成了ax,让他们在同一张图中

  • 相关阅读:
    音乐情感识别
    SoftmaxWithLoss函数和师兄给的loss有哪些区别呢
    内积层和全连接层是一样的
    caffe中的Local Response Normalization (LRN)有什么用,和激活函数区别
    caffe官网的部分翻译及NG的教程
    couldn't import dot_parser
    apt-get -f install
    Spring常用注解总结 hibernate注解
    Set Map List Iterator
    iframe 与frameset
  • 原文地址:https://www.cnblogs.com/francischeng/p/9740689.html
Copyright © 2020-2023  润新知