• pandas-Notes2


    #coding = utf-8
    import pandas as pd
    import numpy as np
    import  matplotlib as plt
    
    dates = pd.date_range('20170601', periods=6)
    # make a random 6*4 matrix
    df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
    
    print df
    
    # statistic basics. exclude missing data in general
    # mean. mean of cols as default
    print df.mean()
    '''
    A   -0.640908
    B   -0.216183
    C    0.316962
    D   -0.634263
    dtype: float64
    '''
    # mean of rows
    print df.mean(1)
    
    # move down
    s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
    #print s
    '''
    2017-06-01    NaN
    2017-06-02    NaN
    2017-06-03    1.0
    2017-06-04    3.0
    2017-06-05    5.0
    2017-06-06    NaN
    Freq: D, dtype: float64
    '''
    
    # df-s. pandas will make Series into DataFrame
    # df will change
    print df
    print df.sub(s, axis='index')
    
    # cumulate by rows. default is by cols
    print df.apply(np.cumsum, axis=1)
    # apply lambda
    print df.apply(lambda x: x.max() - x.min())
    
    s = pd.Series(np.random.randint(0, 7, size=10))
    # there are duplicate values
    # value_counts behaves like histogram
    print s.value_counts()
    
    # string methods
    # s.str.lower() means to lowercase
    
    print df
    # first 3 rows. index by rows in default
    print df[:3]
    
    # concat. use list as parameter
    pieces = [df[:3], df[4:]]
    print pd.concat(pieces)
    
    # join.
    left = pd.DataFrame({'key':['1', '2'], 'lvar':['leftVar1', 'leftVar2']})
    right = pd.DataFrame({'key':['1', '2'], 'rvar':['rightVar1', 'rightVar2']})
    print left
    print right
    # merge by same key value
    print pd.merge(left, right, on='key')
    '''
      key      lvar       rvar
    0   1  leftVar1  rightVar1
    1   2  leftVar2  rightVar2
    '''
    
    # append. add a row to the tail
    # ignore_index = False, the index will be appended too. If True, then all index will be 0...n (int)
    print df.append(df.iloc[3], ignore_index=False)
    
    # group
    df1 = pd.DataFrame({'A' : ['f', 'b', 'f', 'f', 'b'],
                        'B' : ['1', '2', '2', '1', '2'],
                        'C' : np.random.randn(5)})
    # use sum()
    print df1.groupby(['A', 'B']).sum()
    
    # stack unstack means transformation between matrix and DataFrame
    
    # pivot_table means group by index and cols, use values. if there's function, execute it
    # pivot_table(df, values='D', index=['A', 'B'], columnes=['C'])
    
    # time series for time
    
    # categoricals
    
    # declare as category
    s1 = pd.Series(['A', 'B', 'B', 'C', 'A', 'E']).astype("category")
    # set category. Must same number of unique levels
    s1.cat.categories = ["good", "bad", 'A', 'B']
    print s1
    # df.sort_values(by="categoryName")
    # df.groupby("categoryName").size()
    
    # plot
    df2 = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
    df2 = df2.cumsum()
    # four lines, four colors. with legend.
    df2.plot()
    #plt.pyplot.show()
    
    # file in & out
    df2.to_csv("df2.csv")
    
    df3 = pd.read_csv("df2.csv")
    print df3.head(3)
    
    #df2.to_hdf("df2.h5", 'df')
    #pd.read_hdf('df2.h5', 'df')
    
    # need module openpyxl...
    df2.to_excel('df2.xlsx', sheet_name='sheet1')
    pd.read_excel('df2.xlsx', 'sheet1', index_col=None, na_values=['NA'])
    

      

  • 相关阅读:
    图片上传前预览、压缩、转blob、转formData等操作
    Vue背景图打包之后访问路径错误
    图片上传前预览的功能
    总结div里面水平垂直居中的实现方法
    IE浏览器报Promise未定义的错误、解决vuex requires a Promise polyfill in this browser问题
    普通项目转换成maven项目
    HTTP 错误 404.0
    电商项目系列文档(四):售后的设计(退换货)
    Sqlserver数据库还原.bak文件失败的两个问题
    数据库字段顺序的【坑】
  • 原文地址:https://www.cnblogs.com/pxy7896/p/6946569.html
Copyright © 2020-2023  润新知