• Python数据处理进阶——pandas


    对于python进行数据处理来说,pandas式一个不得不用的包,它比numpy很为强大。通过对《利用python进行数据分析》这本书中介绍pandas包的学习,再加以自己的理解,写下这篇随笔,与一起喜欢数据分析的朋友分享和相互学习。

    import numpy as np
    import pandas as pd
    from pandas import Series, DataFrame
    # 函数反应和映射
    df = DataFrame(np.random.randn(4,3), columns= list("bde"),
                   index= ["Utah", "Ohio", "Texas", "Oregon"])
    # print df
    # print np.abs(df)
    # 将函数应用到各列或行所形成的一维数组上。
    f = lambda x : x.max() - x.min()
    # 每一列的最大值减最小值
    # print df.apply(f, axis=0)
    # 每一行的最大值减最小值
    # print df.apply(f, axis=1)
    # 返回值由多个值组成的Series
    def f(x):
        return Series([x.min(), x.max()], index=["min","max"])
    # print df.apply(f)
    # 保留两位小数点
    format = lambda x : "%.2f" % x
    # print df.applymap(format)
    # print df["e"].map(format)
    
    # 排序和排名
    obj = Series(np.arange(4.), index=["b","a","d","c"])
    # print obj.sort_index()
    frame = DataFrame(np.arange(8).reshape((2,4)),index=["three","one"],
                      columns=["d",'a','b','c'])
    # 按照索引的行进行排序
    # print frame.sort_index(axis=1)
    # 按照索引的列进行排序
    # print frame.sort_index(axis=0)
    # 按照值的列进行排序(必须传入一个列的索引且只能排列一组)
    # print frame.sort_values('b', axis=0, ascending=False)
    # 按照值的行进行排序(必须传入一个行的索引且只能排列一组)
    # print frame.sort_values("one", axis=1, ascending=False)
    # 根据多个列进行排序
    # print frame.sort_index(by=["a","b"])
    
    # 排名
    obj1 = Series([7,-5,7,4,2,0,4])
    # print obj1.rank()
    
    # 加减乘除   add代表加,sub代表减, div代表除法, mul代表乘法
    df1 = DataFrame(np.arange(12).reshape((3,4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20).reshape((4,5)), columns=list("abcde"))
    
    # print df1 + df2
    # 将缺失值用0代替
    # print df1.add(df2, fill_value=0)
    # 再进行重新索引时,也可以指定一个填充值
    # print df1.reindex(columns=df2.columns, fill_value=0)
    
    data = {"state": ["Ohio","Ohio","Ohio","Nevada","Nevada"],
            "year" : [2000, 2001, 2002, 2001, 2002],
            "pop"  : [1.5, 1.7, 3.6, 2.4, 2.9]}
    frame = DataFrame(data)
    # print frame
    # 矩阵的横坐标
    # print frame.columns
    # 矩阵的纵坐标
    # print frame.index
    # 获取列通过类似字典标记的方式或属性的方式,可以将DataFrame的列获取为一个Series:
    # print frame["state"]
    # print frame.year
    # 获取行也通过类似字典标记的方式或属性的方式,比如用索引字段ix
    # print frame.ix[3]
    # 修改列的内容
    frame["debt"] = 16.5
    # print frame
    # 精准匹配
    val = Series([-1.2, -1.5, -1.7], index=["two", "four", "five"])
    frame.index = Series(['one', 'two', 'three', 'four', 'five'])
    frame.debt = val
    # print frame
    # 为不存在的列赋值存在列中的某个值会创建出一个布尔列。关键字del用于删除列。
    frame["eastern"] = frame.state == "Ohio"
    # print frame
    del frame["eastern"]       # 只能这样表示
    # print frame
    # 嵌套字典
    pop = { "Nevada" : {2001 : 2.4, 2002 : 2.9},
            "Ohio"   : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6}
            }
    # 传给DataFrame,它会被解释为:外层字典的键作为列,内层键则作为行索引
    frame2 = DataFrame(pop)
    # print frame2
    # 对该结果进行转置
    # print frame2.T
    # 内层字典的键会被合并、排序以形成最终的索引。
    frame3 = DataFrame(pop, index=[2001, 2002, 2003])
    # print frame3
    frame3.index.name = "year"; frame3.columns.name = "state"
    # print frame3
    
    
    # 重新索引
    obj = Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
    # reindex将会根据新索引进行重排。
    obj2 = obj.reindex(["a", "b", "c", "d", "e"])
    # print obj2
    # 将缺失值用0代替
    obj2 = obj.reindex(["a", "b", "c", "d", "e"], fill_value= 0)
    # print obj2
    
    # 插值处理--Series
    obj3 = Series(["blue", "purple", "yellow"], index=[0,2,4])
    # 前向填充ffill或pad
    a = obj3.reindex(xrange(6), method="ffill")
    # print a
    # 后向填充bfill或backfill
    b = obj3.reindex(xrange(6), method="bfill")
    # print b
    
    # 插值处理--DataFrame
    import numpy as np
    f = DataFrame(np.arange(9).reshape((3,3)), index=["a","c","d"],
                  columns=["Ohio", "Texas", "California"])
    # 改变行的索引
    f2 = f.reindex(["a","b","c","d"], fill_value=9)
    # print f2
    # 改变列的索引
    col = ["Texas", "Utah", "California"]
    f3 = f.reindex(columns=col)
    # print f3
    # 同时改变列和行的索引
    f4 = f.reindex(["a","b","c","d"], method="ffill",
                   columns=["Texas", "Utah", "California"])
    # print f4
    
    # 丢弃指定轴上的项--Series
    mys = Series(np.arange(5.), index=["a","b","c","d","e"])
    # print mys
    # drop()删除某个索引以及对应的值
    mys_new = mys.drop("c")
    # print mys_new
    mys_new1 = mys.drop(["c","e"])
    # print mys_new1
    
    # 丢弃指定轴上的项--DataFrame
    data = DataFrame(np.arange(16).reshape((4,4)),
                     index=["Ohio", "Colorado", "Utah", "New York"],
                     columns=["one", "two", "three", "four"])
    # 删除某行轴上的值
    data1 = data.drop(["Ohio","Utah"], axis=0)    # axis=0代表行
    # print data1
    # 删除某列轴上的值
    data2 = data.drop(["one","three"], axis=1)    # axis=1代表列
    # print data2
    
    obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
    # 使用is_unique属性可以知道他的值是否是唯一的
    print obj.index.is_unique
    # obj['a']
    df = DataFrame(np.random.randn(4, 3), index=['a', 'b', 'a', 'b'])
    print df.ix["b", 1]
    print df[1]

    pandas中的索引高级处理:

    from pandas import Series, DataFrame
    import pandas as pd
    import numpy as np
    # 索引、选取和过滤--Series
    obj = Series(np.arange(4), index=["a","b","c","d"])
    # print obj["b"]
    # print obj[1]
    # print obj[2:4]
    # print obj[["b","a","d"]]
    # print obj[[1,3]]
    # print obj[obj < 2]
    # 利用标签的切片运算与普通的python切片运算不同,其末端是包含的
    # print obj["b":"c"]
    obj["b":"c"] = 5
    # print obj
    
    # 索引、选取和过滤--DataFrame
    data = DataFrame(np.arange(16).reshape((4, 4)),
                     index=["Ohio", "Colorado", "Utah", "New York"],
                     columns=["one", "two", "three", "four"])
    # 选取某列的值
    # print data["two"]
    # 选取多个列的值
    # print data[["two","one"]]
    # 通过切片或布尔型数组选取行的值
    a = data[:2]
    b = data[data["three"] > 5]
    # data[data < 5] = 0
    # print data
    # 选取出列和行的值,用ix[行,列]
    c = data.ix["Ohio","two"]
    # print c, data
    # print data.ix["Ohio",["two","three"]]
    # 可以用数字代替列的轴
    # print data.ix[["Ohio","Colorado"],[3,0,1]]
    # 也可以用数字代替行的轴
    # print data.ix[[0,1],[3,0,1]]
    d = data.ix[:"Utah", "two"]
    # 行中每个值大于5且前三列的值
    e = data.ix[data.three > 5, :3]
    # print e
    
    # Series的字符串表现形式为:索引在左边,值在右边。如果没有指定索引,那么默认从0到(N-1)的整数型索引。
    # 可以通过values和index属性获取数组的形式和索引。
    obj = Series([2,3,-6,7])
    # print obj
    # print obj.values
    # print obj.index
    obj2 = Series([2,3,-6,7],index=["d","b","a","c"])
    # print obj2.index
    # print obj2["a"]
    obj2["d"] = 6
    # print obj2[["c","a","d"]]
    # print obj2[obj2 > 0 ]
    # print obj2 * 2
    # print np.exp(obj2)
    sdata = {"Ohio" : 35000, "Texas" : 71000, "Oregon" : 16000, "Utah" : 5000}
    # 直接用字典建立数组
    obj3 = Series(sdata)
    # 如果只传入一个字典,则结果Series中的索引就是原字典的键。
    states = ["California","Ohio","Oregon","Texas"]
    obj4 = Series(sdata, index=states)
    # 上述obj4中California在对应的sdata中找不到对应值,所以用NaN表示(缺失值)
    # 检测是否有缺失值。
    pd.isnull(obj4)
    pd.notnull(obj4)
    obj4.isnull()
    # Series最重要的一个功能是:它在算术运算中会自动对齐不同的索引的数据。
    # print obj3 + obj4
    
    # Series对象和索引都有一个name属性,该属性跟pandas其他的关键功能关系非常密切:
    obj4.name = "population"
    obj4.index.name = "state"
    # print obj4
    # Series的索引可以通过赋值的方式就地修改
    obj.index = ["Bob","Steve","Jeff","Ryan"]
    print obj

    用pandas包进行简单的统计学计算:

    import numpy as np
    import pandas as pd
    from pandas import Series, DataFrame
    df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                    [np.nan, np.nan],[0.75, -1.3]],
                   index=['a','b','c','d'],
                   columns=["one","two"])
    # print df.sum()
    # 传入axis=1将会按行进行求和运算
    # print df.sum(axis=1)
    # NA值会自动被排除,除非整个切片是NA值。可以通过skipna选项禁止这种功能
    d = df.mean(axis=1, skipna=False)
    f = lambda x : "%.2f" % x
    # print d.apply(f)
    
    # 统计
    # 间接统计
    # print df.idxmax()
    # 累计型统计(前一项加后一项)
    # print df.cumsum()
    # 一次性汇总统计
    # print df.describe()
    # print df.min(axis=1)
    
    # 计算相关系数和协方差
    
    obj = DataFrame(np.random.randn(5,4),
                    index=["2009-12-24","2009-12-28","2009-12-29","2009-12-30","2009-12-31"],
                    columns=["AAPL","GOOG","IBM","MSFT"])
    obj.index.name = "Data"
    # print obj
    # index 代表行, columns 代表列
    # corr方法用于计算两个Series中重叠的、非NA的、按索引对齐的值的相对系数。cov用于计算协方差:
    # print obj.MSFT.corr(obj.IBM)
    # print obj.MSFT.cov(obj.IBM)
    # 用于DataFrame的corr和cov
    # 相关系数
    # print obj.corr()
    # 协方差
    # print obj.cov()
    # 按列或行跟一个Series或Data Frame之间的相关系数
    # axis=1进行行进行计算
    # print obj.corrwith(obj.IBM)
    
    
    # 唯一值
    obj1 = Series(["c",'a','d','a','a','b','b','c','c'])
    uniques = obj1.unique()
    # 加排序
    # print uniques.sort()
    # 计算出现的频率
    print obj1.value_counts()
  • 相关阅读:
    edgeR
    R中的运算符,条件语句,控制语句
    library-type:fr-unstanded vs fisrt-stand vs second-stanrd
    R的几个基础函数
    HTseq-count
    HISAT2的运用
    shell的符号总结
    python-tuple
    python -List
    win10 ubuntu18.0 LTS双系统安装
  • 原文地址:https://www.cnblogs.com/llhy1178/p/6762459.html
Copyright © 2020-2023  润新知