一、
# coding:utf8 # !/usr/bin/python # import numpy as np import pandas as pd import np def example2(): ''' Describing a numeric ``Series``. :return: ''' s = pd.Series([1, 2, 3]) print s.describe() ''' count 3.0 mean 2.0 std 1.0 min 1.0 25% 1.5 50% 2.0 75% 2.5 max 3.0 dtype: float64 ''' def example3(): ''' Describing a categorical ``Series``. :return: ''' s = pd.Series(['a', 'a', 'b', 'c']) print s.describe() ''' count 4 unique 3 top a freq 2 dtype: object ''' def example4(): ''' Describing a timestamp ``Series``. :return: ''' s = pd.Series([ np.datetime64("2000-01-01"), np.datetime64("2010-01-01"), np.datetime64("2010-01-01") ]) print s.describe() ''' count 3 unique 2 top 2010-01-01 00:00:00 freq 2 first 2000-01-01 00:00:00 last 2010-01-01 00:00:00 dtype: object ''' def example5(): ''' Describing a ``DataFrame``. By default only numeric fields are returned. :return: ''' df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']), 'numeric': [1, 2, 3], 'object': ['a', 'b', 'c']}) print df.describe() ''' #Describing all columns of a ``DataFrame`` regardless of data type. print df.describe(include='all') #Describing a column from a ``DataFrame`` by accessing it as an attribute. print df.numeric.describe() #Including only numeric columns in a ``DataFrame`` description. print df.describe(include=[np.number]) #Including only string columns in a ``DataFrame`` description. print df.describe(include=[np.object]) #Including only categorical columns from a ``DataFrame`` description. print df.describe(include=['category']) #Excluding numeric columns from a ``DataFrame`` description. print df.describe(exclude=[np.number]) #Excluding object columns from a ``DataFrame`` description. print df.describe(exclude=[np.object]) ''' def example1(): dic1={'000':{'a':1,'b':2,'c':3},'001':{'d':4,'e':5,'f':6}} df2=pd.DataFrame(dic1) # print df2.describe() ''' 000 001 count 3.0 3.0 mean 2.0 5.0 std 1.0 1.0 min 1.0 4.0 25% 1.5 4.5 50% 2.0 5.0 75% 2.5 5.5 max 3.0 6.0 ''' print "返回非NAN数据项数量=>count() {count} ".format(count = df2.describe().count()) print "返回中位数,等价第50位百分位数的值=>median() {median} ".format(median = df2.describe().median()) print "返回数据的众值=>mode() {mode} ".format(mode = df2.describe().mode()) print "返回数据的标准差(描述离散度)=>std() {std} ".format(std = df2.describe().std()) print "返回方差=>var() {var} ".format(var = df2.describe().var()) print "偏态系数(skewness,表示数据分布的对称程度)=>skew() {skew} ".format(skew = df2.describe().skew()) def main(): example1() if __name__ == '__main__': main()
输出=>
返回非NAN数据项数量=>count() 000 8 001 8 dtype: int64 返回中位数,等价第50位百分位数的值=>median() 000 2.00 001 4.75 dtype: float64 返回数据的众值=>mode() 000 001 0 1.0 5.0 1 2.0 NaN 2 3.0 NaN 返回数据的标准差(描述离散度)=>std() 000 0.801784 001 1.603567 dtype: float64 返回方差=>var() 000 0.642857 001 2.571429 dtype: float64 偏态系数(skewness,表示数据分布的对称程度)=>skew() 000 0.000000 001 -1.299187 dtype: float64