• pandas基础(3)_数据处理


    1:删除重复数据

          使用duplicate()函数检测重复的行,返回元素为bool类型的Series对象,每个元素对应一行,如果该行不是第一次出现,则元素为true

    >>> df =DataFrame(np.random.randint(0,150,size=(6,3)),columns=['Chinese','maths','Chinese'],index=['zhangsan','lisi','wangwu','lisi','xiaowu','zhangsan'])

    >>> df

              Chinese  maths  Chinese

    zhangsan       17     58       70

    lisi           88     20      137

    wangwu        130     29       57

    lisi           71     20       65

    xiaowu        133     60        6

    zhangsan       96     48       60

    >>> df.duplicated()

    zhangsan    False

    lisi        False

    wangwu      False

    lisi        False

    xiaowu      False

    zhangsan    False

    dtype: bool

    >>> df =DataFrame(np.random.randint(0,2,size=(6,2)),columns=['Chinese','maths'],index=['zhangsan','lisi','wangwu','lisi','xiaowu','zhangsan'])

    >>> df

              Chinese  maths

    zhangsan        1      1

    lisi            1      0

    wangwu          0      0

    lisi            1      0

    xiaowu          1      1

    zhangsan        0      0

    >>> df.duplicated ()

    zhangsan    False

    lisi        False

    wangwu      False

    lisi         True

    xiaowu       True

    zhangsan     True

    dtype: bool

    >>> #如果出现的数据一样,则会返回true

    >>> #调用drop_duplicates()可以删除重复的数据

    >>> df.drop_duplicates ()

              Chinese  maths

    zhangsan        1      1

    lisi            1      0

    wangwu          0      0

    >>> #删除的是行

    >>> #rename()函数替换索引

    >>> #map():新建一列

    >>> #replace()替换元素

    2:异常值检测和过滤

    >>> #使用describe()函数查看每一列的描述统计量

    >>> df =DataFrame(np.random.randint(0,150,size=(6,2)),columns=['Chinese','maths'],index=[list('ABCDEF')])

    >>> df

       Chinese  maths

    A      119     25

    B       28     33

    C       10    134

    D       44    121

    E       44    119

    F       91     46

    >>> df.describe ()

              Chinese       maths

    count    6.000000    6.000000

    mean    56.000000   79.666667#平均值

    std     40.943864   50.014665

    min     10.000000   25.000000

    25%     32.000000   36.250000

    50%     44.000000   82.500000

    75%     79.250000  120.500000

    max    119.000000  134.000000

    >>> #std是标准方差

    >>> df.std ()

    Chinese    40.943864

    maths      50.014665

    dtype: float64

    >>> df.std(axis=1)

    A    66.468037

    B     3.535534

    C    87.681241

    D    54.447222

    E    53.033009

    F    31.819805

    dtype: float64

    >>> #每个人的标准差

    >>> np.abs(df)>df.std()*2

       Chinese  maths

    A     True  False

    B    False  False

    C    False   True

    D    False   True

    E    False   True

    F     True  False

    >>> #当某个方差大于标准方差的2倍时认为这两个数特殊,返回true,这时筛选出来

    >>> df.any(axis=1)

    A    True

    B    True

    C    True

    D    True

    E    True

    F    True

    dtype: bool

    >>> df2=np.abs(df)>df.std()*2

    >>> df3=df2.any(axis=1)

    >>> df[df3]

       Chinese  maths

    A      119     25

    C       10    134

    D       44    121

    E       44    119

    F       91     46

    >>> df2=np.abs(df)>df.std()*2

    >>> df2

       Chinese  maths

    A     True  False

    B    False  False

    C    False   True

    D    False   True

    E    False   True

    F     True  False

    >>> df2.any()

    Chinese    True

    maths      True

    dtype: bool

    >>> df2.all()

    Chinese    False

    maths      False

    dtype: bool

    >>> df3=df2.any(axis=1)

    >>> df3

    A     True

    B    False

    C     True

    D     True

    E     True

    F     True

    dtype: bool

    >>> df[df3]

       Chinese  maths

    A      119     25

    C       10    134

    D       44    121

    E       44    119

    F       91     46

    3:随机排序

    >>> x=np.random.permutation (6)

    >>> x

    array([4, 5, 1, 0, 3, 2])

    >>> df.take(x)

       Chinese  maths

    E       44    119

    F       91     46

    B       28     33

    A      119     25

    D       44    121

    C       10    134

    >>> #使用take(函数排序,可以借助np.random.pemutation()函数随机排序,可以用来随机抽样

    4:数据聚合

    >>> #通常是每一个数组生成一个具体的值

    >>> #1分组 2用函数处理  3合并

    >>> #核心函数groupby()

    >>> df = DataFrame({'item':['apple','banana','orange','banana','orange','apple'],'price':[4,3,3,2.5,4,2],'color':['red','yellow','yellow','green','green','green']})

    >>> df

        color    item  price

    0     red   apple    4.0

    1  yellow  banana    3.0

    2  yellow  orange    3.0

    3   green  banana    2.5

    4   green  orange    4.0

    5   green   apple    2.0

    >>> df.groupby('item')

    <pandas.core.groupby.DataFrameGroupBy object at 0x000000000E8EE240>

    >>> g=df.groupby('item')

    >>> g

    <pandas.core.groupby.DataFrameGroupBy object at 0x000000000E76A828>

    >>> g.groups

    {'orange': Int64Index([2, 4], dtype='int64'), 'apple': Int64Index([0, 5], dtype='int64'), 'banana': Int64Index([1, 3], dtype='int64')}

    >>> #分组

    >>> g['price'].mean ()

    item

    apple     3.00

    banana    2.75

    orange    3.50

    Name: price, dtype: float64

    >>> m=g['price'].mean ()

    >>> type(m)

    <class 'pandas.core.series.Series'>

    >>> df_mean=DataFrame(m)

    >>> df_mean

            price

    item         

    apple    3.00

    banana   2.75

    orange   3.50

    >>> pd.merge(df,df_mean,left_on='item',right_index=True)

        color    item  price_x  price_y

    0     red   apple      4.0     3.00

    5   green   apple      2.0     3.00

    1  yellow  banana      3.0     2.75

    3   green  banana      2.5     2.75

    2  yellow  orange      3.0     3.50

    4   green  orange      4.0     3.50

    >>> #以多个属性进行分组

    >>> df.groupby(['color','item']).sum()

                         price

    color  item         

    green  apple     2.0

               banana    2.5

               orange    4.0

    red      apple     4.0

    yellow  banana    3.0

               orange    3.0

    >>> #最终变成了多重索引结构

  • 相关阅读:
    html5文件api
    折腾一个自己的UrlRewrite
    hdu 4218 ( IMBA? )
    hdu 4217 Data Structure
    九度OJ 1008
    倒酒趣题详解
    第三届蓝桥杯复赛原题
    第三届蓝桥杯复赛题解析
    hdu 4223 Dynamic Programming
    hdu 4224 Enumeration
  • 原文地址:https://www.cnblogs.com/henuliulei/p/9368350.html
Copyright © 2020-2023  润新知