• Python——pandas的使用


    Pandas的常用数据类型

    • Series一维,带标签的数组
    • DataFrame二维,Series容器

    一、Series

    Series对象本质上是有两个数组组成,一个数组构成对象的键(index),一个数组构成对象的值(values)

    import string
    
    import pandas as pd
    import numpy as np
    
    
    # 创建Series
    t1 = pd.Series(np.arange(5),index=list("abcde"))
    
    print(t1)
    """
    索引可以指定,默认为012...
    
    a    0
    b    1
    c    2
    d    3
    e    4
    dtype: int64
    """
    print(type(t1)) # <class 'pandas.core.series.Series'>
    
    
    # 使用字典创建Series
    
    a = {string.ascii_uppercase[i]:i for i in range(5)}
    
    # 创建Series
    print(pd.Series(a))
    """
    A    0
    B    1
    C    2
    D    3
    E    4
    dtype: int64
    """
    
    print(pd.Series(a,index=list("CDEFG")))
    """
    C    2.0
    D    3.0
    E    4.0
    F    NaN
    G    NaN
    dtype: float64
    """
    
    
    # 切片
    
    print(t1[0:4:2])
    """
    a    0
    c    2
    dtype: int64
    """
    
    print(t1[[2,3,4]])
    """
    c    2
    d    3
    e    4
    dtype: int64
    """
    
    print(t1[t1>2])
    """
    d    3
    e    4
    dtype: int64
    """
    
    print(t1["b"])  # 1
    
    
    print(t1[["a","e","f"]])
    """
    a    0.0
    e    4.0
    f    NaN
    dtype: float64
    """
    
    # 索引和值
    
    print(t1.index)     # Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
    print(type(t1.index))   # <class 'pandas.core.indexes.base.Index'>
    
    print(t1.values)    # [0 1 2 3 4]
    print(type(t1.values))  # <class 'numpy.ndarray'>
    

    二、DataFrame

    创建DataFrame

    # 创建DataFrame对象
    
    t1 = pd.DataFrame(np.arange(12).reshape(3,4))
    
    print(t1)
    """
    DataFrame对象既有行索引,又有列索引
    行索引,表明不同行,横向索引,叫index,0轴,axis=0
    列索引,表名不同列,纵向索引,叫columns,1轴,axis=1
    
       0  1   2   3
    0  0  1   2   3
    1  4  5   6   7
    2  8  9  10  11
    """
    
    t2 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("EFGH"))
    
    print(t2)
    """
       E  F   G   H
    a  0  1   2   3
    b  4  5   6   7
    c  8  9  10  11
    """
    
    # 将字典转换成dataframe
    temp_dict = [{"name":"zhangsan","age":15,"tel":10086},
                 {"name":"lisi","age":15},
                 {"name":"wangwu","tel":10086}
                 ]
    
    t3 = pd.DataFrame(temp_dict)
    print(t3)
    """
        age      name      tel
    0  15.0  zhangsan  10086.0
    1  15.0      lisi      NaN
    2   NaN    wangwu  10086.0
    """
    

    获取DataFrame的基本信息

    # 获取DataFrame的基本信息
    
    # 行数,列数
    print(t1.shape)
    
    # 列数据类型
    print(t1.dtypes)
    
    # 数据维度
    print(t1.ndim)  # 2
    # 行索引
    print(t1.index) # RangeIndex(start=0, stop=3, step=1)
    # 列索引
    print(t2.columns)   # Index(['E', 'F', 'G', 'H'], dtype='object')
    # 对象值
    print(t1.values)
    """
    [[ 0  1  2  3]
     [ 4  5  6  7]
     [ 8  9 10 11]]
    """
    # 显示头几行,默认是5
    print(t1.head(2))
    # 显示末尾几行
    print(t1.tail(2))
    # 相关信息概览:行数,列数,列索引,咧非空值个数,行列类型,内存占用
    print(t1.info())
    """
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 3 entries, 0 to 2
    Data columns (total 4 columns):
    0    3 non-null int64
    1    3 non-null int64
    2    3 non-null int64
    3    3 non-null int64
    dtypes: int64(4)
    memory usage: 176.0 bytes
    None
    """
    # 快速综合统计结果:计数,均值,标准差,最大值,1/4值,最小值
    print(t2.describe())
    """
    是根据列来计算的
             E    F     G     H
    count  3.0  3.0   3.0   3.0
    mean   4.0  5.0   6.0   7.0
    std    4.0  4.0   4.0   4.0
    min    0.0  1.0   2.0   3.0
    25%    2.0  3.0   4.0   5.0
    50%    4.0  5.0   6.0   7.0
    75%    6.0  7.0   8.0   9.0
    max    8.0  9.0  10.0  11.0
    """
    

    加载csv数据

    import numpy as np
    import pandas as pd
    
    # 加载csv数据
    t = pd.read_csv("./dogNames2.csv")
    
    # 按照字段进行排序,ascending   desc/asc
    t2 = t.sort_values("Count_AnimalName",ascending=False).head(10)
    
    print(t2)
    

    获取行列数据

    import string
    
    import numpy as np
    import pandas as pd
    
    
    # loc和iloc方法
        # df.loc:通过标签获取行数据
        # df.iloc:通过位置获取行数据
    
    t1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("EFGH"))
    
    # 获取行列交叉部分
    print(t1.loc["a","E"])  # 0
    
    # 获取行与多列交叉部分
    print(t1.loc["a",["E","F"]])
    """
    E    0
    F    1
    """
    
    # 获取行与多列交叉部分
    print(t1.loc["a","E":"G"])
    """
    E    0
    F    1
    G    2
    """
    
    # 获取行与连续多列交叉部分
    print(t1.loc["a":"c","G"])
    """
    注意:loc里的:是包括最后的那个的
    a     2
    b     6
    c    10
    """
    
    # iloc和loc是一样的,只不过采用的是索引来进行的操作
    

    布尔索引

    import numpy as np
    import pandas as pd
    
    
    # pandas中的布尔索引
    t1 = pd.read_csv("./dogNames2.csv")
    
    # 找出其中名字使用次数超过800的狗
    
    print(t1[t1["Count_AnimalName"]>800])
    """
          Row_Labels  Count_AnimalName
    1156       BELLA              1195
    2660     CHARLIE               856
    3251        COCO               852
    9140         MAX              1153
    12368      ROCKY               823
    """
    
    # 找出狗名字符串长度超过4的狗
    
    print(t1[t1["Row_Labels"].str.len()>4].head(3))
    """
      Row_Labels  Count_AnimalName
    2      40804                 1
    3      90201                 1
    4      90203                 1
    """
    
    # 多条件,要使用()分割,&或|做连接符
    print(t1[(t1["Row_Labels"].str.len()>4)&(t1["Row_Labels"].str.len()<6)].head(3))
    

    字符串方法

    处理缺失数据

    import pandas as pd
    import numpy as np
    
    
    temp_dict = [{"name":"zhangsan","age":0,"tel":10086},
                 {"name":"lisi","age":15},
                 {"name":"wangwu","tel":10010}]
    
    t1 = pd.DataFrame(temp_dict)
    print(t1)
    """
        age      name      tel
    0  15.0  zhangsan  10086.0
    1  15.0      lisi      NaN
    2   NaN    wangwu  10010.0
    """
    
    ## 缺失数据的处理
    
    #   - 处理方式1:删除NaN所在的行列dropna (axis=0, how='any', inplace=False)
    t2 = t1.dropna(axis=0, how='any', inplace=False)
    print(t2)
    """
        age      name      tel
    0  15.0  zhangsan  10086.0
    """
    #   - 处理方式2:填充数据,t.fillna(t.mean()),t.fiallna(t.median()),t.fillna(0)
    t3 = t1.fillna(t1.mean())
    print(t3)
    """
        age      name      tel
    0  15.0  zhangsan  10086.0
    1  15.0      lisi  10048.0
    2  15.0    wangwu  10010.0
    """
    
    ### 处理为0的数据:将0改为nan,然后使用上面的方法进行填充
    t1[t1==0] = np.nan
    
    print(t1)
    
    
    ### 查看是否为nan,返回布尔索引
    
    print(pd.isnull(t1))
    """
         age   name    tel
    0   True  False  False
    1  False  False   True
    2   True  False  False
    """
    print(pd.notnull(t1))
    """
         age  name    tel
    0  False  True   True
    1   True  True  False
    2  False  True   True
    """
    
  • 相关阅读:
    _ 下划线 Underscores __init__
    Page not found (404) 不被Django的exception中间件捕捉 中间件
    从装修儿童房时的门锁说起
    欧拉定理 费马小定理的推广
    线性运算 非线性运算
    Optimistic concurrency control 死锁 悲观锁 乐观锁 自旋锁
    Avoiding Full Table Scans
    批量的单向的ssh 认证
    批量的单向的ssh 认证
    Corrupted MAC on input at /usr/local/perl/lib/site_perl/5.22.1/x86_64-linux/Net/SSH/Perl/Packet.pm l
  • 原文地址:https://www.cnblogs.com/x54256/p/9841590.html
Copyright © 2020-2023  润新知