• pandas之Series数据分析


    python爬虫之数据分析包pandas

    一.pandas介绍

    pandas 是基于numpy构建的含有更高级数据结构和工具的数据分析包
    类似于numpy的核心是ndarray,pandas也是围绕这series和datafrom > 两个核心数据结构
    

    pandas的引入方式

    1.安装方式

    pip3 install pandas
    

    2.引入方式

    import pandas as pd
    

    pandas的两大数据结构

    1.Series

    series是pandas的两种数据结构之一,可以理解为一维带标签数组
    

    数组中的数据可以为任意类型(整数,字符串,浮点型,python objects等)

    创建seriex

    s = pd.Series(data, index=index)
    """
    data 可以是list,array(数组),dictionary(字典)
    NumPy 提供的 array() 函数直接将 Python 数组转换为 ndarray 数组,array() 接受一切序列类型的对象
    """
    
    price = pd.Series([456,716,125])
    price
    """
    0	456
    1	716
    2	125
    dtype:int64
    """
    
    price = pd.Series([456,716,125], name='p')
    
    price
    """
    0	456
    1	716
    2	125
    Name:p, dtype:int64
    """
    
    p.mean()
    """
    432.33333333333333
    """
    
    p.sum()
    """
    1297
    """
    
    p.haed()
    """
    price
    0	456
    1	716
    Name:p, dtype:int64
    """
    
    p.tail(2)
    """
    1	716
    2	125
    Name:p, dtype:int64
    """
    
    dic = {'three':100,'one':15,"two":78}
    price = pd.Series(dic , name='p')
    price
    """
    three	100
    one	15
    two	78
    Name: p, dtype: int64
    """
    

    Series数据类型

    price = pd.Series([1,2,3,4])
    price.dtype
    """
    dtype('int64')
    """
    
    price = pd.Series([1,2,3,4.6])
    price.dtype
    """
    dtype('float64')
    """
    
    ciyt = pd.Series(['wh','hz','sh','nj'])
    city.dtype
    """
    dtype('object')
    """
    
    temp = pd.Series({},[],(1,2))
    temp.dtype
    """
    dtype('object')
    """
    
    x = pd.Series(['2016-01-01','2017-01-01'])
    x.dtype
    """
    dtype('object')
    """
    
    x = pd.Series(['a','b','a','c','d'],dtype='category')
    x
    """
    0	a
    1	b
    2	a
    3	c
    4	d
    dtype:category
    Categories(4,object):[a,b,c,d]
    """
    

    boolean操作(布尔)

    mask = pd.Series([True,True,False,True])
    mask
    """
    0	True
    1	True
    2	False
    3	True
    dtype:bool
    """
    
    price[mask]
    """
    0	1.0
    1	2.0
    3	4.0
    dtype:float64
    """
    
    mask2 = pd.Series([True,False,True,True])
    nams|mask2
    """
    有一个True就是True
    0	True
    1	True
    2	True
    3	True
    """
    
    mask&mask3
    """
    都True才为True
    0	True
    1	False
    2	False
    3	True
    """
    
    ~mask
    """
    去反
    0	False
    1	False
    2	True
    3	False
    """"
    

    index操作

    price
    """
    0	1
    1	2
    2	3
    3	4
    """
    
    price[2]
    """
    3
    """
    
    price = pd.Series([1,2,3,4],index=['aa','bb','cc','dd'])
    price
    """
    aa	1
    bb	2
    cc	3
    dd	4
    """
    
    price.index
    """
    index(['aa','bb','cc','dd'],dtype='object')
    """
    

    日期相关

    dates = pd.date_range('2019-01-01','2019-06-01',freq='M')
    dates
    """
    'M':每月最后一个日历
    'W': 周
    'D': 天
    'H': 时
    'T/min': 分
    'S': 秒
    DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
                   '2019-05-31'],
                  dtype='datetime64[ns]', freq='M')
    """
    
    tempature = pd.Series([10,11,20,27,29],index=dates)
    tempature
    """
    [五个值]
    2019-01-31    10
    2019-02-28    11
    2019-03-31    20
    2019-04-30    27
    2019-05-31    29
    Freq: M, dtype: int64
    """
    

    切片

    temp = pd.Series([12,14,15,18])
    temp[0]
    temp[2]
    """
    12
    15
    """
    
    temp = pd.Series([12,14,15,18],index=['a','b','c','d'])
    temp 
    temp['c']
    """
    a	12
    b	14
    c	15
    d	18
    dtype:int64
    15
    """
    
    temp.iloc[2]
    """
    15
    """
    

    修改/增加/删除Series中的值

    temp['a'] = 100
    temp.iloc[1] = 200
    temp
    """
    修改
    a	100
    b	200
    c	15
    d	18
    dtype:int64
    """
    
    temp
    

    统计函数summary,statistics

    temp.min()	# 最小
    temp.sum()	# 求和
    temp.median()	# 平均
    
    temp.quantile(0.1)
    temp.quantile(0.25)
    temp.quantile(0.5)
    """
    7.8
    9.0
    11.0
    """
    
    temp.describe()
    """
    count     2.000000
    mean     11.000000
    std       5.656854
    min       7.000000
    25%       9.000000
    50%      11.000000
    75%      13.000000
    max      15.000000
    dtype: float64
    """
    
    temp=pd.Series(['hw','apple','vivo','mi','hw','oppo','samsung','vivo'],dtype='category')
    temp.value_count()
    """
    vivo	2
    hw		1
    samsung  1
    oppo	1
    mi		1
    apple	1
    dtype:int64
    """
    

    向量化操作与广播

    price = pd.Series([10,20,30,40], index=['o','t','t','t'])
    price*2
    """
    运算: + - * /
    o	20
    t	40
    t	60
    t	80
    """
    
    price+100
    """
    o	110
    t	120
    t	130
    t	140
    """
    
    s = pd.Series([10,20,30], index=[0,1,2])
    s1 = pd.Series([40,50,60,70], index=[1,2,3,4])
    s+s1
    """
    NaN 在pandas中表示不是一个数字
    0	NaN
    1	60
    2	80
    3	NaN
    4	NaN
    """
    

    迭代iteration

    for num i s:
        print(num)
    """
    1
    2
    3
    """
    
    20 in s
    """
    False
    """
    
    20 in s.values
    """
    True
    """
    
    2 in s
    """
    2 在index中
    True
    """
    
    for k,v in s.items():
        print(k,v)
    """
    0	10.0
    1	20.0
    2	30.0
    """
    

    参考连接--https://lupython.gitee.io/2017/04/07/pandas的介绍/

  • 相关阅读:
    Win7远程连接凭据不工作的诡异问题解决
    pip介绍与使用
    Java Web整合开发(35) -- JPA规范
    爬虫
    零基础自学用Python 3开发网络爬虫
    learn资料
    Linux定时任务Crontab命令详解 转
    VirtualBox Host-only Adapter,Failed to create the host-only adapter 转
    Nginx报 No input file specified. 的问题解决之路 转
    ci上传图片
  • 原文地址:https://www.cnblogs.com/bbiu/p/11550175.html
Copyright © 2020-2023  润新知