简介
- Pandas 是 Python 的外部模块,它非常像 Excel,提供了分析数据的功能。它提供了两个数据类型 Series 和 DataFrame。
- 什么是 Series?
- Series 是 Pandas 提供的一种数据类型,你可以把它想象成 Excel 的一行或一列。(一维,带标签数组)
- Series对象本质上由两个数组组成(index索引,value值)
- 什么是 DataFrame?
- DataFrame 是 Pandas 提供的一种数据类型,你可以把它想象成 Excel 的表格。(二维,Series容器)
- 什么是 Series?
创建Series
import pandas as pd
p1 = pd.Series([11,22,33,44,55])
print(p1)
print(type(p1))
import pandas as pd
p1 = pd.Series([11,12,13,14,15],index=list("abcde")) #index 指定索引
print(p1)
import pandas as pd
p1 = {"name":"gemoumou","age":"18","tel":"10086",}
p2 = pd.Series(p1)
print(p2)
import pandas as pd
p1 = pd.Series([11,22,33,44,55])
print(p1)
print(type(p1))
p2 = p1.astype(float)
print(p2)
Series的切片和索引
import pandas as pd
p1 = {"name":"gemoumou","age":"18","tel":"10086",}
p2 = pd.Series(p1)
print(p2)
# name gemoumou
# age 18
# tel 10086
# dtype: object
print(p2["name"])
print(p2["age"])
# gemoumou
# 18
print(p2[1])
print(p2[2])
# 18
# 10086
print(p2[[0,1]])
# name gemoumou
# age 18
print(p2[["name","tel"]])
# name gemoumou
# tel 10086
import pandas as pd
p1 = pd.Series([11,22,33,44,55,66,77,88,99,100])
print(p1)
print(p1[p1>50]) # 取出大于50的数据
索引
import pandas as pd
p1 = {"name":"gemoumou","age":"18","tel":"10086",}
p2 = pd.Series(p1)
print(p2)
print(p2.index) # Index(['name', 'age', 'tel'], dtype='object')
for i in p2.index:
print(i)
# name
# age
# tel
print(type(p2.index)) # <class 'pandas.core.indexes.base.Index'>
print(list(p2.index)) # ['name', 'age', 'tel']
值
import pandas as pd
p1 = {"name":"gemoumou","age":"18","tel":"10086",}
p2 = pd.Series(p1)
print(p2)
print(p2.values) # ['gemoumou' '18' '10086']
print(type(p2.values)) # <class 'numpy.ndarray'>
import pandas as pd
p = pd.Series(range(5))
print(p)
print(p.where(p>0))
print(p.mask(p>0))
print(p.where(p>1,10))
pandas 读取外部数据
import pandas as pd
df = pd.read_csv("数据.csv") # 读取csv中的文件
print(df)
pands之DataFrame
- DataFrame对象既有行索引也有列索引
- 行索引,表面不同行,横向索引,叫index 0轴,axis=0
- 列索引,表明不同列,纵向索引,叫columns 1轴,axis=1
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = pd.DataFrame(np.arange(12).reshape(3,4))
print(p1)
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("wxyz"))
print(p1)
import pandas as pd
import numpy as np
p1 = {"name":["zhangsan","lisi"],"age":[18,20],"tel":[10086,10010]}
p2 = pd.DataFrame(p1)
print(p2)
print(type(p2))
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = {"name":"zhangsan","age":18,"tel":10086},{"name":"lisi","age":20,"tel":10010},{"name":"wangmazi","age":22,"tel":100000}
p2 = pd.DataFrame(p1)
print(p2)
print(type(p2))
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = [{"name":"zhangsan","age":18,"tel":10086},{"name":"lisi","tel":10010},{"name":"wangmazi","tel":100000}]
p2 = pd.DataFrame(p1)
print(p2)
print(type(p2))
DataFrame的基础属性
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = {"name":"zhangsan","age":18,"tel":10086},{"name":"lisi","age":20,"tel":10010},{"name":"wangmazi","age":22,"tel":100000}
p2 = pd.DataFrame(p1)
print(p2)
print(p2.index)
print(p2.columns)
print(p2.values)
print(p2.shape)
print(p2.dtypes)
print(p2.ndim)
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = {"name":"zhangsan","age":18,"tel":10086},{"name":"lisi","age":20,"tel":10010},{"name":"wangmazi","age":22,"tel":100000},{"name":"xiaoming","age":22,"tel":100000},{"name":"xiaohong","age":22,"tel":100000}
p2 = pd.DataFrame(p1)
print(p2)
print("-"*20+"显示前几行"+"-"*20)
print(p2.head(2))
print("-"*20+"显示后几行"+"-"*20)
print(p2.tail(2))
print("-"*20+"显示p2的概览"+"-"*20)
print(p2.info())
print("-"*20+"快速对数字类型(int,float)进行统计"+"-"*20)
print(p2.describe())
# -*- coding: utf-8 -*-
import pandas as pd
p1 = pd.read_csv("test01.csv")
#print(p1)
print(p1.head())
print(p1.info())
# -*- coding: utf-8 -*-
import pandas as pd
p1 = pd.read_csv("test01.csv")
# DataFrame中的排序方法
# ascending=True/False 表示升序或者降序
p1 = p1.sort_values(by="NUM",ascending=False)
print(p1)
切片索引
# -*- coding: utf-8 -*-
import pandas as pd
p1 = pd.read_csv("数据.csv") # 读取csv文件内容
# DataFrame中的排序方法
# ascending=True/False 表示升序或者降序
p1 = p1.sort_values(by="NUM",ascending=False)
# pandas取行或者列注意点
# 方括号写数组,表示取行,对行进行操作
# 方括号写字符串,表示取列,对列进行操作
print("-"*20+"取前五行"+"-"*20)
print(p1[:5]) #
print("-"*20+"取后五行"+"-"*20)
print(p1[5:])
print("-"*20+"取NAME列的数据"+"-"*20)
print(p1["NAME"])
print("-"*20+"取前五行 NUM列的数据"+"-"*20)
print(p1[:5]["NUM"])
p1.loc 通过标签索引来获取数据
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
print(p1)
# 衡为行,竖为列
print("-"*20+"坐标:a行Z列"+"-"*20)
print(p1.loc["a","Z"])
print("-"*20+"取a行所有"+"-"*20)
print(p1.loc["a",:])
print("-"*20+"取Z列所有"+"-"*20)
print(p1.loc[:,"Z"])
print("-"*20+"取指定行"+"-"*20)
print(p1.loc[["a","c"],:])
print("-"*20+"取指定列"+"-"*20)
print(p1.loc[:,["W","Z"]])
print("-"*20+"取连续的多行多列"+"-"*20)
print(p1.loc[["a","b"],["W","Z"]])
print("-"*20+"冒号"+"-"*20)
print(p1.loc["a":"c"])
p1.iloc 通过位置来获取数据
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
print(p1)
# 衡为行,竖为列
print("-"*20+"通过位置来获取行数据"+"-"*20)
print(p1.iloc[1])
print("-"*20+"通过位置来获取列数据"+"-"*20)
print(p1.iloc[:,1])
print("-"*20+"通过位置来获取指定数据"+"-"*20)
print(p1.iloc[1,1])
print("-"*20+"通过位置来获取多行多列"+"-"*20)
print(p1.iloc[1:,1:])
print("-"*20+"通过位置来获取指定的多行多列"+"-"*20)
print(p1.iloc[[1,1],[2,1]])
print("-"*20+"通过位置来获取多行多列并赋值"+"-"*20)
p1.iloc[1:,:2]=100
print(p1)
p1.iloc[1:,:2]=np.nan
print(p1)
pandas之布尔索引
# -*- coding: utf-8 -*-
import pandas as pd
p1 = pd.read_csv("数据.csv") # 读取csv文件内容
# ascending=True/False 表示升序或者降序
p1 = p1.sort_values(by="NUM",ascending=False)
print(p1)
print("-"*20+"显示大于14的数据"+"-"*20)
print(p1[p1["NUM"]>14])
print("-"*20+"显示大于10小于22的数据"+"-"*20)
# & 表示且 | 表示或 不同条件之间需要使用括号括起来
print(p1[(p1["NUM"]>10)&(p1["NUM"]<22)])
print("-"*20+"字符串显示大于5小于7的数据"+"-"*20)
print(p1[(p1["NAME"].str.len()>5)&(p1["NAME"].str.len()<7)])
缺失数据的处理
删除nan
import pandas as pd
import numpy as np
p1 = pd.DataFrame(np.arange(20).reshape(4,5),index=list("abcd"),columns=list("VWXYZ"))
# print(p1)
# 衡为行,竖为列
p1.iloc[1:3,:2]=np.nan #把1行后2行前,0列后2列前的值变为nan
print(p1)
print("-"*20+"判断是否存在nan"+"-"*20)
print(pd.notnull(p1))
print("-"*20+"通过位置W列来获取不是nan的数据"+"-"*20)
print(p1[pd.notnull(p1["W"])])
print("-"*20+"数据中只要有nan的行全部删除"+"-"*20)
print(p1.dropna(axis=0,how="any"))
print("-"*20+"删除全部为nan的一行"+"-"*20)
print(p1.dropna(axis=0,how="all"))
print("-"*20+"inplace原地修改p1数据"+"-"*20)
p1.dropna(axis=0,how="any",inplace=True)
print(p1)
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
p1 = pd.DataFrame(np.arange(20).reshape(4,5),index=list("abcd"),columns=list("VWXYZ"))
# print(p1)
# 衡为行,竖为列
p1.iloc[1:3,:2]=np.nan #把1行后2行前,0列后2列前的值变为nan
print(p1)
print("-"*20+"填充nan"+"-"*20)
print(p1.fillna(100))
print("-"*20+"填充全部nan均值"+"-"*20)
print(p1.fillna(p1.mean()))
print("-"*20+"填充V列nan均值"+"-"*20)
print(p1["V"].fillna(p1["V"].mean()))
案例
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
file_path = "IMDB-Movie-Data.csv"
df =pd.read_csv(file_path)
# print(df.info()) #查看描述信息有哪些字段
# print(df.head(1)) # 查看第一行数据
# 获取电影的平均评分
print(df["Rating"].mean())
# 导演的人数
print(len(set(df["Director"].tolist())))
# print(df["Director"].unique())
# 获取演员的人数
temp_actors_list = df["Actors"].str.split(",").tolist()
actors_list = [i for j in temp_actors_list for i in j]
actors_num = len(set(actors_list))
print(actors_num)
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "IMDB-Movie-Data.csv"
df =pd.read_csv(file_path)
# 统计分类情况思路:重新构造一个全为0的数组,列名为分类,如果某一条数据中分类出现过的就让0变为1
# 统计分类的列表
temp_list = df["Genre"].str.split(",").tolist() # [[],[],[]]
genre_list = list(set([i for j in temp_list for i in j])) # 展开列表
# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zeros_df)
# 给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
zeros_df.loc[i,temp_list[i]]=1
# print(zeros_df.head(3))
# 统计每个分类的和
genre_count = zeros_df.sum(axis=0)
print(genre_count)
# 排序
genre_count = genre_count.sort_values()
# 画图
_x = genre_count.index
_y = genre_count.values
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()
数组合并
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
print(df1) # 两行四列
print("-"*50)
df2 = pd.DataFrame(np.zeros((3,3)),index=["A","B","C"],columns=list("xyz"))
print(df2) # 3行3列
print("-"*50)
print(df1.join(df2))
print("-"*50)
print(df2.join(df1))
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
print(df1) # 两行四列
df2 = pd.DataFrame(np.zeros((3, 3)), columns=list("asd"))
print(df2)
print("-" * 50)
print(df1.merge(df2, on="a")) # on 表示按照什么进行合并
df2.loc[1,"a"]=1 # 为a列1行进行赋值1
print(df2)
print(df1.merge(df2, on="a"))
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
print(df1)
print("-" * 50)
df2 = pd.DataFrame(np.arange(9).reshape((3,3)), columns=list("sad"))
print(df2)
print("-" * 50)
print(df1.merge(df2,on="a")) # on 表示按照什么进行合并
print("-" * 50)
df1.loc["A","a"]=100
print(df1)
print("-" * 50)
print(df1.merge(df2,on="a"))
print("-" * 20+"外连接(并集)"+"-" * 20)
print(df1.merge(df2,on="a",how="outer"))
print("-" * 20+"左链接"+"-" * 20)
print(df1.merge(df2,on="a",how="left"))
print("-" * 20+"右链接"+"-" * 20)
print(df1.merge(df2,on="a",how="right"))
分组和聚合
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
file_path = "starbucks_store_worldwide.csv"
df =pd.read_csv(file_path)
# print(df.head(1))
# print(df.info()) # 查看文件中有哪些数据
grouped = df.groupby(by="Country")
#print(grouped)
# DataFrameGroupBy
# # 可以进行遍历
# for i,j in grouped:
# print(i)
# print("-"*50)
# print(j)
# 调用聚合
# print(grouped.count())
country_count = grouped["Brand"].count()
print(country_count["US"])
print(country_count["CN"])
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
file_path = "starbucks_store_worldwide.csv"
df =pd.read_csv(file_path)
# #统计中国每个身份店铺数量
# china_data = df[df["Country"]=="CN"]
# grouped = china_data.groupby(by="State/Province").count()["Brand"]
# print(grouped)
# 数据按照多个条件进行分组,返回的Series
# grouped =df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
# print(grouped)
# 数据按照多个条件进行分组,返回的DataFrame
grouped1 =df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped2 =df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped3 =df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]
print(grouped1,type(grouped1))
print("-"*50)
print(grouped2,type(grouped2))
print("-"*50)
print(grouped3,type(grouped3))
索引和复合索引
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
file_path = "starbucks_store_worldwide.csv"
df =pd.read_csv(file_path)
#统计中国每个身份店铺数量
china_data = df[df["Country"]=="CN"]
grouped = china_data.groupby(by="State/Province").count()["Brand"]
print(grouped)
#数据按照多个条件进行分组,返回的Series
grouped =df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
print(grouped)
#数据按照多个条件进行分组,返回的DataFrame
grouped1 =df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped2 =df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped3 =df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]
print(grouped1,type(grouped1))
print("-"*50)
print(grouped2,type(grouped2))
print("-"*50)
print(grouped3,type(grouped3))
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
print(df1)
print("-"*50)
print(df1.index)
print("-"*50)
df1.index = ["c","d"]
print(df1)
print(df1.index)
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
print(df1)
df1.loc["A","a"]=100
print("-"*50)
print(df1.reindex(["A","C"]))# 没有的行全为NaN
print("-"*50)
print(df1.set_index("a")) # 把某一行作为索引
print(df1.set_index("a").index)
print(df1.set_index(["a","b"])) # 把某几行作为索引
print(df1.set_index(["a","b"]).index)
print("-"*50)
print(df1.set_index("a",drop=False))
print("-"*50)
print(df1["d"].unique())
print(df1["a"].unique())
print("-"*50)
print(len(df1.set_index("b").index)) # 求长度
print("-"*50)
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
a = pd.DataFrame({"a":range(7),"b":range(7,0,-1),"c":["one","one","one","two","two","two","two"],"d":list("hjklmno")})
print(a)
print("-"*50)
b = a.set_index(["c","d"])
print(b)
print("-"*50)
c = b["a"]
print(c)
print("-"*50)
print(c["one"]["j"])
print("-"*50)
d = a.set_index(["d","c"])["a"]
print(d)
print("-"*50)
print(d.swaplevel())
print("-"*50)
print(d.swaplevel()["one"])
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
a = pd.DataFrame({"a":range(7),"b":range(7,0,-1),"c":["one","one","one","two","two","two","two"],"d":list("hjklmno")})
b = a.set_index(["c","d"])
print(b)
print("-"*50)
print(b.loc["one"].loc["k"])
print("-"*50)
print(b.swaplevel().loc["j"])
# -*- coding: utf-8 -*-
import pandas as pd
from matplotlib import pyplot as plt
file_path = "starbucks_store_worldwide.csv"
df =pd.read_csv(file_path)
# 使用matplotlib呈现出店铺总数排名前10的国家
# 准备数据
data1 = df.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10] # 升序前10
_x = data1.index
_y = data1.values
# 画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()
# -*- coding: utf-8 -*-
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
file_path = "starbucks_store_worldwide.csv"
df =pd.read_csv(file_path)
df = df[df["Country"]=="CN"]
# 使用matplotlib呈现出中国各个城市店铺总数排名
# 准备数据
data1 = df.groupby(by="City").count()["Brand"].sort_values(ascending=False)[:50] #升序
_x = data1.index
_y = data1.values
# 画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y,width=0.3,color="orange")
plt.xticks(range(len(_x)),_x,fontproperties=my_font,rotation=90)
plt.show()
pandas 时间序列
# -*- coding: utf-8 -*-
import pandas as pd
print(pd.date_range(start="20171230",end="20180131",freq="D"))# D 表示天
print("-"*50)
print(pd.date_range(start="20171230",end="20180131",freq="10D")) # 每隔10天
print("-"*50)
print(pd.date_range(start="20171230",periods=10,freq="D"))# 生成10天
print("-"*50)
print(pd.date_range(start="20180101",periods=12,freq="M"))# 月
案例
# -*- coding: utf-8 -*-
import pandas as pd
pd.set_option('expand_frame_repr', False)#True就是可以换行显示。设置成False的时候不允许换行
file_path = "BeijingPM20100101_20151231.csv"
df =pd.read_csv(file_path)
#把分开的时间字符串通过 PeriodIndex的方法转化为pandas的事件类型
periond1 = pd.PeriodIndex(year=df["year"],month=df["month"],day=df["day"],hour=df["hour"],freq="H")
# print(periond)
df["datetime"] = periond1
print(df.head(10))
# -*- coding: utf-8 -*-
import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('expand_frame_repr', False)#True就是可以换行显示。设置成False的时候不允许换行
file_path = "BeijingPM20100101_20151231.csv"
df =pd.read_csv(file_path)
#把分开的时间字符串通过 PeriodIndex的方法转化为pandas的事件类型
periond1 = pd.PeriodIndex(year=df["year"],month=df["month"],day=df["day"],hour=df["hour"],freq="H")
# print(periond)
df["datetime"] = periond1
print(df.head(10))
# 把datetime设置为索引
df.set_index("datetime",inplace=True)
# 处理NaN缺失数据,删除缺失数据
data = df["PM_US Post"].dropna()
# 画图
_x = data.index
_y = data.values
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)
plt.xticks(range(0,len(_x),20),list(_x)[::20])
plt.show()
# -*- coding: utf-8 -*-
import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('expand_frame_repr', False)#True就是可以换行显示。设置成False的时候不允许换行
file_path = "BeijingPM20100101_20151231.csv"
df =pd.read_csv(file_path)
#把分开的时间字符串通过 PeriodIndex的方法转化为pandas的事件类型
periond1 = pd.PeriodIndex(year=df["year"],month=df["month"],day=df["day"],hour=df["hour"],freq="H")
# print(periond)
df["datetime"] = periond1
#print(df.head(10))
# 把datetime设置为索引
df.set_index("datetime",inplace=True)
# 应数据较多我们进行降采样按周或者月平均统计
# df =df.resample("M").mean()
df =df.resample("7D").mean()
data = df["PM_US Post"].dropna()
# 画图
_x = data.index
_x = [i.strftime("%Y%m%d")for i in _x]
_y = data.values
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)
plt.xticks(range(0,len(_x),10),list(_x)[::10],rotation=45)
plt.show()