Pandas的常用数据类型
- Series一维,带标签的数组
- DataFrame二维,Series容器
一、Series
Series对象本质上是有两个数组组成,一个数组构成对象的键(index),一个数组构成对象的值(values)
import string import pandas as pd import numpy as np # 创建Series t1 = pd.Series(np.arange(5),index=list("abcde")) print(t1) """ 索引可以指定,默认为012... a 0 b 1 c 2 d 3 e 4 dtype: int64 """ print(type(t1)) # <class 'pandas.core.series.Series'> # 使用字典创建Series a = {string.ascii_uppercase[i]:i for i in range(5)} # 创建Series print(pd.Series(a)) """ A 0 B 1 C 2 D 3 E 4 dtype: int64 """ print(pd.Series(a,index=list("CDEFG"))) """ C 2.0 D 3.0 E 4.0 F NaN G NaN dtype: float64 """ # 切片 print(t1[0:4:2]) """ a 0 c 2 dtype: int64 """ print(t1[[2,3,4]]) """ c 2 d 3 e 4 dtype: int64 """ print(t1[t1>2]) """ d 3 e 4 dtype: int64 """ print(t1["b"]) # 1 print(t1[["a","e","f"]]) """ a 0.0 e 4.0 f NaN dtype: float64 """ # 索引和值 print(t1.index) # Index(['a', 'b', 'c', 'd', 'e'], dtype='object') print(type(t1.index)) # <class 'pandas.core.indexes.base.Index'> print(t1.values) # [0 1 2 3 4] print(type(t1.values)) # <class 'numpy.ndarray'>
二、DataFrame
创建DataFrame
# 创建DataFrame对象 t1 = pd.DataFrame(np.arange(12).reshape(3,4)) print(t1) """ DataFrame对象既有行索引,又有列索引 行索引,表明不同行,横向索引,叫index,0轴,axis=0 列索引,表名不同列,纵向索引,叫columns,1轴,axis=1 0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 """ t2 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("EFGH")) print(t2) """ E F G H a 0 1 2 3 b 4 5 6 7 c 8 9 10 11 """ # 将字典转换成dataframe temp_dict = [{"name":"zhangsan","age":15,"tel":10086}, {"name":"lisi","age":15}, {"name":"wangwu","tel":10086} ] t3 = pd.DataFrame(temp_dict) print(t3) """ age name tel 0 15.0 zhangsan 10086.0 1 15.0 lisi NaN 2 NaN wangwu 10086.0 """
获取DataFrame的基本信息
# 获取DataFrame的基本信息 # 行数,列数 print(t1.shape) # 列数据类型 print(t1.dtypes) # 数据维度 print(t1.ndim) # 2 # 行索引 print(t1.index) # RangeIndex(start=0, stop=3, step=1) # 列索引 print(t2.columns) # Index(['E', 'F', 'G', 'H'], dtype='object') # 对象值 print(t1.values) """ [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] """ # 显示头几行,默认是5 print(t1.head(2)) # 显示末尾几行 print(t1.tail(2)) # 相关信息概览:行数,列数,列索引,咧非空值个数,行列类型,内存占用 print(t1.info()) """ <class 'pandas.core.frame.DataFrame'> RangeIndex: 3 entries, 0 to 2 Data columns (total 4 columns): 0 3 non-null int64 1 3 non-null int64 2 3 non-null int64 3 3 non-null int64 dtypes: int64(4) memory usage: 176.0 bytes None """ # 快速综合统计结果:计数,均值,标准差,最大值,1/4值,最小值 print(t2.describe()) """ 是根据列来计算的 E F G H count 3.0 3.0 3.0 3.0 mean 4.0 5.0 6.0 7.0 std 4.0 4.0 4.0 4.0 min 0.0 1.0 2.0 3.0 25% 2.0 3.0 4.0 5.0 50% 4.0 5.0 6.0 7.0 75% 6.0 7.0 8.0 9.0 max 8.0 9.0 10.0 11.0 """
加载csv数据
import numpy as np import pandas as pd # 加载csv数据 t = pd.read_csv("./dogNames2.csv") # 按照字段进行排序,ascending desc/asc t2 = t.sort_values("Count_AnimalName",ascending=False).head(10) print(t2)
获取行列数据
import string import numpy as np import pandas as pd # loc和iloc方法 # df.loc:通过标签获取行数据 # df.iloc:通过位置获取行数据 t1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("EFGH")) # 获取行列交叉部分 print(t1.loc["a","E"]) # 0 # 获取行与多列交叉部分 print(t1.loc["a",["E","F"]]) """ E 0 F 1 """ # 获取行与多列交叉部分 print(t1.loc["a","E":"G"]) """ E 0 F 1 G 2 """ # 获取行与连续多列交叉部分 print(t1.loc["a":"c","G"]) """ 注意:loc里的:是包括最后的那个的 a 2 b 6 c 10 """ # iloc和loc是一样的,只不过采用的是索引来进行的操作
布尔索引
import numpy as np import pandas as pd # pandas中的布尔索引 t1 = pd.read_csv("./dogNames2.csv") # 找出其中名字使用次数超过800的狗 print(t1[t1["Count_AnimalName"]>800]) """ Row_Labels Count_AnimalName 1156 BELLA 1195 2660 CHARLIE 856 3251 COCO 852 9140 MAX 1153 12368 ROCKY 823 """ # 找出狗名字符串长度超过4的狗 print(t1[t1["Row_Labels"].str.len()>4].head(3)) """ Row_Labels Count_AnimalName 2 40804 1 3 90201 1 4 90203 1 """ # 多条件,要使用()分割,&或|做连接符 print(t1[(t1["Row_Labels"].str.len()>4)&(t1["Row_Labels"].str.len()<6)].head(3))
字符串方法
处理缺失数据
import pandas as pd import numpy as np temp_dict = [{"name":"zhangsan","age":0,"tel":10086}, {"name":"lisi","age":15}, {"name":"wangwu","tel":10010}] t1 = pd.DataFrame(temp_dict) print(t1) """ age name tel 0 15.0 zhangsan 10086.0 1 15.0 lisi NaN 2 NaN wangwu 10010.0 """ ## 缺失数据的处理 # - 处理方式1:删除NaN所在的行列dropna (axis=0, how='any', inplace=False) t2 = t1.dropna(axis=0, how='any', inplace=False) print(t2) """ age name tel 0 15.0 zhangsan 10086.0 """ # - 处理方式2:填充数据,t.fillna(t.mean()),t.fiallna(t.median()),t.fillna(0) t3 = t1.fillna(t1.mean()) print(t3) """ age name tel 0 15.0 zhangsan 10086.0 1 15.0 lisi 10048.0 2 15.0 wangwu 10010.0 """ ### 处理为0的数据:将0改为nan,然后使用上面的方法进行填充 t1[t1==0] = np.nan print(t1) ### 查看是否为nan,返回布尔索引 print(pd.isnull(t1)) """ age name tel 0 True False False 1 False False True 2 True False False """ print(pd.notnull(t1)) """ age name tel 0 False True True 1 True True False 2 False True True """