1. 导入numpy和pandas
import numpy as np
import pandas as pd
2. 生成对象
- 用值列表生成Series,索引默认为整数
- 用含日期时间索引与标签的numpy数组生成DataFrame
- 用Series字典对象生成DataFrame
总之,pandas生成对象很灵活。
s= pd.Series([1,3,4,np.nan, 6, 8])
s
0 1.0
1 3.0
2 4.0
3 NaN
4 6.0
5 8.0
dtype: float64
dates = pd.date_range('20220514', periods=6)
dates
DatetimeIndex(['2022-05-14', '2022-05-15', '2022-05-16', '2022-05-17',
'2022-05-18', '2022-05-19'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns= list('ABCD'))
df
|
A |
B |
C |
D |
2022-05-14 |
1.714341 |
-1.979479 |
-0.300576 |
-1.088862 |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
-0.715608 |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
1.899276 |
2022-05-19 |
1.216169 |
-0.220087 |
-1.994829 |
-0.866881 |
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20220514'),
'C': pd.Series(1, index = list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E': pd.Categorical(["test","train","test","train"]),
'F': 'foo'})
df2
|
A |
B |
C |
D |
E |
F |
0 |
1.0 |
2022-05-14 |
1.0 |
3 |
test |
foo |
1 |
1.0 |
2022-05-14 |
1.0 |
3 |
train |
foo |
2 |
1.0 |
2022-05-14 |
1.0 |
3 |
test |
foo |
3 |
1.0 |
2022-05-14 |
1.0 |
3 |
train |
foo |
df2.dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
3. 查看数据
df.head() # 查看头部数据,参数缺省=全部显示,数字=头几行
|
A |
B |
C |
D |
2022-05-14 |
1.714341 |
-1.979479 |
-0.300576 |
-1.088862 |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
-0.715608 |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
1.899276 |
df.tail(3) # 查看尾部数据
|
A |
B |
C |
D |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
-0.715608 |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
1.899276 |
2022-05-19 |
1.216169 |
-0.220087 |
-1.994829 |
-0.866881 |
df.index
DatetimeIndex(['2022-05-14', '2022-05-15', '2022-05-16', '2022-05-17',
'2022-05-18', '2022-05-19'],
dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
DataFrame.to_numpy()
输出底层数据的NumPy对象。
注意,DataFrame
的列由多种数据类型组成时,该操作耗费系统资源较大,这也是Pandas和NumPy的本质区别:
- NumPy 数组只有一种数据类型,
- DataFrame 每列的数据类型各不相同。
- Pandas查找支持DataFrame里所有数据类型的NumPy数据类型。
- .to_numpy()输出结果不包含索引和列标签
还有一种数据类型是object,可以把DataFrame列里的值强制转换为Python对象。
df.to_numpy()
array([[ 1.71434051e+00, -1.97947895e+00, -3.00576349e-01,
-1.08886170e+00],
[ 8.20437240e-01, 1.26699927e+00, -1.40821878e+00,
1.36472164e+00],
[-8.07610335e-01, 1.97375765e-01, 2.86653595e-03,
2.95810777e+00],
[ 4.95664812e-01, -2.70283518e-01, -2.43174905e-01,
-7.15608296e-01],
[-2.59269764e-01, 2.56749508e-01, 6.19412044e-01,
1.89927592e+00],
[ 1.21616869e+00, -2.20086698e-01, -1.99482874e+00,
-8.66880756e-01]])
df2.to_numpy()
array([[1.0, Timestamp('2022-05-14 00:00:00'), 1.0, 3, 'test', 'foo'],
[1.0, Timestamp('2022-05-14 00:00:00'), 1.0, 3, 'train', 'foo'],
[1.0, Timestamp('2022-05-14 00:00:00'), 1.0, 3, 'test', 'foo'],
[1.0, Timestamp('2022-05-14 00:00:00'), 1.0, 3, 'train', 'foo']],
dtype=object)
df.describe()
|
A |
B |
C |
D |
count |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
mean |
0.529955 |
-0.124787 |
-0.554087 |
0.591792 |
std |
0.935005 |
1.063632 |
0.964729 |
1.706927 |
min |
-0.807610 |
-1.979479 |
-1.994829 |
-1.088862 |
25% |
-0.070536 |
-0.257734 |
-1.131308 |
-0.829063 |
50% |
0.658051 |
-0.011355 |
-0.271876 |
0.324557 |
75% |
1.117236 |
0.241906 |
-0.058644 |
1.765637 |
max |
1.714341 |
1.266999 |
0.619412 |
2.958108 |
df2.describe() # 描述 各种计数、均值、标准差、最小、最大等
|
A |
C |
D |
count |
4.0 |
4.0 |
4.0 |
mean |
1.0 |
1.0 |
3.0 |
std |
0.0 |
0.0 |
0.0 |
min |
1.0 |
1.0 |
3.0 |
25% |
1.0 |
1.0 |
3.0 |
50% |
1.0 |
1.0 |
3.0 |
75% |
1.0 |
1.0 |
3.0 |
max |
1.0 |
1.0 |
3.0 |
df.T # 转置
|
2022-05-14 |
2022-05-15 |
2022-05-16 |
2022-05-17 |
2022-05-18 |
2022-05-19 |
A |
1.714341 |
0.820437 |
-0.807610 |
0.495665 |
-0.259270 |
1.216169 |
B |
-1.979479 |
1.266999 |
0.197376 |
-0.270284 |
0.256750 |
-0.220087 |
C |
-0.300576 |
-1.408219 |
0.002867 |
-0.243175 |
0.619412 |
-1.994829 |
D |
-1.088862 |
1.364722 |
2.958108 |
-0.715608 |
1.899276 |
-0.866881 |
df.sort_index(axis=1, ascending = False ) # 按列降序
|
D |
C |
B |
A |
2022-05-14 |
-1.088862 |
-0.300576 |
-1.979479 |
1.714341 |
2022-05-15 |
1.364722 |
-1.408219 |
1.266999 |
0.820437 |
2022-05-16 |
2.958108 |
0.002867 |
0.197376 |
-0.807610 |
2022-05-17 |
-0.715608 |
-0.243175 |
-0.270284 |
0.495665 |
2022-05-18 |
1.899276 |
0.619412 |
0.256750 |
-0.259270 |
2022-05-19 |
-0.866881 |
-1.994829 |
-0.220087 |
1.216169 |
df.sort_values(by='B',ascending=False) # 按B列的降序,ascending = False 降序,默认升序。
|
A |
B |
C |
D |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
1.899276 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
2022-05-19 |
1.216169 |
-0.220087 |
-1.994829 |
-0.866881 |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
-0.715608 |
2022-05-14 |
1.714341 |
-1.979479 |
-0.300576 |
-1.088862 |
df.at # 生产环境使用 .at .iat .loc .iloc
<pandas.core.indexing._AtIndexer at 0x10fda6f20>
df.iat
<pandas.core.indexing._iAtIndexer at 0x11cbcef20>
df.loc
<pandas.core.indexing._LocIndexer at 0x11cc5d940>
df.iloc
<pandas.core.indexing._iLocIndexer at 0x11cc5fdd0>
4. 获取数据
4.1 直接获取
输出Series对象或者DataFrame对象,取决于不同的用法:
- 列表切片,后面用方括号[]
- 面向对象方法ORM,df.A
df['A'] # 产生单列,生产的数据结构与Series一样, 等于 df.A
2022-05-14 1.714341
2022-05-15 0.820437
2022-05-16 -0.807610
2022-05-17 0.495665
2022-05-18 -0.259270
2022-05-19 1.216169
Freq: D, Name: A, dtype: float64
df.A # 验证一下看看, 如果是中文标签咋搞? 'ABCD' 改成 "阿波茨坦" 是可以操作的,用 df.阿
2022-05-14 1.714341
2022-05-15 0.820437
2022-05-16 -0.807610
2022-05-17 0.495665
2022-05-18 -0.259270
2022-05-19 1.216169
Freq: D, Name: A, dtype: float64
df[0:3].A #切片行df[0:3].A和df.A[0:2]也可以执行
2022-05-14 1.714341
2022-05-15 0.820437
2022-05-16 -0.807610
Freq: D, Name: A, dtype: float64
df['20220514':'20220517']
|
A |
B |
C |
D |
2022-05-14 |
1.714341 |
-1.979479 |
-0.300576 |
-1.088862 |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
-0.715608 |
4.2 按标签获取数据
.loc
方法的使用主要就是操作标签值来取数
.loc[[行标签列表], [列标签列表]]
- 形参都是列表,输出DataFrame对象
- 形参有具体值,输出Series/numpy对象
- 形参全是值,输出Value值
df.loc[dates[0]]
A 1.714341
B -1.979479
C -0.300576
D -1.088862
Name: 2022-05-14 00:00:00, dtype: float64
df.loc[:,['A','B']]
|
A |
B |
2022-05-14 |
1.714341 |
-1.979479 |
2022-05-15 |
0.820437 |
1.266999 |
2022-05-16 |
-0.807610 |
0.197376 |
2022-05-17 |
0.495665 |
-0.270284 |
2022-05-18 |
-0.259270 |
0.256750 |
2022-05-19 |
1.216169 |
-0.220087 |
df.loc['20220514':'20220517',['A','C']]
|
A |
C |
2022-05-14 |
1.714341 |
-0.300576 |
2022-05-15 |
0.820437 |
-1.408219 |
2022-05-16 |
-0.807610 |
0.002867 |
2022-05-17 |
0.495665 |
-0.243175 |
df.loc['20220514',['A','C']] #.loc[行标签列表,列标签列表],但是如果用这种用法,是解构了DataFrame,取值到了numpy数组对象了。
A 1.714341
C -0.300576
Name: 2022-05-14 00:00:00, dtype: float64
df.loc[['20220514'],['A','B']] # 注意看与上面输出数据的区别,这里还是还是DataFrame
|
A |
B |
2022-05-14 |
1.714341 |
-1.979479 |
df.loc[dates[0],'A']
1.7143405095532576
df.at[dates[0],'A'] # 这个方面慢多了
1.7143405095532576
df.loc[[dates[0]],['A']] # 果然,.loc[[行列表],[列列表]]始终输出的是DataFrame,还是个表,不是值
4.3 按位置获取
用法与.loc有点像,不重复
df.iloc[3]
A 0.495665
B -0.270284
C -0.243175
D -0.715608
Name: 2022-05-17 00:00:00, dtype: float64
df.iloc[0:5, 0:2] # 这里的切片好像是不包含最后一个,不论行列,从0开始,0是第一行/列
|
A |
B |
2022-05-14 |
1.714341 |
-1.979479 |
2022-05-15 |
0.820437 |
1.266999 |
2022-05-16 |
-0.807610 |
0.197376 |
2022-05-17 |
0.495665 |
-0.270284 |
2022-05-18 |
-0.259270 |
0.256750 |
df.iloc[[0,1,2,3,5],[0,3]] # 不连续列表取行or列
|
A |
D |
2022-05-14 |
1.714341 |
-1.088862 |
2022-05-15 |
0.820437 |
1.364722 |
2022-05-16 |
-0.807610 |
2.958108 |
2022-05-17 |
0.495665 |
-0.715608 |
2022-05-19 |
1.216169 |
-0.866881 |
df.iloc[0:3,:] #显式取整行
|
A |
B |
C |
D |
2022-05-14 |
1.714341 |
-1.979479 |
-0.300576 |
-1.088862 |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
df.iloc[:, 1:3] #显式取整列
|
B |
C |
2022-05-14 |
-1.979479 |
-0.300576 |
2022-05-15 |
1.266999 |
-1.408219 |
2022-05-16 |
0.197376 |
0.002867 |
2022-05-17 |
-0.270284 |
-0.243175 |
2022-05-18 |
0.256750 |
0.619412 |
2022-05-19 |
-0.220087 |
-1.994829 |
df.iloc[1,2] # .iloc[单个值,单个值] 时,直接返回值,而不是DataFrame对象了
-1.408218776287673
df.iat[1,3] #与上面一致
1.364721643082539
4.4 布尔索引(条件选择)
类似其他库的where
df[df.B > 0] # B列大于0的行
|
A |
B |
C |
D |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
1.899276 |
df[df > 0] # 表里面所有大于0的,不满足的直接显示NaN
|
A |
B |
C |
D |
2022-05-14 |
1.714341 |
NaN |
NaN |
NaN |
2022-05-15 |
0.820437 |
1.266999 |
NaN |
1.364722 |
2022-05-16 |
NaN |
0.197376 |
0.002867 |
2.958108 |
2022-05-17 |
0.495665 |
NaN |
NaN |
NaN |
2022-05-18 |
NaN |
0.256750 |
0.619412 |
1.899276 |
2022-05-19 |
1.216169 |
NaN |
NaN |
NaN |
df3 = df.copy()
df3['E']= ['one','two','two','three','four','five'] # 不能通过调用属性来直接赋值:df.E
df3
|
A |
B |
C |
D |
E |
2022-05-14 |
1.714341 |
-1.979479 |
-0.300576 |
-1.088862 |
one |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
two |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
two |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
-0.715608 |
three |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
1.899276 |
four |
2022-05-19 |
1.216169 |
-0.220087 |
-1.994829 |
-0.866881 |
five |
df3[df3.E.isin(['two','four'])] # 可以调用属性来做判断
|
A |
B |
C |
D |
E |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
two |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
two |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
1.899276 |
four |
4.5 赋值
# 用索引自动对齐,注意起始日期 是原df中的第2天/行
df['F'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20220515', periods=6))
df
|
A |
B |
C |
D |
F |
2022-05-14 |
1.714341 |
-1.979479 |
-0.300576 |
-1.088862 |
NaN |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
1.364722 |
1.0 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
2.958108 |
2.0 |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
-0.715608 |
3.0 |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
1.899276 |
4.0 |
2022-05-19 |
1.216169 |
-0.220087 |
-1.994829 |
-0.866881 |
5.0 |
# 按标签、位置、numpy数组赋值
df.at[dates[0],'A'] = 0 # 第1行,第2列
df.iat[0,1] = 0 # 第1行,第2列
df.loc[:, 'D'] = np.array([5]*len(df)) # 用np数组赋值
df
|
A |
B |
C |
D |
F |
2022-05-14 |
0.000000 |
0.000000 |
-0.300576 |
5 |
NaN |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
5 |
1.0 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
5 |
2.0 |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
5 |
3.0 |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
5 |
4.0 |
2022-05-19 |
1.216169 |
-0.220087 |
-1.994829 |
5 |
5.0 |
5. 缺失值
np.nan
# 重构时的缺省值
df5 = df.reindex(index=dates[0:4], columns= list(df.columns) + ['E'])
df5.loc[dates[0]:dates[1], 'E'] = 1 # 用.loc时不存在 用.iloc时遇到的不包括最后一个数的问题
df5
|
A |
B |
C |
D |
F |
E |
2022-05-14 |
0.000000 |
0.000000 |
-0.300576 |
5 |
NaN |
1.0 |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
5 |
1.0 |
1.0 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
5 |
2.0 |
NaN |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
5 |
3.0 |
NaN |
# 删除所有含NaN的行
df5.dropna(how='any')
|
A |
B |
C |
D |
F |
E |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
5 |
1.0 |
1.0 |
# 填充缺省值
df5.fillna(value=5)
|
A |
B |
C |
D |
F |
E |
2022-05-14 |
0.000000 |
0.000000 |
-0.300576 |
5 |
5.0 |
1.0 |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
5 |
1.0 |
1.0 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
5 |
2.0 |
5.0 |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
5 |
3.0 |
5.0 |
# 提取na的布尔值掩码
pd.isna(df5)
|
A |
B |
C |
D |
F |
E |
2022-05-14 |
False |
False |
False |
False |
True |
False |
2022-05-15 |
False |
False |
False |
False |
False |
False |
2022-05-16 |
False |
False |
False |
False |
False |
True |
2022-05-17 |
False |
False |
False |
False |
False |
True |
6. 运算
6.1. 二进制
6.2. 统计
# 求均值
df.mean(0) # 首个参数 axias=0 或者默认值,显示按列求平均值
A 0.244232
B 0.205126
C -0.554087
D 5.000000
F 3.000000
dtype: float64
df.mean(1) # axias=1 按行求平均值
2022-05-14 1.174856
2022-05-15 1.335844
2022-05-16 1.278526
2022-05-17 1.596441
2022-05-18 1.923378
2022-05-19 1.800251
Freq: D, dtype: float64
不同维度对象运算时,要先对齐。
pandas 自动沿指定维度广播(扩展可能更合适吧)
# 对齐的案例
s = pd.Series([1,3,5, np.nan, 6, 8], index = dates).shift(2)
# 通过 .shift(n) 向后移动n个。
s
2022-05-14 NaN
2022-05-15 NaN
2022-05-16 1.0
2022-05-17 3.0
2022-05-18 5.0
2022-05-19 NaN
Freq: D, dtype: float64
6.3. Apply()函数
df
|
A |
B |
C |
D |
F |
2022-05-14 |
0.000000 |
0.000000 |
-0.300576 |
5 |
NaN |
2022-05-15 |
0.820437 |
1.266999 |
-1.408219 |
5 |
1.0 |
2022-05-16 |
-0.807610 |
0.197376 |
0.002867 |
5 |
2.0 |
2022-05-17 |
0.495665 |
-0.270284 |
-0.243175 |
5 |
3.0 |
2022-05-18 |
-0.259270 |
0.256750 |
0.619412 |
5 |
4.0 |
2022-05-19 |
1.216169 |
-0.220087 |
-1.994829 |
5 |
5.0 |
# 使用np的计算方法
df.apply(np.cumsum) # 按行累加
|
0 |
1 |
2 |
3 |
0 |
0.804641 |
-0.217559 |
2.073518 |
0.345552 |
1 |
-0.327518 |
0.562475 |
0.841837 |
0.954023 |
2 |
1.114386 |
-0.654050 |
0.387675 |
-0.704480 |
3 |
1.633977 |
0.134903 |
0.734002 |
-1.505859 |
4 |
-0.194504 |
-0.852858 |
-1.305951 |
0.722428 |
5 |
1.980137 |
-0.907263 |
0.297719 |
1.357773 |
6 |
1.890712 |
2.498787 |
0.320914 |
-0.351401 |
7 |
-1.901487 |
-0.539700 |
-1.418863 |
-1.481537 |
8 |
-1.021292 |
0.860927 |
1.171972 |
0.208986 |
9 |
-1.380867 |
-1.287114 |
-1.492172 |
-0.932805 |
pieces = [df[:3], df[7:]]
pd.concat(pieces)
|
0 |
1 |
2 |
3 |
0 |
0.804641 |
-0.217559 |
2.073518 |
0.345552 |
1 |
-0.327518 |
0.562475 |
0.841837 |
0.954023 |
2 |
1.114386 |
-0.654050 |
0.387675 |
-0.704480 |
7 |
-1.901487 |
-0.539700 |
-1.418863 |
-1.481537 |
8 |
-1.021292 |
0.860927 |
1.171972 |
0.208986 |
9 |
-1.380867 |
-1.287114 |
-1.492172 |
-0.932805 |
7.2. 连接(JION)
SQL风格
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left, right
( key lval
0 foo 1
1 foo 2,
key rval
0 foo 4
1 foo 5)
pd.merge(left, right, on = 'key')
|
key |
lval |
rval |
0 |
foo |
1 |
4 |
1 |
foo |
1 |
5 |
2 |
foo |
2 |
4 |
3 |
foo |
2 |
5 |
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [4, 5]})
left, right
( key lval
0 foo 4
1 bar 5,
key rval
0 foo 4
1 bar 5)
pd.merge(left, right, on = 'key')
|
key |
lval |
rval |
0 |
foo |
4 |
4 |
1 |
bar |
5 |
5 |
7.3. 追加
# 为df追加一行
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df
|
A |
B |
C |
D |
0 |
0.317374 |
-1.337608 |
-2.674631 |
-0.420806 |
1 |
-0.428625 |
0.401127 |
1.002601 |
-0.600688 |
2 |
1.032571 |
0.239903 |
-0.153389 |
-2.722230 |
3 |
0.346756 |
-0.464619 |
0.874594 |
0.438894 |
4 |
-0.705527 |
0.719372 |
0.907008 |
0.656376 |
5 |
0.423865 |
-1.262852 |
0.721386 |
-0.614503 |
6 |
2.170255 |
1.199904 |
-1.468446 |
-0.809605 |
7 |
-1.030749 |
1.182177 |
-0.325211 |
-0.919963 |
s = df.iloc[3]
s
A 0.346756
B -0.464619
C 0.874594
D 0.438894
Name: 3, dtype: float64
# 这个方法要被干掉了。
# FutureWarning:
# The frame.append method is deprecated and will be removed from pandas in a future version.
# Use pandas.concat instead.
df.append(s,ignore_index=True)
/var/folders/wj/nc3k2r8x1l9blh3bp02_y01r0000gn/T/ipykernel_62802/1540360925.py:5: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df.append(s,ignore_index=True)
|
A |
B |
C |
D |
0 |
0.317374 |
-1.337608 |
-2.674631 |
-0.420806 |
1 |
-0.428625 |
0.401127 |
1.002601 |
-0.600688 |
2 |
1.032571 |
0.239903 |
-0.153389 |
-2.722230 |
3 |
0.346756 |
-0.464619 |
0.874594 |
0.438894 |
4 |
-0.705527 |
0.719372 |
0.907008 |
0.656376 |
5 |
0.423865 |
-1.262852 |
0.721386 |
-0.614503 |
6 |
2.170255 |
1.199904 |
-1.468446 |
-0.809605 |
7 |
-1.030749 |
1.182177 |
-0.325211 |
-0.919963 |
8 |
0.346756 |
-0.464619 |
0.874594 |
0.438894 |
8. 分组 Groupby
“group by” 指的是涵盖下列一项或多项步骤的处理流程:
- 分割:按条件把数据分割成多组;
- 应用:为每组单独应用函数;
- 组合:将处理结果组合成一个数据结构。
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
df
|
A |
B |
C |
D |
0 |
foo |
one |
0.319217 |
0.930735 |
1 |
bar |
one |
-1.410576 |
-0.613675 |
2 |
foo |
two |
0.426761 |
0.318037 |
3 |
bar |
three |
-0.310835 |
-2.186474 |
4 |
foo |
two |
1.340606 |
2.311946 |
5 |
bar |
two |
-0.531829 |
-0.047955 |
6 |
foo |
one |
0.588822 |
-0.115847 |
7 |
foo |
three |
0.274960 |
-0.639542 |
df.groupby('A').sum()
|
C |
D |
A |
|
|
bar |
-2.253241 |
-2.848104 |
foo |
2.950364 |
2.805329 |
df.groupby(['A','B']).sum()
|
|
C |
D |
A |
B |
|
|
bar |
one |
-1.410576 |
-0.613675 |
three |
-0.310835 |
-2.186474 |
two |
-0.531829 |
-0.047955 |
foo |
one |
0.908038 |
0.814888 |
three |
0.274960 |
-0.639542 |
two |
1.767366 |
2.629982 |
9. 重塑 reshaping
没弄明白,姑且先知道有这么个东西吧。
# 数据准备
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz','foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two','one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8,2),index = index, columns= ['A', 'B'])
df
df2 = df[:4]
df2
|
|
A |
B |
first |
second |
|
|
bar |
one |
1.082895 |
-0.845524 |
two |
-1.459212 |
-0.990315 |
baz |
one |
1.330689 |
0.391560 |
two |
1.543693 |
-0.546439 |
# 用stack()压缩至1层:DataFrame对象变成有多层索引
stacked = df2.stack()
stacked
first second
bar one A 1.082895
B -0.845524
two A -1.459212
B -0.990315
baz one A 1.330689
B 0.391560
two A 1.543693
B -0.546439
dtype: float64
# 反向操作 unstack(): 没搞懂
stacked.unstack()
|
|
A |
B |
first |
second |
|
|
bar |
one |
1.082895 |
-0.845524 |
two |
-1.459212 |
-0.990315 |
baz |
one |
1.330689 |
0.391560 |
two |
1.543693 |
-0.546439 |
stacked.unstack(1)
|
second |
one |
two |
first |
|
|
|
bar |
A |
1.082895 |
-1.459212 |
B |
-0.845524 |
-0.990315 |
baz |
A |
1.330689 |
1.543693 |
B |
0.391560 |
-0.546439 |
stacked.unstack(2)
|
|
A |
B |
first |
second |
|
|
bar |
one |
1.082895 |
-0.845524 |
two |
-1.459212 |
-0.990315 |
baz |
one |
1.330689 |
0.391560 |
two |
1.543693 |
-0.546439 |
10. 数据透视
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
'B': ['A', 'B', 'C'] * 4,
'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D': np.random.randn(12),
'E': np.random.randn(12)})
df
|
A |
B |
C |
D |
E |
0 |
one |
A |
foo |
-0.460053 |
0.696100 |
1 |
one |
B |
foo |
-0.377592 |
-1.117457 |
2 |
two |
C |
foo |
0.480715 |
1.824891 |
3 |
three |
A |
bar |
-0.327904 |
0.374917 |
4 |
one |
B |
bar |
-0.242910 |
-0.644575 |
5 |
one |
C |
bar |
-0.088203 |
-1.164071 |
6 |
two |
A |
foo |
-0.811716 |
-1.360298 |
7 |
three |
B |
foo |
0.013366 |
0.088306 |
8 |
one |
C |
foo |
1.680185 |
-0.770378 |
9 |
one |
A |
bar |
0.813511 |
-0.576679 |
10 |
two |
B |
bar |
0.888552 |
0.424111 |
11 |
three |
C |
bar |
-0.017553 |
0.335018 |
pd.pivot_table(df,values='D',index= ['A', 'B'], columns= 'C')
|
C |
bar |
foo |
A |
B |
|
|
one |
A |
0.813511 |
-0.460053 |
B |
-0.242910 |
-0.377592 |
C |
-0.088203 |
1.680185 |
three |
A |
-0.327904 |
NaN |
B |
NaN |
0.013366 |
C |
-0.017553 |
NaN |
two |
A |
NaN |
-0.811716 |
B |
0.888552 |
NaN |
C |
NaN |
0.480715 |
11. 时间序列
主要是针对时间序列的频率、时区、时间戳等的转换。
# 数据频率
rng = pd.date_range('1/1/2022', periods=100, freq='S') # 不同的时间格式都可以
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts
2022-01-01 00:00:00 353
2022-01-01 00:00:01 33
2022-01-01 00:00:02 411
2022-01-01 00:00:03 408
2022-01-01 00:00:04 236
...
2022-01-01 00:01:35 260
2022-01-01 00:01:36 60
2022-01-01 00:01:37 226
2022-01-01 00:01:38 361
2022-01-01 00:01:39 288
Freq: S, Length: 100, dtype: int64
ts.resample('5Min').sum()
# ts.resample()之后是一个<pandas.core.resample.DatetimeIndexResampler object at 0x111f64730>对象
# ts1 = ts.resample('5Min')
# ts1
2022-01-01 23659
Freq: 5T, dtype: int64
rng = pd.date_range('3/6/2022 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
2022-03-06 0.229783
2022-03-07 1.244212
2022-03-08 -0.246107
2022-03-09 0.678679
2022-03-10 1.117801
Freq: D, dtype: float64
ts_utc = ts.tz_localize('UTC') # tz_localize 和 tz_convert
ts_utc
2022-03-06 00:00:00+00:00 0.229783
2022-03-07 00:00:00+00:00 1.244212
2022-03-08 00:00:00+00:00 -0.246107
2022-03-09 00:00:00+00:00 0.678679
2022-03-10 00:00:00+00:00 1.117801
Freq: D, dtype: float64
ts_utc.tz_convert('US/Eastern')
2022-03-05 19:00:00-05:00 0.229783
2022-03-06 19:00:00-05:00 1.244212
2022-03-07 19:00:00-05:00 -0.246107
2022-03-08 19:00:00-05:00 0.678679
2022-03-09 19:00:00-05:00 1.117801
Freq: D, dtype: float64
rng = pd.date_range('1/1/2022', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
2022-01-31 -0.084049
2022-02-28 -0.291836
2022-03-31 -1.579093
2022-04-30 0.332739
2022-05-31 -0.247872
Freq: M, dtype: float64
ps = ts.to_period() # 日期转换为月份
ps
2022-01 -0.084049
2022-02 -0.291836
2022-03 -1.579093
2022-04 0.332739
2022-05 -0.247872
Freq: M, dtype: float64
ps.to_timestamp() # 日期转换为时间戳
2022-01-01 -0.084049
2022-02-01 -0.291836
2022-03-01 -1.579093
2022-04-01 0.332739
2022-05-01 -0.247872
Freq: MS, dtype: float64
案例:把以11月为结束年份的季度频率转换为下一季度月末上午9点
- .asfreq()
- .period_range()
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 # 这个完全没看懂啊
ts.head()
1990-03-01 09:00 1.473524
1990-06-01 09:00 2.420399
1990-09-01 09:00 0.157428
1990-12-01 09:00 1.361583
1991-03-01 09:00 0.654051
Freq: H, dtype: float64
12. 类别型(Categoricals)
对DataFrame中的数据指定类别,这个好像可以用到对资产负债表里面不同科目的设定“资产”、“负债”这样的用途。好像还不是这样哦
df = pd.DataFrame({
"id": [1, 2, 3, 4, 5, 6],
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']
})
df
|
id |
raw_grade |
0 |
1 |
a |
1 |
2 |
b |
2 |
3 |
b |
3 |
4 |
a |
4 |
5 |
a |
5 |
6 |
e |
df['grade'] = df['raw_grade'].astype('category')
df['grade'], df
(0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e'],
id raw_grade grade
0 1 a a
1 2 b b
2 3 b b
3 4 a a
4 5 a a
5 6 e e)
Series.cat.categories
: 用有含义的名字命名不同的类型。
df['grade'].cat.categories= ['very good','good', 'bad']
Series.cat.set_categories
: 重新排序类别,并添加缺失值
df['grade'] = df['grade'].cat.set_categories(['very bad', 'bad','medium','good','very good'])
df['grade'] # 和案例不一样,e 对应的是 very bad,这里只有bad 为什么??
0 very good
1 good
2 good
3 very good
4 very good
5 bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']
df.sort_values(by='grade') # 意思是不是按 a=verygood,b=good,... 这样子分等级哦?
|
id |
raw_grade |
grade |
5 |
6 |
e |
bad |
1 |
2 |
b |
good |
2 |
3 |
b |
good |
0 |
1 |
a |
very good |
3 |
4 |
a |
very good |
4 |
5 |
a |
very good |
df.groupby('grade').size() # 和案例显示的不一样啊。。。
grade
very bad 0
bad 1
medium 0
good 2
very good 3
dtype: int64
13. 可视化
用matplotlib实现可视化。
# 1条时间序列
ts = pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
<AxesSubplot:>
# n条时间序列
import matplotlib.pyplot as plt # 尼玛还要去学个matplotlib 。。
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure() #这是个啥? matplotlib??
df.plot()
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x11e2d6a70>
<Figure size 432x288 with 0 Axes>
14. 输入/输出
对不同数据源的读写:
格式 |
写 |
读 |
CSV |
df.to_csv('/path') |
pd.read_csv('/path') |
HDF5 |
df.to_hdf('/path') |
pd.read_hdf('/path') |
Excel |
df.to_excel('/path', sheetname= ) |
pd.read_excel('/path', 'sheetname', index_col, na_values=['NA'] ) |
- df: DataFrame对象
- pd:pandas对象
# CSV
df.to_csv('test.csv')
pd.read_csv('test.csv')
|
Unnamed: 0 |
A |
B |
C |
D |
0 |
2000-01-01 |
-0.202417 |
0.456969 |
-2.924227 |
0.515216 |
1 |
2000-01-02 |
-0.438973 |
-0.123265 |
-2.138087 |
0.180029 |
2 |
2000-01-03 |
-1.314234 |
0.729317 |
-3.167284 |
-0.970958 |
3 |
2000-01-04 |
-0.334582 |
1.380405 |
-3.591633 |
-0.134346 |
4 |
2000-01-05 |
-2.179493 |
1.489109 |
-3.695347 |
-1.412114 |
... |
... |
... |
... |
... |
... |
995 |
2002-09-22 |
-0.249507 |
43.834701 |
-15.442732 |
-2.111668 |
996 |
2002-09-23 |
-1.792658 |
45.252003 |
-15.524048 |
-1.902708 |
997 |
2002-09-24 |
-2.880995 |
47.077331 |
-16.008287 |
-2.276801 |
998 |
2002-09-25 |
-2.008815 |
48.172294 |
-16.671531 |
-2.360542 |
999 |
2002-09-26 |
-1.788311 |
47.357009 |
-18.158131 |
-2.731052 |
1000 rows × 5 columns
# HDF5
df.to_hdf('foo.h5', 'df') # 需要pytable库,安装:pip install tables.
pd.read_hdf('foo.h5', 'df')
|
A |
B |
C |
D |
2000-01-01 |
-0.202417 |
0.456969 |
-2.924227 |
0.515216 |
2000-01-02 |
-0.438973 |
-0.123265 |
-2.138087 |
0.180029 |
2000-01-03 |
-1.314234 |
0.729317 |
-3.167284 |
-0.970958 |
2000-01-04 |
-0.334582 |
1.380405 |
-3.591633 |
-0.134346 |
2000-01-05 |
-2.179493 |
1.489109 |
-3.695347 |
-1.412114 |
... |
... |
... |
... |
... |
2002-09-22 |
-0.249507 |
43.834701 |
-15.442732 |
-2.111668 |
2002-09-23 |
-1.792658 |
45.252003 |
-15.524048 |
-1.902708 |
2002-09-24 |
-2.880995 |
47.077331 |
-16.008287 |
-2.276801 |
2002-09-25 |
-2.008815 |
48.172294 |
-16.671531 |
-2.360542 |
2002-09-26 |
-1.788311 |
47.357009 |
-18.158131 |
-2.731052 |
1000 rows × 4 columns
# Excel
df.to_excel('test.xlsx', sheet_name='pandas')
pd.read_excel('test.xlsx', 'pandas', index_col = None, na_values=['NA'])
|
Unnamed: 0 |
A |
B |
C |
D |
0 |
2000-01-01 |
-0.202417 |
0.456969 |
-2.924227 |
0.515216 |
1 |
2000-01-02 |
-0.438973 |
-0.123265 |
-2.138087 |
0.180029 |
2 |
2000-01-03 |
-1.314234 |
0.729317 |
-3.167284 |
-0.970958 |
3 |
2000-01-04 |
-0.334582 |
1.380405 |
-3.591633 |
-0.134346 |
4 |
2000-01-05 |
-2.179493 |
1.489109 |
-3.695347 |
-1.412114 |
... |
... |
... |
... |
... |
... |
995 |
2002-09-22 |
-0.249507 |
43.834701 |
-15.442732 |
-2.111668 |
996 |
2002-09-23 |
-1.792658 |
45.252003 |
-15.524048 |
-1.902708 |
997 |
2002-09-24 |
-2.880995 |
47.077331 |
-16.008287 |
-2.276801 |
998 |
2002-09-25 |
-2.008815 |
48.172294 |
-16.671531 |
-2.360542 |
999 |
2002-09-26 |
-1.788311 |
47.357009 |
-18.158131 |
-2.731052 |
1000 rows × 5 columns
15. 各种坑(报错) Gotchs
链接断了。