import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
创建一个Series ,同时让pandas自动生成索引列
s = pd.Series([1,3,5,np.nan,6,8])
# 查看s
s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
创建一个DataFrame数据框
### 创建一个DataFrame ,可以传入一个numpy array 可以自己构建索引以及列标
dates = pd.date_range('2018-11-01',periods=7)
#### 比如说生成一个时间序列,以20181101 为起始位置的,7个日期组成的时间序列,数据的类型为datetime64[ns]
dates
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
'2018-11-05', '2018-11-06', '2018-11-07'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(7,4),index= dates,columns=list('ABCD'))
df
# 产生随机正态分布的数据,7行4列,分别对应的index的长度以及column的长度
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
### 同时用可以使用dict的实行创建DataFrame
df2 = pd.DataFrame({"A":1,
"B":"20181101",
'C':np.array([3]*4,dtype='int32'),
'D':pd.Categorical(['test','train','test','train']),
"E":1.5},
)
df2
|
A |
B |
C |
D |
E |
0 |
1 |
20181101 |
3 |
test |
1.5 |
1 |
1 |
20181101 |
3 |
train |
1.5 |
2 |
1 |
20181101 |
3 |
test |
1.5 |
3 |
1 |
20181101 |
3 |
train |
1.5 |
df2.dtypes
### 查看数据框中的数据类型,常见的数据类型还有时间类型以及float类型
A int64
B object
C int32
D category
E float64
dtype: object
查看数据
# 比如说看前5行
df.head()
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
# 后4行
df.tail(4)
|
A |
B |
C |
D |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
# 查看DataFrame的索引
df.index
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
'2018-11-05', '2018-11-06', '2018-11-07'],
dtype='datetime64[ns]', freq='D')
# 查看DataFrame的列索引
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
# 查看DataFrame的数据,将DataFrame转化为numpy array 的数据形式
df.values
array([[ 2.19709382, 0.90891281, -0.64802911, -1.32554721],
[ 0.35466158, -1.22424591, -0.50120854, -1.49017025],
[-0.24583358, -1.04959585, 2.36622453, 0.6373212 ],
[-0.6899396 , 0.47128154, -1.41740143, 0.26890482],
[-0.54804068, -0.84193368, 0.57312781, -1.05517487],
[-0.6910726 , 0.93301611, 1.85764662, 0.77552552],
[ 0.46707509, 0.36240665, 2.31937488, -0.721314 ]])
数据的简单统计
# 可以使用describe函数对DataFrame中的数值型数据进行统计
df.describe()
|
A |
B |
C |
D |
count |
7.000000 |
7.000000 |
7.000000 |
7.000000 |
mean |
0.120563 |
-0.062880 |
0.649962 |
-0.415779 |
std |
1.031487 |
0.942664 |
1.553537 |
0.955789 |
min |
-0.691073 |
-1.224246 |
-1.417401 |
-1.490170 |
25% |
-0.618990 |
-0.945765 |
-0.574619 |
-1.190361 |
50% |
-0.245834 |
0.362407 |
0.573128 |
-0.721314 |
75% |
0.410868 |
0.690097 |
2.088511 |
0.453113 |
max |
2.197094 |
0.933016 |
2.366225 |
0.775526 |
df2.describe()
### 对于其他的数据类型的数据describe函数会自动过滤掉
|
A |
C |
E |
count |
4.0 |
4.0 |
4.0 |
mean |
1.0 |
3.0 |
1.5 |
std |
0.0 |
0.0 |
0.0 |
min |
1.0 |
3.0 |
1.5 |
25% |
1.0 |
3.0 |
1.5 |
50% |
1.0 |
3.0 |
1.5 |
75% |
1.0 |
3.0 |
1.5 |
max |
1.0 |
3.0 |
1.5 |
### DataFrame 的转置,将列索引与行索引进行调换,行数据与列数进行调换
df.T
|
2018-11-01 00:00:00 |
2018-11-02 00:00:00 |
2018-11-03 00:00:00 |
2018-11-04 00:00:00 |
2018-11-05 00:00:00 |
2018-11-06 00:00:00 |
2018-11-07 00:00:00 |
A |
2.197094 |
0.354662 |
-0.245834 |
-0.689940 |
-0.548041 |
-0.691073 |
0.467075 |
B |
0.908913 |
-1.224246 |
-1.049596 |
0.471282 |
-0.841934 |
0.933016 |
0.362407 |
C |
-0.648029 |
-0.501209 |
2.366225 |
-1.417401 |
0.573128 |
1.857647 |
2.319375 |
D |
-1.325547 |
-1.490170 |
0.637321 |
0.268905 |
-1.055175 |
0.775526 |
-0.721314 |
df
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
数据的排序
df.sort_index(ascending=False)
### 降序,按照列进行降序,通过该索引列
|
A |
B |
C |
D |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
print(df.sort_values(by=['B','A']))
# 默认是升序,可以选择多指排序,先照B,后排A,如果B中的数据一样,则按照A中的大小进行排序
df.sort_values(by='B')
A B C D
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-07 0.467075 0.362407 2.319375 -0.721314
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-06 -0.691073 0.933016 1.857647 0.775526
|
A |
B |
C |
D |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
选择数据(类似于数据库中sql语句)
df['A']
# 取出单独的一列数据,等价于df.A
2018-11-01 2.197094
2018-11-02 0.354662
2018-11-03 -0.245834
2018-11-04 -0.689940
2018-11-05 -0.548041
2018-11-06 -0.691073
2018-11-07 0.467075
Freq: D, Name: A, dtype: float64
# 通过[]进行行选择切片
df[0:3]
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
# 同时对于时间索引而言,可以直接使用比如
df['2018-11-01':'2018-11-04']
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
另外可以使用标签来选择
df.loc['2018-11-01']
A 2.197094
B 0.908913
C -0.648029
D -1.325547
Name: 2018-11-01 00:00:00, dtype: float64
#### 通过标签来进行多个轴上的进行选择
df.loc[:,["A","B"]] # 等价于df[["A","B"]]
|
A |
B |
2018-11-01 |
2.197094 |
0.908913 |
2018-11-02 |
0.354662 |
-1.224246 |
2018-11-03 |
-0.245834 |
-1.049596 |
2018-11-04 |
-0.689940 |
0.471282 |
2018-11-05 |
-0.548041 |
-0.841934 |
2018-11-06 |
-0.691073 |
0.933016 |
2018-11-07 |
0.467075 |
0.362407 |
df.loc["2018-11-01":"2018-11-03",["A","B"]]
|
A |
B |
2018-11-01 |
2.197094 |
0.908913 |
2018-11-02 |
0.354662 |
-1.224246 |
2018-11-03 |
-0.245834 |
-1.049596 |
#### 获得一个标量数据
df.loc['2018-11-01','A']
2.1970938156943904
通过位置获取数据
df.iloc[3] # 获得第四行的数据
A -0.689940
B 0.471282
C -1.417401
D 0.268905
Name: 2018-11-04 00:00:00, dtype: float64
df.iloc[1:3,1:4] # 与numpy中的ndarray类似
|
B |
C |
D |
2018-11-02 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-1.049596 |
2.366225 |
0.637321 |
# 可以选取不连续的行或者列进行取值
df.iloc[[1,3],[1,3]]
|
B |
D |
2018-11-02 |
-1.224246 |
-1.490170 |
2018-11-04 |
0.471282 |
0.268905 |
# 对行进行切片处理
df.iloc[1:3,:]
|
A |
B |
C |
D |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
# 对列进行切片
df.iloc[:,1:4]
|
B |
C |
D |
2018-11-01 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-06 |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.362407 |
2.319375 |
-0.721314 |
# 获取特定的值
df.iloc[1,3]
-1.4901702546027098
布尔值索引
# 使用单列的数据作为条件进行筛选
df[df.A>0]
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
#很少用到,很少使用这种大范围的条件进行筛选
df[df>0]
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
NaN |
NaN |
2018-11-02 |
0.354662 |
NaN |
NaN |
NaN |
2018-11-03 |
NaN |
NaN |
2.366225 |
0.637321 |
2018-11-04 |
NaN |
0.471282 |
NaN |
0.268905 |
2018-11-05 |
NaN |
NaN |
0.573128 |
NaN |
2018-11-06 |
NaN |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
NaN |
# 使用isin()方法过滤
df2.head()
|
A |
B |
C |
D |
E |
0 |
1 |
20181101 |
3 |
test |
1.5 |
1 |
1 |
20181101 |
3 |
train |
1.5 |
2 |
1 |
20181101 |
3 |
test |
1.5 |
3 |
1 |
20181101 |
3 |
train |
1.5 |
df2[df2['D'].isin(['test'])]
|
A |
B |
C |
D |
E |
0 |
1 |
20181101 |
3 |
test |
1.5 |
2 |
1 |
20181101 |
3 |
test |
1.5 |
设定数值(类似于sql update 或者add)
df['E'] = [1,2,3,4,5,6,7]
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
1 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
4 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
5 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
7 |
df.loc['2018-11-01','E']= 10 # 第一行,E列的数据修改为10
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
10 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
4 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
5 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
7 |
df.iloc[1,4]=5000 # 第二行第五列数据修改为5000
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
10 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
5000 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
4 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
5 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
7 |
df3 =df.copy()
df3[df3<0]= -df3
df3 # 都变成非负数
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
0.648029 |
1.325547 |
10 |
2018-11-02 |
0.354662 |
1.224246 |
0.501209 |
1.490170 |
5000 |
2018-11-03 |
0.245834 |
1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
0.689940 |
0.471282 |
1.417401 |
0.268905 |
4 |
2018-11-05 |
0.548041 |
0.841934 |
0.573128 |
1.055175 |
5 |
2018-11-06 |
0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
0.721314 |
7 |
缺失值处理
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
10 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
5000 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
4 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
5 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
7 |
df['E']=[1,np.nan,2,np.nan,4,np.nan,6]
df.loc['2018-11-01':'2018-11-03','D']=np.nan
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
NaN |
1.0 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
NaN |
NaN |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
NaN |
2.0 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
NaN |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
NaN |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
df4 = df.copy()
df4.dropna(how='any')
|
A |
B |
C |
D |
E |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
df4.dropna(how='all')
# """DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)"""
# aixs 轴0或者1 index或者columns
# how 方式
# thresh 超过阈值个数的缺失值
# subset 那些字段的处理
# inplace 是否直接在原数据框中的替换
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
NaN |
1.0 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
NaN |
NaN |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
NaN |
2.0 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
NaN |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
NaN |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
df4.fillna(1000)
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
1000.000000 |
1.0 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
1000.000000 |
1000.0 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
1000.000000 |
2.0 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
1000.0 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
1000.0 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
pd.isnull(df4)
|
A |
B |
C |
D |
E |
2018-11-01 |
False |
False |
False |
True |
False |
2018-11-02 |
False |
False |
False |
True |
True |
2018-11-03 |
False |
False |
False |
True |
False |
2018-11-04 |
False |
False |
False |
False |
True |
2018-11-05 |
False |
False |
False |
False |
False |
2018-11-06 |
False |
False |
False |
False |
True |
2018-11-07 |
False |
False |
False |
False |
False |
数据操作
#统计的工作一般情况下都不包含缺失值,
df4.mean()
# 默认是对列进行求平均,沿着行方向也就是axis=0
A 0.120563
B -0.062880
C 0.649962
D -0.183015
E 3.250000
dtype: float64
df4.mean(axis=1)
# 沿着列方向求每行的平均
2018-11-01 0.864494
2018-11-02 -0.456931
2018-11-03 0.767699
2018-11-04 -0.341789
2018-11-05 0.425596
2018-11-06 0.718779
2018-11-07 1.685509
Freq: D, dtype: float64
# 对于拥有不同维度,需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行广播:
s = pd.Series([1,3,4,np.nan,6,7,8],index=dates)
s
2018-11-01 1.0
2018-11-02 3.0
2018-11-03 4.0
2018-11-04 NaN
2018-11-05 6.0
2018-11-06 7.0
2018-11-07 8.0
Freq: D, dtype: float64
df4.sub(s,axis='index')
|
A |
B |
C |
D |
E |
2018-11-01 |
1.197094 |
-0.091087 |
-1.648029 |
NaN |
0.0 |
2018-11-02 |
-2.645338 |
-4.224246 |
-3.501209 |
NaN |
NaN |
2018-11-03 |
-4.245834 |
-5.049596 |
-1.633775 |
NaN |
-2.0 |
2018-11-04 |
NaN |
NaN |
NaN |
NaN |
NaN |
2018-11-05 |
-6.548041 |
-6.841934 |
-5.426872 |
-7.055175 |
-2.0 |
2018-11-06 |
-7.691073 |
-6.066984 |
-5.142353 |
-6.224474 |
NaN |
2018-11-07 |
-7.532925 |
-7.637593 |
-5.680625 |
-8.721314 |
-2.0 |
df4
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
NaN |
1.0 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
NaN |
NaN |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
NaN |
2.0 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
NaN |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
NaN |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
df4.apply(np.cumsum)
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
NaN |
1.0 |
2018-11-02 |
2.551755 |
-0.315333 |
-1.149238 |
NaN |
NaN |
2018-11-03 |
2.305922 |
-1.364929 |
1.216987 |
NaN |
3.0 |
2018-11-04 |
1.615982 |
-0.893647 |
-0.200415 |
0.268905 |
NaN |
2018-11-05 |
1.067942 |
-1.735581 |
0.372713 |
-0.786270 |
7.0 |
2018-11-06 |
0.376869 |
-0.802565 |
2.230360 |
-0.010745 |
NaN |
2018-11-07 |
0.843944 |
-0.440158 |
4.549735 |
-0.732059 |
13.0 |
df4.apply(lambda x: x.max()-x.min())
A 2.888166
B 2.157262
C 3.783626
D 1.830700
E 5.000000
dtype: float64
统计个数与离散化
s = pd.Series(np.random.randint(0,7,size=15))
s
0 1
1 6
2 3
3 1
4 1
5 0
6 4
7 1
8 3
9 4
10 6
11 1
12 4
13 3
14 5
dtype: int32
s.value_counts()
# 统计元素的个数,并按照元素统计量进行排序,未出现的元素不会显示出来
1 5
4 3
3 3
6 2
5 1
0 1
dtype: int64
s.reindex(range(0,7))
# 按照固定的顺序输出元素的个数统计
0 1
1 6
2 3
3 1
4 1
5 0
6 4
dtype: int32
s.mode()
# 众数
0 1
dtype: int32
# 连续值转化为离散值,可以使用cut函数进行操作(bins based on vlaues) qcut (bins based on sample
# quantiles) 函数
arr = np.random.randint(0,20,size=15) # 正态分布
arr
array([ 3, 14, 10, 2, 2, 0, 17, 13, 7, 0, 15, 14, 4, 19, 9])
factor = pd.cut(arr,3)
factor
[(-0.019, 6.333], (12.667, 19.0], (6.333, 12.667], (-0.019, 6.333], (-0.019, 6.333], ..., (12.667, 19.0], (12.667, 19.0], (-0.019, 6.333], (12.667, 19.0], (6.333, 12.667]]
Length: 15
Categories (3, interval[float64]): [(-0.019, 6.333] < (6.333, 12.667] < (12.667, 19.0]]
pd.value_counts(factor)
(12.667, 19.0] 6
(-0.019, 6.333] 6
(6.333, 12.667] 3
dtype: int64
factor1 = pd.cut(arr,[-1,5,10,15,20])
pd.value_counts(factor1)
(-1, 5] 6
(10, 15] 4
(5, 10] 3
(15, 20] 2
dtype: int64
factor2 = pd.qcut(arr,[0,0.25,0.5,0.75,1])
pd.value_counts(factor2)
(9.0, 14.0] 4
(2.5, 9.0] 4
(-0.001, 2.5] 4
(14.0, 19.0] 3
dtype: int64