处理丢失数据
有两种丢失数据:
import numpy as np
type(None)
NoneType
type(np.nan)
float
1. None
None是Python自带的,其类型为python object。因此,None不能参与到任何计算中。
object类型的运算要比int类型的运算慢得多
计算不同数据类型求和时间
%timeit np.arange(1e5,dtype=xxx).sum()
1E7
10000000.0
%timeit np.arange(1E6, dtype= int).sum()
1.67 ms ± 79.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit np.arange(1E6, dtype = float).sum()
1.58 ms ± 14.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit np.arange(1E6,dtype = object).sum()
68.1 ms ± 226 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
2. np.nan(NaN)
np.nan是浮点类型,能参与到计算中。但计算的结果总是NaN。
但可以使用np.nan*()函数来计算nan,此时视nan为0。
nd = np.array([10,20,30,np.nan,None])
#None 不能够参加到运算当中
nd.sum()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-8-eb79efca2123> in <module>
1 nd = np.array([10,20,30,np.nan,None])
2 #None 不能够参加到运算当中
----> 3 nd.sum()
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
36 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
37 initial=_NoValue, where=True):
---> 38 return umr_sum(a, axis, dtype, out, keepdims, initial, where)
39
40 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'
nd = np.array([10,20,30,np.nan])
nd
array([10., 20., 30., nan])
nd.sum()
nan
np.mean(nd)
nan
np.nanmean(nd)
20.0
np.nansum(nd)
60.0
np.nan
nan
3. pandas中的None与NaN
1) pandas中None与np.nan都视作np.nan
创建DataFrame
import pandas as pd
from pandas import Series,DataFrame
df = DataFrame([10,20,57,None,np.nan], index = list('abcde'), columns = ["Python"])
df
|
Python |
a |
10.0 |
b |
20.0 |
c |
57.0 |
d |
NaN |
e |
NaN |
df.sum()
Python 87.0
dtype: float64
df = DataFrame([[10,20,57,None,np.nan],
[22,33,56,16,None],
[np.nan,1,2,3,4]], index = list("abc"), columns = ["Python","Java","物理","数学","H5"])
df
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20 |
57 |
NaN |
NaN |
b |
22.0 |
33 |
56 |
16.0 |
NaN |
c |
NaN |
1 |
2 |
3.0 |
4.0 |
df.sum(axis = 0)
Python 32.0
Java 54.0
物理 115.0
数学 19.0
H5 4.0
dtype: float64
使用DataFrame行索引与列索引修改DataFrame数据
df["Python"]["c"] = 12
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
df
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20 |
57 |
NaN |
NaN |
b |
22.0 |
33 |
56 |
16.0 |
NaN |
c |
12.0 |
1 |
2 |
3.0 |
4.0 |
2) pandas中None与np.nan的操作
df = DataFrame([[10,20,57,None,np.nan],
[22,33,56,16,None],
[np.nan,1,2,3,4]], index = list("abc"), columns = ["Python","Java","物理","数学","H5"])
df
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20 |
57 |
NaN |
NaN |
b |
22.0 |
33 |
56 |
16.0 |
NaN |
c |
NaN |
1 |
2 |
3.0 |
4.0 |
#下面讲的是一个重点!!!!
isnull()
notnull()
dropna()
: 过滤丢失数据
fillna()
: 填充丢失数据
df
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20 |
57 |
NaN |
NaN |
b |
22.0 |
33 |
56 |
16.0 |
NaN |
c |
NaN |
1 |
2 |
3.0 |
4.0 |
#DataFrame 的isnull这个函数返回值就是一个DataFrame
is_null = df.isnull()
is_null
#
|
Python |
Java |
物理 |
数学 |
H5 |
a |
False |
False |
False |
True |
True |
b |
False |
False |
False |
False |
True |
c |
True |
False |
False |
False |
False |
#需求:查看哪一行有空值,举数据分析的例子的时候吗,会用这个方法
is_null = is_null.any(axis = 1)
is_null
a True
b True
c True
dtype: bool
df2 = DataFrame([[10,20,57,90,28],[22,35,46,78,67],[21,34,23,77,66]],
index = list("efg"),columns = ["Python","Java","物理","数学","H5"] )
#没空值的数据
df2
|
Python |
Java |
物理 |
数学 |
H5 |
e |
10 |
20 |
57 |
90 |
28 |
f |
22 |
35 |
46 |
78 |
67 |
g |
21 |
34 |
23 |
77 |
66 |
df3 = df.add(df2, fill_value = 0)
df3
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
NaN |
b |
22.0 |
33.0 |
56.0 |
16.0 |
NaN |
c |
NaN |
1.0 |
2.0 |
3.0 |
4.0 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
28.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
67.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
66.0 |
df3_isnull = df3.isnull()
df3_isnull = df3_isnull.any(axis = 1)
df3_isnull
a True
b True
c True
e False
f False
g False
dtype: bool
df3[df3_isnull]
#过滤问题 过滤的是没有空值的,留下来的是带空值的!!!
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
NaN |
b |
22.0 |
33.0 |
56.0 |
16.0 |
NaN |
c |
NaN |
1.0 |
2.0 |
3.0 |
4.0 |
df
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20 |
57 |
NaN |
NaN |
b |
22.0 |
33 |
56 |
16.0 |
NaN |
c |
NaN |
1 |
2 |
3.0 |
4.0 |
df[is_null]
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20 |
57 |
NaN |
NaN |
b |
22.0 |
33 |
56 |
16.0 |
NaN |
c |
NaN |
1 |
2 |
3.0 |
4.0 |
(1)判断函数
df3
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
NaN |
b |
22.0 |
33.0 |
56.0 |
16.0 |
NaN |
c |
NaN |
1.0 |
2.0 |
3.0 |
4.0 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
28.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
67.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
66.0 |
df3_notnull = df3.notnull().all(axis = 1)
df3_notnull
a False
b False
c False
e True
f True
g True
dtype: bool
df3[df3_notnull]
#过滤的是空值,留下来的是没有空值的情况
|
Python |
Java |
物理 |
数学 |
H5 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
28.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
67.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
66.0 |
#还可以通过条件来进行过滤
df3
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
NaN |
b |
22.0 |
33.0 |
56.0 |
16.0 |
NaN |
c |
NaN |
1.0 |
2.0 |
3.0 |
4.0 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
28.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
67.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
66.0 |
cond = (df3 >= 10).all(axis= 1)
cond
a False
b False
c False
e True
f True
g True
dtype: bool
df3[cond]
|
Python |
Java |
物理 |
数学 |
H5 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
28.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
67.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
66.0 |
(2) 过滤函数
df3
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
NaN |
b |
22.0 |
33.0 |
56.0 |
16.0 |
NaN |
c |
NaN |
1.0 |
2.0 |
3.0 |
4.0 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
28.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
67.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
66.0 |
df3.dropna()
|
Python |
Java |
物理 |
数学 |
H5 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
28.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
67.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
66.0 |
df3["H5"] = None
df3
#pandas 自身的bug 但是数据还是nan
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
None |
b |
22.0 |
33.0 |
56.0 |
16.0 |
None |
c |
NaN |
1.0 |
2.0 |
3.0 |
None |
e |
10.0 |
20.0 |
57.0 |
90.0 |
None |
f |
22.0 |
35.0 |
46.0 |
78.0 |
None |
g |
21.0 |
34.0 |
23.0 |
77.0 |
None |
df3.dropna(axis = 1,how = "all")
|
Python |
Java |
物理 |
数学 |
a |
10.0 |
20.0 |
57.0 |
NaN |
b |
22.0 |
33.0 |
56.0 |
16.0 |
c |
NaN |
1.0 |
2.0 |
3.0 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
可以选择过滤的是行还是列(默认为行)
也可以选择过滤的方式 how = 'all'
(3) 填充函数 Series/DataFrame
df3
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
None |
b |
22.0 |
33.0 |
56.0 |
16.0 |
None |
c |
NaN |
1.0 |
2.0 |
3.0 |
None |
e |
10.0 |
20.0 |
57.0 |
90.0 |
None |
f |
22.0 |
35.0 |
46.0 |
78.0 |
None |
g |
21.0 |
34.0 |
23.0 |
77.0 |
None |
df3.fillna(-1)
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
-1.0 |
-1 |
b |
22.0 |
33.0 |
56.0 |
16.0 |
-1 |
c |
-1.0 |
1.0 |
2.0 |
3.0 |
-1 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
-1 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
-1 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
-1 |
可以选择前向填充还是后向填充
df3
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
None |
b |
22.0 |
33.0 |
56.0 |
16.0 |
None |
c |
NaN |
1.0 |
2.0 |
3.0 |
None |
e |
10.0 |
20.0 |
57.0 |
90.0 |
None |
f |
22.0 |
35.0 |
46.0 |
78.0 |
None |
g |
21.0 |
34.0 |
23.0 |
77.0 |
None |
df3.fillna(method = "bfill")
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
16.0 |
None |
b |
22.0 |
33.0 |
56.0 |
16.0 |
None |
c |
10.0 |
1.0 |
2.0 |
3.0 |
None |
e |
10.0 |
20.0 |
57.0 |
90.0 |
None |
f |
22.0 |
35.0 |
46.0 |
78.0 |
None |
g |
21.0 |
34.0 |
23.0 |
77.0 |
None |
df3.fillna(method = "ffill")
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
NaN |
None |
b |
22.0 |
33.0 |
56.0 |
16.0 |
None |
c |
22.0 |
1.0 |
2.0 |
3.0 |
None |
e |
10.0 |
20.0 |
57.0 |
90.0 |
None |
f |
22.0 |
35.0 |
46.0 |
78.0 |
None |
g |
21.0 |
34.0 |
23.0 |
77.0 |
None |
#f forward 向前
df3.fillna(method='ffill', axis = 1)
|
Python |
Java |
物理 |
数学 |
H5 |
a |
10.0 |
20.0 |
57.0 |
57.0 |
57.0 |
b |
22.0 |
33.0 |
56.0 |
16.0 |
16.0 |
c |
NaN |
1.0 |
2.0 |
3.0 |
3.0 |
e |
10.0 |
20.0 |
57.0 |
90.0 |
90.0 |
f |
22.0 |
35.0 |
46.0 |
78.0 |
78.0 |
g |
21.0 |
34.0 |
23.0 |
77.0 |
77.0 |
df3.fillna(method = "bfill",axis = 1)
对于DataFrame来说,还要选择填充的轴axis。记住,对于DataFrame来说:
- axis=0:index/行
- axis=1:columns/列
============================================
练习7:
-
简述None与NaN的区别
-
假设张三李四参加模拟考试,但张三因为突然想明白人生放弃了英语考试,因此记为None,请据此创建一个DataFrame,命名为ddd3
-
老师决定根据用数学的分数填充张三的英语成绩,如何实现?
用李四的英语成绩填充张三的英语成绩?
============================================