part1数据探索及数据处理
数据处理
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3004 entries, 0 to 3003
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 3004 non-null int64
1 name 3004 non-null object
2 gender 2868 non-null object
3 age 2904 non-null float64
4 edu 1073 non-null object
5 custom_amt 3004 non-null object
6 order_date 3004 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 164.4+ KB
# 数值型转字符串类型
df3["id"] = df3["id"].astype("str")
# 字符串类型转浮点型
df3["custom_amt"] = df3["custom_amt"].str.strip("¥").astype("float") # 去掉部分字符
# 字符串类型转日期型
df3["order_date"] = pd.to_datetime(df3["order_date"],format="%Y年%m月%d日")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3004 entries, 0 to 3003
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 3004 non-null object
1 name 3004 non-null object
2 gender 2868 non-null object
3 age 2904 non-null float64
4 edu 1073 non-null object
5 custom_amt 3002 non-null float64
6 order_date 3004 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 164.4+ KB
df3.dtypes # 方法是针对特定对象的函数
id object
name object
gender object
age float64
edu object
custom_amt float64
order_date datetime64[ns]
dtype: object
4
df3.drop_duplicates() # 只是删除了行,索引不变
3000 rows × 7 columns
id | name | gender | age | edu | custom_amt | order_date |
0 |
890 |
李小胆李l |
female |
43.0 |
NaN |
2177.94 |
2018-12-25 |
1 |
2391 |
881xt |
male |
52.0 |
NaN |
2442.18 |
2017-05-24 |
2 |
2785 |
haoah |
male |
39.0 |
NaN |
849.79 |
2018-05-15 |
3 |
1361 |
snaen |
female |
26.0 |
NaN |
2482.22 |
2018-05-16 |
4 |
888 |
sue女少 |
female |
61.0 |
本科 |
2027.90 |
2018-01-21 |
... |
... |
... |
... |
... |
... |
... |
... |
2999 |
2926 |
宝哒哇是我 |
female |
NaN |
NaN |
542.02 |
2018-09-01 |
3000 |
1616 |
和花花豆豆 |
female |
64.0 |
本科 |
2593.38 |
2018-11-03 |
3001 |
1129 |
小清新桉蓝 |
female |
61.0 |
NaN |
139.68 |
2018-11-07 |
3002 |
674 |
xyyx毓 |
female |
34.0 |
本科 |
670.89 |
2018-06-08 |
3003 |
229 |
大侦探野荒 |
male |
64.0 |
NaN |
118.37 |
2018-07-13 |
# 删除重复值,重排索引
df3.drop_duplicates(inplace=True
,ignore_index=True)
3000 rows × 7 columns
id | name | gender | age | edu | custom_amt | order_date |
0 |
890 |
李小胆李l |
female |
43.0 |
NaN |
2177.94 |
2018-12-25 |
1 |
2391 |
881xt |
male |
52.0 |
NaN |
2442.18 |
2017-05-24 |
2 |
2785 |
haoah |
male |
39.0 |
NaN |
849.79 |
2018-05-15 |
3 |
1361 |
snaen |
female |
26.0 |
NaN |
2482.22 |
2018-05-16 |
4 |
888 |
sue女少 |
female |
61.0 |
本科 |
2027.90 |
2018-01-21 |
... |
... |
... |
... |
... |
... |
... |
... |
2995 |
2926 |
宝哒哇是我 |
female |
NaN |
NaN |
542.02 |
2018-09-01 |
2996 |
1616 |
和花花豆豆 |
female |
64.0 |
本科 |
2593.38 |
2018-11-03 |
2997 |
1129 |
小清新桉蓝 |
female |
61.0 |
NaN |
139.68 |
2018-11-07 |
2998 |
674 |
xyyx毓 |
female |
34.0 |
本科 |
670.89 |
2018-06-08 |
2999 |
229 |
大侦探野荒 |
male |
64.0 |
NaN |
118.37 |
2018-07-13 |
# 异常值的处理
df3.describe([0.25,0.5,0.75,0.99])
age | custom_amt |
count |
2900.000000 |
2998.000000 |
mean |
41.605862 |
1535.732572 |
std |
14.162957 |
842.658685 |
min |
19.000000 |
68.100000 |
25% |
30.000000 |
828.232500 |
50% |
40.000000 |
1510.275000 |
75% |
55.000000 |
2276.527500 |
99% |
65.000000 |
2963.879000 |
max |
230.000000 |
2999.950000 |
df3.loc[df3["age"]>200,:]
| id | name | gender | age | edu | custom_amt | order_date |
118 |
2113 |
089bt |
female |
230.0 |
NaN |
1193.06 |
2018-04-03 |
2999 rows × 7 columns
id | name | gender | age | edu | custom_amt | order_date |
0 |
890 |
李小胆李l |
female |
43.0 |
NaN |
2177.94 |
2018-12-25 |
1 |
2391 |
881xt |
male |
52.0 |
NaN |
2442.18 |
2017-05-24 |
2 |
2785 |
haoah |
male |
39.0 |
NaN |
849.79 |
2018-05-15 |
3 |
1361 |
snaen |
female |
26.0 |
NaN |
2482.22 |
2018-05-16 |
4 |
888 |
sue女少 |
female |
61.0 |
本科 |
2027.90 |
2018-01-21 |
... |
... |
... |
... |
... |
... |
... |
... |
2995 |
2926 |
宝哒哇是我 |
female |
NaN |
NaN |
542.02 |
2018-09-01 |
2996 |
1616 |
和花花豆豆 |
female |
64.0 |
本科 |
2593.38 |
2018-11-03 |
2997 |
1129 |
小清新桉蓝 |
female |
61.0 |
NaN |
139.68 |
2018-11-07 |
2998 |
674 |
xyyx毓 |
female |
34.0 |
本科 |
670.89 |
2018-06-08 |
2999 |
229 |
大侦探野荒 |
male |
64.0 |
NaN |
118.37 |
2018-07-13 |
# 删除异常值,~取反
df3 = df3.loc[~(df3["age"]>200),:]
# 查看缺失值
df3.isnull().sum()
# 缺失值的比例
df3.isnull().sum()/df3.shape[0]
id 0
name 0
gender 136
age 100
edu 1926
custom_amt 2
order_date 0
dtype: int64
id 0.000000
name 0.000000
gender 0.045348
age 0.033344
edu 0.642214
custom_amt 0.000667
order_date 0.000000
dtype: float64
id 0.000000
name 0.000000
gender 0.045348
age 0.033344
edu 0.642214
custom_amt 0.000667
order_date 0.000000
dtype: float64
# 删除某一列
df3.drop(columns="edu")
2999 rows × 6 columns
id | name | gender | age | custom_amt | order_date |
0 |
890 |
李小胆李l |
female |
43.0 |
2177.94 |
2018-12-25 |
1 |
2391 |
881xt |
male |
52.0 |
2442.18 |
2017-05-24 |
2 |
2785 |
haoah |
male |
39.0 |
849.79 |
2018-05-15 |
3 |
1361 |
snaen |
female |
26.0 |
2482.22 |
2018-05-16 |
4 |
888 |
sue女少 |
female |
61.0 |
2027.90 |
2018-01-21 |
... |
... |
... |
... |
... |
... |
... |
2995 |
2926 |
宝哒哇是我 |
female |
NaN |
542.02 |
2018-09-01 |
2996 |
1616 |
和花花豆豆 |
female |
64.0 |
2593.38 |
2018-11-03 |
2997 |
1129 |
小清新桉蓝 |
female |
61.0 |
139.68 |
2018-11-07 |
2998 |
674 |
xyyx毓 |
female |
34.0 |
670.89 |
2018-06-08 |
2999 |
229 |
大侦探野荒 |
male |
64.0 |
118.37 |
2018-07-13 |
df3.loc[:,df3.isnull().mean()<=0.5]
2999 rows × 6 columns
id | name | gender | age | custom_amt | order_date |
0 |
890 |
李小胆李l |
female |
43.0 |
2177.94 |
2018-12-25 |
1 |
2391 |
881xt |
male |
52.0 |
2442.18 |
2017-05-24 |
2 |
2785 |
haoah |
male |
39.0 |
849.79 |
2018-05-15 |
3 |
1361 |
snaen |
female |
26.0 |
2482.22 |
2018-05-16 |
4 |
888 |
sue女少 |
female |
61.0 |
2027.90 |
2018-01-21 |
... |
... |
... |
... |
... |
... |
... |
2995 |
2926 |
宝哒哇是我 |
female |
NaN |
542.02 |
2018-09-01 |
2996 |
1616 |
和花花豆豆 |
female |
64.0 |
2593.38 |
2018-11-03 |
2997 |
1129 |
小清新桉蓝 |
female |
61.0 |
139.68 |
2018-11-07 |
2998 |
674 |
xyyx毓 |
female |
34.0 |
670.89 |
2018-06-08 |
2999 |
229 |
大侦探野荒 |
male |
64.0 |
118.37 |
2018-07-13 |
df_dropna = df3.loc[:,df3.isnull().mean()<=0.5].dropna() # 剔除缺失值比例大于一半的列,删除有缺失值的行
df_dropna.isnull().sum()
id 0
name 0
gender 0
age 0
custom_amt 0
order_date 0
dtype: int64
# 填补缺失值
df3["age"] = df3["age"].fillna(df3["age"].mean())
df3["age"].fillna(method="bfill")# method{‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None
0 43.000000
1 52.000000
2 39.000000
3 26.000000
4 61.000000
...
2995 41.540876
2996 64.000000
2997 61.000000
2998 34.000000
2999 64.000000
Name: age, Length: 2999, dtype: float64
part2文本数据的处理
# 文本数据的处理
df4 = df_dropna.copy()
id 0
name 0
gender 0
age 0
custom_amt 0
order_date 0
dtype: int64
id | name | gender | age | custom_amt | order_date |
0 |
890 |
李小胆李l |
female |
43.0 |
2177.94 |
2018-12-25 |
1 |
2391 |
881xt |
male |
52.0 |
2442.18 |
2017-05-24 |
2 |
2785 |
haoah |
male |
39.0 |
849.79 |
2018-05-15 |
3 |
1361 |
snaen |
female |
26.0 |
2482.22 |
2018-05-16 |
4 |
888 |
sue女少 |
female |
61.0 |
2027.90 |
2018-01-21 |
df4["gender"] = df4["gender"].apply(lambda x:x.title()) # 对每一个自变量都执行首字母大写 map
df4["gender"] = df4["gender"].map({"Female":0,"Male":1}) # 分类变量
# 去掉数据集中一些特殊的符号
df["custom_amt"].str.replace("¥","")
0 2177.94
1 2442.18
2 849.79
3 2482.22
4 2027.9
...
2999 542.02
3000 2593.38
3001 139.68
3002 670.89
3003 118.37
Name: custom_amt, Length: 3004, dtype: object