- 需求:
- 导入文件,查看原始数据
- 将人口数据和各州简称数据进行合并
- 将合并的数据中重复的abbreviation列进行删除
- 查看存在缺失数据的列
- 找到有哪些state/region使得state的值为NaN,进行去重操作
- 为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
- 合并各州面积数据areas
- 我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行
- 去除含有缺失数据的行
- 找出2010年的全民人口数据
- 计算各州的人口密度
- 排序,并找出人口密度最高的州
import numpy as np
import pandas as pd
from pandas import DataFrame
#导入文件,查看原始数据
abb = pd.read_csv('./data/state-abbrevs.csv') #state(州的全称)abbreviation(州的简称)
area = pd.read_csv('./data/state-areas.csv') #state州的全称,area (sq. mi)州的面积
pop = pd.read_csv('./data/state-population.csv')#state/region简称,ages年龄,year时间,population人口数量
#将人口数据和各州简称数据进行合并
abb_pop = pd.merge(abb,pop,left_on='abbreviation',right_on='state/region',how='outer')
abb_pop.head()
|
state |
abbreviation |
state/region |
ages |
year |
population |
0 |
Alabama |
AL |
AL |
under18 |
2012 |
1117489.0 |
1 |
Alabama |
AL |
AL |
total |
2012 |
4817528.0 |
2 |
Alabama |
AL |
AL |
under18 |
2010 |
1130966.0 |
3 |
Alabama |
AL |
AL |
total |
2010 |
4785570.0 |
4 |
Alabama |
AL |
AL |
under18 |
2011 |
1125763.0 |
#将合并的数据中重复的abbreviation列进行删除
abb_pop.drop(labels='abbreviation',axis=1,inplace=True)
abb_pop.head()
|
state |
state/region |
ages |
year |
population |
0 |
Alabama |
AL |
under18 |
2012 |
1117489.0 |
1 |
Alabama |
AL |
total |
2012 |
4817528.0 |
2 |
Alabama |
AL |
under18 |
2010 |
1130966.0 |
3 |
Alabama |
AL |
total |
2010 |
4785570.0 |
4 |
Alabama |
AL |
under18 |
2011 |
1125763.0 |
#查看存在缺失数据的列
#方式1:isnull,notll,any,all
abb_pop.isnull().any(axis=0)
#state,population这两列中是存在空值
state True
state/region False
ages False
year False
population True
dtype: bool
#方式2:
abb_pop.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
state 2448 non-null object
state/region 2544 non-null object
ages 2544 non-null object
year 2544 non-null int64
population 2524 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB
#找到有哪些state/region使得state的值为NaN,进行去重操作(将state中的空值对应的简称找到,且对简称进行去重)
abb_pop.head()
|
state |
state/region |
ages |
year |
population |
0 |
Alabama |
AL |
under18 |
2012 |
1117489.0 |
1 |
Alabama |
AL |
total |
2012 |
4817528.0 |
2 |
Alabama |
AL |
under18 |
2010 |
1130966.0 |
3 |
Alabama |
AL |
total |
2010 |
4785570.0 |
4 |
Alabama |
AL |
under18 |
2011 |
1125763.0 |
#思路:可以将state这一列中的空值对应的行数据取出,从该行数据中就可以取出简称的值
#1.将state中的空值定位到
abb_pop['state'].isnull()
#2.将上述的布尔值作为源数据的行索引
abb_pop.loc[abb_pop['state'].isnull()]#将state中空对应的行数据取出
#3.将简称取出
abb_pop.loc[abb_pop['state'].isnull()]['state/region']
#4.对简称去重
abb_pop.loc[abb_pop['state'].isnull()]['state/region'].unique()
#结论:只有PR和USA对应的全称数据为空值
array([], dtype=object)
#为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
#思考:填充该需求中的空值可不可以使用fillna?
# - 不可以。fillna可以使用空的紧邻值做填充。fillna(value='xxx')使用指定的值填充空值
# 使用给元素赋值的方式进行填充!
#1.先给USA的全称对应的空值进行批量赋值
#1.1将USA对应的行数据找出(行数据中就存在state的空值)
abb_pop['state/region'] == 'USA'
abb_pop.loc[abb_pop['state/region'] == 'USA']#将usa对应的行数据取出
|
state |
state/region |
ages |
year |
population |
2496 |
NaN |
USA |
under18 |
1990 |
64218512.0 |
2497 |
NaN |
USA |
total |
1990 |
249622814.0 |
2498 |
NaN |
USA |
total |
1991 |
252980942.0 |
2499 |
NaN |
USA |
under18 |
1991 |
65313018.0 |
2500 |
NaN |
USA |
under18 |
1992 |
66509177.0 |
2501 |
NaN |
USA |
total |
1992 |
256514231.0 |
2502 |
NaN |
USA |
total |
1993 |
259918595.0 |
2503 |
NaN |
USA |
under18 |
1993 |
67594938.0 |
2504 |
NaN |
USA |
under18 |
1994 |
68640936.0 |
2505 |
NaN |
USA |
total |
1994 |
263125826.0 |
2506 |
NaN |
USA |
under18 |
1995 |
69473140.0 |
2507 |
NaN |
USA |
under18 |
1996 |
70233512.0 |
2508 |
NaN |
USA |
total |
1995 |
266278403.0 |
2509 |
NaN |
USA |
total |
1996 |
269394291.0 |
2510 |
NaN |
USA |
total |
1997 |
272646932.0 |
2511 |
NaN |
USA |
under18 |
1997 |
70920738.0 |
2512 |
NaN |
USA |
under18 |
1998 |
71431406.0 |
2513 |
NaN |
USA |
total |
1998 |
275854116.0 |
2514 |
NaN |
USA |
under18 |
1999 |
71946051.0 |
2515 |
NaN |
USA |
total |
2000 |
282162411.0 |
2516 |
NaN |
USA |
under18 |
2000 |
72376189.0 |
2517 |
NaN |
USA |
total |
1999 |
279040181.0 |
2518 |
NaN |
USA |
total |
2001 |
284968955.0 |
2519 |
NaN |
USA |
under18 |
2001 |
72671175.0 |
2520 |
NaN |
USA |
total |
2002 |
287625193.0 |
2521 |
NaN |
USA |
under18 |
2002 |
72936457.0 |
2522 |
NaN |
USA |
total |
2003 |
290107933.0 |
2523 |
NaN |
USA |
under18 |
2003 |
73100758.0 |
2524 |
NaN |
USA |
total |
2004 |
292805298.0 |
2525 |
NaN |
USA |
under18 |
2004 |
73297735.0 |
2526 |
NaN |
USA |
total |
2005 |
295516599.0 |
2527 |
NaN |
USA |
under18 |
2005 |
73523669.0 |
2528 |
NaN |
USA |
total |
2006 |
298379912.0 |
2529 |
NaN |
USA |
under18 |
2006 |
73757714.0 |
2530 |
NaN |
USA |
total |
2007 |
301231207.0 |
2531 |
NaN |
USA |
under18 |
2007 |
74019405.0 |
2532 |
NaN |
USA |
total |
2008 |
304093966.0 |
2533 |
NaN |
USA |
under18 |
2008 |
74104602.0 |
2534 |
NaN |
USA |
under18 |
2013 |
73585872.0 |
2535 |
NaN |
USA |
total |
2013 |
316128839.0 |
2536 |
NaN |
USA |
total |
2009 |
306771529.0 |
2537 |
NaN |
USA |
under18 |
2009 |
74134167.0 |
2538 |
NaN |
USA |
under18 |
2010 |
74119556.0 |
2539 |
NaN |
USA |
total |
2010 |
309326295.0 |
2540 |
NaN |
USA |
under18 |
2011 |
73902222.0 |
2541 |
NaN |
USA |
total |
2011 |
311582564.0 |
2542 |
NaN |
USA |
under18 |
2012 |
73708179.0 |
2543 |
NaN |
USA |
total |
2012 |
313873685.0 |
#1.2将USA对应的全称空对应的行索引取出
indexs = abb_pop.loc[abb_pop['state/region'] == 'USA'].index
abb_pop.iloc[indexs]
abb_pop.loc[indexs,'state'] = 'United States'
#2.可以将PR的全称进行赋值
abb_pop['state/region'] == 'PR'
abb_pop.loc[abb_pop['state/region'] == 'PR'] #PR对应的行数据
indexs = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
abb_pop.loc[indexs,'state'] = 'PPPRRR'
#合并各州面积数据areas
abb_pop_area = pd.merge(abb_pop,area,how='outer')
#我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行
abb_pop_area['area (sq. mi)'].isnull()
abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()] #空对应的行数据
indexs = abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()].index
#去除含有缺失数据的行
abb_pop_area.drop(labels=indexs,axis=0,inplace=True)
#找出2010年的全民人口数据(基于df做条件查询)
abb_pop_area.query('ages == "total" & year == 2010')
|
state |
state/region |
ages |
year |
population |
area (sq. mi) |
3 |
Alabama |
AL |
total |
2010.0 |
4785570.0 |
52423.0 |
91 |
Alaska |
AK |
total |
2010.0 |
713868.0 |
656425.0 |
101 |
Arizona |
AZ |
total |
2010.0 |
6408790.0 |
114006.0 |
189 |
Arkansas |
AR |
total |
2010.0 |
2922280.0 |
53182.0 |
197 |
California |
CA |
total |
2010.0 |
37333601.0 |
163707.0 |
283 |
Colorado |
CO |
total |
2010.0 |
5048196.0 |
104100.0 |
293 |
Connecticut |
CT |
total |
2010.0 |
3579210.0 |
5544.0 |
379 |
Delaware |
DE |
total |
2010.0 |
899711.0 |
1954.0 |
389 |
District of Columbia |
DC |
total |
2010.0 |
605125.0 |
68.0 |
475 |
Florida |
FL |
total |
2010.0 |
18846054.0 |
65758.0 |
485 |
Georgia |
GA |
total |
2010.0 |
9713248.0 |
59441.0 |
570 |
Hawaii |
HI |
total |
2010.0 |
1363731.0 |
10932.0 |
581 |
Idaho |
ID |
total |
2010.0 |
1570718.0 |
83574.0 |
666 |
Illinois |
IL |
total |
2010.0 |
12839695.0 |
57918.0 |
677 |
Indiana |
IN |
total |
2010.0 |
6489965.0 |
36420.0 |
762 |
Iowa |
IA |
total |
2010.0 |
3050314.0 |
56276.0 |
773 |
Kansas |
KS |
total |
2010.0 |
2858910.0 |
82282.0 |
858 |
Kentucky |
KY |
total |
2010.0 |
4347698.0 |
40411.0 |
869 |
Louisiana |
LA |
total |
2010.0 |
4545392.0 |
51843.0 |
954 |
Maine |
ME |
total |
2010.0 |
1327366.0 |
35387.0 |
965 |
Montana |
MT |
total |
2010.0 |
990527.0 |
147046.0 |
1050 |
Nebraska |
NE |
total |
2010.0 |
1829838.0 |
77358.0 |
1061 |
Nevada |
NV |
total |
2010.0 |
2703230.0 |
110567.0 |
1146 |
New Hampshire |
NH |
total |
2010.0 |
1316614.0 |
9351.0 |
1157 |
New Jersey |
NJ |
total |
2010.0 |
8802707.0 |
8722.0 |
1242 |
New Mexico |
NM |
total |
2010.0 |
2064982.0 |
121593.0 |
1253 |
New York |
NY |
total |
2010.0 |
19398228.0 |
54475.0 |
1338 |
North Carolina |
NC |
total |
2010.0 |
9559533.0 |
53821.0 |
1349 |
North Dakota |
ND |
total |
2010.0 |
674344.0 |
70704.0 |
1434 |
Ohio |
OH |
total |
2010.0 |
11545435.0 |
44828.0 |
1445 |
Oklahoma |
OK |
total |
2010.0 |
3759263.0 |
69903.0 |
1530 |
Oregon |
OR |
total |
2010.0 |
3837208.0 |
98386.0 |
1541 |
Maryland |
MD |
total |
2010.0 |
5787193.0 |
12407.0 |
1626 |
Massachusetts |
MA |
total |
2010.0 |
6563263.0 |
10555.0 |
1637 |
Michigan |
MI |
total |
2010.0 |
9876149.0 |
96810.0 |
1722 |
Minnesota |
MN |
total |
2010.0 |
5310337.0 |
86943.0 |
1733 |
Mississippi |
MS |
total |
2010.0 |
2970047.0 |
48434.0 |
1818 |
Missouri |
MO |
total |
2010.0 |
5996063.0 |
69709.0 |
1829 |
Pennsylvania |
PA |
total |
2010.0 |
12710472.0 |
46058.0 |
1914 |
Rhode Island |
RI |
total |
2010.0 |
1052669.0 |
1545.0 |
1925 |
South Carolina |
SC |
total |
2010.0 |
4636361.0 |
32007.0 |
2010 |
South Dakota |
SD |
total |
2010.0 |
816211.0 |
77121.0 |
2021 |
Tennessee |
TN |
total |
2010.0 |
6356683.0 |
42146.0 |
2106 |
Texas |
TX |
total |
2010.0 |
25245178.0 |
268601.0 |
2117 |
Utah |
UT |
total |
2010.0 |
2774424.0 |
84904.0 |
2202 |
Vermont |
VT |
total |
2010.0 |
625793.0 |
9615.0 |
2213 |
Virginia |
VA |
total |
2010.0 |
8024417.0 |
42769.0 |
2298 |
Washington |
WA |
total |
2010.0 |
6742256.0 |
71303.0 |
2309 |
West Virginia |
WV |
total |
2010.0 |
1854146.0 |
24231.0 |
2394 |
Wisconsin |
WI |
total |
2010.0 |
5689060.0 |
65503.0 |
2405 |
Wyoming |
WY |
total |
2010.0 |
564222.0 |
97818.0 |
#计算各州的人口密度(人口除以面积)
abb_pop_area['midu'] = abb_pop_area['population'] / abb_pop_area['area (sq. mi)']
abb_pop_area
|
state |
state/region |
ages |
year |
population |
area (sq. mi) |
midu |
0 |
Alabama |
AL |
under18 |
2012.0 |
1117489.0 |
52423.0 |
21.316769 |
1 |
Alabama |
AL |
total |
2012.0 |
4817528.0 |
52423.0 |
91.897221 |
2 |
Alabama |
AL |
under18 |
2010.0 |
1130966.0 |
52423.0 |
21.573851 |
3 |
Alabama |
AL |
total |
2010.0 |
4785570.0 |
52423.0 |
91.287603 |
4 |
Alabama |
AL |
under18 |
2011.0 |
1125763.0 |
52423.0 |
21.474601 |
5 |
Alabama |
AL |
total |
2011.0 |
4801627.0 |
52423.0 |
91.593900 |
6 |
Alabama |
AL |
total |
2009.0 |
4757938.0 |
52423.0 |
90.760506 |
7 |
Alabama |
AL |
under18 |
2009.0 |
1134192.0 |
52423.0 |
21.635389 |
8 |
Alabama |
AL |
under18 |
2013.0 |
1111481.0 |
52423.0 |
21.202163 |
9 |
Alabama |
AL |
total |
2013.0 |
4833722.0 |
52423.0 |
92.206131 |
10 |
Alabama |
AL |
total |
2007.0 |
4672840.0 |
52423.0 |
89.137211 |
11 |
Alabama |
AL |
under18 |
2007.0 |
1132296.0 |
52423.0 |
21.599222 |
12 |
Alabama |
AL |
total |
2008.0 |
4718206.0 |
52423.0 |
90.002594 |
13 |
Alabama |
AL |
under18 |
2008.0 |
1134927.0 |
52423.0 |
21.649410 |
14 |
Alabama |
AL |
total |
2005.0 |
4569805.0 |
52423.0 |
87.171757 |
15 |
Alabama |
AL |
under18 |
2005.0 |
1117229.0 |
52423.0 |
21.311810 |
16 |
Alabama |
AL |
total |
2006.0 |
4628981.0 |
52423.0 |
88.300574 |
17 |
Alabama |
AL |
under18 |
2006.0 |
1126798.0 |
52423.0 |
21.494344 |
18 |
Alabama |
AL |
total |
2004.0 |
4530729.0 |
52423.0 |
86.426359 |
19 |
Alabama |
AL |
under18 |
2004.0 |
1113662.0 |
52423.0 |
21.243767 |
20 |
Alabama |
AL |
total |
2003.0 |
4503491.0 |
52423.0 |
85.906778 |
21 |
Alabama |
AL |
under18 |
2003.0 |
1113083.0 |
52423.0 |
21.232722 |
22 |
Alabama |
AL |
total |
2001.0 |
4467634.0 |
52423.0 |
85.222784 |
23 |
Alabama |
AL |
under18 |
2001.0 |
1120409.0 |
52423.0 |
21.372470 |
24 |
Alabama |
AL |
total |
2002.0 |
4480089.0 |
52423.0 |
85.460370 |
25 |
Alabama |
AL |
under18 |
2002.0 |
1116590.0 |
52423.0 |
21.299620 |
26 |
Alabama |
AL |
under18 |
1999.0 |
1121287.0 |
52423.0 |
21.389218 |
27 |
Alabama |
AL |
total |
1999.0 |
4430141.0 |
52423.0 |
84.507583 |
28 |
Alabama |
AL |
total |
2000.0 |
4452173.0 |
52423.0 |
84.927856 |
29 |
Alabama |
AL |
under18 |
2000.0 |
1122273.0 |
52423.0 |
21.408027 |
... |
... |
... |
... |
... |
... |
... |
... |
2419 |
Wyoming |
WY |
under18 |
2003.0 |
124182.0 |
97818.0 |
1.269521 |
2420 |
Wyoming |
WY |
total |
2004.0 |
509106.0 |
97818.0 |
5.204625 |
2421 |
Wyoming |
WY |
under18 |
2004.0 |
123974.0 |
97818.0 |
1.267395 |
2422 |
Wyoming |
WY |
total |
2002.0 |
500017.0 |
97818.0 |
5.111707 |
2423 |
Wyoming |
WY |
under18 |
2002.0 |
125495.0 |
97818.0 |
1.282944 |
2424 |
Wyoming |
WY |
total |
2001.0 |
494657.0 |
97818.0 |
5.056912 |
2425 |
Wyoming |
WY |
under18 |
2001.0 |
126212.0 |
97818.0 |
1.290274 |
2426 |
Wyoming |
WY |
total |
2000.0 |
494300.0 |
97818.0 |
5.053262 |
2427 |
Wyoming |
WY |
under18 |
2000.0 |
128774.0 |
97818.0 |
1.316465 |
2428 |
Wyoming |
WY |
total |
1999.0 |
491780.0 |
97818.0 |
5.027500 |
2429 |
Wyoming |
WY |
under18 |
1999.0 |
130793.0 |
97818.0 |
1.337106 |
2430 |
Wyoming |
WY |
total |
1997.0 |
489452.0 |
97818.0 |
5.003701 |
2431 |
Wyoming |
WY |
under18 |
1997.0 |
134328.0 |
97818.0 |
1.373244 |
2432 |
Wyoming |
WY |
under18 |
1998.0 |
132602.0 |
97818.0 |
1.355599 |
2433 |
Wyoming |
WY |
total |
1998.0 |
490787.0 |
97818.0 |
5.017349 |
2434 |
Wyoming |
WY |
under18 |
1996.0 |
135698.0 |
97818.0 |
1.387250 |
2435 |
Wyoming |
WY |
total |
1996.0 |
488167.0 |
97818.0 |
4.990564 |
2436 |
Wyoming |
WY |
total |
1995.0 |
485160.0 |
97818.0 |
4.959823 |
2437 |
Wyoming |
WY |
under18 |
1995.0 |
136785.0 |
97818.0 |
1.398362 |
2438 |
Wyoming |
WY |
under18 |
1994.0 |
137733.0 |
97818.0 |
1.408054 |
2439 |
Wyoming |
WY |
total |
1994.0 |
480283.0 |
97818.0 |
4.909965 |
2440 |
Wyoming |
WY |
under18 |
1992.0 |
137308.0 |
97818.0 |
1.403709 |
2441 |
Wyoming |
WY |
total |
1992.0 |
466251.0 |
97818.0 |
4.766515 |
2442 |
Wyoming |
WY |
total |
1993.0 |
473081.0 |
97818.0 |
4.836339 |
2443 |
Wyoming |
WY |
under18 |
1993.0 |
137458.0 |
97818.0 |
1.405242 |
2444 |
Wyoming |
WY |
total |
1991.0 |
459260.0 |
97818.0 |
4.695046 |
2445 |
Wyoming |
WY |
under18 |
1991.0 |
136720.0 |
97818.0 |
1.397698 |
2446 |
Wyoming |
WY |
under18 |
1990.0 |
136078.0 |
97818.0 |
1.391135 |
2447 |
Wyoming |
WY |
total |
1990.0 |
453690.0 |
97818.0 |
4.638103 |
2544 |
Puerto Rico |
NaN |
NaN |
NaN |
NaN |
3515.0 |
NaN |
2449 rows × 7 columns
#排序,并找出人口密度最高的州
abb_pop_area.sort_values(by='midu',axis=0,ascending=False).iloc[0]['state']
'District of Columbia'