使用的python3.6
民政网站,不同年份数据可能页面结构不一致,这点踩了很多坑,这也是代码越写越长的原因。
如果以后此段代码不可用,希望再仔细学习下 页面结构是否发生了变更。
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Jul 10 14:40:41 2019 4 5 @author: Administrator 6 """ 7 8 import pandas as pd 9 import requests 10 from bs4 import BeautifulSoup 11 import time 12 13 url1 = 'http://www.mca.gov.cn/article/sj/xzqh//1980/' 14 headers = {'content-type': 'application/json', 15 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'} 16 17 # 1. 获取所有链接======================================================================== 18 def f1(url1): 19 '2018-1980年中华人民共和国行政区划代码 的所有链接' 20 #requests发出请求,设置url,header参数 21 response = requests.get(url1, headers=headers, timeout=200, verify=False) 22 soup = BeautifulSoup(response.text,'lxml') #将网页源码返回为BeautifulSoup类型 23 _tmp1 = soup.select('td.arlisttd') 24 end_1 = [] 25 for i in _tmp1: 26 _a = i.select('a')[0].get('href') 27 _b = i.select('a')[0].get('title')[:4] 28 end_1.append(['http://www.mca.gov.cn'+_a,_b]) 29 return end_1 30 31 end_2=[] 32 for i in ['','?2','?3']: 33 end_2 = end_2+f1(url1+i) 34 35 36 def f2(url1='http://www.mca.gov.cn/article/sj/xzqh/2019/'): 37 '2019年中华人民共和国行政区划代码' 38 response = requests.get(url1, headers=headers, timeout=200, verify=False) 39 soup = BeautifulSoup(response.text,'lxml') 40 _tmp1 = soup.select('td.arlisttd') 41 end_1 = [] 42 for i in _tmp1: 43 _a = i.select('a')[0].get('href') 44 _b = i.select('a')[0].get('title')[:7] 45 end_1.append(['http://www.mca.gov.cn'+_a,_b]) 46 return end_1 47 48 end_2 = end_2+f2() 49 50 # 2. 获取数据======================================================================== 51 def f3(url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201903/20190300014989.shtml'): 52 #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854922.shtml' 53 #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854918.shtml' 54 # 55 response = requests.get(url1, headers=headers, timeout=200, verify=False) 56 soup = BeautifulSoup(response.text,'lxml') 57 _txt = soup.select('script')[4].get_text().strip().replace('window.location.href="','').strip('";') 58 if _txt[-4:]=='html': 59 print('script!') 60 url2 = _txt 61 else: 62 _tmp1 = soup.select('div.artext > div > p > a') 63 if len(_tmp1)==0: 64 _tmp1 = soup.select('div#zoom > a') 65 url2 = _tmp1[0].get('href') 66 print(url2) 67 #return url2 68 #url2='http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220856.html' 69 time.sleep(0.5) 70 response = requests.get(url2, headers=headers, timeout=200, verify=False) 71 #将网页源码返回为BeautifulSoup类型 72 soup = BeautifulSoup(response.text,'lxml') 73 _tmp1 = soup.select('table > tr[height="19"]') 74 end_1 = [] 75 if len(_tmp1)>5: 76 for i in _tmp1: 77 _a = i.select('td')[1].get_text().strip() 78 if len(_a)>15: #部分数据页面,最后一行是备注。 79 continue 80 else: 81 _b = i.select('td')[2].get_text().strip() 82 end_1.append([_a,_b]) 83 else: 84 _tmp1 = soup.select('table > tr[height="20"]') 85 for i in _tmp1: 86 _a = i.select('td')[0].get_text().strip() 87 if len(_a)>15 or _a=='行政区划代码': #部分数据页面,最后一行是备注。 88 continue 89 else: 90 _b = i.select('td')[1].get_text().strip() 91 end_1.append([_a,_b]) 92 93 return end_1 94 95 #循环对每个链接 获取数据 96 end_3=[];#end_4=[] 97 for j in range(len(end_2)): 98 item = end_2[j] 99 if '19' in item[1] or '20' in item[1]: 100 print(j,item[0],item[1]) 101 tmp2 = f3(item[0]) 102 print('.') 103 end_3.extend([[item[1]]+i for i in tmp2]) 104 #end_4.append(tmp2) 105 time.sleep(0.1) 106 107 df_result = pd.DataFrame(end_3) 108 #pd.DataFrame(end_4).to_excel('所有连接.xlsx',index=False) 109 df_result.to_excel('地区编码.xlsx',index=False) 110 111 112 ''' 113 #3 2019年5月份县以上行政区划代码_3852 > table > tbody > tr:nth-child(4) 114 #list_content > div.list_right > div > ul > table > tbody > tr:nth-child(1) > td.arlisttd > a 115 '''