• 使用python爬去国家民政最新的省份代码的程序,requests,beautifulsoup,lxml


    使用的python3.6

    民政网站,不同年份数据可能页面结构不一致,这点踩了很多坑,这也是代码越写越长的原因。

    如果以后此段代码不可用,希望再仔细学习下 页面结构是否发生了变更。

      1 # -*- coding: utf-8 -*-
      2 """
      3 Created on Wed Jul 10 14:40:41 2019
      4 
      5 @author: Administrator
      6 """
      7 
      8 import pandas as pd
      9 import requests 
     10 from bs4 import BeautifulSoup
     11 import time 
     12 
     13 url1 = 'http://www.mca.gov.cn/article/sj/xzqh//1980/'
     14 headers = {'content-type': 'application/json',
     15                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
     16 
     17 # 1. 获取所有链接========================================================================
     18 def f1(url1):
     19     '2018-1980年中华人民共和国行政区划代码 的所有链接'
     20     #requests发出请求,设置url,header参数
     21     response = requests.get(url1, headers=headers, timeout=200, verify=False)
     22     soup = BeautifulSoup(response.text,'lxml') #将网页源码返回为BeautifulSoup类型
     23     _tmp1 = soup.select('td.arlisttd')
     24     end_1 = []
     25     for i in _tmp1:
     26         _a = i.select('a')[0].get('href')
     27         _b = i.select('a')[0].get('title')[:4]
     28         end_1.append(['http://www.mca.gov.cn'+_a,_b])
     29     return end_1
     30 
     31 end_2=[]
     32 for i in ['','?2','?3']:
     33     end_2 = end_2+f1(url1+i)
     34     
     35     
     36 def f2(url1='http://www.mca.gov.cn/article/sj/xzqh/2019/'):
     37     '2019年中华人民共和国行政区划代码'
     38     response = requests.get(url1, headers=headers, timeout=200, verify=False)
     39     soup = BeautifulSoup(response.text,'lxml')
     40     _tmp1 = soup.select('td.arlisttd')
     41     end_1 = []
     42     for i in _tmp1:
     43         _a = i.select('a')[0].get('href')
     44         _b = i.select('a')[0].get('title')[:7]
     45         end_1.append(['http://www.mca.gov.cn'+_a,_b])
     46     return end_1
     47 
     48 end_2 = end_2+f2()
     49 
     50 # 2. 获取数据========================================================================
     51 def f3(url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201903/20190300014989.shtml'):
     52     #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854922.shtml'
     53     #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854918.shtml'
     54     #
     55     response = requests.get(url1, headers=headers, timeout=200, verify=False)
     56     soup = BeautifulSoup(response.text,'lxml')
     57     _txt = soup.select('script')[4].get_text().strip().replace('window.location.href="','').strip('";')
     58     if _txt[-4:]=='html':
     59         print('script!')
     60         url2 = _txt
     61     else:
     62         _tmp1 = soup.select('div.artext > div > p > a')
     63         if len(_tmp1)==0:
     64             _tmp1 = soup.select('div#zoom > a')
     65         url2 = _tmp1[0].get('href')
     66     print(url2)
     67     #return url2
     68     #url2='http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220856.html'
     69     time.sleep(0.5)
     70     response = requests.get(url2, headers=headers, timeout=200, verify=False)
     71     #将网页源码返回为BeautifulSoup类型
     72     soup = BeautifulSoup(response.text,'lxml')
     73     _tmp1 = soup.select('table > tr[height="19"]')
     74     end_1 = []
     75     if len(_tmp1)>5:
     76         for i in _tmp1:
     77             _a = i.select('td')[1].get_text().strip()
     78             if len(_a)>15: #部分数据页面,最后一行是备注。
     79                 continue
     80             else:
     81                 _b = i.select('td')[2].get_text().strip()
     82                 end_1.append([_a,_b])
     83     else:
     84         _tmp1 = soup.select('table > tr[height="20"]')
     85         for i in _tmp1:
     86             _a = i.select('td')[0].get_text().strip()
     87             if len(_a)>15 or _a=='行政区划代码': #部分数据页面,最后一行是备注。
     88                 continue
     89             else:
     90                 _b = i.select('td')[1].get_text().strip()
     91                 end_1.append([_a,_b])
     92     
     93     return end_1
     94 
     95 #循环对每个链接 获取数据
     96 end_3=[];#end_4=[]
     97 for j in range(len(end_2)):
     98     item = end_2[j]
     99     if '19'  in item[1] or '20'  in item[1]:
    100         print(j,item[0],item[1])
    101         tmp2 = f3(item[0])
    102         print('.')
    103         end_3.extend([[item[1]]+i for i in tmp2])
    104         #end_4.append(tmp2)
    105         time.sleep(0.1)
    106     
    107 df_result = pd.DataFrame(end_3)
    108 #pd.DataFrame(end_4).to_excel('所有连接.xlsx',index=False)
    109 df_result.to_excel('地区编码.xlsx',index=False)
    110 
    111 
    112 '''
    113 #3 2019年5月份县以上行政区划代码_3852 > table > tbody > tr:nth-child(4)
    114 #list_content > div.list_right > div > ul > table > tbody > tr:nth-child(1) > td.arlisttd > a
    115 '''
  • 相关阅读:
    剑指office--------重建二叉树
    剑指office--------二进制中1的个数
    剑指office--------最小的K个数 (待补充)
    剑指office--------二维数组的查找
    剑指office--------替换空格
    Redis集群
    一致性hash算法
    Zab协议(转)
    Redis线程模型
    Http Cookie和session
  • 原文地址:https://www.cnblogs.com/andylhc/p/11490563.html
Copyright © 2020-2023  润新知