''' 中华人民共和国民政局官网中的行政区域代码爬取: 技术点: 1>进入二级页面(数据展示页)时,url发生跳转(js作用的),需要在二级页面源码中找到真实url 2>数据入库实时更新:保存url,下次爬取时,先对比url,若相同,不更新,否则更新 ''' import requests from lxml import etree import re import pymysql class GovementSpider: def __init__(self): self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' } self.db = pymysql.connect('localhost', 'root', '123456', 'govermentdb', charset='utf8') self.cursor = self.db.cursor() # 提取二级页面链接(假链接),一定是最新的那个链接 def get_false_link(self): html = requests.get(url=self.one_url, headers=self.headers).content.decode('utf-8', 'ignore') parse_html = etree.HTML(html) a_list = parse_html.xpath('//a [@class="artitlelist"]') for a in a_list: title = a.xpath('./@title')[0].strip() if re.findall(r'.*以上行政区划代码', title, re.S): two_false_link = 'http://www.mca.gov.cn' + a.get('href') return two_false_link # 提取真实二级页面链接(返回数据) def get_true_link(self): # 获取响应内容 false_link = self.get_false_link() html = requests.get(url=false_link, headers=self.headers).content.decode('utf-8', 'ignore') pattern = re.compile(r'window.location.href="(.*?)"', re.S) real_link = pattern.findall(html)[0] print(real_link) # 实现增量爬取 # 即到version表中查询是否有real_link,如果有,直接返回数据已是最新,否则,抓取最新数据 sel = 'select * from version where link="{}"'.format(real_link) self.cursor.execute(sel) # 不为空元组(不需要抓取数据),即链接已存在 if self.cursor.fetchall(): print('数据已是最新') else: # 先抓数据 self.get_data(real_link) # 把real_link插入到version表中 ins = 'insert into version values(%s)' self.cursor.execute(ins, [real_link]) self.db.commit() # 真正提取数据函数 def get_data(self, real_link): html = requests.get(url=real_link, headers=self.headers).text parse_html = etree.HTML(html) tr_list = parse_html.xpath('//tr[@height="19"]') for tr in tr_list: code = tr.xpath('./td[2]/text()')[0] name = tr.xpath('./td[3]/text()')[0] print(name, code) # 主函数 def main(self): pass if __name__ == '__main__': spider = GovementSpider() spider.main() spider.get_true_link()
''' 使用selenium+chrome进行爬取,可以避免js对二级页面链接的渲染,爬取更简单 ''' from selenium import webdriver import time import pymysql class GovementSpider: def __init__(self): self.browser = webdriver.Chrome() self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/' self.db = pymysql.connect('localhost', 'root', '123456', db='govdb', charset='utf8') self.cursor = self.db.cursor() # 定义三个空列表,为了excutemany() self.province_list = [] self.city_list = [] self.county_list = [] # 获取首页并提取二级页面链接(虚假链接即可,真实链接可以不用) def get_false_url(self): self.browser.get(self.one_url) td_list = self.browser.find_elements_by_xpath('//td[@class="arlisttd"]/a[contains(@title,"代码")]') if td_list: # 找节点对象,因为要click() two_url_element = td_list[0] # 增量爬取,取出链接,和数据库中version表中作比对 two_url = two_url_element.get_attribute('href') sel = 'select * from version where link=%s' self.cursor.execute(sel, [two_url]) result = self.cursor.fetchall() if len(result) != 0: print('数据已最新,无需爬取') else: # 点击 two_url_element.click() time.sleep(3) # 切换browser all_handles = self.browser.window_handles self.browser.switch_to_window(all_handles[1]) # 数据抓取 self.get_data() # 结束后把two_url插入version表中 ins = 'insert into version values(%s)' self.cursor.execute(ins, [two_url]) self.db.commit() # 二级页面中提取行政区划代码 def get_data(self): tr_list = self.browser.find_elements_by_xpath('//tr[@height="19"]') for tr in tr_list: code = tr.find_element_by_xpath('./td[2]').text.strip() name = tr.find_element_by_xpath('./td[3]').text.strip() print(name, code) # 判断层级关系,添加到对应的数据库表中(对应表中字段) if code[-4:] == '0000': self.province_list.append([name, code]) if name in ['北京市', '天津市', '上海市', '重庆市']: city = [name, code, code[:2] + '0000'] self.city_list.append(city) elif code[-2:] == '00': city = [name, code, code[:2] + '0000'] self.city_list.append(city) else: county = [name, code, code[:4] + '00'] self.county_list.append(county) # 所有数据爬取完成之后,统一excutemany() self.insert_mysql() def insert_mysql(self): # 更新时一定要删除表记录 del_province = 'delete from province' del_city = 'delete from city' del_county = 'delete from county' self.cursor.execute(del_province) self.cursor.execute(del_city) self.cursor.execute(del_county) # 插入新的数据 ins_province = 'insert into province values(%s,%s)' ins_city = 'insert into city values(%s,%s,%s)' ins_county = 'insert into county values(%s,%s,%s)' self.cursor.executemany(ins_province, self.province_list) self.cursor.executemany(ins_city, self.city_list) self.cursor.executemany(ins_county, self.county_list) self.db.commit() print('数据抓取完成,成功存入数据库') def main(self): self.get_false_url() # 断开连接 self.cursor.close() self.db.close() self.browser.quit() if __name__ == "__main__": spider = GovementSpider() spider.main()