• 民政局中行政区域数据爬取


    '''
        中华人民共和国民政局官网中的行政区域代码爬取:
            技术点:
                1>进入二级页面(数据展示页)时,url发生跳转(js作用的),需要在二级页面源码中找到真实url
                2>数据入库实时更新:保存url,下次爬取时,先对比url,若相同,不更新,否则更新
    '''
    
    import requests
    from lxml import etree
    import re
    import pymysql
    
    
    class GovementSpider:
        def __init__(self):
            self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
            }
            self.db = pymysql.connect('localhost', 'root', '123456', 'govermentdb', charset='utf8')
            self.cursor = self.db.cursor()
    
        # 提取二级页面链接(假链接),一定是最新的那个链接
        def get_false_link(self):
            html = requests.get(url=self.one_url, headers=self.headers).content.decode('utf-8', 'ignore')
            parse_html = etree.HTML(html)
            a_list = parse_html.xpath('//a [@class="artitlelist"]')
            for a in a_list:
                title = a.xpath('./@title')[0].strip()
                if re.findall(r'.*以上行政区划代码', title, re.S):
                    two_false_link = 'http://www.mca.gov.cn' + a.get('href')
                    return two_false_link
    
        # 提取真实二级页面链接(返回数据)
        def get_true_link(self):
            # 获取响应内容
            false_link = self.get_false_link()
            html = requests.get(url=false_link, headers=self.headers).content.decode('utf-8', 'ignore')
            pattern = re.compile(r'window.location.href="(.*?)"', re.S)
            real_link = pattern.findall(html)[0]
            print(real_link)
    
            # 实现增量爬取
            # 即到version表中查询是否有real_link,如果有,直接返回数据已是最新,否则,抓取最新数据
            sel = 'select * from version where link="{}"'.format(real_link)
            self.cursor.execute(sel)
            # 不为空元组(不需要抓取数据),即链接已存在
            if self.cursor.fetchall():
                print('数据已是最新')
            else:
                # 先抓数据
                self.get_data(real_link)
                # 把real_link插入到version表中
                ins = 'insert into version values(%s)'
                self.cursor.execute(ins, [real_link])
                self.db.commit()
    
        # 真正提取数据函数
        def get_data(self, real_link):
            html = requests.get(url=real_link, headers=self.headers).text
            parse_html = etree.HTML(html)
            tr_list = parse_html.xpath('//tr[@height="19"]')
            for tr in tr_list:
                code = tr.xpath('./td[2]/text()')[0]
                name = tr.xpath('./td[3]/text()')[0]
                print(name, code)
    
        # 主函数
        def main(self):
            pass
    
    
    if __name__ == '__main__':
        spider = GovementSpider()
        spider.main()
        spider.get_true_link()
    '''
        使用selenium+chrome进行爬取,可以避免js对二级页面链接的渲染,爬取更简单
    '''
    
    from selenium import webdriver
    import time
    import pymysql
    
    
    class GovementSpider:
        def __init__(self):
            self.browser = webdriver.Chrome()
            self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
            self.db = pymysql.connect('localhost', 'root', '123456', db='govdb', charset='utf8')
            self.cursor = self.db.cursor()
            # 定义三个空列表,为了excutemany()
            self.province_list = []
            self.city_list = []
            self.county_list = []
    
        # 获取首页并提取二级页面链接(虚假链接即可,真实链接可以不用)
        def get_false_url(self):
            self.browser.get(self.one_url)
            td_list = self.browser.find_elements_by_xpath('//td[@class="arlisttd"]/a[contains(@title,"代码")]')
            if td_list:
                # 找节点对象,因为要click()
                two_url_element = td_list[0]
                # 增量爬取,取出链接,和数据库中version表中作比对
                two_url = two_url_element.get_attribute('href')
                sel = 'select * from version where link=%s'
                self.cursor.execute(sel, [two_url])
                result = self.cursor.fetchall()
                if len(result) != 0:
                    print('数据已最新,无需爬取')
                else:
                    # 点击
                    two_url_element.click()
                    time.sleep(3)
                    # 切换browser
                    all_handles = self.browser.window_handles
                    self.browser.switch_to_window(all_handles[1])
                    # 数据抓取
                    self.get_data()
                    # 结束后把two_url插入version表中
                    ins = 'insert into version values(%s)'
                    self.cursor.execute(ins, [two_url])
                    self.db.commit()
    
        # 二级页面中提取行政区划代码
        def get_data(self):
            tr_list = self.browser.find_elements_by_xpath('//tr[@height="19"]')
            for tr in tr_list:
                code = tr.find_element_by_xpath('./td[2]').text.strip()
                name = tr.find_element_by_xpath('./td[3]').text.strip()
                print(name, code)
                # 判断层级关系,添加到对应的数据库表中(对应表中字段)
                if code[-4:] == '0000':
                    self.province_list.append([name, code])
                    if name in ['北京市', '天津市', '上海市', '重庆市']:
                        city = [name, code, code[:2] + '0000']
                        self.city_list.append(city)
                elif code[-2:] == '00':
                    city = [name, code, code[:2] + '0000']
                    self.city_list.append(city)
                else:
                    county = [name, code, code[:4] + '00']
                    self.county_list.append(county)
            # 所有数据爬取完成之后,统一excutemany()
            self.insert_mysql()
    
        def insert_mysql(self):
            # 更新时一定要删除表记录
            del_province = 'delete from province'
            del_city = 'delete from city'
            del_county = 'delete from county'
            self.cursor.execute(del_province)
            self.cursor.execute(del_city)
            self.cursor.execute(del_county)
            # 插入新的数据
            ins_province = 'insert into province values(%s,%s)'
            ins_city = 'insert into city values(%s,%s,%s)'
            ins_county = 'insert into county values(%s,%s,%s)'
            self.cursor.executemany(ins_province, self.province_list)
            self.cursor.executemany(ins_city, self.city_list)
            self.cursor.executemany(ins_county, self.county_list)
            self.db.commit()
            print('数据抓取完成,成功存入数据库')
    
        def main(self):
            self.get_false_url()
            # 断开连接
            self.cursor.close()
            self.db.close()
            self.browser.quit()
    
    
    if __name__ == "__main__":
        spider = GovementSpider()
        spider.main()
  • 相关阅读:
    mongodb nodemailer
    mongodb session
    mongodb cookie
    mongodb multer
    mongodb operate update and delete
    mongodb find opearate
    echart
    Git学习
    PHP海补知识(11)-- 自定义exception
    ThinkPHP U方法
  • 原文地址:https://www.cnblogs.com/yuxiangyang/p/11245508.html
Copyright © 2020-2023  润新知