• python爬取国家统计局2019年行政区划分数据mssql


    请安装python3.7版本,更高版本无法使用pymssql包

    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    #   功能:  获取省市县数据
    #   版本:v1.1
    import importlib
    import sys
    import pymssql
    importlib.reload(sys)
    import requests
    import lxml.etree as etree
    import os
    class chinese_city():
        # 初始化函数
        def __init__(self):
            self.trdic = {
                1: '//tr[@class="provincetr"]',
                2: '//tr[@class="citytr"]',
                3: '//tr[@class="countytr"]',
                4: '//tr[@class="towntr"]',
                5: '//tr[@class="villagetr"]'
            }        
     
        def crawl_page(self,url):
            try:
                headers = {
                    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
                    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
                }
                html = requests.get(url, headers=headers, timeout=1000)
                html.encoding = 'gbk'
                print(str(html.status_code)+url)
                text = html.text
                return text
            except requests.exceptions.RequestException:
                print('超时'+url)
        
        #根据trid 解析子页
        def parse(self, trid, pid, url):
            if url.strip() == '':
                return None
            else:
                html = self.crawl_page(url)
                tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
                if trid==3:
                    nodes = tree.xpath(self.trdic.get(trid))
                    if len(nodes)==0:
                        nodes = tree.xpath(self.trdic.get(4))
                        print('有镇的市:'+url)
                else:
                    nodes = tree.xpath(self.trdic.get(trid)) 
                path = os.path.basename(url)
                base_url = url.replace(path, '')
                values = []
                for node in nodes:
                    nexturl = node.xpath('./td[1]/a/@href')
                    if(len(nexturl)>0):
                        code = node.xpath('./td[1]/a/text()')
                        if len(code) == 0:
                            code = node.xpath('./td[1]/text()')
                        name = node.xpath('./td[2]/a/text()')
                        if len(name) == 0:
                            name = node.xpath('./td[2]/text()')
                        value = {}
                        value['url'] = base_url + "".join(nexturl)
                        value['code'] = "".join(code)
                        value['name'] = "".join(name)
                        value['pcode'] = pid
                        value['grade'] = trid
                        value['cat'] = 0
                        values.append(value)
                return values
     
        #解析社区页
        def parseVillager(self, trid, pid, url):
            if url.strip() == '':
                return None
            else:
                html = self.crawl_page(url)
                tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
                nodes = tree.xpath(self.trdic.get(trid))
                values = []
                for node in nodes:
                    code = node.xpath('./td[1]/text()')
                    cate = node.xpath('./td[2]/text()')
                    name = node.xpath('./td[3]/text()')
                    value = {}
                    value['code'] = "".join(code)
                    value['name'] = "".join(name)
                    value['pcode'] = pid
                    value['grade'] = trid
                    value['cat'] = cate
                    values.append(value)
                return values
     
        #插入数据库
        def insert_to_db(self, taobao):
            conn = pymssql.connect(
                host=r'127.0.0.1',
                user=r'sa',
                password='123',
                database='areadb'
            )
            cursor = conn.cursor()
            try:
                param = []
                for p in taobao:
                    param.append((p.get("code"), p.get("grade"), p.get("pcode"), p.get("name").strip(),p.get("cat"))) 
                sql = 'INSERT INTO fetch_area VALUES(%s,%d,%s,%s,%d)'
                cursor.executemany(sql, param)
                conn.commit()
            except Exception as e:
                conn.rollback()
                print(e)
            finally:
                if cursor:
                    cursor.close()
                if conn:
                    conn.close()
     
        #从头执行解析
        def parseChineseCity(self):
            citys = self.parse(2, '520000000000', 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/52.html')
            if not citys is None:
                self.insert_to_db(citys)
                for city in citys:
                    countys = self.parse(3, city['code'], city['url'])
                    if not countys is None:
                        self.insert_to_db(countys)
                        for county in countys:
                            towns = self.parse(4, county['code'], county['url'])
                            if not towns is None:
                                self.insert_to_db(towns)
                                for town in towns:
                                    villagers = self.parseVillager(5, town['code'], town['url'])
                                    if not villagers is None:
                                        self.insert_to_db(villagers)
     
    if __name__ == '__main__':
        chinese_city = chinese_city()
        chinese_city.parseChineseCity()

    创建数据库表

    CREATE TABLE [dbo].[fetch_area](
        [code] [varchar](16) NULL,
        [grade] [int] NULL,
        [pcode] [varchar](16) NULL,
        [name] [nvarchar](64) NULL,
        [cat] [int] NULL
    )
  • 相关阅读:
    Node.js基础学习一之Get请求
    Node.js学习准备篇
    如何在eclipse添加SVN菜单
    Java泛型的好处
    mybatis多数据源配置
    Python map() 函数
    python split()使用方法
    pythom os 模块
    深浅copy
    小数据池
  • 原文地址:https://www.cnblogs.com/hllive/p/12992858.html
Copyright © 2020-2023  润新知