• python3爬虫全国地址信息


    PHP方式写的一团糟所以就用python3重写了一遍,所以因为第二次写了,思路也更清晰了些。
    提醒:可能会有502的错误,所以做了异常以及数据库事务处理,暂时没有想到更好的优化方法,所以就先这样吧。待更懂python再进一步优化哈
    欢迎留言赐教~

    
    #!C:Users12550AppDataLocalProgramsPythonPython37python.exe
    # -*- coding: utf-8 -*-
    
    
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import pymysql
    import urllib.request
    import re
    from urllib.error import URLError, HTTPError
    
    
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='test', charset='utf8')
    db = conn.cursor()
    
    curr_url = ''
    
    # 请求网页
    def get_html(url):
        global curr_url
        user_agent = 'Mozilla/6.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.6796.99 Safari/537.36'
        response = urllib.request.Request(url)
        response.add_header('User-Agent', user_agent)
        response = urllib.request.urlopen(response)
    
        html = BeautifulSoup(response.read(), "html.parser", from_encoding='gbk')
        return html
    
    
    def get_list(url, level=1,  pid=0, get_level=2):
        data = [];
        level_arr = {'1': 'provincetr', '2': 'citytr', '3': 'countytr', '4': 'towntr', '5': 'villagetr'}
    
        try:
            print(url)
            html = get_html(url)
            c_url = url
    
            tr_list = html.findAll('tr', {'class': level_arr[str(level)]})
            for tr in tr_list:
                region_name, href, page = '', '', ''
                td_list = tr.findAll('td')
                for td in td_list:
                    region_name = td.get_text();
                    # 判断是否存在该省份
                    if (level == 1):
                        sql = "select * from region where region_name='" + region_name + "'"
                        db.execute(sql)
                        exist = db.fetchone()
                        if(exist):
                            continue
    
                    # 判断是否全数字-非法则跳过
                    if (region_name.isdigit()):
                        continue
    
                    if (region_name):
                        sql = "insert into region(region_name,pid,level,url) value('" + region_name + "','" + str(
                            pid) + "','" + str(level) + "','" + url + "')"
                        db.execute(sql)
                        db.execute('SELECT LAST_INSERT_ID();')
                        last_id = db.fetchone()[0]
    
                    if (td.a):
                        page = td.a.attrs['href']
                        pattern = re.compile(r'w*.html')
                        url = re.sub(pattern, page, c_url)
    
                        if (level <= get_level):
                            get_list(url, level + 1, last_id)
    
                # 每个省份执行完成,则提交
                if (level == 1):
                    conn.commit()
            return data;
        except HTTPError as e:
            # 如果有出错,则回滚
            conn.rollback()
            print(e) # HTTP Error 502: Proxy Error
    
    
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
    get_list(url)
    print('执行完成')
    
    
    

    来源:https://blog.csdn.net/qq_27974479/article/details/85634360

  • 相关阅读:
    Qt之QLabel
    在Servlet中使用spring注入的bean
    Matlab中图片保存的5种方法
    LATEX中优化问题如何排列Max——s.t.格式
    Latex 初学者入门(四)-- 多个作者共享同一个地址
    一份不太简短的LaTeX教程 lshort – A short in­tro­duc­tion to LATEX 2elshort – A short in­tro­duc­tion to LATEX 2e
    LaTeX技巧:LaTeX括号总结
    Bibtex使用方法
    Latex初学者入门(三)-- 用BibTeX生成参考文献
    LaTeX之参考文献的写法
  • 原文地址:https://www.cnblogs.com/qixidi/p/10229236.html
Copyright © 2020-2023  润新知