• 国家统计局区划码爬取


    目标数据

    oracle存储表格

    -- Create table
    create table VILLAGE_CODE
    (
      id                INTEGER,
      area_code         VARCHAR2(500),
      city_village_code VARCHAR2(500),
      area_name         VARCHAR2(500)
    )
    tablespace SYSTEM
      pctfree 10
      pctused 40
      initrans 1
      maxtrans 255
      storage
      (
        initial 64K
        next 1M
        minextents 1
        maxextents unlimited
      );
    -- Add comments to the columns 
    comment on column VILLAGE_CODE.id
      is '自增ID';
    comment on column VILLAGE_CODE.area_code
      is '统计用区划代码';
    comment on column VILLAGE_CODE.city_village_code
      is '城乡分类代码    ';
    comment on column VILLAGE_CODE.area_name
      is '名称';

    爬取代码

    #!/usr/bin/env python
    # encoding: utf-8
    '''
    @author: lurenjia
    @contact: 1499418300@qq.com
    @file: areacode.py
    @time: 2018/9/29 14:40
    @desc:
    '''
    
    import urllib2, re
    from time import sleep
    from random import random
    from config import DBSession
    
    
    headers = {
        "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
    session = DBSession()
    
    
    def insertVillage(code, name, city_village_code='-1'):
        print code, name, city_village_code
        session.execute("insert into village_code(area_code, area_name, city_village_code) VALUES ('%s','%s','%s')" %(code, name, city_village_code))
        session.commit()
    
    
    def openUrl(url, type):
        try:
            sleep(random()*0.5)
            request = urllib2.Request(url,headers=headers)
            html = urllib2.urlopen(request,timeout=10).read().decode('gbk')
        except:
            html = None
            with open('error.txt', 'a+') as f:
                f.write(url+'                   '+str(type)+'
    ')
        finally:
            return html
        
        
    def parseCode1(baseUrl, lastUrl):
        html = openUrl(baseUrl+lastUrl,1)
        if html:
            for tr in re.findall("<tr class='provincetr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)<br/>", tr):
                    parseCode2(baseUrl, td[0])
            
    
    def parseCode2(baseUrl, lastUrl):
        html = openUrl(baseUrl + lastUrl,2)
        if html:
            for tr in re.findall("<tr class='citytr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                    insertVillage(td[1], td[2])
                    parseCode3(baseUrl, td[0])
            
    
    def parseCode3(baseUrl, lastUrl):
        baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
        lastUrl = '/'.join(lastUrl.split('/')[1:])
        html = openUrl(baseUrl + lastUrl,3)
        if html:
            for tr in re.findall("<tr class='countytr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                    insertVillage(td[1], td[2])
                    parseCode4(baseUrl, td[0])
            
    
    def parseCode4(baseUrl, lastUrl):
        baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
        lastUrl = '/'.join(lastUrl.split('/')[1:])
        html = openUrl(baseUrl + lastUrl,4)
        if html:
            for tr in re.findall("<tr class='towntr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                    insertVillage(td[1], td[2])
                    parseCode5(baseUrl, td[0])
            
    
    def parseCode5(baseUrl, lastUrl):
        baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
        lastUrl = '/'.join(lastUrl.split('/')[1:])
        html = openUrl(baseUrl + lastUrl,5)
        if html:
            for tr in re.findall("<tr class='villagetr'>.+?</tr>", html):
                for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>", tr):
                    insertVillage(td[0], td[2], td[1])
            
    
    if __name__=="__main__":
        baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        parseCode1(baseUrl, 'index.html')

    分布式爬取

    纯手写

    #!/usr/bin/env python
    # encoding: utf-8
    '''
    @author: lurenjia
    @contact: 1499418300@qq.com
    @file: areacode.py
    @time: 2018/9/29 14:40
    @desc:
    '''
    
    import urllib2, re, os, redis
    from time import sleep
    from random import random
    from sqlalchemy import *
    from sqlalchemy.orm import sessionmaker
    from multiprocessing import Process
    
    os.environ['NLS_LANG'] = 'AMERICAN_AMERICA.AL32UTF8'
    engine = create_engine('oracle://xxx:xxx@xxx:1521/xe', pool_size=100, encoding='utf8')
    DBSession = sessionmaker(bind=engine)
    session = DBSession()
    pool = redis.ConnectionPool(host='xxx', port=6379)
    MRedis = redis.Redis(connection_pool=pool)
    
    headers = {
        "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
    
    
    def insertVillage(code, name, city_village_code='-1'):
        print code, name, city_village_code
        session.execute("insert into village_code(area_code, area_name, city_village_code) VALUES ('%s','%s','%s')" %(code, name, city_village_code))
        session.commit()
    
    
    def openUrl(url):
        try:
            sleep(random() * 0.5)
            request = urllib2.Request(url, headers=headers)
            html = urllib2.urlopen(request, timeout=10).read().decode('gbk')
        except:
            html = None
            MRedis.lpush('area_code_error', url)
        finally:
            return html
    
    
    def run():
        while True:
            area_code2 = MRedis.lpop('area_code2')
            while area_code2:
                html = openUrl(area_code2)
                if html:
                    parseCode2(html, area_code2)
                area_code2 = MRedis.lpop('area_code2')
    
            area_code3 = MRedis.lpop('area_code3')
            while area_code3:
                html = openUrl(area_code3)
                if html:
                    parseCode3(html, area_code3)
                area_code3 = MRedis.lpop('area_code3')
    
            area_code4 = MRedis.lpop('area_code4')
            while area_code4:
                html = openUrl(area_code4)
                if html:
                    parseCode4(html, area_code4)
                area_code4 = MRedis.lpop('area_code4')
    
            area_code5 = MRedis.lpop('area_code5')
            while area_code5:
                html = openUrl(area_code5)
                if html:
                    parseCode5(html, area_code5)
                area_code5 = MRedis.lpop('area_code5')
    
    
    def parseCode1(baseUrl, lastUrl):
        html = openUrl(baseUrl+lastUrl)
        if html:
            for tr in re.findall("<tr class='provincetr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)<br/>", tr):
                    MRedis.lpush('area_code2', baseUrl+td[0])
    
    
    def parseCode2(html, url):
        for tr in re.findall("<tr class='citytr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                baseUrl = '/'.join(url.split('/')[:-1])
                MRedis.lpush('area_code3', baseUrl +'/'+ td[0])
    
    
    def parseCode3(html, url):
        for tr in re.findall("<tr class='countytr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                baseUrl = '/'.join(url.split('/')[:-1])
                MRedis.lpush('area_code4', baseUrl + '/' + td[0])
    
    
    def parseCode4(html, url):
        for tr in re.findall("<tr class='towntr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                baseUrl = '/'.join(url.split('/')[:-1])
                MRedis.lpush('area_code5', baseUrl + '/' + td[0])
    
    
    def parseCode5(html):
        for tr in re.findall("<tr class='villagetr'>.+?</tr>", html):
            for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>", tr):
                insertVillage(td[0], td[2], td[1])
            
    
    if __name__=="__main__":
        baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        parseCode1(baseUrl, 'index.html')
        # p1 = Process(target=run)
        # p1.start()
        # p2 = Process(target=run)
        # p2.start()
        # p3 = Process(target=run)
        # p3.start()
  • 相关阅读:
    Cocos2d-x 3.0 beta 中加入附加项目,解决无法打开包括文件:“extensions/ExtensionMacros.h”: No such file or directory”
    C、Shell、Perl基于Tomcat开发CGI程序环境配置
    Windows机器配置启动加载器的高级选项后,机器出现蓝屏,无法RDP
    Linux由于物理节点故障导致的异常重启-Case1
    Azure经典虚拟机(Windows)如何监测单个磁盘的使用空间
    ARM VM安装Linux Diagnostic 2.3扩展
    rsyslog服务日志报错分析1
    登陆Linux服务器时触发邮件提醒
    部署Azure Log Analytics
    获取指定订阅下所有Azure ARM虚拟机配置(CPU核数,内存大小,磁盘信息)的使用情况
  • 原文地址:https://www.cnblogs.com/lurenjia1994/p/9724372.html
Copyright © 2020-2023  润新知