• Python抓取网页例子


    功能

    1. 一级分页列表页, 二级数据页
    2. 不定表头, 写入CSV
    3. 正则匹配, 在()中使用?:实现只匹配, 不捕获
    4. HTTP头设置
    #!/usr/bin/python3
    # -*- coding: UTF-8 -*-
    
    import re
    import time
    import requests
    import csv
    
    session = requests.session()
    
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
        'Referer':'http://jtgl.beijing.gov.cn/jgj/93950/check_car/ycl/index.html',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.5,en;q=0.3,de-DE;q=0.2',
        'Accept-Encoding': 'gzip, deflate',
        'Cache-Control': 'max-age=0'
    }
    
    base_link = 'http://somewhere'
    
    '''
    GET /jgj/93950/check_car/ycl/22719550-2.html HTTP/1.1
    Host: jtgl.beijing.gov.cn
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
    Accept-Language: en-US,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.5,en;q=0.3,de-DE;q=0.2
    Accept-Encoding: gzip, deflate
    Referer: http://jtgl.beijing.gov.cn/jgj/93950/check_car/ycl/index.html
    Connection: keep-alive
    Cookie: _trs_uv=kde6gvt0_365_j8m; _va_id=ec5b1ecd783dfc52.1596438844.30.1615800849.1615799023.; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22173b32ba72347-0efda1230057718-4c302273-2073600-173b32ba72436a%22%7D; _va_ref=%5B%22%22%2C%22%22%2C1615800833%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D54coDM4TQyJUBe5UqX7uRRIy7UXZ8zHoufRE-ufZk-8fMFrJRtMfs_TQ-sPD2UYtZnPmS86B0DUph3QiQ_8j0sKSmN4ZJVuCRurpig3dJ3W%26wd%3D%26eqid%3Db7d847b70004074600000003604f0468%22%5D; __jsluid_h=9c085f396ee2cc91e800331f4d8fd4a8; _va_ses=*
    Upgrade-Insecure-Requests: 1
    If-Modified-Since: Mon, 15 Mar 2021 06:28:19 GMT
    If-None-Match: W/"604efe83-a36a"
    Cache-Control: max-age=0
    '''
    
    def request_get(url, encoding='UTF-8', tout=20, retries=10):
        count = 0
        while True:
            count += 1
            if (count > retries):
                print('Exceed retry limit')
                return None
            time.sleep(0.2)
            try:
                response = session.get(url, timeout=tout, headers = header)
                response.encoding = encoding
                #print(response.text)
                return response.text
            except requests.ReadTimeout:
                print('ReadTimeout')
                continue
            except ConnectionError:
                print('ConnectionError')
                continue
            except requests.RequestException:
                print('RequestException')
                continue
    
    def lv1_to_lv2(page):
        link_lv1 = base_link + '/jgj/93950/check_car/ycl/22719550-'+str(page)+'.html'
        content = request_get(link_lv1, 'UTF-8', 20, 10)
    
        links_lv2 = []
        result = re.compile(r'<p class="content_li_title"><a href="[^"]+" onclick').findall(content)
        if (len(result) > 0):
            for line in result:
                match = re.match(r'<p class="content_li_title"><a href="([^"]+)" onclick', line)
                link_lv2 = match.group(1)
                links_lv2.append(link_lv2)
        return links_lv2
    
    def lv2_to_data(link_lv2):
        link_lv2= base_link + link_lv2
        content = request_get(link_lv2, 'UTF-8', 20, 10)
        data = {}
        result = re.compile(r'<p class="titles">d+.?年d+月d+日全市检测场实际(?:检|验)').findall(content)
        if (len(result) > 0):
            match = re.match(r'<p class="titles">(d+).?年(d+)月(d+)日全市检测场实际(?:检|验)', result[0])
            data['year'] = match.group(1)
            data['month'] = match.group(2)
            data['date'] = match.group(3)
        else:
            data['year'] = 0
            data['month'] = 0
            data['date'] = 0
    
        content = remove_style(content)
        result = re.compile(r'<tr><td>d+</td>(?:<td>[^<]+</td>)?<td>[^<]+</td><td>d+</td>(?:<td></td>)?</tr>').findall(content)
        rows = []
        if (len(result) > 0):
            for line in result:
                match = re.match(r'<tr><td>d+</td>(?:<td>[^<]+</td>)?<td>([^<]+)</td><td>(d+)</td>(?:<td></td>)?</tr>', line)
                row = {}
                row['name'] = match.group(1)
                row['name'] = row['name'].replace('&nbsp;', '')
                row['value'] = match.group(2)
                rows.append(row)
        data['rows'] = rows
        return data
    
    
    def remove_style(text):
        css_pattern = re.compile('(s+|<span[^>]*>|</span>|<p[^>]*>|</p>|</?strong>|<font[^>]+>|</font>|rowspan="d+"|class="[^"]+"|style="[^"]+"|height="d+"|width="d+"|x:num="d+"|x:str="")')
        return css_pattern.sub(r'', text)
    
    
    # main process
    csv_rows = []
    csv_headers = {}
    
    for page in range(1, 81, 1):
        print(page)
        links_lv2 = lv1_to_lv2(page)
        if (len(links_lv2) > 0):
            for link_lv2 in links_lv2:
                print(link_lv2)
                data = lv2_to_data(link_lv2)
                csv_rows.append(data)
                if (len(data['rows']) > 0):
                    for row in data['rows']:
                        if (row['name'] not in csv_headers):
                            csv_headers[row['name']] = 1
                #time.sleep(1)
    
    with open('output.csv', 'w', encoding='utf8', newline='') as csvfile:
        fieldnames = []
        fieldnames.append('date')
        fieldnames.extend(csv_headers.keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for csv_row in csv_rows:
            row = {}
            row['date'] = '{}.{}.{}'.format(csv_row['year'], csv_row['month'], csv_row['date'])
            for f in csv_row['rows']:
                row[f['name']] = f['value']
            writer.writerow(row)
    
    
    
  • 相关阅读:
    Java中的面向对象以及java的基本特性
    Redis数据结构(三):双向链表和压缩链表
    java对象晋升的四种情况
    Redis数据结构(一):对外数据类型和底层数据结构
    InnoDB中的的聚合函数count(?)哪个效率高?
    群智能算法标准测试函数集
    运行vue项目时报错“ValidationError: Progress Plugin Invalid Options”
    uswipeaction 宽度计算的延迟导致组件加载时内部样式错误
    地址智能识别相关插件
    js滑动验证
  • 原文地址:https://www.cnblogs.com/milton/p/14541138.html
Copyright © 2020-2023  润新知