• 1.4.3 ID遍历爬虫(每天一更)


    # -*- coding: utf-8 -*-
    '''
    Created on 2019年5月7日
    
    @author: 薛卫卫
    '''
    import itertools
    import urllib.request
    import re
    
    def download(url, user_agent="wswp",num_retries=2):
        print("Downloading: " , url)
        headers = { 'User-agent': user_agent}
        request = urllib.request.Request(url, headers=headers)
        try:
            html = urllib.request.urlopen(request).read()
        except urllib.request.URLError as e:
            print('Download error:' , e.reason)
            html = None
            if num_retries > 0 :
                if hasattr(e, 'code') and 500 <= e.code < 600:
                    return download(url, user_agent, num_retries-1)
        return html
    
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/view/-%d' % page
        html = download(url)
        if html is None:
            break
        else:
            # success - can scrape the result
            pass
        
    #     
    # # maximum number of consecutive download errors allowed
    # max_error = 5
    # # current number of consecutive download errors
    # nun_errors = 0
    # for page in itertools.count(1):
    #     url = 'http://example.webcraping.com/view/-%d' % page
    #     html = download(url)
    #     if html is None:
    #         # received an error trying to download this webpage
    #         num_errors +=1
    #         if num_errors == max_errors:
    #             # reached maxinum number of 
    #             # consecutive errors so exit
    #             break
    #         else:
    #             # success - can scrape the result
    #             # ...
    #             num_errors = 0
    

      

  • 相关阅读:
    HTML 网页创建
    CSS3 opacity
    两数相加的和
    九九乘法表
    Linux下的Makefile初入
    linux 下定义寄存器宏 实现类似于STM32的寄存器操作
    Linux 编译与交叉编译
    linux IMX6 汇编点亮一个LED灯
    Linux基本指令与作用
    C# Task 源代码阅读(2)
  • 原文地址:https://www.cnblogs.com/xww115/p/10835223.html
Copyright © 2020-2023  润新知