• 写了个小爬虫,为何用上代理ip总是出现错误。


     1 import urllib.request
     2 import re
     3 import os
     4 import random
     5 import threading
     6 
     7 def url_open(url):  #在第8到第12行,总是无法正常运行,代理Ip是从网上免费代理ip获取的。
     8     #ips = ['117.136.234.12:80', '218.189.26.20:8080','202.194.101.150:80','180.166.112.47:8888']
     9         
    10     #proxy = urllib.request.ProxyHandler({'http':random.choice(ips)})#{'http':'124.202.174.66:8118'}
    11     #opener = urllib.request.build_opener(proxy)
    12     #opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36')]
    13         
    14     #urllib.request.install_opener(opener)
    15 
    16     req = urllib.request.Request(url)
    17     
    18     req.add_header('User-Agent','Mozilla/5.0')
    19     urlobject = urllib.request.urlopen(req)
    20     response = urlobject.read()
    21     return response
    22     
    23 def find_page(html):
    24     s2 = r'[d{4}]'
    25     m = re.search(s2, html)
    26     page = m.group()
    27     print("find_page")
    28     return page
    29 
    30 def find_page_link(html):
    31     s = r'http://ww[0-9].sinaimg.cn/mw600/w+.jpg'
    32     m = re.findall(s, html)
    33     return m
    34 
    35 
    36 def save_page(jpg):
    37     for file in jpg:
    38         data = url_open(file)
    39         #print("wwwwwwwwww")
    40         name = "E:\作业\j_d\"+file.split('/')[-1]
    41         with open(name, 'wb') as f:
    42             f.write(data)
    43         
    44 
    45 def down_jpg(dir_name='E:作业j_d', page=10, pages=10):
    46     #os.mkdir(dir_name)
    47     os.chdir(dir_name)
    48     #red = url_open('http://jandan.net/ooxx')
    49     #print(type(red))
    50     #red = red.decode('utf-8')
    51     
    52     #page = find_page(red)
    53     #page = int(page[1:-1])
    54     #page = 1333
    55     for i in range(pages):
    56         page += 1
    57         url = 'http://jandan.net/ooxx/page-'+str(page)+'#comments'
    58         print(url)
    59         data = url_open(url)
    60         data = data.decode('utf-8')
    61         print("dddddddddddddd")
    62         page_list = find_page_link(data)
    63         #print("sssssssssssssss")
    64         save_page(page_list)
    65     
    66 
    67 if __name__ == '__main__':
    68     p = threading.Thread(target=down_jpg,args=('E:作业j_d',1555,10))
    69     c = threading.Thread(target=down_jpg,args=('E:作业j_d',1024,10))
    70     #down_jpg()
    71     p.start()
    72     c.start()
    73 
    74     p.join()
    75     c.join()
  • 相关阅读:
    TextView文字排版问题:
    Cent OS 6 主机名设置
    windows server 时间同步
    DELL服务器SAS 5 I_R 完全配置手册
    SAS 5/iR Adapter 驱动下载
    U盘加载硬盘控制卡驱动安装Windows 2003 指南
    邮件客户端导入邮件通讯录地址薄
    Symantec System Recovery
    windows server 备份与还原
    Acronis 备份使用
  • 原文地址:https://www.cnblogs.com/nethk/p/4825131.html
Copyright © 2020-2023  润新知