• 代理池抓取基础版-(python协程)--抓取网站(西刺-后期会持续更新)


     1 # coding = utf-8
     2 
     3 __autor__ = 'litao'
     4 
     5 import urllib.request
     6 import urllib.request
     7 import urllib.error
     8 import socket
     9 import gevent
    10 from gevent import monkey
    11 from bs4 import BeautifulSoup
    12 import time
    13 import random
    14 home = "http://www.xicidaili.com/wt/"
    15 first_proxy_list = []
    16 end_proxy_list = []
    17 # proxy_support = urllib.request.ProxyHandler({"http": "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080"})
    18 headers = {
    19     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
    20 }
    21 monkey.patch_all()
    22 def test_proxy(proxy_key):
    23     # for i in range(len(first_proxy_list)):
    24         # proxy_support = urllib.request.ProxyHandler({"http":proxy_list[i]})
    25         print(proxy_key)
    26         proxy = {"http":proxy_key}
    27         url = "https://www.baidu.com/"
    28 
    29 
    30         proxy_support = urllib.request.ProxyHandler(proxy)
    31         opener = urllib.request.build_opener(proxy_support)
    32         urllib.request.install_opener(opener)
    33         res = urllib.request.Request(url=url, headers=headers)
    34         try:
    35             response = urllib.request.urlopen(res,timeout=5)
    36             if response.code == 200:
    37                 end_proxy_list.append(proxy_key)
    38         except Exception as e:
    39             print("error:",e)
    40         # except socket.timeout as e:
    41         #     print("This proxy is socket.timeout")
    42         # except urllib.error.URLError as e:
    43         #     print("This proxy is timeout")
    44 
    45 def get_proxy_list():
    46     for i in range(20):
    47         url =home + str(i+1)
    48         print(url)
    49         # proxy_support = urllib.request.ProxyHandler({"http":"123.125.5.100:3128"})
    50         # opener = urllib.request.build_opener(proxy_support)
    51         # urllib.request.install_opener(opener)
    52         res = urllib.request.Request(url=url, headers=headers)
    53         response =urllib.request.urlopen(res,timeout=20).read().decode()
    54         soup = BeautifulSoup(response,'html.parser')
    55         print(response)
    56         content = soup.find_all("table",attrs={"id":"ip_list"})[0].find_all('tr')[1:]
    57         for i in range(len(content)):
    58             result = content[i].find_all('td')
    59             proxy_enum = result[1].text+":"+result[2].text
    60             print(proxy_enum)
    61             first_proxy_list.append(proxy_enum)
    62         time.sleep(random.randint(120,240))
    63 
    64 
    65 def join_gevent(first_proxy_list,gevent_list):
    66     for i in range(len(first_proxy_list)):
    67         gevent_list.append(gevent.spawn(test_proxy,first_proxy_list[i]))
    68 
    69 def main():
    70     gevent_list = []
    71     get_proxy_list()
    72     with open("proxy_first.txt",'a',encoding='utf-8') as f:
    73         for item in first_proxy_list:
    74             f.write(item+'
    ')
    75     join_gevent(first_proxy_list, gevent_list)
    76     gevent.joinall(gevent_list)
    77     print(end_proxy_list)
    78     with open("proxy_end.txt",'a',encoding='utf-8') as f:
    79         for item in end_proxy_list:
    80             f.write(item+'
    ')
    81 
    82 if __name__ == "__main__":
    83     main()
  • 相关阅读:
    线性表的相关操作
    jq实现登陆页面的拖拽功能
    js实现登陆页面的拖拽功能
    HTML5存储
    js函数声明
    js打字机效果实现
    js日期显示效果
    js实现倒计时效果
    js中如何去获取外部css样式
    面向对象的几种方法详解(后)
  • 原文地址:https://www.cnblogs.com/crawer-1/p/7638799.html
Copyright © 2020-2023  润新知