• python3 爬取西祠代理IP数据


     1 import requests
     2 from lxml import etree
     3 
     4 
     5 # 将可以使用的代理IP的信息存储到文件
     6 def write_proxy(proxies):
     7     print(proxies)
     8     for proxy in proxies:
     9         with open("ip_proxy.txt", 'a+') as f:
    10             print("正在写入:", proxy)
    11             f.write(proxy + '
    ')
    12     print("录入完成!!!")
    13 
    14 
    15 # 解析网页,并得到网页中的代理IP
    16 def get_proxy(html):
    17     # 对获取的页面进行解析
    18     selector = etree.HTML(html)
    19     # print(selector.xpath("//title/text()"))
    20     proxies = []
    21     # 信息提取
    22     for each in selector.xpath('//table[@id="ip_list"]/tr')[1:]:
    23         # ip.append(each[0])
    24         ip = each.xpath("./td[2]/text()")[0]
    25         port = each.xpath("./td[3]/text()")[0]
    26         proxy = ip + ":" + port
    27 
    28         proxies.append(proxy)
    29     print(len(proxies))
    30     test_proxies(proxies)
    31 
    32 
    33 # 验证已得到IP的可用性,本段代码通过访问百度网址,返回的response状态码判断(是否可用)。
    34 def test_proxies(proxies):
    35     proxies = proxies
    36     url = "http://www.baidu.com/"
    37     header = {
    38         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    39         }
    40     normal_proxies = []
    41     count = 1
    42     for proxy in proxies:
    43         print("第%s个。。" % count)
    44         count += 1
    45         try:
    46             response = requests.get(url, headers=header, proxies={"http": proxy}, timeout=1)
    47             if response.status_code == 200:
    48                 print("该代理IP可用:", proxy)
    49                 normal_proxies.append(proxy)
    50             else:
    51                 print("该代理IP不可用:", proxy)
    52         except Exception:
    53             print("该代理IP无效:", proxy)
    54             pass
    55     # print(normal_proxies)
    56     write_proxy(normal_proxies)
    57 
    58 
    59 def get_html(url):
    60     header = {
    61         "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    62     }
    63     response = requests.get(url,headers=header,)
    64     # print(response.text)
    65     get_proxy(response.text)
    66 
    67 
    68 if __name__ == "__main__":
    69     base_url = "http://www.xicidaili.com/nn/%s/"
    70     # 爬取3页数据
    71     for i in range(1, 4):
    72         url = base_url % i
    73         get_html(url)
  • 相关阅读:
    php源码学习——开篇
    springMvc入门一
    spring 整合 servlet
    java jar包下载地址
    java spring学习
    Servlet 学习
    JRE_HOME environment variable is not defined correctly This environment variableis needed to run this program
    java JBDC操作
    Java 自定义异常
    JAVA的日期类DATE
  • 原文地址:https://www.cnblogs.com/daihao9527/p/9508246.html
Copyright © 2020-2023  润新知