• 代理池-豆瓣电影


    代理池

    实现了简单的代理池免费ip的获取,并使用有效的ip,进行爬取工作

    import requests
    import re
    from lxml import etree
    
    
    url = 'https://www.xicidaili.com/nn/'
    
    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
    }
    
    def get(proxies):
        url2 = 'https://movie.douban.com/top250'
        for i in range(0,250,25):
            payload = {'start': i}
            movie_response = requests.get(url=url2,headers=headers,proxies=proxies,params=payload).text
            tree = etree.HTML(movie_response)
            title.extend(tree.xpath("//div[@class='item']//a/span[1]/text()"))
            movie_url.extend(tree.xpath("//div[@class='item']//a/@href"))
            fen.extend(tree.xpath("//div[@class='star']//span[2]/text()"))
            ping.extend(tree.xpath("//div[@class='star']//span[4]/text()"))
    
    
    #构建代理池
    def ip_run():
        ip_response = requests.get(url=url,headers=headers).text
    
        ips = re.findall("<td>(d+.d+.d+.d+)</td>", ip_response, re.S)
        ports = re.findall("<td>(d+)</td>", ip_response, re.S)
    
        for ip in(zip(ips,ports)):
            proxies = {
                "http":"http://"+ip[0]+":"+ip[1],
                "https":"http://"+ip[0]+":"+ip[1],
            }
            try:
                res = requests.get('http://www.baidu.com',proxies=proxies, timeout=2)
                print("ip能使用")
                get(proxies)
                break
            except Exception as e:
                print("ip不能使用")
    
    
    if __name__ == '__main__':
        title = []
        movie_url = []
        fen= []
        ping = []
        ip_run()
        jie = zip(title,movie_url,fen,ping)
        for i in jie:
            print(i)
    
    
  • 相关阅读:
    java调用打印机方式二
    java调用系统打印机
    Centos7开放端口(永久)
    java毫秒级别定时器
    java计算接口调用时间
    java实现当前时间往前推N小时
    java注解日志记录到数据库
    Java后端HttpClient Post提交文件流 及服务端接收文件流
    springboot整合websocket
    注解@Slf4j使用
  • 原文地址:https://www.cnblogs.com/zx125/p/12121829.html
Copyright © 2020-2023  润新知