• python_day06(ip代理池)


    from urllib.request import Request, ProxyHandler
    from urllib.request import build_opener
    from bs4 import BeautifulSoup
    import MySQLdb;
    import redis
    from urllib.request import urlopen
    from lxml import etree
    from lxml import etree
    import re;
    urlfront = "http://www.xicidaili.com"
    url = "http://www.xicidaili.com/nn/1"
    result = redis.Redis(host='127.0.0.1', port=6379,db=0)
    
    # def spider_IP(url):
    # 获取整个页面
    def get_allcode(url):
        # 设置代理IP
        proxy = {'https': '110.73.0.45:8123'}
        proxy_support = ProxyHandler(proxy);
        opener = build_opener(proxy_support)
        # 设置访问http协议头,模拟浏览器
        opener.addheaders = [
            ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')]
        r = opener.open(url)
        html = r.read().decode("UTF-8");
        # print(html)
        return str(html)
    # lxml 方式 获取Ip
    def find_ip(s):
        # s = get_allcode(url);
        selector = etree.HTML(s);
        links = selector.xpath('//tr[@class="odd"]/td/text()|//tr[@class=""]/td/text()');
        ip=[]
        port=[]
        for link in links:
            # print(link)
            if '-' in link:
                # print()
                pass
            elif link.isdigit():
                port.append(link)
                # f.write(link + '
    ');
            elif '.' in link:
                ip.append(link)
                # f.write(link + ':');
        #  用redis 的 llist存 ip
        for i in range(len(ip)):
            # print(ip[i]+":"+port[i])
            ips=ip[i] + ":" + port[i]
            result.lpush('mylist',ips)
    def get_next_page(s):
        selecter = etree.HTML(s);
        link = selecter.xpath('//div[@class="pagination"]/a[@class="next_page"]/@href');
        for i in link:
            if i == None:
                return None;
            return urlfront + i
    def get_allcode_ip(url,ip):
        # 设置代理IP
        try:
            ip=str(ip, encoding="utf-8")# bytes与str相互转换
            timeout=5
            proxy = {'http':ip}
            proxy_support = ProxyHandler(proxy);
            opener = build_opener(proxy_support)
            # 设置访问http协议头,模拟浏览器
            opener.addheaders = [
                ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')]
            # 加运行超时
            r = opener.open(url,None,timeout)
            html = r.read().decode("UTF-8");
            print('+++++++++++++++')
            # 将可用Ip放到redis的useable_ip中
            result.lpush('usable_ip',ip)
            print(ip)
            print('+++++++++++++++')
        except Exception as err:
            print(err)
    while 1:
        print(url)
        s=get_allcode(url);
        url=get_next_page(s)
        print(url)
        if url==None:
            break
        find_ip(s)
        while 1:
            ip = result.lpop('mylist')
            print(ip)
            if ip == None:
                break
            get_allcode_ip(url, ip)
  • 相关阅读:
    AVR单片机教程——PWM调光
    AVR单片机教程——蜂鸣器
    AVR单片机教程——串口接收
    做个别出心裁的圣诞礼物
    电源选择电路的原理
    AVR单片机教程——串口发送
    C++类成员默认初始值
    AVR单片机教程——旋转编码器
    10.Django-form表单
    09.Django-数据库优化
  • 原文地址:https://www.cnblogs.com/qieyu/p/7846110.html
Copyright © 2020-2023  润新知