• scrapy--ipproxy


    不要急于求成,你只要做的是比昨天的你更优秀一点
                           --匿名

    今天给大家讲一下--IpProxy,由于从"http://www.xicidaili.com/nn"爬取,以下是我转载的博客

    https://www.jianshu.com/p/8975a3997ab6

    需要解决的问题

    1.ip,端口和协议都是在静态页面中爬取
    2.验证代理ip是否可用

    这里就给大家看看爬取的代码怎么写,其他的配置可以看我之前的博客,具体代码可以进我的GitHub:。QAQ!!

    # -*- coding: utf-8 -*-
    import scrapy
    from Iproxy.items import IproxyItem
    import pdb
    from Iproxy.settings import USER_AGENT
    import re
    from scrapy.linkextractors import LinkExtractor
    import telnetlib
    
    class IproxySpider(scrapy.Spider):
        name = 'iproxy'
        allowed_domains = ['www.xicidaili.com']
        start_urls = ['http://www.xicidaili.com/nn']
    
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'Content-Length': '11',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'www.xicidaili.com',
            'Origin': 'www.xicidaili.com',
            'Referer': 'http://www.xicidaili.com/',
            'User-Agent': USER_AGENT,
            'X-Requested-With': 'XMLHttpRequest',
        }
    
        #验证ip代理是否可用
        def telnet(self,item):
            try:
                telnetlib.Telnet(item['origin_ip'], port=item['port'], timeout=10.0)
            except:
                print('connect failure')
                return False
            else:
                print('conncet success')
                return True
    
        def parse(self, response):
            iplist = IproxyItem()
            sels = response.xpath('//tr[@class="odd"]')
            items = {}
            for sel in sels:
                ips     = sel.xpath('./td[2]').extract()[0].encode('utf8')
                ports   = sel.xpath('./td[3]').extract()[0].encode('utf8')
                types   = sel.xpath('./td[6]').extract()[0].encode('utf8')
                type    = re.findall(r'>(.*?)<',types)[0]
    
                #获取ip代理协议,低址,端口
                if type == 'HTTP':
                    #items = 'http://' + re.findall(r'>(.*?)<',ips)[0] +':'+re.findall(r'>(.*?)<',ports)[0]
                    items['origin_ip'] = re.findall(r'>(.*?)<',ips)[0]
                    items['port']      = re.findall(r'>(.*?)<',ports)[0]
                    if self.telnet(items):
                        iplist['ip_name'] = 'http://' + re.findall(r'>(.*?)<',ips)[0]
                        iplist['port']    = re.findall(r'>(.*?)<',ports)[0]
    
                if type == 'HTTPS':
                    items['origin_ip'] = re.findall(r'>(.*?)<', ips)[0]
                    items['port'] = re.findall(r'>(.*?)<', ports)[0]
                    #items = 'https://' + re.findall(r'>(.*?)<', ips)[0] +':'+re.findall(r'>(.*?)<', ports)[0]
                    if self.telnet(items):
                        iplist['ip_name'] = 'https://' + re.findall(r'>(.*?)<',ips)[0]
                        iplist['port']    = re.findall(r'>(.*?)<', ports)[0]
    
                print iplist
                yield iplist
    
            #获取页面链接url
            links = LinkExtractor(restrict_css='div.pagination')
            for link in links.extract_links(response):
                yield scrapy.Request(link.url,callback=self.parse)
  • 相关阅读:
    iOS
    iOS
    iOS
    iOS
    iOS
    iOS
    iOS
    iOS
    iOS
    iOS
  • 原文地址:https://www.cnblogs.com/eilinge/p/9830079.html
Copyright © 2020-2023  润新知