• scrapy 随机中间件配置


    Scrapy添加随机User-Agent:

    1.pip install scrapy-fake-useragent
    
    2.setting.py 写:
    DOWNLOADER_MIDDLEWARES = {
        'lagoujob.middlewares.RandomUesrAgent': 1,
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    }
    
    3.middlewares.py 写入:
    class RandomUesrAgent(object):
        def process_request(self, request, spider):
            ua = UserAgent()
            request.headers.setdefault("User-Agent", ua.random)
    

    scrapy 添加ip池:

       国内的免费IP代理网站参考:
                        http://www.xicidaili.com/wt           
                        https://www.kuaidaili.com/free/
                        http://www.youdaili.net/Daili/guonei/   
                        http://ip.zdaye.com/FreeIPlist.html
    
    

    配置:

    中间件 middlewares.py 中 添加一个 代理 类
    import random
    from proxy1.settings import IPPOOL
     
     
    class ProxychiMiddleware(object):
         # 定义一个请求之前的方法
         def process_request(self, request, spider):
                  # 如果是 私密代理
                  # request.meta['proxy'] = 'https://用户名and密码114.212.12.4:3128'    
                  # 随即获取一个代理
                  this_ip = random.choice(IPPOOL)
                  request.meta['proxy'] = 'HTTP://'+this_ip
                
                return None
    
     setting.py 中 启用  middlewares.py 中的 代理类:
     DOWNLOADER_MIDDLEWARES = {
             #  启用的类名 要和  中间件中的类名一致
             'movie.middlewares.ProxychiMiddleware': 543,
    }
     
    # 定义一个代理池
    IPPOOL=[
            {"ipaddr":"123.55.1.75:30325"},
            {"ipaddr":"220.184.213.12:6666"},
            {"ipaddr":"171.38.85.82:8123"},
            {"ipaddr":"111.121.193.214:3128"},
            {"ipaddr":"58.48.193.180:3128"},
            {"ipaddr":"171.37.29.26:9797"},
            {"ipaddr":"119.188.162.165:8081"} ]
    
    

    重写start_request:

    import scrapy
    import random
    # 设置一个代理池
    proxy_pool = [{'HTTP':'111.155.116.215:8123'}]
    class ProxydemoSpider(scrapy.Spider):
        name = 'proxydemo'
        allowed_domains = ['www.baidu.com']
        start_urls = ['http://www.baidu.com/']
     
        def start_requests(self):
            for url in self.start_urls:
                proxy_addr = random.choice(proxy_pool)  # 随机选一个
                yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': proxy_addr})  # 通过meta参数添加代理
     
        def parse(self, response):
                print('proxy simida')
    
                proxy_addr = "http://ip:port"
                加密:
    

    scrapy 爬取多层:

    # -*- coding: utf-8 -*-
    import scrapy
    from Tencent.items import TencentItem
     
     
    class TencentSpider(scrapy.Spider):
        # 爬虫名称
        name = 'tencent'
        # 允许爬取的域名
        allowed_domains = ['www.xxx.com']
        # 爬虫基础地址 用于爬虫域名的拼接
        base_url = 'https://www.xxx.com/'
        # 爬虫入口爬取地址
        start_urls = ['https://www.xxx.com/position.php']
        # 爬虫爬取页数控制初始值
        count = 1
        # 爬虫爬取页数 10为只爬取一页
        page_end = 1
     
        def parse(self, response):
     
     
            nodeList = response.xpath("//table[@class='tablelist']/tr[@class='odd'] | //table[@class='tablelist']/tr[@class='even']")
            for node in nodeList:
                item = TencentItem()
     
                item['title'] = node.xpath("./td[1]/a/text()").extract()[0]
                if len(node.xpath("./td[2]/text()")):
                    item['position'] = node.xpath("./td[2]/text()").extract()[0]
                else:
                    item['position'] = ''
                item['num'] = node.xpath("./td[3]/text()").extract()[0]
                item['address'] = node.xpath("./td[4]/text()").extract()[0]
                item['time'] = node.xpath("./td[5]/text()").extract()[0]
                item['url'] = self.base_url + node.xpath("./td[1]/a/@href").extract()[0]
                # 根据内页地址爬取
                yield scrapy.Request(item['url'], meta={'item': item}, callback=self.detail_parse)
     
                # 有下级页面爬取 注释掉数据返回
                # yield item
     
            # 循环爬取翻页
            nextPage = response.xpath("//a[@id='next']/@href").extract()[0]
            # 爬取页数控制及末页控制
            if self.count < self.page_end and nextPage != 'javascript:;':
                if nextPage is not None:
                    # 爬取页数控制值自增
                    self.count = self.count + 1
                    # 翻页请求
                    yield scrapy.Request(self.base_url + nextPage, callback=self.parse)
            else:
                # 爬虫结束
                return None
            
        def detail_parse(self, response):
            # 接收上级已爬取的数据
            item = response.meta['item']   
            #一级内页数据提取 
            item['zhize'] = response.xpath("//*[@id='position_detail']/div/table/tr[3]/td/ul[1]").xpath('string(.)').extract()[0]
            item['yaoqiu'] = response.xpath("//*[@id='position_detail']/div/table/tr[4]/td/ul[1]").xpath('string(.)').extract()[0]
            # 二级内页地址爬取
            yield scrapy.Request(item['url'] + "&123", meta={'item': item}, callback=self.detail_parse2)
            # 有下级页面爬取 注释掉数据返回
            # return item
        def detail_parse2(self, response):
            # 接收上级已爬取的数据
            item = response.meta['item']
            # 二级内页数据提取 
            item['test'] = "111111111111111111"
            # 最终返回数据给爬虫引擎
            return item
    
  • 相关阅读:
    部署asp.net网站若干记录
    部署asp.net网站容易忽视的问题
    我的wp记录WordPress3.0导航菜单图文使用教程
    问题org.apache.catalina.core.StandardWrapperValve invoke 严重: Servlet.service() for servlet jsp threw exception
    spring + hibernate 配置
    FlexPaper二次开发简单收尾
    C++集成设计环境——Code::Blocks安装过程
    onlineDoc修改
    问题java.net.ConnectException: connection failed: socket,host=localhost,port=8100,tcpNoDelay=1
    解决java web项目导入后出现的问题 cannot be read or is not a valid ZIP file
  • 原文地址:https://www.cnblogs.com/shaozheng/p/12792270.html
Copyright © 2020-2023  润新知