• 封装IP池和用户代理相应的类(python3)


    一、middlewares.py源代码:

     1 # -*- coding: utf-8 -*-
     2 # 导入随机模块
     3 import random
     4 # 导入有关IP池有关的模块
     5 from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
     6 # 导入有关用户代理有关的模块
     7 from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
     8 
     9 # IP池
    10 class HTTPPROXY(HttpProxyMiddleware):
    11     # 初始化 注意一定是 ip=''
    12     def __init__(self, ip=''):
    13         self.ip = ip
    14 
    15     def process_request(self, request, spider):
    16         item = random.choice(IPPOOL)
    17         print("当前的IP是:"+item["ipaddr"])
    18         request.meta["proxy"] = "http://"+item["ipaddr"]
    19 
    20 
    21 # 设置IP池
    22 IPPOOL = [
    23     {"ipaddr": "182.117.102.10:8118"},
    24     {"ipaddr": "121.31.102.215:8123"},
    25     {"ipaddr": "1222.94.128.49:8118"}
    26 ]
    27 
    28 
    29 # 用户代理
    30 class USERAGENT(UserAgentMiddleware):
    31     #初始化 注意一定是 user_agent=''
    32     def __init__(self, user_agent=''):
    33         self.user_agent = user_agent
    34 
    35     def process_request(self, request, spider):
    36         item = random.choice(UPPOOL)
    37         try:
    38             print("当前的User-Agent是:"+item)
    39             request.headers.setdefault('User-Agent', item)
    40         except Exception as e:
    41             print(e)
    42             pass
    43 
    44 
    45 # 设置用户代理池
    46 UPPOOL = [
    47     "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
    48 ]

    二、settings.py文件设置,添加下面代码:

     1 #==============================================
     2 
     3 # 禁止Cookie
     4 COOKIES_ENABLED = False
     5 
     6 # 下载中间件指向
     7 DOWNLOADER_MIDDLEWARES = {
     8     # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
     9     # '工程名.middlewares.HTTPPROXY' : 125,
    10     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
    11     '工程名.middlewares.USERAGENT': 1
    12 }
    13 
    14 # 管道指向
    15 ITEM_PIPELINES = {
    16     '工程名.pipelines.管道中对应的类名': 300,
    17 }
    18 
    19 # 注意将Obey robots关闭(上面已经开启了,所以需要找到并设置为False)
    20 ROBOTSTXT_OBEY = False
    21 
    22 #==============================================

    三、总结一些需要的东西

    1.在引入(有关IP池有关的和关用户代理的)模块是注意这个

    1 # 导入有关IP池有关的模块
    2 from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
    3 # 导入有关用户代理有关的模块
    4 from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware

    "contrib"不能丢了

    2.配置下载中间件的连接信息时,注意文件指向,也就是调用类的指向

    3.“.”表示这一级目录,".."表示上一级目录

  • 相关阅读:
    【Linux 编程】进程间通信
    毕设进行时——4.3寸在富士通ARM中实现
    spcomm使用:在编译运行时为什么总出现"unable to open include file 'spcomm.hpp'"?
    Xilinx LVDS
    Xilinx selectIO
    xilinx 原理图输入
    http消息头(转)
    用java语言将数据库中的数据表转换为xml文件的通用程序(转)
    数据字典实例
    Web Service工作原理初探
  • 原文地址:https://www.cnblogs.com/xiaomingzaixian/p/7122783.html
Copyright © 2020-2023  润新知