• custom_setting


    custom_setting

    一、定义

    二、配置

      1.middlewares

     1 # ----- SeleniumMiddlerware中间件,不添加全局 -----
     2 from selenium import webdriver
     3 from selenium.common.exceptions import TimeoutException
     4 from selenium.webdriver.common.by import By
     5 from selenium.webdriver.support.ui import WebDriverWait
     6 from selenium.webdriver.support import expected_conditions as  EC
     7 from selenium.webdriver.common.keys import Keys
     8 from scrapy.http import HtmlResponse
     9 from logging import getLogger
    10 import time
    11 class SeleniumMiddleware(object):
    12     # Middleware中会传递进来一个spider,这就是我们的spider对象,从中可以获取__init__时的chrome相关元素
    13     @classmethod
    14     def process_request(self, request, spider):
    15         """
    16         用chrome抓取页面
    17         :param request:Request请求对象
    18         :param spider: Spider对象
    19         :return: HtmlResponse响应
    20         """
    21         print(f"chrome is getting page")
    22         # 依靠meta中的标记,决定是否启用selenium来爬取
    23         usedSelenium = request.meta.get('usedSelenium',False)
    24         print("usedSelenium开始执行")
    25         if usedSelenium:
    26             try:
    27                 spider.browser.get(request.url)
    28                 # 搜索框是否出现
    29                 input = spider.wait.until(
    30                     EC.presence_of_element_located((By.XPATH, "//div[@class='nav-search-field ']/input"))
    31                 )
    32                 time.sleep(2)
    33                 input.clear()
    34                 input.send_keys("iphone 7s")
    35                 # 按enter键,进行搜索
    36                 input.send_keys(Keys.RETURN)
    37                 # 查看搜索结果是否出现
    38                 searchRes = spider.wait.until(
    39                     EC.presence_of_all_elements_located((By.XPATH, "//div[@id='resultsCol']"))
    40                 )
    41             except Exception as err:
    42                 print(f"chrome getting page error,Exception = {err}")
    43                 return HtmlResponse(url=request.url, status=500, request=request)
    44             finally:
    45                 time.sleep(3)
    46                 # 页面爬取成功,构造一个成功的Response对象(HtmlResponse是它的子类)
    47                 return HtmlResponse(url=request.url,
    48                                     body=spider.browser.page_source,
    49                                     request=request,
    50                                     # 指定网页编码
    51                                     encoding = "UTF-8",
    52                                     status=200,
    53                                     )
    middlewares

      2.settings/在settings同目录下新建custom_settings.py

     1 # -*- coding: utf-8 -*-
     2 custom_settings_for_spider1 = {
     3     'LOG_LEVEL': 'INFO',
     4     'DOWNLOAD_DELAY': 0,
     5     'COOKIES_ENABLED': False,  # enabled by default
     6     'DOWNLOADER_MIDDLEWARES': {
     7         # 代理中间件
     8         'video_spider.middlewares.ProxiesMiddleware': 400,
     9         # SeleniumMiddleware中间件
    10         'video_spider.middlewares.SeleniumMiddleware': 543,
    11         # 将scrapy默认的user-agent中间件关闭
    12         'scrapy.downloadmiddlewares.useragent.UserAgentMiddleware': None,
    13     },
    14 
    15     }
    custom_settings

      3.在spider文件中引入custom_settings

     1 import scrapy
     2 from scrapy import Request
     3 from selenium import webdriver
     4 from selenium.webdriver.support.ui import WebDriverWait
     5 # scrapy 信号相关库
     6 from scrapy.utils.project import get_project_settings
     7 from scrapy import signals
     8 from pydispatch import dispatcher
     9 # setting
    10 from ..custom_settings import *
    11 
    12 class ShanbaySpider(scrapy.Spider):
    13     name = 'shanbay'
    14     allowed_domains = ['shanbay.com']
    15    # start_urls = ['http://shanbay.com/']
    16     custom_settings = custom_settings_for_spider1
    17     # 将Chrome初始化放到spider中,成为spider中的元素
    18     def __init__(self, timeout=30, isLoadImage=True, windowHeight=None, windowWidth=None):
    19         # 从settings.py中获取设置参数
    20         print("浏览器开始执行")
    21         self.mySetting = get_project_settings()
    22         self.timeout = self.mySetting['SELENIUM_TIMEOUT']
    23         self.isLoadImage = self.mySetting['LOAD_IMAGE']
    24         self.windowHeight = self.mySetting['WINDOW_HEIGHT']
    25         self.windowWidth = self.mySetting['windowWidth']
    26         # 初始化Chrome对象
    27         self.browser = webdriver.Chrome()
    28         print("六拉你去")
    29         if self.windowHeight and self.windowHeight:
    30             self.browser.set_window_size(900, 900)
    31         self.browser.set_page_load_timeout(self.timeout)
    32         self.wait = WebDriverWait(self.browser, 25)
    33         # 初始化父类,方便不同爬虫文件执行不同执行方式
    34         super(ShanbaySpider, self).__init__()
    35         # 设置信号量,当收到spider_closed信号时,调用mySpiderCloseHandle方法,关闭chrome
    36         dispatcher.connect(receiver=self.CloseHandle,
    37                            signal=signals.spider_closed
    38                            )
    39 
    40     # 信号量处理函数:关闭chrome浏览器
    41     def CloseHandle(self, spider):
    42         print(f"CloseHandle:enter")
    43         self.browser.quit()
    44 
    45     # ------------ spider 开始执行 --------------
    46     # --- 网络请求 ---
    47     def start_requests(self):
    48         for i in range(29):
    49             page = 540709 + i * 3
    50             url_base = 'https://www.shanbay.com/wordlist/187711/' + str(page) + '/?page={}'
    51             for x in range(10):
    52                 url = url_base.format(x+1)
    53                 yield Request(
    54                     url,
    55                     meta={'usedSelenium': True, 'dont_redirect': True},
    56                     callback=self.parse,
    57                     errback=self.error,
    58                     )
    59     def error(self, response):
    60         pass
    61     def parse(self, response):
    62         from ..items import ShanbaySpiderItem
    63         html_contenxs = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div/table/tbody/tr//*/text()')
    64         item = ShanbaySpiderItem()
    65 
    66         for result in html_contenxs:
    67             item['Chinese'] = result.extract()
    68             print(item['Chinese'])
    69             yield item
    spider.py
  • 相关阅读:
    LaTeX插入数学公式
    清除浮动的4种方式
    水平居中与垂直居中
    如何实现两三栏布局
    BFC
    flex弹性盒子
    盒模型
    Git
    jQuery设置disabled属性与移除disabled属性
    TP---where多条件查询
  • 原文地址:https://www.cnblogs.com/guozepingboke/p/10794202.html
Copyright © 2020-2023  润新知