• scrapy中使用selenium+webdriver获取网页源码,爬取简书网站


    scrapy中使用selenium+webdriver获取网页源码,爬取简书网站

    由于简书中一些数据是通过js渲染出来的,所以通过正常的request请求返回的response源码中没有相关数据,
    所以这里选择selenium+webdriver获取网页源码

    1. 设置需要爬取的数据

    import scrapy
    class JianshuItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        author = scrapy.Field()
        author_img = scrapy.Field()
        time = scrapy.Field()
        read_count = scrapy.Field()
        subjects = scrapy.Field()
    

    2. 在下载器中间件中使用 selenium+webdriver

    from scrapy import signals
    from scrapy.http.response.html import HtmlResponse
    from selenium import webdriver
    # 显示等待
    from selenium.webdriver.support.ui import WebDriverWait
    
    
    class SeleniumDownloaderMiddleware:
        def __init__(self):
            # 加载chrome驱动,若chromedriver.exe文件和python.exe 在相同目录下,可以省略executable_path="D:pythonchromedriver.exe"
            # 即 self.driver=webdriver.Chrome()就可以
            self.driver = webdriver.Chrome(executable_path="D:pythonchromedriver.exe")
    
        def process_request(self, request, spider):
            print("-"*40)
            print(id(self))
            print("-"*40)
    
            self.driver.get(request.url)
            try:
                while True:
                    WebDriverWait(self.driver, 3).until(lambda x: x.find_element_by_class_name("H7E3vT"))
                    # 获取加载更多按钮
                    # show_more = self.driver.find_element_by_xpath("//div[@class='H7E3vT']")
                    show_more = self.driver.find_element_by_class_name("H7E3vT")
                    show_more.click()
            except:
                print("找不到更多按钮")
                pass
            # 获取网页源代码
            html = self.driver.page_source
            # 使用url=self.driver.current_url而不使用url=request.url,是有可能发生重定向,导致url发生变化
            response = HtmlResponse(url=self.driver.current_url, body=html, request=request, encoding="utf-8")
            # 返回response,请求就直接返回给scrapy引擎,而不会再发给下载器执行下载
            return response
    

    3. 编写解析数据的爬虫

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from scrapylearn.jianshu.jianshu.items import JianshuItem
    
    
    class JianshuspiderSpider(CrawlSpider):
        name = 'jianshuspider'
        allowed_domains = ['jianshu.com']
        start_urls = ['http://jianshu.com/']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            title = response.xpath("//h1[@class='_1RuRku']/text()").get()
            author = response.xpath("//span[@class='FxYr8x']/a/text()").get()
            author_img = response.xpath("//img[@class='_13D2Eh']/@src").get()
            time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
            read_count = response.xpath("//div[@class='s-dsoj']/span[2]/text()").get().split()[1].replace(",", "")
            subjects = ",".join(response.xpath("//div[@class='_2Nttfz']/a/span/text()").getall())
            yield JianshuItem(title=title, author=author, author_img=author_img, time=time, read_count=read_count,
                        subjects=subjects)
    
        def parse_item(self, response):
            item = {}
            # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
            # item['name'] = response.xpath('//div[@id="name"]').get()
            # item['description'] = response.xpath('//div[@id="description"]').get()
            return item
    

    4. 将数据保存到mysql

    import pymysql
    
    
    class JianshuPipeline:
        def __init__(self):
            self.conn = pymysql.connect(
                host='localhost',
                port=3307,
                user='root',
                password='1612480331',
                database='houses',
                charset='utf8'
            )
    
        def process_item(self, item, spider):
            print("=" * 40)
            print(id(self))
            print("=" * 40)
            # 打开数据库连接
            # conn = pymysql.connect("localhost", "root", "1612480331", "houses", 3307)
            # 创建一个游标对象
            cursor = self.conn.cursor()
            sql = "insert into jianshu values (%s,%s,%s,%s,%s,%s)"
            cursor.execute(sql, (
                item["title"], item["author"], item["author_img"], item["time"], item["read_count"], item["subjects"]))
            self.conn.commit()
            # print(values)
            # for v in values:
            #     print(v)
            cursor.close()
    
            return item
    
        # 当爬虫关闭的时候会调用
        def close_spider(self, spider):
            self.conn.close()
            print("爬虫执行结束")
    

    5. 在settings.py中进行配置

    DOWNLOADER_MIDDLEWARES = {
        # 'jianshu.middlewares.JianshuDownloaderMiddleware': 543,
        'jianshu.middlewares.SeleniumDownloaderMiddleware': 1
    }
    
    ITEM_PIPELINES = {
       'jianshu.pipelines.JianshuPipeline': 300,
    }
    
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
  • 相关阅读:
    导入Excel的时候使用TransactionScope事务控制来进行数据
    【项目相关】MVC中将WebUploader进行封装
    【项目相关】MVC中使用WebUploader进行图片预览上传以及编辑
    Java学习-2 其它公司合作项目源码分析
    Linux开发环境搭建
    新春畅想未来
    Java学习-1 框架、测试及学习误区
    Java学习-1 Myeclipse与Idea
    又到了一年一度圣诞新年立志许愿的时候了
    WebStorm神器啊,一旦上手根本停不下来
  • 原文地址:https://www.cnblogs.com/yloved/p/12990474.html
Copyright © 2020-2023  润新知