• python爬虫学习:从数据库读取目标爬虫站点及爬虫规则,批量爬取目标站点指定数据(scrapy框架)


    1. 数据库databaseConfig.py
    from urllib.parse import quote_plus
    from pymongo import MongoClient
    import settings
    
    
    class DB:
        def __init__(self):
            # 从配置文件总获取数据库连接的参数
            host = settings.MONGODB_HOST
            port = settings.MONGODB_PORT
            dbname = settings.MONGODB_DBNAME
            user_name = settings.MONGODB_USERNAME
            password = settings.MONGODB_PASSWORD
    
            # 存放爬取数据的表名
            self.spider_result_sheet_name = settings.MONGODB_SAVE_SPIDER_RESULT_SHEET_NAME
            # 存放爬虫目标网站信息
            self.spider_station_sheet_name = settings.MONGODB_SPIDER_STATION_SHEET_NAME
    
            # 创建MONGODB数据库链接
            uri = "mongodb://%s:%s@%s:%s" % (quote_plus(user_name), quote_plus(password),
                                             quote_plus(host), quote_plus(port))
            client = MongoClient(uri)
            # 指定数据库
            self.collection = client[dbname]
    
    1. 修改 scrapy 框架的 pipelines.py 文件,添加爬虫数据保存到数据库的方法
    # -*- coding: utf-8 -*-
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    # useful for handling different item types with a single interface
    import codecs
    import json
    import os
    from MySpider.databaseConfig import DB
    
    
    class MyScrapyPipeline:
        def process_item(self, item, spider):
            return item
    
    # # 以json文件保存
    # class JsonPipeline(object):
    #     def process_item(self, item, spider):
    #         # base_dir = os.getcwd()
    #         # filename = base_dir + '/spiderData.json'
    #         filename = 'D:/development/datas' + '/spiderData.json'
    #         # 打开json文件,向里面以dumps的方式吸入数据
    #         # 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如
    #         # :“/xe15”
    #         with codecs.open(filename, 'a', encoding='utf-8') as f:
    #             line = json.dumps(dict(item), ensure_ascii=False) + '
    '
    #             f.write(line)
    #         return item
    
    
    # 保存到mongodb数据库
    class SpiderMongoPipeline(object):
        def process_item(self, item, spider):
            data = dict(item)
            db = DB()
            db.collection[db.spider_result_sheet_name].insert(data)
            return item
    
    1. 编辑items.py 对应数据库字段
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    from scrapy.item import Item, Field
    
    
    class MyDataItem(Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = Field()
        author = Field()
        release_time = Field()
        url = Field()
        create_time = Field()
        # pass
    
    1. 核心爬虫方法mySpider.py
    # coding=utf-8
    
    import time
    import scrapy
    from scrapy.selector import Selector
    from mySpider.databaseConfig import DB
    from mySpider.items import MyDataItem
    
    
    class MySpider(scrapy.Spider):
        name = 'mySpider'  # 爬虫的唯一标识,不能重复,启动爬虫的时候要用
    
        # 重写Scrapy的start_requests方法
        def start_requests(self):
            # 数据库连接
            collection = DB().collection[DB().spider_station_sheet_name]
            items = collection.find() # 从数据库中查询所有需要爬取的站点信息
            for item in items:
                station_url = item["station_url"] # 目标站点url
                yield scrapy.Request(url=station_url, meta=item, callback=self.parse_station)
    
    
         # 站点爬虫方法
        def parse_station(self, response):
            meta = response.meta  # 从请求上获取手动传入的meta参数
            articles = Selector(response).xpath(meta["table_xpath"]) # 获取到文章列表
            for article in articles:
                article_detail_url = meta["station_root_url"] + article.xpath(meta["article_detail_xpath"]).extract()[0]
                # dont_filter=True 表示不过滤,不然会导致parse_detail只执行一次
                yield scrapy.Request(url=article_detail_url, meta=meta, callback=self.parse_detail, dont_filter=True) 
    
        # 爬取详情页
        def parse_detail(self, response):
            items = MyDataItem()
            current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    
            meta = response.meta
            selector = Selector(response)
            items['title'] = selector.xpath(meta["title_xpath"]).extract()[0]
            items['author'] = meta["station_name"] if meta["author_xpath"] == "" else selector.xpath(meta["author_xpath"]).extract()[0]
            items['release_time'] = selector.xpath(meta["release_time_xpath"]).extract()[0]
            items['url'] = response.url
            items['create_time'] = current_time
            yield items # 提交爬虫信息(到pipelines.py)
    
    

    dont_filter=True 表示不过滤,不然会导致parse_detail只执行一次,这是一个坑点,前期由于查询资料的方向和关键字不对,导致卡壳很久。最后搜到scrapy - Request 中的回调函数不执行或者只执行一次这篇文章才得以解决

    1. settings.py 修改(以下为settings.py的部分配置内容)
    BOT_NAME = 'mySpider'
    
    SPIDER_MODULES = ['myScrapy.spiders'] # 爬虫核心方法所在的项目文件路径(从项目根开始)
    NEWSPIDER_MODULE = 'myScrapy.spiders'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True
    # LOG_LEVEL = 'ERROR'
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en',
    }
    
    # 优先级
    ITEM_PIPELINES = {
        'myScrapy.pipelines.SpiderMongoPipeline': 200 
    }
    
    # 以下mongodb数据库配置信息省略
    
    1. 启动类main.py
    # 方法一:执行单一爬虫模块,并根据pipeline.py的配置保存
    from scrapy import cmdline
    cmdline.execute("scrapy crawl recruit".split())
    
    # 方法二:执行单一爬虫模块,以文件形式保存(在当前项目根)
    # cmdline.execute("scrapy crawl recruit -o rsj.json".split())
    
    
    # 方法三: 批量制定执行爬虫模块
    # 批量方法1
    # cmdline.execute("scrapy crawlProcess rsj cqgsdx".split())
    # 批量方法2
    # cmdline.execute(['scrapy', 'crawl', 'recruit'])
    
    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings
    
    # 方法四:批量运行spider
    # process = CrawlerProcess(get_project_settings())
    # didntWorkSpider = ['rsj', 'cqgsdx'] # 不需要执行的spider模块
    
    # process_spider_list = process.spiders.list() # 取spiders路径下所有的spider模块
    # for the_spider_name in process_spider_list:
    #     if the_spider_name in didntWorkSpider:
    #         continue
    #     print("Running spider %s" % (the_spider_name))
    #     process.crawl(the_spider_name)
    
    # process.start()
    
  • 相关阅读:
    xpath定向爬取
    正则表达式的零散知识
    正则表达式中的零宽断言
    Cookies
    一行代码从PDF提取Excel文件
    学习kafka的内容总结
    深度学习模型部署
    舆情情感分析
    关键词提取的几种常用方法总结以及代码实现
    语义预训练模型ERNIE
  • 原文地址:https://www.cnblogs.com/httpc/p/14265649.html
Copyright © 2020-2023  润新知