- 数据库databaseConfig.py
from urllib.parse import quote_plus
from pymongo import MongoClient
import settings
class DB:
def __init__(self):
# 从配置文件总获取数据库连接的参数
host = settings.MONGODB_HOST
port = settings.MONGODB_PORT
dbname = settings.MONGODB_DBNAME
user_name = settings.MONGODB_USERNAME
password = settings.MONGODB_PASSWORD
# 存放爬取数据的表名
self.spider_result_sheet_name = settings.MONGODB_SAVE_SPIDER_RESULT_SHEET_NAME
# 存放爬虫目标网站信息
self.spider_station_sheet_name = settings.MONGODB_SPIDER_STATION_SHEET_NAME
# 创建MONGODB数据库链接
uri = "mongodb://%s:%s@%s:%s" % (quote_plus(user_name), quote_plus(password),
quote_plus(host), quote_plus(port))
client = MongoClient(uri)
# 指定数据库
self.collection = client[dbname]
- 修改 scrapy 框架的 pipelines.py 文件,添加爬虫数据保存到数据库的方法
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import codecs
import json
import os
from MySpider.databaseConfig import DB
class MyScrapyPipeline:
def process_item(self, item, spider):
return item
# # 以json文件保存
# class JsonPipeline(object):
# def process_item(self, item, spider):
# # base_dir = os.getcwd()
# # filename = base_dir + '/spiderData.json'
# filename = 'D:/development/datas' + '/spiderData.json'
# # 打开json文件,向里面以dumps的方式吸入数据
# # 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如
# # :“/xe15”
# with codecs.open(filename, 'a', encoding='utf-8') as f:
# line = json.dumps(dict(item), ensure_ascii=False) + '
'
# f.write(line)
# return item
# 保存到mongodb数据库
class SpiderMongoPipeline(object):
def process_item(self, item, spider):
data = dict(item)
db = DB()
db.collection[db.spider_result_sheet_name].insert(data)
return item
- 编辑items.py 对应数据库字段
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class MyDataItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
author = Field()
release_time = Field()
url = Field()
create_time = Field()
# pass
- 核心爬虫方法mySpider.py
# coding=utf-8
import time
import scrapy
from scrapy.selector import Selector
from mySpider.databaseConfig import DB
from mySpider.items import MyDataItem
class MySpider(scrapy.Spider):
name = 'mySpider' # 爬虫的唯一标识,不能重复,启动爬虫的时候要用
# 重写Scrapy的start_requests方法
def start_requests(self):
# 数据库连接
collection = DB().collection[DB().spider_station_sheet_name]
items = collection.find() # 从数据库中查询所有需要爬取的站点信息
for item in items:
station_url = item["station_url"] # 目标站点url
yield scrapy.Request(url=station_url, meta=item, callback=self.parse_station)
# 站点爬虫方法
def parse_station(self, response):
meta = response.meta # 从请求上获取手动传入的meta参数
articles = Selector(response).xpath(meta["table_xpath"]) # 获取到文章列表
for article in articles:
article_detail_url = meta["station_root_url"] + article.xpath(meta["article_detail_xpath"]).extract()[0]
# dont_filter=True 表示不过滤,不然会导致parse_detail只执行一次
yield scrapy.Request(url=article_detail_url, meta=meta, callback=self.parse_detail, dont_filter=True)
# 爬取详情页
def parse_detail(self, response):
items = MyDataItem()
current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
meta = response.meta
selector = Selector(response)
items['title'] = selector.xpath(meta["title_xpath"]).extract()[0]
items['author'] = meta["station_name"] if meta["author_xpath"] == "" else selector.xpath(meta["author_xpath"]).extract()[0]
items['release_time'] = selector.xpath(meta["release_time_xpath"]).extract()[0]
items['url'] = response.url
items['create_time'] = current_time
yield items # 提交爬虫信息(到pipelines.py)
dont_filter=True 表示不过滤,不然会导致parse_detail只执行一次,这是一个坑点,前期由于查询资料的方向和关键字不对,导致卡壳很久。最后搜到scrapy - Request 中的回调函数不执行或者只执行一次这篇文章才得以解决
- settings.py 修改(以下为settings.py的部分配置内容)
BOT_NAME = 'mySpider'
SPIDER_MODULES = ['myScrapy.spiders'] # 爬虫核心方法所在的项目文件路径(从项目根开始)
NEWSPIDER_MODULE = 'myScrapy.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# LOG_LEVEL = 'ERROR'
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# 优先级
ITEM_PIPELINES = {
'myScrapy.pipelines.SpiderMongoPipeline': 200
}
# 以下mongodb数据库配置信息省略
- 启动类main.py
# 方法一:执行单一爬虫模块,并根据pipeline.py的配置保存
from scrapy import cmdline
cmdline.execute("scrapy crawl recruit".split())
# 方法二:执行单一爬虫模块,以文件形式保存(在当前项目根)
# cmdline.execute("scrapy crawl recruit -o rsj.json".split())
# 方法三: 批量制定执行爬虫模块
# 批量方法1
# cmdline.execute("scrapy crawlProcess rsj cqgsdx".split())
# 批量方法2
# cmdline.execute(['scrapy', 'crawl', 'recruit'])
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
# 方法四:批量运行spider
# process = CrawlerProcess(get_project_settings())
# didntWorkSpider = ['rsj', 'cqgsdx'] # 不需要执行的spider模块
# process_spider_list = process.spiders.list() # 取spiders路径下所有的spider模块
# for the_spider_name in process_spider_list:
# if the_spider_name in didntWorkSpider:
# continue
# print("Running spider %s" % (the_spider_name))
# process.crawl(the_spider_name)
# process.start()