Scrapy框架的使用

Scrapy框架的安装

pip install pywin32
下载 Twisted 包 pip install Twisted包的路径
pip insatll scrapy

Scrapy的基本使用

切换到开始项目的目录
scrapy startproject my_first_spider 命令创建一个新项目
scrapy genspider my_first_spider www.xxx.com
目录结构如下

# -*- coding: utf-8 -*-
import scrapy
from my_first_spider.items import MyFirstSpiderItem


class FirstSpider(scrapy.Spider):
    # 当前spider的名称
    name = 'first'

    # 允许的域名
    # allowed_domains = ['www.xxx.com']

    # 开始的第一个url
    start_urls = ['https://www.zhipin.com/c101010100/?query=python开发&page=1&ka=page-1']

    url = 'https://www.zhipin.com/c101010100/?query=python开发&page=%d&ka=page-1'
    page = 1

    # 用于解析的解析函数, 适用 xpath, 必传参数response
    def parse(self, response):
        div_list = response.xpath('//div[@class="job-list"]/ul/li')
        for div in div_list:
            job = div.xpath('./div/div[1]/h3/a/div[1]/text()').extract_first()
            salary = div.xpath('./div/div[1]/h3/a/span/text()').extract_first()
            company = div.xpath('./div/div[2]/div/h3/a/text()').extract_first()

            item = MyFirstSpiderItem()

            item['job'] = job
            item['salary'] = salary
            item['company'] = company

            yield item  # 将item对象返回给数据持久化管道(pipelines)

        # 分页爬取
        if self.page <= 7:
            print(f'第{self.page}页爬取完毕,开始爬取第{self.page+1}页')
            self.page += 1
            yield scrapy.Request(url=self.url%self.page, callback=self.parse)

first.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


# 与管道交互,实例化item对象进行交互,将爬取到的数据封装在对象中,以对象的方式进行文件间数据传递
class MyFirstSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job = scrapy.Field()
    salary = scrapy.Field()
    company = scrapy.Field()

item.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import redis

# 写入boss.txt文件
class MyFirstSpiderPipeline(object):

    fp = None

    def open_spider(self, spider):
        print('开始爬虫')
        self.fp = open('./boss.txt', 'w', encoding='utf8')

    def close_spider(self, spider):
        self.fp.close()
        print('爬虫结束')

    def process_item(self, item, spider):

        self.fp.write(item['job']+':'+item['salary']+':'+item['company']+'
')

        return item


# MySQL 写入管道
class MySqlPipeline(object):
    conn = None
    cursor = None

    # 爬虫开始后如若使用管道, 首先执行open_spider方法(打开mysql数据库连接)
    def open_spider(self, spider):

        import pymysql.cursors

        self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='root',
                                     db='scrapy', charset='utf8')
        # print('-----', self.conn)
        print('打开MySQL连接, 开始写入')

    # 第二步 执行process_item 函数, 在其中进行写入操作, 如若有在此管道之后还有管道 此方法要将item对象返回
    def process_item(self, item, spider):

        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute(
                'INSERT INTO boss (job_name, salary, company) VALUES ("%s", "%s", "%s")' % (item['job'], item['salary'], item['company']))
            self.conn.commit()
        except Exception as e:
            self.conn.rollback()
            print('出现错误, 事件回滚')
            print(e)

    # 第三部爬虫结束后执行close_spider方法(关闭MySQL连接)
    def close_spider(self, spider):

        self.conn.close()
        self.cursor.close()
        print('MySQl 写入完毕')


# Redis 写入管道
class RedisPipeline(object):

    conn = None

    def open_spider(self, spider):
        self.conn = redis.Redis(host='127.0.0.1', port=6379)

    #  第二步 执行process_item 函数, 在其中进行写入操作, 如若有在此管道之后还有管道 此方法要将item对象返回
    def process_item(self, item, spider):
        import json
        dic = {
            'job_name': item['job'],
            'salary': item['salary'],
            'company': item['company']
        }
        try:
            self.conn.lpush('boss', json.dumps(dic))
            print('redis 写入成功')
        except Exception as e:

            print('redis 写入失败', e)
        return item

pipeline.py

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'my_first_spider.pipelines.MyFirstSpiderPipeline': 300,
   'my_first_spider.pipelines.RedisPipeline': 301,
   'my_first_spider.pipelines.MySqlPipeline': 302,
}


BOT_NAME = 'my_first_spider'

SPIDER_MODULES = ['my_first_spider.spiders']
NEWSPIDER_MODULE = 'my_first_spider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

setting.py

- 在命令行中输入 scrapy crawl first --nolog --nolog不显示日志
- 爬取成功

相关阅读:
gulp serve 报错 gulp.ps1
执行git命令时出现fatal: 'origin' does not appear to be a git repository错误
 利用 SASS 简化 `nth-child` 样式的生成
 git的一些常用命令
 回调函数
 匿名函数
 css消除行内元素的间隙
 @click.native的使用
 Element-ui 下拉列表选项过多时如何解决卡顿问题
 vue组件通信（父子之间，兄弟之间）
原文地址：https://www.cnblogs.com/Treasuremy/p/10457725.html