使用scrapy框架---爬小说，入库

本人步骤：

1>setting.py:

BOT_NAME = 'newding'

SPIDER_MODULES = ['newding.spiders']
NEWSPIDER_MODULE = 'newding.spiders'

ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
'newding.pipelines.NewdingPipeline': 300,
}

以上配置；创建项目会自动出现这些

以下是想要入数据库的（阶段）：

MYSQL_USER = 'root'
MYSQL_PASSWORD = '12345678'
MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = '3306'
MYSQL_DB = 'xiaoshuo'

2>RUN.py

from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'newding1s']) #执行项目命令


3>items.py

import scrapy


class NewdingItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # pass
    title = scrapy.Field()
    types = scrapy.Field()
    zijie = scrapy.Field()
    book_url = scrapy.Field()

4>sql.py

from newding.settings import *
import mysql.connector

db = mysql.connector.connect(user=MYSQL_USER, password=MYSQL_PASSWORD, host=MYSQL_HOST, port=MYSQL_PORT, db=MYSQL_DB)
cursor = db.cursor(buffered=True)


class Sql():
    @classmethod
    def insert_book(cls, title, types, zijie, book_url):
        sql = "insert INTO book_table(`title`,`types`,`zijie`,`book_url`)VALUES ('" + title + "','" + types + "','" + zijie + "','" + book_url + "')"
        cursor.execute(sql)  # 游标执行sql语句
        db.commit()  # 提交数据

    @classmethod
    def select_book(self, book_url):
        # 这一段代码会查找name_id这个字段，如果存在则会返回1不存在则会返回0
        sql = "select EXISTS (select 1 FROM book_table WHERE book_url='" + book_url + "')"
        cursor.execute(sql)
        list = cursor.fetchall()  # 游标查询所有超链接
        return list


5>pipelines.py

from .sql import Sql #引入sql.py文件


class NewdingPipeline(object):
    def process_item(self, item, spider):
        # return item
        title = item['title']
        types = item['types']
        zijie = item['zijie']
        book_url = item['book_url']

        if not Sql.select_book(book_url)[0][0]:
            Sql.insert_book(title, types, zijie, book_url)
        else:
            print('该小说已存在')

6>newding1s.py (项目py文件)
import requests
import scrapy
from scrapy.http import Request
from scrapy.spiders import CrawlSpider, Rule, Request  ##CrawlSpider与Rule配合使用可以骑到历遍全站的作用、Request干啥的我就不解释了
from scrapy.linkextractors import LinkExtractor
from newding.items import *


class Newding1sSpider(scrapy.Spider):
    #name,allowed_domains,start_urls三个字段为固定格式，不能随意改变
    name = 'newding1s'
    allowed_domains = ['23us.so']
    start_urls = ['http://www.23us.so/']

    def parse(self, response):
        start_urls = "http://www.23us.so/list/"
        end_url = ".html"
        for i in range(1, 10):  #循环顶点板块
            the_url = start_urls + str(i) + '_1' + end_url #拼接板块连接
            # print(the_url)
            yield Request(the_url, self.san)  #将当前函数传到san函数(传到下一函数)

    def san(self, response):
        yeshu = response.xpath('//*[@id="pagelink"]/a/text()').extract()[-1] #用xpath匹配出最大页数
        yeshu_url = response.xpath('//*[@id="pagelink"]/a/@href').extract()[0] #用xpath匹配出最大页数的连接
        qie = yeshu_url[:-6] #http://www.23us.so/list/1_

        for i in range(1, int(yeshu) + 1): #循环最大页数
            qie_html = qie + str(i) + ".html" #拼接板块+页数
            # print(qie_html)
            yield Request(qie_html, self.si)

    def si(self, response):
        #进入当前文章连接封面
        shu_url = response.xpath('//*[@id="content"]/dd[1]/table/tr[2]/td[1]/a/@href').extract()[0]
        # print(shu_url)
        yield Request(shu_url, self.wu)

    def wu(self, response):
        item = NewdingItem() #引用item相对应的字段
        types = response.xpath('//*[@id="at"]/tr[1]/td[1]/a/text()').extract()[0]  #小说类型
        zijie = response.xpath('//*[@id="at"]/tr[2]/td[2]/text()').extract()[0].replace('xa0', '') #.replace('xa0', '')处理特殊字符，防止出现乱码
        title = response.xpath('//*[@id="content"]/dd[1]/h1/text()').extract()[0]
        book_url = response.xpath('//a[@class="read"]/@href').extract()[0]

        item['title'] = title
        item['types'] = types
        item['zijie'] = zijie
        item['book_url'] = book_url

        return item


做的项目是顶点小说信息入库（scrapy框架的优点是清晰明了，缺点是之间关系太繁琐）
可以参考另一片代码笔记xpath爬顶点页面信息；可以看出之间明显区别

相关阅读:
[LeetCode] 71. Simplify Path 简化路径
 [LeetCode] 173. Binary Search Tree Iterator 二叉搜索树迭代器
 [LeetCode] 142. Linked List Cycle II 链表中的环 II
[LeetCode] 141. Linked List Cycle 链表中的环
 读经典——《CLR via C#》(Jeffrey Richter著) 笔记_友元程序集
 读经典——《CLR via C#》(Jeffrey Richter著) 笔记_通过ILDasm.exe查看编译器如何将类型及其成员编译成元数据
 tfs强行签入和删除工作区
 需要提升权限才能运行dism
读经典——《CLR via C#》(Jeffrey Richter著) 笔记_类型的各种成员
 Jquery 获取URL中的参数
原文地址：https://www.cnblogs.com/GUIDAO/p/6690759.html