• 使用scrapy框架---爬小说,入库


    本人步骤:

    1>setting.py:

    BOT_NAME = 'newding'

    SPIDER_MODULES = ['newding.spiders']
    NEWSPIDER_MODULE = 'newding.spiders'


    ROBOTSTXT_OBEY = True

      ITEM_PIPELINES = {
    'newding.pipelines.NewdingPipeline': 300,
    }

    以上配置;创建项目会自动出现这些

    以下是想要入数据库的(阶段):

    MYSQL_USER = 'root'
    MYSQL_PASSWORD = '12345678'
    MYSQL_HOST = '127.0.0.1'
    MYSQL_PORT = '3306'
    MYSQL_DB = 'xiaoshuo'


    2>RUN.py
    from scrapy.cmdline import execute
    execute(['scrapy', 'crawl', 'newding1s']) #执行项目命令


    3>items.py
    import scrapy


    class NewdingItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # pass
    title = scrapy.Field()
    types = scrapy.Field()
    zijie = scrapy.Field()
    book_url = scrapy.Field()


    4>sql.py
    from newding.settings import *
    import mysql.connector

    db = mysql.connector.connect(user=MYSQL_USER, password=MYSQL_PASSWORD, host=MYSQL_HOST, port=MYSQL_PORT, db=MYSQL_DB)
    cursor = db.cursor(buffered=True)


    class Sql():
    @classmethod
    def insert_book(cls, title, types, zijie, book_url):
    sql = "insert INTO book_table(`title`,`types`,`zijie`,`book_url`)VALUES ('" + title + "','" + types + "','" + zijie + "','" + book_url + "')"
    cursor.execute(sql) # 游标执行sql语句
    db.commit() # 提交数据

    @classmethod
    def select_book(self, book_url):
    # 这一段代码会查找name_id这个字段,如果存在则会返回1不存在则会返回0
    sql = "select EXISTS (select 1 FROM book_table WHERE book_url='" + book_url + "')"
    cursor.execute(sql)
    list = cursor.fetchall() # 游标查询所有超链接
    return list

    5>pipelines.py
    from .sql import Sql #引入sql.py文件


    class NewdingPipeline(object):
    def process_item(self, item, spider):
    # return item
    title = item['title']
    types = item['types']
    zijie = item['zijie']
    book_url = item['book_url']

    if not Sql.select_book(book_url)[0][0]:
    Sql.insert_book(title, types, zijie, book_url)
    else:
    print('该小说已存在')

    6>newding1s.py (项目py文件)
    import requests
    import scrapy
    from scrapy.http import Request
    from scrapy.spiders import CrawlSpider, Rule, Request ##CrawlSpiderRule配合使用可以骑到历遍全站的作用、Request干啥的我就不解释了
    from scrapy.linkextractors import LinkExtractor
    from newding.items import *


    class Newding1sSpider(scrapy.Spider):
    #name,allowed_domains,start_urls三个字段为固定格式,不能随意改变
    name = 'newding1s'
    allowed_domains = ['23us.so']
    start_urls = ['http://www.23us.so/']

    def parse(self, response):
    start_urls = "http://www.23us.so/list/"
    end_url = ".html"
    for i in range(1, 10): #循环顶点板块
    the_url = start_urls + str(i) + '_1' + end_url #拼接板块连接
    # print(the_url)
    yield Request(the_url, self.san) #将当前函数传到san函数(传到下一函数)

    def san(self, response):
    yeshu = response.xpath('//*[@id="pagelink"]/a/text()').extract()[-1] #xpath匹配出最大页数
    yeshu_url = response.xpath('//*[@id="pagelink"]/a/@href').extract()[0] #xpath匹配出最大页数的连接
    qie = yeshu_url[:-6] #http://www.23us.so/list/1_

    for i in range(1, int(yeshu) + 1): #循环最大页数
    qie_html = qie + str(i) + ".html" #拼接板块+页数
    # print(qie_html)
    yield Request(qie_html, self.si)

    def si(self, response):
    #进入当前文章连接封面
    shu_url = response.xpath('//*[@id="content"]/dd[1]/table/tr[2]/td[1]/a/@href').extract()[0]
    # print(shu_url)
    yield Request(shu_url, self.wu)

    def wu(self, response):
    item = NewdingItem() #引用item相对应的字段
    types = response.xpath('//*[@id="at"]/tr[1]/td[1]/a/text()').extract()[0] #小说类型
    zijie = response.xpath('//*[@id="at"]/tr[2]/td[2]/text()').extract()[0].replace('xa0', '') #.replace('xa0', '')处理特殊字符,防止出现乱码
    title = response.xpath('//*[@id="content"]/dd[1]/h1/text()').extract()[0]
    book_url = response.xpath('//a[@class="read"]/@href').extract()[0]

    item['title'] = title
    item['types'] = types
    item['zijie'] = zijie
    item['book_url'] = book_url

    return item



    做的项目是顶点小说信息入库(scrapy框架的优点是清晰明了,缺点是之间关系太繁琐)
    可以参考另一片代码笔记xpath爬顶点页面信息;可以看出之间明显区别
     



  • 相关阅读:
    [LeetCode] 71. Simplify Path 简化路径
    [LeetCode] 173. Binary Search Tree Iterator 二叉搜索树迭代器
    [LeetCode] 142. Linked List Cycle II 链表中的环 II
    [LeetCode] 141. Linked List Cycle 链表中的环
    读经典——《CLR via C#》(Jeffrey Richter著) 笔记_友元程序集
    读经典——《CLR via C#》(Jeffrey Richter著) 笔记_通过ILDasm.exe查看编译器如何将类型及其成员编译成元数据
    tfs强行签入和删除工作区
    需要提升权限才能运行dism
    读经典——《CLR via C#》(Jeffrey Richter著) 笔记_类型的各种成员
    Jquery 获取URL中的参数
  • 原文地址:https://www.cnblogs.com/GUIDAO/p/6690759.html
Copyright © 2020-2023  润新知