• 爬虫第六篇:scrapy框架爬取某书网整站爬虫爬取


    新建项目

    # 新建项目
    $ scrapy startproject jianshu
    # 进入到文件夹 $ cd jainshu
    # 新建spider文件 $ scrapy genspider
    -t crawl jianshu_spider jainshu.com

    items.py文件

    import scrapy
    
    
    class ArticleItem(scrapy.Item):
        title = scrapy.Field()
        content = scrapy.Field()
        article_id = scrapy.Field()
        origin_url = scrapy.Field()
        author = scrapy.Field()
        avatar = scrapy.Field()
        pub_time = scrapy.Field()

    jianshu_spider.py文件

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from jianshu.items import ArticleItem
    
    
    class JianshuSpiderSpider(CrawlSpider):
        name = 'jianshu_spider'
        allowed_domains = ['jianshu.com']
        start_urls = ['https://www.jianshu.com/']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            title = response.xpath("//h1[@class='title']/text()").get()
            content = response.xpath("//div[@class='show-content-free']").get()
            avatar = response.xpath("//a[@class='avatar']/img/@src").get()
            author = response.xpath("//div[@class='info']/span/a/text()").get()
            pub_time = response.xpath("//span[@class='publish-time']/text()").get()
            article_id = response.url.split("?")[0].split("/")[-1]
            origin_url = response.url
            item = ArticleItem(
                title=title,
                content=content,
                avatar=avatar,
                pub_time=pub_time,
                article_id=article_id,
                origin_url=origin_url,
                author=author
            )
            yield item

    同步的MySQL插入数据

    import pymysql
    
    
    class JianshuPipeline(object):
        def __init__(self):
            dbparams = {
                'host': '127.0.0.1',
                'user': 'root',
                'password': '123456',
                'database': 'jianshu',
                'port': 3306,
                'charset': 'utf8'
            }
            self.conn = pymysql.connect(**dbparams)
            self.cursor = self.conn.cursor()
            self._sql = None
    
        def process_item(self, item, spider):
            self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], 
                                           item['pub_time'], item['origin_url'], item['article_id']))
            self.conn.commit()
            return item
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
                insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s)
                """
                return self._sql
            return self._sql

    异步的MySQL插入数据

    from twisted.enterprise import adbapi
    from pymysql import cursors
    class JianshuTwistedPipeline(object):
        def __init__(self):
            dbparams = {
                'host': '127.0.0.1',
                'user': 'root',
                'password': '123456',
                'database': 'jianshu',
                'port': 3306,
                'charset': 'utf8',
                'cursorclass': cursors.DictCursor
            }
            self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
            self._sql = None
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
                    insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s)
                    """
                return self._sql
            return self._sql
    
        def process_item(self, item, spider):
            defer = self.dbpool.runInteraction(self.insert_item, item)
            defer.addErrback(self.handle_error, item, spider)
    
        def insert_item(self, cursor, item):
            cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], 
                                      item['pub_time'], item['origin_url'], item['article_id']))
    
        def handle_error(self, error, item, spider):
            print('=' * 10 + 'error' + '=' * 10)
            print(error)
            print('=' * 10 + 'error' + '=' * 10)

     

  • 相关阅读:
    MFC的序列化的一点研究.
    一次LoadRunner的CPC考试经历
    LAMP架构上(一)
    文件和目录管理
    如何在Linux上清理内存缓存、缓冲与交换空间
    Linux Shell基础(下)
    防火墙(上)
    LAMP架构(三)
    LNMP(二)
    LNMP(一)
  • 原文地址:https://www.cnblogs.com/leijing0607/p/8075324.html
Copyright © 2020-2023  润新知