• 简书全站CrawlSpider爬取 mysql异步保存


    # 简书网
    # 数据保存在mysql中; 将selenium+chromedriver集成到scrapy; 整个网站数据爬取
    #  抓取ajax数据
    
    #爬虫文件
    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from jianshu_spider.items import ArticleItem
    
    class JsSpider(CrawlSpider):
        name = 'js'
        allowed_domains = ['jianshu.com']
        start_urls = ['https://www.jianshu.com/'] # 从首页开始爬去
    
        rules = (
            # 详情页里面下面推荐的文章的href直接就是/p/.......
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'),
                 callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            # print(response.text)
            title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()
            # print(title)
            avatar = response.xpath("//a[@class='avatar']/img/@src").get()
            # print(avatar)
            author = response.xpath("//span[@class='name']/a/text()").get()
            # print(author)
            pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
            # print(pub_time)
    
            # url正常情况下里面只有一个?
            url = response.url
            url1 = url.split("?")[0]
            article_id = url1.split("/")[-1]
            # print(article_id)
    
            # 把html标签一起趴下来, 方便以后展示
            content = response.xpath("//div[@class='show-content']").get()
            # print(content)
            item = ArticleItem(
                title=title,
                avatar=avatar,
                author=author,
                pub_time=pub_time,
                origin_url=response.url,
                article_id=article_id,
                content=content
            )
            yield item
    
    # item文件
    import scrapy
    
    class ArticleItem(scrapy.Item):
        # define the fields for your item here like:
        title = scrapy.Field()
        content = scrapy.Field()
        article_id = scrapy.Field()
        origin_url = scrapy.Field()
        author = scrapy.Field()
        avatar = scrapy.Field()
        pub_time = scrapy.Field()
        
        
    # pipeline文件  保存在mysql中
    import pymysql
    from twisted.enterprise import adbapi       # 专门做数据库处理的模块
    from pymysql import cursors
    
    class JianshuSpiderPipeline(object):
        def __init__(self):
            dbparams={
                'host':'127.0.0.1',
                'port':3306,
                'user':'root',
                'password':'',
                'database':'jianshu',
                'charset':'utf8'
            }
            self.conn = pymysql.connect(**dbparams)
            # **dbparams 相当于把 host='127.0.0.1' 写在了括号里
    
            self.cursor = self.conn.cursor()
            self._sql = None
    
        def process_item(self, item, spider):
            self.cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                          item['pub_time'],item['origin_url'],item['article_id']))
            self.conn.commit() # 这个是同步进行的 比较慢
            return item
    
        @property
        def sql(self):
            if not self._sql: # 如果没有 执行
                self._sql = '''
                insert into article2(id,title,content,author,avatar,pub_time,
                origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
                '''
                return self._sql
            else:
                return self._sql
    # 优化上面的pipeline文件,  实现异步保存
    # 使用twisted 提供的数据库连接池 ConnectionPool,把插入数据的动作变成异步的 (面试可以说)
    
    # 上面的存储是同步 比较慢, 现在优化成异步
    class JianshuTwistedPipeline(object):
        def __init__(self):
            # 创建连接池
            dbparams = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '',
                'database': 'jianshu',
                'charset': 'utf8',
                'cursorclass':cursors.DictCursor
            }
            self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
            self._sql = None
    
        @property
        def sql(self):
            if not self._sql: # 如果没有 执行
                self._sql = '''
                insert into article2(id,title,content,author,avatar,pub_time,
                origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
                '''
                return self._sql
            else:
                return self._sql
    
        def process_item(self,item,spider):
            # runInteraction执行异步的
            defer = self.dbpool.runInteraction(self.insert_item,item)
            defer.addErrback(self.handle_error,item,spider)
    
        def insert_item(self,cursor,item): # 插入数据库
            cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                          item['pub_time'],item['origin_url'],item['article_id']))
    
        def handle_error(self,error,item,spider):
            print('='*20)
            print("error:",error)
            print('='*20)
    
    # 把settings中的pipeline文件改一下
    ITEM_PIPELINES = {
       # 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
       'jianshu_spider.pipelines.JianshuTwistedPipeline': 300, # 异步保存数据
    }
    # 优化动态数据     处理ajax加载进来的数据
    # selenium+chromdriver 处理
    
    
    # 爬虫文件  把阅读量,点赞数,文章字数,标题分类,评论数 字段获取,保存到item中
        def parse_detail(self, response):
            # print(response.text)
            title = response.xpath("//div[@class='note']/div[@class='post']/div[@class='article']/h1[@class='title']/text()").get()
            print(title)
            avatar = response.xpath("//a[@class='avatar']/img/@src").get()
            # print(avatar)
            author = response.xpath("//span[@class='name']/a/text()").get()
            # print(author)
            pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
            # print(pub_time)
    
            # url正常情况下里面只有一个?
            url = response.url
            url1 = url.split("?")[0]
            article_id = url1.split("/")[-1]
            # print(article_id)
    
            # 把html标签一起趴下来, 方便以后展示
            content = response.xpath("//div[@class='show-content']").get()
            # print(content)
    
            # 动态获取下面的数据
            word_count = response.xpath("//span[@class='wordage']/text()").get().split(" ")[-1]
            read_count = response.xpath("//span[@class='views-count']/text()").get().split(" ")[-1]
            comment_count = response.xpath("//span[@class='comments-count']/text()").get().split(" ")[-1]
            like_count = response.xpath("//span[@class='likes-count']/text()").get().split(" ")[-1]
            subject = response.xpath("//div[@class='include-collection']/a/div/text()").getall()
            # subject 获取的时候一个列表  存到mysql的时候不支持, 需要把列表转成字符串
            subject = ",".join(subject)
    
            item = ArticleItem(
                title=title,
                avatar=avatar,
                author=author,
                pub_time=pub_time,
                origin_url=response.url,
                article_id=article_id,
                content=content,
                
                word_count=word_count,
                read_count=read_count,
                comment_count=comment_count,
                like_count=like_count,
                subject=subject,
            )
            yield item
    
    
    
    # 管道文件
    # 上面的存储是同步 比较慢, 现在优化成异步
    class JianshuTwistedPipeline(object):
        def __init__(self):
            # 创建连接池
            dbparams = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '',
                'database': 'jianshu',
                'charset': 'utf8',
                'cursorclass':cursors.DictCursor
            }
            self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
            self._sql = None
    
        @property
        def sql(self):
            if not self._sql: # 如果没有 执行
                self._sql = '''
                insert into article2(id,title,content,author,avatar,pub_time,
                origin_url,article_id,read_count, word_count, like_count, comment_count,subject)
                 values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                '''
                #
    
                return self._sql
            else:
                return self._sql
    
        def process_item(self,item,spider):
            # runInteraction执行异步的
            defer = self.dbpool.runInteraction(self.insert_item,item)
            defer.addErrback(self.handle_error,item,spider)
    
        def insert_item(self,cursor,item): # 插入数据库
            cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],
                                          item['pub_time'],item['origin_url'],item['article_id'],
                                     item['read_count'],item['word_count'],item['like_count'],item['comment_count'],item['subject']))
    
        def handle_error(self,error,item,spider):
            print('='*20+'error'+'='*20)
            print("error:",error)
            print('='*20+'error'+'='*20)
  • 相关阅读:
    怎样查看Oracle的数据库名称sid
    request.getRemoteAddr request.getRemoteHost()
    Oracle中添加自动编号的序列
    google chrome 快捷键
    MyEclipse快捷键大全( 再排版)
    Java正则表达式应用详解
    Spring3.1 Cache注解
    Java本周总结1.
    jquery ui 自动补全
    用字符串的length实现限制文本框长度
  • 原文地址:https://www.cnblogs.com/kenD/p/11123696.html
Copyright © 2020-2023  润新知