• 简书网站 爬取所有文章(同步方式保存数据库)


    import scrapy
    import re
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from jianshu.items import JianshuItem
    
    
    class JsSpider(CrawlSpider):
        name = 'js'
        allowed_domains = ['jianshu.com']
        start_urls = ['https://www.jianshu.com/']
    
        rules = (
            # 匹配文章链接
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            title = response.xpath('//h1[@class="title"]/text()').get()
            avatar = response.xpath('//a[@class="avatar"]/img/@src').get()
            author = response.xpath('//span[@class="name"]//text()').get()
            content = response.xpath('//div[@class="show-content"]').get()
            pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace('*', '')
            read_count = response.xpath('//span[@class="views-count"]/text()').get()
            comment_count = response.xpath('//span[@class="comments-count"]/text()').get()
            like_count = response.xpath('//span[@class="likes-count"]/text()').get()
            rewards_count = response.xpath('//span[@class="wordage"]/text()').get()
            print('-------------', rewards_count)
            # 获取数字
            read_count = re.findall('d+', read_count)[0] if re.findall('d+', read_count) else None
            comment_count = re.findall('d+', comment_count)[0] if re.findall('d+', comment_count) else None
            like_count = re.findall('d+', like_count)[0] if re.findall('d+', like_count) else None
            rewards_count = re.findall('d+', rewards_count)[0] if re.findall('d+', rewards_count) else None
            item = JianshuItem(
                title=title,
                avatar=avatar,
                author=author,
                content=content,
                pub_time=pub_time,
                read_count=read_count,
                comment_count=comment_count,
                like_count=like_count,
                rewards_count=rewards_count
            )
            yield item

    items:

    import scrapy
    
    
    class JianshuItem(scrapy.Item):
        title = scrapy.Field()
        avatar = scrapy.Field()
        author = scrapy.Field()
        content = scrapy.Field()
        pub_time = scrapy.Field()
        read_count = scrapy.Field()
        comment_count = scrapy.Field()
        like_count = scrapy.Field()
        rewards_count = scrapy.Field()

    pipline:

    class JianShuPipeline(object):
        def __init__(self):
            db_params = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '',
                'database': 'jianshu',
                'charset': 'utf8'
            }
            self.conn = pymysql.connect(**db_params)
            self.cursor = self.conn.cursor()
            self._sql = None
    
        def process_item(self, item, spider):
            self.cursor.execute(self.sql,
                                (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'],
                                 item['like_count'], item['read_count'], item['comment_count'], item['rewards_count']))
            self.conn.commit()
            return item
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
               INSERT INTO article(id, title, content, author, avatar, pub_time, like_count, read_count, comment_count, 
               rewards_count) VALUES (null, %s, %s, %s, %s, %s, %s, %s, %s, %s);
               """
                return self._sql
            return self._sql
  • 相关阅读:
    1009 说反话 (20 分)
    1007 素数对猜想 (20 分)
    Visual Studio2017下载方法
    百词斩和扇贝打卡测试与评估
    创建者模式->工厂模式
    STM32F427|----------IO讲解与应用
    MySQL的索引
    STM32F103RCt6 与 MG996R
    2020物联网实验室的考核题目
    初学数据库-MySQL&IDEA&Navicat
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/10338756.html
Copyright © 2020-2023  润新知