• scarpy crawl 爬取微信小程序文章(将数据通过异步的方式保存的数据库中)


    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    
    from wxapp.items import WxappItem
    
    
    class WxSpider(CrawlSpider):
        name = 'wx'
        allowed_domains = ['wxapp-union.com']
        start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=d+'), follow=True),
            Rule(LinkExtractor(allow=r'.*article-.+.html'), callback='parse_detail', follow=False),
        )
    
        def parse_detail(self, response):
            detail_href = response.request.url
            title = response.xpath('//h1[@class="ph"]/text()').get()
            content = response.xpath('//td[@id="article_content"]//text()').getall()
            content = [c.strip() for c in content]
            content = ''.join(content).strip()
            pub_time = response.xpath('//p[@class="authors"]/span/text()').get()
            author = response.xpath('//p[@class="authors"]/a/text()').get()
            item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author)
            yield item

    items:

    class WxAppItem(scrapy.Item):
        title = scrapy.Field()
        pub_time = scrapy.Field()
        content = scrapy.Field()
        summary = scrapy.Field()
        article_url = scrapy.Field()
        read_count = scrapy.Field()

    pipline:

    import pymysql
    from pymysql import cursors
    from twisted.enterprise import adbapi
    
    
    class WxAppPipeline(object):
        def __init__(self):
            db_params = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '',
                'database': 'wxapp',
                'charset': 'utf8',
                'cursorclass': cursors.DictCursor  # 指定游标类
            }
            # 定义数据库连接池
            self.db_pool = adbapi.ConnectionPool('pymysql', **db_params)
            self._sql = None
    
        def process_item(self, item, spider):
            defer = self.db_pool.runInteraction(self.insert_item, item)
            defer.addErrback(self.handle_error, item, spider)
            return item
    
        def insert_item(self, cursor, item):
            print('kkkkkkkkkkkkkkkkkkkk')
            cursor.execute(self.sql, (item['title'], item['content'], item['summary'], item['read_count'], item['pub_time'], item['article_url']))
    
        def handle_error(self, error, item, spider):
            print('=' * 10 + 'error' + '=' * 10)
            print(error)
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
                   INSERT INTO article(id, title, content, summary, read_count, pub_time, article_url) VALUES (null, %s, %s, %s, %s, %s, %s);
                   """
                return self._sql
            return self._sql
  • 相关阅读:
    SAP S/4HANA OData Mock Service 介绍
    SAP S/4HANA Cloud SDK 入门介绍
    SAP Cloud SDK for JavaScript 的搭建和使用方法介绍
    SAP Cloud SDK for JavaScript 概述
    如何在 SAP BTP ABAP 编程环境里直接调用 ABAP On-Premises 系统的 RFC 函数
    3-16计划
    HBASE基础(5):语法(3) API (2) DML
    HBASE进阶(3):重要工作机制(2)读流程
    HBASE进阶(2):重要工作机制(1) 写流程/MemStore Flush
    JavaWeb 之 Ajax
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/10338708.html
Copyright © 2020-2023  润新知