• scrapy存储mysql


    scrapy 数据存储mysql

     
    复制代码
    #spider.py
    from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from Cwpjt.items import CwpjtItem class FulongSpider(CrawlSpider): name = 'fulong' allowed_domains = ['sina.com.cn'] start_urls = ['http://sina.com.cn/'] 'http://news.sina.com.cn/c/2017-05-09/doc-ifyeycte9324112.shtml' rules = ( Rule(LinkExtractor(allow=('.*?/[0-9]{4}.[0-9]{2}.[0-9]{2}.doc-.*?shtml'),allow_domains=('sina.com.cn')), callback='parse_item', follow=True), ) def parse_item(self, response): i = CwpjtItem() i['name']=response.xpath('/html/head/title/text()').extract() i['kws'] = response.xpath('/html/head/meta[@name="keywords"]/@content').extract() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return i
    复制代码

    pipeline

    复制代码
    import pymysql
    from pymysql import connections
    class CwpjtPipeline(object):
        def __init__(self):
            self.conn = pymysql.connect(host='127.0.0.1',user='root',passwd='123456',db ='mydb')
            self.cursor = self.conn.cursor()
        def process_item(self, item, spider):
            name = item['name'][0]
            kws = item['kws'][0]
            sql ="insert into hehe(title,kws) VALUES(%s,%s)"
            self.cursor.execute(sql,(name,kws,))
            self.conn.commit()
            return item
        def close_spider(self,spider):
            self.conn.close()
    复制代码

    item

    复制代码
    import scrapy
    
    
    class CwpjtItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()
        kws = scrapy.Field()
    复制代码
  • 相关阅读:
    mysql 45讲 索引的使用 09-11
    mysql 45讲 相关锁的概念 06-08
    mysql 45讲 深入浅出索引04-05
    mysql 45讲 概览 01-03
    AQS源码解析第二回
    面试相关-怎么实现限流功能
    人工智能必备数学基础:线性代数基础(2)
    Elasticsearch问题总结和解决方法
    spring boot中打印所有日志
    Java中Stream流里面的findFirst()和findAny()区别
  • 原文地址:https://www.cnblogs.com/duanlinxiao/p/10827264.html
Copyright © 2020-2023  润新知