• Scrapy框架 之某网站产品采集案例 + mongodb入库


    一、创建项目

    第一步:scrapy startproject boyuan

    第二步:cd boyuan

        scrapy genspider product -t crawl  boyuan.com

    如图:

    二、代码编写

    1、item.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class BoyuanItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        name = scrapy.Field()
        address = scrapy.Field()
        company = scrapy.Field()
        img = scrapy.Field()
        time = scrapy.Field()

    2、product.py爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.spiders import Rule, CrawlSpider
    from scrapy.linkextractors import LinkExtractor
    from ..items import BoyuanItem
    
    
    class ProductSpider(CrawlSpider):
        name = 'product'
        allowed_domains = ['boyuan.com']
        offset = 1
        url = "http://www.boyuan.com/sell/?page={0}"
        start_urls = [url.format(str(offset))]
    
        page_link = LinkExtractor(allow=("?page=d+"))
    
        rules = [
            Rule(page_link, callback="parse_content", follow=True)
        ]
    
        def parse_content(self, response):
            for each in response.xpath("//div[@class='list']//tr"):
                item = BoyuanItem()
                item['name'] = each.xpath("./td[4]//strong/text()").extract()[0]
                item['company'] = each.xpath("./td[4]//li[4]/a/text()").extract()[0]
                address = each.xpath("./td[4]//li[3]/text()").extract()[0]
                item['address'] = str(address).strip("[").strip("]")
                time = each.xpath("./td[4]//li[3]/span/text()").extract()[0]
                item['time'] = str(time).strip()
                item['img'] = each.xpath("./td[2]//img/@original").extract()[0]
                yield item

    3、pipelines.py 管道文件

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import json
    import pymongo
    from scrapy.conf import settings
    
    
    class BoyuanPipeline(object):
    
        def __init__(self):
            host = settings.get("MONGO_HOST")
            port = settings.get("MONGO_PORT")
            db_name = settings.get("MONGO_DB")
            collection = settings.get("MONGO_COLLECTION")
            self.client = pymongo.MongoClient(host=host, port=int(port))
            db = self.client.get_database(db_name)
            if collection not in db.list_collection_names():
                db.create_collection(collection)
            self.col = db[collection]
    
        def process_item(self, item, spider):
            # 保存到mongodb中
            self.col.insert(dict(item))
            return item
    
        def close_spider(self, spider):
            self.client.close()

    3、settings.py 配置文件

    # mongodb数据库参数
    MONGO_HOST = "localhost"
    MONGO_PORT = "27017"
    MONGO_DB = "boyuan"
    MONGO_COLLECTION = "product"

    4、start.py 启动文件

    from scrapy import cmdline
    
    if __name__ == '__main__':
        cmdline.execute("scrapy crawl product".split())

    采集结果如图:

  • 相关阅读:
    富文本编辑器Ueditor
    记一个好用的轮播图的FlexSlider
    记一次couchbase(memcached)安装以及使用
    写了一个联动select的jquery选择器
    ios访问手机通讯录获取联系人手机号
    Swift中自定义SubString
    Swift中给UIView添加Badge
    Swift计算两个经纬度之间的球面面积
    Swift项目使用SWTableViewCell
    SQLite.Swift 中的一些用法
  • 原文地址:https://www.cnblogs.com/yang-2018/p/10984814.html
Copyright © 2020-2023  润新知