• scrapy 爬取当当网产品分类


    #spider部分
    import
    scrapy from Autopjt.items import AutopjtItem from scrapy.http import Request class AutospdSpider(scrapy.Spider): name = "autospd" allowed_domains = ["dangdang.com"] start_urls = ['http://category.dangdang.com/pg1-cid4007379.html'] def parse(self, response): item = AutopjtItem() item['name'] =response.xpath('//a[@name="itemlist-title"]/@title').extract() item['price'] = response.xpath('//span[@class="price_n"]/text()').extract() item['link'] = response.xpath('//a[@name="itemlist-title"]/@href').extract() item['comnum'] = response.xpath('//a[@name="itemlist-review"]/text()').extract() yield item for i in range(1,101): url = 'http://category.dangdang.com/pg'+str(i)+'-cid4007379.html' yield Request(url,callback=self.parse)

    pipeline部分

    import codecs
    import json
    
    class AutopjtPipeline(object):
        def __init__(self):
            self.file = codecs.open('D:/mydata.json','wb',encoding='utf-8')
        def process_item(self, item, spider):
            for j in range(0,len(item['name'])):
                name = item['name'][j]
                price = item['price'][j]
                comnum = item['comnum'][j]
                link =item['link'][j]
                goods = {'name':name,'price':price,'comnum':comnum,'link':link}
                i = json.dumps(dict(goods),ensure_ascii=False)
                line = i + '
    '
                self.file.write(line)
            return item
        def close_spider(self,spider):
            self.file.close()

    item部分

    import scrapy
    
    
    class AutopjtItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        name = scrapy.Field()
        price = scrapy.Field()
        link = scrapy.Field()
        comnum = scrapy.Field()
  • 相关阅读:
    第四章 网络层协议介绍
    第三章 交换机基本原理与配置
    网络服务综合性测试
    双向秘钥对验证
    DNS分离解析与构建智能DNS服务器
    NFS共享服务
    部署YUM仓库服务
    PXE高效能批量网络装机
    DNS综合实验
    构建DNS主 从服务器
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6835391.html
Copyright © 2020-2023  润新知