• 京东进口牛奶的爬取


    # -*- coding: utf-8 -*-
    import scrapy
    import json
    import csv
    from milk.items import MilkItem
    
    class MilkspiderSpider(scrapy.Spider):
        name = 'milkspider'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E7%89%9B%E5%A5%B6&enc=utf-8&suggest=3.def.0.V09--12s0,20s0,38s0&wq=%E8%BF%9B%E5%8F%A3&pvid=96ab0296e9ce494fb251b716911d93ec']
        data_list = []
    
        def parse(self, response):
            li_list = response.xpath('//li[@class="gl-item"]')
            for li in li_list:
                good_id = li.xpath('./@data-sku').get()  # 从自己开始找
                # print(good_id)
                shop_name = li.xpath('.//a[@class="curr-shop"]/text()').get()
                # print(shop_name)
                good_name = li.xpath('.//div[@class="p-name p-name-type-2"]/a/em/text()').getall()
                good_name = ','.join(good_name).strip().replace(",", "").replace("
    	", "")
                # print(good_name)
                good_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href').get()
                if good_url.startswith('https:'):
                    good_url = good_url
                else:
                    good_url = 'https:' + good_url
                # print(good_url)
                good_price = li.xpath('.//div[@class="p-price"]/strong//text()').getall()
                good_price = ','.join(good_price).replace(",", "")
                # print(good_price)
    
                # 评论数在源码没有 获取不到 需要去详情页获取
                item = MilkItem()
                item["shop_name"] = shop_name
                item["good_name"] = good_name
                item["good_price"] = good_price
                item["good_id"] = good_id
                item['good_url'] = good_url
                yield scrapy.Request(url=good_url, meta={"item": item}, callback=self.parse_detail)
    
        def parse_detail(self, response):
            # 获取的评论是动态加载的
            item = response.meta['item']
    
            # 拼接每个商品的评论的url
            comment_info_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + item['good_id']
            # print(comment_info_url)
            yield scrapy.Request(url=comment_info_url, meta={"item": item}, callback=self.parse_comment)
    
        def parse_comment(self, response):
            item = response.meta['item']
    
            # response.body是一个bytes格式的   转成str
            str = response.body.decode('utf-8', 'replace')
            json_str = str.replace('��', '')
            dict = json.loads(json_str)
    
            total_comment = dict['CommentsCount'][0]['CommentCountStr']
            good_comment = dict['CommentsCount'][0]['GoodCountStr']
            video_count = dict['CommentsCount'][0]['VideoCountStr']
            general_count = dict['CommentsCount'][0]['GeneralCountStr']
            poor_count = dict['CommentsCount'][0]['PoorCountStr']
    
            item['total_comment'] = total_comment
            item['good_comment'] = good_comment
            item['video_count'] = video_count
            item['general_count'] = general_count
            item['poor_count'] = poor_count
    
            self.data_list.append(item)
            # print(self.data_list)
    
            with open('./京东进口牛奶.csv', 'w', encoding='utf-8', errors='ignore', newline="") as csvfile:
                fieldnames = ['good_id', 'good_name', 'shop_name', 'good_url', 'total_comment', 'good_comment',
                              'video_count', 'general_count', 'poor_count', 'good_price']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(self.data_list)
    
            return self.data_list

    items

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class MilkItem(scrapy.Item):
        # define the fields for your item here like:
        good_id = scrapy.Field()
        good_name = scrapy.Field()
        shop_name = scrapy.Field()
        good_url = scrapy.Field()
    
        total_comment = scrapy.Field()
        good_comment = scrapy.Field()
        video_count = scrapy.Field()
        general_count = scrapy.Field()
        poor_count = scrapy.Field()
    
        good_price = scrapy.Field()

    start

    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl milkspider".split())
  • 相关阅读:
    LoggingApplicationListener
    Repeated meta-data items
    善待Redis里的数据--Unable to validate object
    mysql启动的四种方式
    mybatis操作动态表+动态字段+存储过程
    VMware 11安装Mac OS X 10.10
    JMS开源比较
    VMware 11安装Mac OS X 10.10
    网页设计的标准尺寸
    FullPage.js
  • 原文地址:https://www.cnblogs.com/kenD/p/11123581.html
Copyright © 2020-2023  润新知