• 关于scrapy的一些练习


    练习代码如下:

    主函数:

    # _*_ coding:utf-8 _*_
    
    import scrapy
    from scrapy.selector import Selector
    from lxml import etree
    from ScrapyTest.items import ScrapytestItem
    
    class testSpider(scrapy.Spider):
        #爬虫名字
        name = "itcast"
        #允许爬取的域名
        # allowwd_domains = ["http://www.itcast.cn/"]
        #爬取信息的网址
        start_urls = (
            "http://www.hs-bianma.com/search.php?ser=%E5%8F%A3%E7%BA%A2&ai=1",
        )
    
        def parse(self, response):
            # print(response.body)
            #先爬取口红列表中的所有信息
            items = response.xpath("/html/body/div[2]/div")
            print(items)
            #使用xpath,在items的基础上继续查找口红的具体信息
            #使用replace()方法删除多余的字段
            #使用yield将爬取信息传给items.py
            for item in items:
                str = './/div[1]/text()'
                m = item.xpath(str).extract_first()
                n = item.xpath('.//div[2]').extract_first()
                # a = response.xpath('/html/body/div[2]/div/div[2]').extract()[11]
                b = item.xpath(".//div[3]/a[1]/@href").extract_first()
                c = item.xpath(".//div[3]/a[2]/@href").extract_first()
                a1 = n.replace('</div>', '').replace('<div>', '').replace('<span class="red">', '').replace('</span>', '')
                a1 = a1.replace('<b>商品描述</b>', '')
                # print(m)
                # print(a1)
                # print(b)
                # print(c)
                # print(a1)
                item = ScrapytestItem()
                item['hscode'] = m
                item['商品描述'] = a1
                item['归类实例'] = b
                item['详情'] = c
                yield item
    
            # for m in mlist:
            #     test_list = {}
            #     print(test_list)
            #
                # test_list['m'] = m
                # print(m)
            #判断下一页是否存在,存在着交给newParse处理
            next_page = response.xpath("//*[@id='main']/div/div[3]/a[1]/@href").extract_first()
            next_page = 'http://www.hs-bianma.com'+next_page
            print(next_page)
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.myParse)
    
        def myParse(self, response):
            # print(response.text)
            second_items = response.xpath(".//*[@id='main']/div")
            # print(second_items)
            for second_item in second_items:
                second_hscode = second_item.xpath(".//div[1]").extract_first()
                second_hscode = second_hscode.replace('<span class="red">', '').replace('</span>', '')
                second_hscode = second_hscode.replace('<div>', '').replace('</div>','').replace('<b>hscode</b>','')
                second_description = second_item.xpath(".//div[2]/text()").extract_first()
                second_details = second_item.xpath(".//div[3]/a/@href").extract_first()
                print(second_hscode)
                print(second_description)
                print(second_details)
                item = ScrapytestItem()
                item['second_hscode'] = second_hscode
                item['二级商业描述'] = second_description
                item['二级详情'] = second_details
                yield item

    setting.py:

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for ScrapyTest project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'ScrapyTest'
    
    SPIDER_MODULES = ['ScrapyTest.spiders']
    NEWSPIDER_MODULE = 'ScrapyTest.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #伪装为火狐浏览器访问
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
    
    # 修改编码为gbk,utf-8不行
    FEED_EXPORT_ENCODING = 'gbk'
    #输出csv文件
    FEED_URI = u'file:///F://douban.csv'
    FEED_FORMAT = 'CSV'
    
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False

    items.py:

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class ScrapytestItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
    
        hscode = scrapy.Field()
        商品描述 = scrapy.Field()
        归类实例 = scrapy.Field()
        详情 = scrapy.Field()
    
        second_hscode = scrapy.Field()
        二级商业描述 = scrapy.Field()
        二级详情 = scrapy.Field()
        pass

    start.py:

    pipelines.py:

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    class ScrapytestPipeline(object):
        def process_item(self, item, spider):
            return item
    
    class FilePipeLine(object):
        def __init__(self):
            self.file = open('f:/hjd', 'wb')
    
        #输出txt文件
        def process_item(self, item, spider):
            line = "%s	%s	%s	%s	%s	%s
    " % (item['hscode'],
                                                 item['description'],
                                                 item['example'],
                                                 item['details'])
            self.file.write(line.encode("utf-8"))
            return item

  • 相关阅读:
    理解RESTful架构
    Javascript闭包
    Javascript立即执行函数
    多个Jboss端口冲突配置;一个Jboss多个server端口配置
    MyEclipse部署Jboss出现java.lang.OutOfMemoryError: PermGen space
    css摘抄
    css 浮动和绝对定位的区别
    搭建第一个web项目:jasperReports+ireport制作pdf报表
    Hibernate的检索方式
    浅谈JavaScript的作用域
  • 原文地址:https://www.cnblogs.com/-hjd/p/9670992.html
Copyright © 2020-2023  润新知