• scrapy


    __author__ = 'Administrator'
    # -*- encoding:utf-8 -*-
    import scrapy
    class QuoteSpider(scrapy.Spider):
        name = 'poxiao'
        start_urls=['https://www.poxiao.com/type/movie/']
        def parse(self, response):#固定的
            quotes=response.xpath('//li/h3')#内容
            for quote in quotes:
                yield {
                    'name':quote.xpath('./a/text()').extract_first(),
                    'author':'https://www.poxiao.com'+quote.xpath('./a/@href').extract_first()
                }
                next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
                if next_page:
                    yield response.follow(next_page,self.parse)

    用SCRAPY爬取某网页链接地址

    scrapy runspider ***.py  运行此工程

    SCRAPY runspider ***.py -o aa.json      保存成JSON文件

    scrap runspider ***.py -o aa.csv -t csv    保存成EXCEL

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class MovieSpider(scrapy.Spider):
        name = 'movie'
        allowed_domains = ['poxiao.com']
        start_urls = ['https://www.poxiao.com/type/movie/index_2.html',
                      'https://www.poxiao.com/type/movie/index_3.html'
                      ]
    
        def parse(self, response):
            filname=response.url.split('/')[-1].split('.')[-2]
            with open(filname,'wb')as f:
                f.write(response.body)

    爬取HTML源文件 

    # -*- coding: utf-8 -*-
    import scrapy
    from meiju.items import MeijuItem
    
    class Mj100Spider(scrapy.Spider):
        name = 'mj100'
        allowed_domains = ['meijutt.com']
        start_urls = ['https://www.meijutt.com/new100.html']
    
        def parse(self, response):
            movies=response.xpath('//h5/a')
            for each_movie in movies:
                item=MeijuItem()
                item['name']=each_movie.xpath('./text()').extract_first()
                yield item
    class MeijuPipeline(object):
        def process_item(self, item, spider):
            with open('my_meiju.txt','a')as fp:
                fp.write(item['name']+'
    ')
    class MeijuItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        name=scrapy.Field()

    爬取美剧100实例     注意  还要注释一下PIPLINE文件里的内容 就是有300    优先级那个

    # -*- coding: utf-8 -*-
    import scrapy
    from poxiao.items import PoxiaoItem
    
    
    class NameSpider(scrapy.Spider):
        name = 'name'
        allowed_domains = ['poxiao.com']
        start_urls = ['https://www.poxiao.com/type/movie/']
    
        def parse(self, response):
    
            movie=response.xpath('//div[@class="gkpic"]//img')
            for i in movie:
                item=PoxiaoItem()
                item['src']=i.xpath('./@src').extract_first()
                item['name']=i.xpath('./@alt').extract_first()
                yield item
                next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
                if next_page:
                    yield response.follow("https://www.poxiao.com"+next_page,self.parse)

    第一个小爬虫

    import os
    import requests
    class PoxiaoPipeline(object):
        def process_item(self, item, spider):
            filename=os.path.join(r"d:untitled1poxiao",item['name']+'.jpg')
            with open(filename,'wb') as f:
                f.write(requests.get(item['src']).content)
  • 相关阅读:
    利用Trace.WriteLine定位难以重现的问题
    技术经验分享
    辞职小记
    残阳如血--读《忆秦娥·娄山关》 有感
    一个简单多线程等待窗口
    [转]Control的Invoke和BeginInvoke
    elk 改为使用 ik 中文分词器
    在 jenkins 的 pipeline 中使用分支参数
    centos8 下删除网桥 docker0
    vscode 实现组件之间的跳转
  • 原文地址:https://www.cnblogs.com/xupanfeng/p/11765545.html
Copyright © 2020-2023  润新知