• scrapy


    __author__ = 'Administrator'
    # -*- encoding:utf-8 -*-
    import scrapy
    class QuoteSpider(scrapy.Spider):
        name = 'poxiao'
        start_urls=['https://www.poxiao.com/type/movie/']
        def parse(self, response):#固定的
            quotes=response.xpath('//li/h3')#内容
            for quote in quotes:
                yield {
                    'name':quote.xpath('./a/text()').extract_first(),
                    'author':'https://www.poxiao.com'+quote.xpath('./a/@href').extract_first()
                }
                next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
                if next_page:
                    yield response.follow(next_page,self.parse)

    用SCRAPY爬取某网页链接地址

    scrapy runspider ***.py  运行此工程

    SCRAPY runspider ***.py -o aa.json      保存成JSON文件

    scrap runspider ***.py -o aa.csv -t csv    保存成EXCEL

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class MovieSpider(scrapy.Spider):
        name = 'movie'
        allowed_domains = ['poxiao.com']
        start_urls = ['https://www.poxiao.com/type/movie/index_2.html',
                      'https://www.poxiao.com/type/movie/index_3.html'
                      ]
    
        def parse(self, response):
            filname=response.url.split('/')[-1].split('.')[-2]
            with open(filname,'wb')as f:
                f.write(response.body)

    爬取HTML源文件 

    # -*- coding: utf-8 -*-
    import scrapy
    from meiju.items import MeijuItem
    
    class Mj100Spider(scrapy.Spider):
        name = 'mj100'
        allowed_domains = ['meijutt.com']
        start_urls = ['https://www.meijutt.com/new100.html']
    
        def parse(self, response):
            movies=response.xpath('//h5/a')
            for each_movie in movies:
                item=MeijuItem()
                item['name']=each_movie.xpath('./text()').extract_first()
                yield item
    class MeijuPipeline(object):
        def process_item(self, item, spider):
            with open('my_meiju.txt','a')as fp:
                fp.write(item['name']+'
    ')
    class MeijuItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        name=scrapy.Field()

    爬取美剧100实例     注意  还要注释一下PIPLINE文件里的内容 就是有300    优先级那个

    # -*- coding: utf-8 -*-
    import scrapy
    from poxiao.items import PoxiaoItem
    
    
    class NameSpider(scrapy.Spider):
        name = 'name'
        allowed_domains = ['poxiao.com']
        start_urls = ['https://www.poxiao.com/type/movie/']
    
        def parse(self, response):
    
            movie=response.xpath('//div[@class="gkpic"]//img')
            for i in movie:
                item=PoxiaoItem()
                item['src']=i.xpath('./@src').extract_first()
                item['name']=i.xpath('./@alt').extract_first()
                yield item
                next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
                if next_page:
                    yield response.follow("https://www.poxiao.com"+next_page,self.parse)

    第一个小爬虫

    import os
    import requests
    class PoxiaoPipeline(object):
        def process_item(self, item, spider):
            filename=os.path.join(r"d:untitled1poxiao",item['name']+'.jpg')
            with open(filename,'wb') as f:
                f.write(requests.get(item['src']).content)
  • 相关阅读:
    Linux----------自动化运维工具-ansible
    Linux----------自动化运维
    Linux----------集群介绍
    Linux----------集群-LVS
    Linux----------zabbix监控-进阶
    Linux----------zabbix监控基础
    Linux----------mysql主从复制和基于GTID主从复制
    Linux----------mysql主从及高可用
    MongoDB笔记(四)基本管理命令
    MongoDB笔记(三)启动命令mongod的参数
  • 原文地址:https://www.cnblogs.com/xupanfeng/p/11765545.html
Copyright © 2020-2023  润新知