__author__ = 'Administrator' # -*- encoding:utf-8 -*- import scrapy class QuoteSpider(scrapy.Spider): name = 'poxiao' start_urls=['https://www.poxiao.com/type/movie/'] def parse(self, response):#固定的 quotes=response.xpath('//li/h3')#内容 for quote in quotes: yield { 'name':quote.xpath('./a/text()').extract_first(), 'author':'https://www.poxiao.com'+quote.xpath('./a/@href').extract_first() } next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first() if next_page: yield response.follow(next_page,self.parse)
用SCRAPY爬取某网页链接地址
scrapy runspider ***.py 运行此工程
SCRAPY runspider ***.py -o aa.json 保存成JSON文件
scrap runspider ***.py -o aa.csv -t csv 保存成EXCEL
# -*- coding: utf-8 -*- import scrapy class MovieSpider(scrapy.Spider): name = 'movie' allowed_domains = ['poxiao.com'] start_urls = ['https://www.poxiao.com/type/movie/index_2.html', 'https://www.poxiao.com/type/movie/index_3.html' ] def parse(self, response): filname=response.url.split('/')[-1].split('.')[-2] with open(filname,'wb')as f: f.write(response.body)
爬取HTML源文件
# -*- coding: utf-8 -*- import scrapy from meiju.items import MeijuItem class Mj100Spider(scrapy.Spider): name = 'mj100' allowed_domains = ['meijutt.com'] start_urls = ['https://www.meijutt.com/new100.html'] def parse(self, response): movies=response.xpath('//h5/a') for each_movie in movies: item=MeijuItem() item['name']=each_movie.xpath('./text()').extract_first() yield item
class MeijuPipeline(object): def process_item(self, item, spider): with open('my_meiju.txt','a')as fp: fp.write(item['name']+' ')
class MeijuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name=scrapy.Field()
爬取美剧100实例 注意 还要注释一下PIPLINE文件里的内容 就是有300 优先级那个
# -*- coding: utf-8 -*- import scrapy from poxiao.items import PoxiaoItem class NameSpider(scrapy.Spider): name = 'name' allowed_domains = ['poxiao.com'] start_urls = ['https://www.poxiao.com/type/movie/'] def parse(self, response): movie=response.xpath('//div[@class="gkpic"]//img') for i in movie: item=PoxiaoItem() item['src']=i.xpath('./@src').extract_first() item['name']=i.xpath('./@alt').extract_first() yield item next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first() if next_page: yield response.follow("https://www.poxiao.com"+next_page,self.parse)
第一个小爬虫
import os import requests class PoxiaoPipeline(object): def process_item(self, item, spider): filename=os.path.join(r"d:untitled1poxiao",item['name']+'.jpg') with open(filename,'wb') as f: f.write(requests.get(item['src']).content)