# -*- coding: utf-8 -*- # scrapy爬取豆瓣电影top250 import scrapy from douban.items import DoubanItem class DoubanspiderSpider(scrapy.Spider): name = "doubanspider" # allowed_domains = ["movie.douban.com/top250"]注意这里的主页限制,一旦翻页可能超出范围 start_urls = ['http://movie.douban.com/top250'] def parse(self, response): item = DoubanItem() for each in response.css('.article .grid_view li'): # 电影名称 title = each.css('.item .hd .title:nth-child(1)::text').extract_first() # 导演 dire_actor = each.css('.item .bd p::text').extract()[0].strip() director = dire_actor.split('xa0xa0xa0')[0].strip() # 演员 actor = dire_actor.split('xa0xa0xa0')[1].strip() # 年代 info = each.css('.item .bd p::text').extract()[1].strip() year = info.split('/')[0].strip() # 国家 country = info.split('/')[1].strip() # 类型 type = info.split('/')[2].strip() # 评分 rating_num = each.css('.item .bd .star .rating_num::text').extract_first() # 经典台词 quote = each.css('.item .bd .quote span::text').extract_first() # 海报 image = each.css('.item .pic a img::attr(src)').extract_first() item['title'] = title item['director'] = director item['actor'] = actor item['year'] = year item['country'] = country item['type'] = type item['rating_num'] = rating_num item['quote'] = quote item['image'] = image yield item # 构造下一页的请求 next = response.css('.paginator .next a::attr(href)').extract_first() if next: url = 'http://movie.douban.com/top250' + next print(url) yield scrapy.Request(url=url, callback=self.parse)