废话不说,直接上代码.....
目录结构
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.org/en/latest/topics/items.html import scrapy from scrapy import Field class MoviesItem(scrapy.Item): # define the fields for your item here like: # name = Field() # 电影名称 movieName = Field() # 电影id movieId = Field() # 海报地址 img = Field() # 电影信息网址 info_website = Field() # 评分 data_score = Field() # 片长 data_duration = Field() # 上映日期 data_release = Field() # 导演 data_director = Field() # 主演 data_actors = Field() # 制作国家/地区 data_region = Field() # 编剧 data_attrs = Field() # 评论人数 data_number = Field() # 简介 introduction = Field() # 类型 movie_type = Field() # 语言 movie_language = Field() # 又名 also_called = Field() # 排名 movie_ranking = Field() # 短评网址 comment_website = Field() # 标记 mark = Field() class MoviesCommentItem(scrapy.Item): # 网友名称 netName = Field() # 电影名称 movieName = Field() # 电影id movieId = Field() # 短评内容 content = Field() # 评论时间 contentTime = Field() # 星级 states = Field() # 好评 praise_rate = Field() # 一般 general_rate = Field() # 差评 negative_rate = Field() # 网友头像 netImg = Field() # 短评网址 comment_website = Field() # 海报 img = Field() # 电影信息网址 info_website = Field() # 电影网址 movie_website = Field() # 标记 mark = Field()
spiders
movie_spider.py
# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import Rule, CrawlSpider from ..items import MoviesItem, MoviesCommentItem from scrapy.linkextractors import LinkExtractor from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import * from scrapy import log class MovieSpiderSpider(CrawlSpider): name = 'movie_spider' # allowed_domains = ['https://movie.douban.com/'] # start_urls = ['https://movie.douban.com/top250'] # start_urls = ['https://movie.douban.com/subject/1291828/'] custom_settings = { "COOKIES_ENABLED": False, "DOWNLOAD_DELAY": 3, 'DEFAULT_REQUEST_HEADERS': { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'Accept-Encoding': "gzip, deflate, sdch", 'Accept-Language': "zh-CN,zh;q=0.8", 'Cache-Control': "no-cache", 'Connection': "keep-alive", 'Host': "movie.douban.com", 'Pragma': "no-cache", 'Upgrade-Insecure-Requests': "1", } } rules = ( Rule(LinkExtractor(allow=r'^https://movie.douban.com/subject/d+/$'), callback='parse_item', follow=True), ) def __init__(self, *args, **kwargs): super(MovieSpiderSpider, self).__init__(*args, **kwargs) self.driver = webdriver.PhantomJS(executable_path=r'D:phantomjs-2.1.1-windowsinphantomjs.exe') def __del__(self): self.driver.close() def start_requests(self): for i in range(0, 250, 25): url = 'https://movie.douban.com/top250?start={}&filter='.format(i) request = scrapy.Request(url) yield request def parse_item(self, response): response = response.replace(body=response.body.decode("utf-8").replace(' ', '').replace(' ', '')) info_url = response.url log.msg('url列表:{}'.format(info_url)) item = self.selenium_js(info_url) url_list = info_url.split('/') for i in url_list: if i == '': url_list.remove(i) # 电影id item['movieId'] = url_list[-1] info_website = 'http://localhost:8000/movie/{}/'.format(item['movieId']) # info_website = 'http://longlove.wang/movie/{}/'.format(item['movieId']) # 电影信息网址 item['info_website'] = info_website movie_introduction = response.xpath('//*[@id="link-report"]/span[1]/text()').extract() introduction = '' for i in movie_introduction: introduction += i.strip().replace(' ', '').replace(' ', '') if not introduction: movie_introduction = response.xpath('//*[@id="link-report"]/span[1]/span/text()').extract() for i in movie_introduction: introduction += i.strip().replace(' ', '').replace(' ', '') # 电影简介 item['introduction'] = introduction # 排名 item['movie_ranking'] = response.xpath('//*[@id="content"]/div[1]/span[1]/text()').extract() img_list = response.xpath('//*[@id="mainpic"]/a/img/@src').extract() # 电影名称 item['movieName'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() for img in img_list: # 电影海报 item['img'] = img.replace('.webp', '.jpg') # 评分 data_score = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract() for i in data_score: item['data_score'] = i.strip() douban_url = 'https://movie.douban.com/subject/{}/comments'.format(item['movieId']) comment_url = 'http://localhost:8000/movie/{}/comments'.format(item['movieId']) # comment_url = 'http://longlove.wang/movie/{}/comments'.format(item['movieId']) request = scrapy.Request(douban_url, callback=self.parse_comment) # 短评网址 item['comment_website'] = comment_url item['mark'] = '1' # log.msg(item) request.meta['movieId'] = item['movieId'] request.meta['movieName'] = item['movieName'] request.meta['comment_website'] = item['comment_website'] request.meta['img'] = item['img'] request.meta['info_website'] = item['info_website'] yield request yield item def parse_comment(self, response): response = response.replace(body=response.body.decode("utf-8").replace(' ', '').replace(' ', '')) item_comment = MoviesCommentItem() # 好评 item_comment['praise_rate'] = response.xpath('//*[@id="content"]/div/div[1]/div[3]/label[2]/span[2]/text()').extract() # 一般 item_comment['general_rate'] = response.xpath('//*[@id="content"]/div/div[1]/div[3]/label[3]/span[2]/text()').extract() # 差评 item_comment['negative_rate'] = response.xpath('//*[@id="content"]/div/div[1]/div[3]/label[4]/span[2]/text()').extract() info = response.xpath('//*[@id="comments"]')[0] # 短评内容 //*[@id="comments"]/div[1]/div[2]/p/span/text() # item_comment['content'] = info.xpath('./div/div[@class="comment"]/p/span/text()').extract() content_list = info.xpath('./div/div[@class="comment"]/p/span/text()').extract() content_format = [] for content in content_list: content = content.strip().replace(' ', '').replace(' ', '') # print('content', content) if content != '': content_format.append(content) item_comment['content'] = content_format # 网友名称 item_comment['netName'] = info.xpath('./div/div[2]/h3/span[2]/a/text()').extract() # 评论时间 item_comment['contentTime'] = info.xpath('./div/div[2]/h3/span[2]/span[3]/text()').extract() # 星级 item_comment['states'] = info.xpath('./div/div[2]/h3/span[2]/span[2]/@class').extract() # 头像 item_comment['netImg'] = info.xpath('./div/div/a/img/@src').extract() # 电影id item_comment['movieId'] = response.meta['movieId'] # 电影名称 item_comment['movieName'] = response.meta['movieName'] # 短评网址 item_comment['comment_website'] = response.meta['comment_website'] # 海报 item_comment['img'] = response.meta['img'] # 电影信息网址 item_comment['info_website'] = response.meta['info_website'] # 标记 item_comment['mark'] = '2' # log.msg('短评:{}'.format(item_comment)) yield item_comment def selenium_js(self, info_url): item = MoviesItem() self.driver.get(info_url) self.driver.implicitly_wait(10) locator = ('xpath', '//div[@class="subject clearfix"]/div[2]') data = self.get_text(locator) data_list = data.split(' ') for d in data_list: if d != '': j = d.split(':', 1) if '导演' in j[0]: item['data_director'] = j[1] elif '编剧' in j: item['data_attrs'] = j[1] elif '主演' in j: item['data_actors'] = j[1] elif '类型' in j: item['movie_type'] = j[1] elif '制片国家/地区' in j: item['data_region'] = j[1] elif '语言' in j: item['movie_language'] = j[1] elif '上映日期' in j: item['data_release'] = j[1] elif '片长' in j: item['data_duration'] = j[1] elif '又名' in j: item['also_called'] = j[1] else: pass # 评论人数 locator = ('xpath', '//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span') data = self.get_text(locator) item['data_number'] = data return item def find_element(self, locator, timeout=5): """重写元素定位方法""" try: element = WebDriverWait(self.driver, timeout, 1).until(EC.presence_of_element_located(locator)) return element except: return "" def get_text(self, locator): """获取文本""" element = self.find_element(locator) if element: return element.text else: return ''
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql import time import redis from scrapy.exceptions import DropItem from scrapy import log class DoubanPipeline(object): def __init__(self): self.check = 0 def process_item(self, item, spider): # 判断如果没有本科人数字段,则丢弃该item if spider.name == 'movie_spider': if not item.get('img'): raise DropItem('缺少字段:{}'.format('img')) # 判断如果没有研究生人数字段,则丢弃该item if not item.get('movieName'): raise DropItem('缺少字段:{}'.format('movieName')) if item['mark'] == '1': if not item.get('also_called'): item['also_called'] = '无' if not item.get('movie_type'): item['movie_type'] = '无' if not item.get('data_duration'): item['data_duration'] = '无' if item.get('movie_type').strip() == '纪录片': item['data_actors'] = '无' return itemclass RedisPipeline(object): def __init__(self): self.r = redis.Redis(host='localhost', port=6379) def process_item(self, item, spider): if spider.name == 'u6': self.r.wait(item['website'], timeout=10) self.r.sadd(spider.name, item['movieName']) return item class MysqlPipeline(object): def __init__(self): """"初始化mysql链接和游标对象""" self.conn = None self.cur = None self.movies = False self.commentary = False self.states = [] def open_spider(self, spider): """"初始化mysql链接""" self.conn = pymysql.connect( host='localhost', port=3306, user='root', password='123456', db='douban', charset='utf8mb4', ) # 初始化游标对象 self.cur = self.conn.cursor() if spider.name == 'movie_spider': self.delete_data('movieId', 'movie_home') self.delete_data('movieId', 'movie_info') self.delete_data('movieId', 'movie_comment') def delete_data(self, field, table): """在保存爬取数据前,清空库 递归清空""" sql = 'select {} from {}'.format(field, table) self.cur.execute(sql) if self.cur.fetchone(): sql = 'delete from {}'.format(table) self.cur.execute(sql) self.conn.commit() time.sleep(1) self.delete_data(field, table) else: log.msg('{}, 数据库初始化完成!'.format(table)) def check_data(self, field, table): sql = 'select {} from {}'.format(field, table) self.cur.execute(sql) self.conn.commit() s = self.cur.fetchall() id_list = [] # 判断数据是否已经存在 for i in range(len(s)): for j in s[i]: id_list.append(j) return set(id_list) def process_item(self, item, spider): if spider.name == 'movie_spider': if item['mark'] == '1': sql = 'insert into `movie_info`(`movieName`, `movieId`, `img`,`info_website`,`data_score`, ' '`data_duration`, `data_release`, `data_director`, `data_actors`, `data_region`, `data_attrs`, ' '`data_number`, `introduction`, `movie_type`, `movie_language`, `also_called`, `movie_ranking`, ' '`comment_website`) values ' '(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' self.cur.execute(sql, (item['movieName'], item['movieId'], item['img'], item['info_website'], item['data_score'], item['data_duration'], item['data_release'], item['data_director'], item['data_actors'], item['data_region'], item['data_attrs'], item['data_number'], item['introduction'], item['movie_type'], item['movie_language'], item['also_called'], item['movie_ranking'], item['comment_website'])) self.conn.commit() log.msg('movie_info {},保存成功!'.format(item['movieName'])) if item['mark'] == '2': # 处理item['states'], 显示成中文 if item['states']: self.states = self.format_states(item['states']) for i in range(len(item['contentTime'])): sql = 'insert into `movie_comment`(`movieName`, `movieId`, `netName`,`states`,`content`, ' '`contentTime`, `comment_website`, `netImg`) values ' '(%s, %s, %s, %s, %s, %s, %s, %s)' self.cur.execute(sql, (item['movieName'], item['movieId'], item['netName'][i], self.states[i], item['content'][i], item['contentTime'][i].strip(), item['comment_website'], item['netImg'][i])) id_list = self.check_data('movieId', 'movie_home') if int(item['movieId']) not in id_list: sql = 'insert into `movie_home`(`movieName`, `movieId`, `praise_rate`,`general_rate`,' '`negative_rate`, `comment_website`, `img`, `info_website`) values ' '(%s, %s, %s, %s, %s, %s, %s, %s)' self.cur.execute(sql, (item['movieName'], item['movieId'], item['praise_rate'], item['general_rate'], item['negative_rate'], item['comment_website'], item['img'], item['info_website'])) self.conn.commit() log.msg('movie_comment, movie_home {},保存成功!'.format(item['movieName'])) return item
def format_states(self, states_list):
for i in states_list: if '10' in i: i = '一星级' elif '15' in i: i = '一星半' elif '20' in i: i = '二星级' elif '25' in i: i = '二星半' elif '30' in i: i = '三星级' elif '35' in i: i = '三星半' elif '40' in i: i = '四星级' elif '45' in i: i = '四星半' elif '50' in i: i = '五星级' else: i = '三星半' self.states.append(i) return self.states
middlewares
proxy.py
# !/usr/bin/env python # coding=utf-8 import random from urllib.request import _parse_proxy import requests from scrapy.exceptions import NotConfigured from scrapy import log def reform_url(url): # 重组url,返回不带用户名和密码的格式 proxy_type, *_, hostport = _parse_proxy(url) return '{}://{}'.format(proxy_type, hostport) class RandomProxyMiddleware: # 代理的最多失败次数,超过此值,从代理池删除 max_failed = 3 def __init__(self, settings): # 从设置中获取代理池 # self.proxies = settings.getlist('PROXIES') self.proxies = self.choice_proxies() def choice_proxies(self): self.proxies = [] url = '返回 ip url' r = requests.get(url) # eval() 计算字符串中的有效表达式 ip_dict = eval(r.text) if ip_dict['code'] == '0': for i in ip_dict['msg']: # 拼接成有效的代理ip ip = 'http://' + i['ip'] + ':' + i['port'] self.proxies.append(ip) log.msg(self.proxies) if self.proxies: # 初始化统计信息,一开始失败次数都是0 self.stats = {}.fromkeys(map(reform_url, self.proxies), 0) return self.proxies elif ip_dict['code'] == '3006': log.msg(ip_dict['msg']) return '-2' else: log.msg('代理ip接口返回状态码异常...{}'.format(ip_dict['code'])) return '-1' @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('HTTPPROXY_ENABLED'): raise NotConfigured return cls(crawler.settings) def process_request(self, request, spider): # 如果request.meta中没有设置proxy,则从代理池中随机设置一个,作为本次请求的代理 if 'proxy' not in request.meta: request.meta['proxy'] = random.choice(self.proxies) def process_response(self, request, response, spider): # 获取当前使用的proxy cur_proxy = request.meta['proxy'] # 判断http code是否大于400,响应是否出错 if response.status in [301, 302]: log.msg('访问被重定向:{}'.format(response)) return response if response.status >= 400: # 将该代理的失败次数加1 self.stats[cur_proxy] += 1 # 判断该代理的总失败次数是否已经超过最大失败次数 if self.stats[cur_proxy] >= self.max_failed: log.msg('{} 获得一个 {} 返回结果'.format(cur_proxy, response.status)) # 从代理池中删除该代理 # if cur_proxy in self.proxies: # self.proxies.remove(cur_proxy) for proxy in self.proxies: if reform_url(proxy) == cur_proxy: self.proxies.remove(proxy) break log.msg('{} 超过最大失败次数,从代理列表删除'.format(cur_proxy)) # 将本次请求重新设置一个代理,并返回 if not self.proxies: self.proxies = self.choice_proxies() log.msg('超过最大失败次数,代理池为空...再次请求api') # return request.meta['proxy'] = random.choice(self.proxies) return request return response def process_exception(self, request, exception, spider): cur_proxy = request.meta['proxy'] # 如果出现网络超时或者链接被拒绝,则删除该代理 if cur_proxy in self.proxies: self.proxies.remove(cur_proxy) log.msg('{} 代理ip出现错误,从代理列表删除'.format(cur_proxy)) # 将本次请求重新设置一个d代理并返回 if not self.proxies: self.proxies = self.choice_proxies() log.msg('代理ip出现错误,代理池为空...再次请求api') # return request.meta['proxy'] = random.choice(self.proxies) return request
useragent.py
import faker from scrapy import log class RandomUserAgentMiddleware(object): """该中间件负责给每个请求随机分配一个user agent""" def __init__(self, settings): self.faker = faker.Faker() @classmethod def from_crawler(cls, crawler): # 创建一个中间件实例,并返回 return cls(crawler.settings) def process_request(self, request, spider): # 设置request头信息内的user-Agent字段 request.headers['User-Agent'] = self.faker.user_agent() def process_response(self, request, response, spider): # log.msg(request.headers['User-Agent']) return response
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for douban project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import time, os BOT_NAME = 'douban' SPIDER_MODULES = ['douban.spiders'] NEWSPIDER_MODULE = 'douban.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/49.0.2623.112 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False # 是否使用代理 HTTPPROXY_ENABLED = True # 超时等待时间 DOWNLOAD_TIMEOUT = 5 # Configure maximum concurrent requests performed by Scrapy (default: 16) # 请求并发数量,默认16 CONCURRENT_REQUESTS = 2 # 增加线程池数量 REACTOR_THREADPOOL_MAXSIZE = 20 # 启动日志 # LOG_ENABLED = True # # log文件编码 # LOG_ENCODING = 'utf-8' # # 打印日志文件位置 # today = time.strftime('%Y-%m-%d') # LOG_FILE = "./log/{}.log".format(today) # 提高日志级别 LOG_LEVEL = 'INFO' # 禁⽤用重定向 REDIRECT_ENABLED = False DOWNLOADER_MIDDLEWARES_BASE = { 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300, 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400, 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500, 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550, #'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600, 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700, 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750, 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 800, 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850, 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900, } # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # 请求之间的间隔 DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # 是否请求cookies COOKIES_ENABLED = False # 禁用重试 RETRY_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'Accept-Encoding': "gzip, deflate, sdch", 'Accept-Language': "zh-CN,zh;q=0.8", 'Cache-Control': "no-cache", 'Connection': "keep-alive", 'Host': "book.douban.com", 'Pragma': "no-cache", 'Upgrade-Insecure-Requests': "1", } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'douban.middlewares.DoubanSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'douban.middlewares.useragent.RandomUserAgentMiddleware': 543, 'douban.middlewares.proxy.RandomProxyMiddleware': 749, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'douban.pipelines.DoubanPipeline': 300, #} ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300, # 'douban.pipelines.RedisPipeline': 301, 'douban.pipelines.MysqlPipeline': 302, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
运行:scrapy crawl douban
结果: