# -*- coding: utf-8 -*- # 分析动态网页请求爬取腾讯视频评论 import scrapy import re import json import time from tencent.items import TencentItem class TenspiderSpider(scrapy.Spider): name = "tenspider" # allowed_domains = ["v.qq.com/x/cover/ga7nei8pd5i9mek.html"] start_urls = ['http://v.qq.com/x/cover/ga7nei8pd5i9mek.html/'] #为了生成comment_id comment_url = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid=' #为了生成评论页js的url base_url = 'https://coral.qq.com/article/{comment_id}/comment?commentid=0&reqnum=1000' def start_requests(self): movie_cid = re.search(r'cover/(.*?).html', self.start_urls[0]).group(1) video_comment_url = self.comment_url + movie_cid # print(video_comment_url) yield scrapy.Request(url = video_comment_url, callback= self.parse_video) def parse_video(self, response): html = re.search(r'=(.*?);',response.text).group(1) data = json.loads(html) comment_id = data.get('comment_id') f_comment_url = self.base_url.format(comment_id=comment_id) yield scrapy.Request(url = f_comment_url, callback = self.parse_comment) def parse_comment(self, response): item = TencentItem() data = json.loads(response.text) for each in data.get('data').get('commentid'): comment = each['content'] timestamp = each['time'] # 转换成localtime time_local = time.localtime(timestamp) # 转换成新的时间格式(2016-05-05 20:28:54) date = time.strftime("%Y-%m-%d %H:%M:%S", time_local) user = each['userinfo']['nick'] region = each['userinfo']['region'] userid = each['userinfo']['userid'] item['comment'] = comment item['user'] = user item['date'] = date item['region'] = region item['userid'] = userid yield item