分析动态网页请求爬取腾讯视频评论

# -*- coding: utf-8 -*-
# 分析动态网页请求爬取腾讯视频评论
import scrapy
import re
import json
import time
from tencent.items import TencentItem


class TenspiderSpider(scrapy.Spider):
    name = "tenspider"
    # allowed_domains = ["v.qq.com/x/cover/ga7nei8pd5i9mek.html"]
    start_urls = ['http://v.qq.com/x/cover/ga7nei8pd5i9mek.html/']
    #为了生成comment_id
    comment_url = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid='
    #为了生成评论页js的url
    base_url = 'https://coral.qq.com/article/{comment_id}/comment?commentid=0&reqnum=1000'

    def start_requests(self):
        movie_cid = re.search(r'cover/(.*?).html', self.start_urls[0]).group(1)
        video_comment_url = self.comment_url + movie_cid
        # print(video_comment_url)
        yield scrapy.Request(url = video_comment_url, callback= self.parse_video)

    def parse_video(self, response):
        html = re.search(r'=(.*?);',response.text).group(1)
        data = json.loads(html)
        comment_id = data.get('comment_id')
        f_comment_url = self.base_url.format(comment_id=comment_id)
        yield scrapy.Request(url = f_comment_url, callback = self.parse_comment)

    def parse_comment(self, response):
        item = TencentItem()
        data = json.loads(response.text)
        for each in data.get('data').get('commentid'):
            comment = each['content']
            timestamp = each['time']
            # 转换成localtime
            time_local = time.localtime(timestamp)
            # 转换成新的时间格式(2016-05-05 20:28:54)
            date = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
            user = each['userinfo']['nick']
            region = each['userinfo']['region']
            userid = each['userinfo']['userid']

            item['comment'] = comment
            item['user'] = user
            item['date'] = date
            item['region'] = region
            item['userid'] = userid

            yield item

相关阅读:
软件测试人员的年终绩效考核怎么应对
收藏
顶踩组件前后两版
订阅组件
hdu 1963 Investment 完全背包
hdu 4939 Stupid Tower Defense 动态规划
hdu 4405 Aeroplane chess 动态规划
cf 414B Mashmokh and ACM 动态规划
BUPT 202 Chocolate Machine 动态规划
hdu 3853 LOOPS 动态规划

原文地址：https://www.cnblogs.com/themost/p/7102518.html