• 分析动态网页请求爬取腾讯视频评论


    # -*- coding: utf-8 -*-
    # 分析动态网页请求爬取腾讯视频评论
    import scrapy
    import re
    import json
    import time
    from tencent.items import TencentItem
    
    
    class TenspiderSpider(scrapy.Spider):
        name = "tenspider"
        # allowed_domains = ["v.qq.com/x/cover/ga7nei8pd5i9mek.html"]
        start_urls = ['http://v.qq.com/x/cover/ga7nei8pd5i9mek.html/']
        #为了生成comment_id
        comment_url = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid='
        #为了生成评论页js的url
        base_url = 'https://coral.qq.com/article/{comment_id}/comment?commentid=0&reqnum=1000'
    
        def start_requests(self):
            movie_cid = re.search(r'cover/(.*?).html', self.start_urls[0]).group(1)
            video_comment_url = self.comment_url + movie_cid
            # print(video_comment_url)
            yield scrapy.Request(url = video_comment_url, callback= self.parse_video)
    
        def parse_video(self, response):
            html = re.search(r'=(.*?);',response.text).group(1)
            data = json.loads(html)
            comment_id = data.get('comment_id')
            f_comment_url = self.base_url.format(comment_id=comment_id)
            yield scrapy.Request(url = f_comment_url, callback = self.parse_comment)
    
        def parse_comment(self, response):
            item = TencentItem()
            data = json.loads(response.text)
            for each in data.get('data').get('commentid'):
                comment = each['content']
                timestamp = each['time']
                # 转换成localtime
                time_local = time.localtime(timestamp)
                # 转换成新的时间格式(2016-05-05 20:28:54)
                date = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
                user = each['userinfo']['nick']
                region = each['userinfo']['region']
                userid = each['userinfo']['userid']
    
                item['comment'] = comment
                item['user'] = user
                item['date'] = date
                item['region'] = region
                item['userid'] = userid
    
                yield item
  • 相关阅读:
    软件测试人员的年终绩效考核怎么应对
    收藏
    顶踩组件 前后两版
    订阅组件
    hdu 1963 Investment 完全背包
    hdu 4939 Stupid Tower Defense 动态规划
    hdu 4405 Aeroplane chess 动态规划
    cf 414B Mashmokh and ACM 动态规划
    BUPT 202 Chocolate Machine 动态规划
    hdu 3853 LOOPS 动态规划
  • 原文地址:https://www.cnblogs.com/themost/p/7102518.html
Copyright © 2020-2023  润新知