• 分析动态网页请求爬取腾讯视频评论


    # -*- coding: utf-8 -*-
    # 分析动态网页请求爬取腾讯视频评论
    import scrapy
    import re
    import json
    import time
    from tencent.items import TencentItem
    
    
    class TenspiderSpider(scrapy.Spider):
        name = "tenspider"
        # allowed_domains = ["v.qq.com/x/cover/ga7nei8pd5i9mek.html"]
        start_urls = ['http://v.qq.com/x/cover/ga7nei8pd5i9mek.html/']
        #为了生成comment_id
        comment_url = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid='
        #为了生成评论页js的url
        base_url = 'https://coral.qq.com/article/{comment_id}/comment?commentid=0&reqnum=1000'
    
        def start_requests(self):
            movie_cid = re.search(r'cover/(.*?).html', self.start_urls[0]).group(1)
            video_comment_url = self.comment_url + movie_cid
            # print(video_comment_url)
            yield scrapy.Request(url = video_comment_url, callback= self.parse_video)
    
        def parse_video(self, response):
            html = re.search(r'=(.*?);',response.text).group(1)
            data = json.loads(html)
            comment_id = data.get('comment_id')
            f_comment_url = self.base_url.format(comment_id=comment_id)
            yield scrapy.Request(url = f_comment_url, callback = self.parse_comment)
    
        def parse_comment(self, response):
            item = TencentItem()
            data = json.loads(response.text)
            for each in data.get('data').get('commentid'):
                comment = each['content']
                timestamp = each['time']
                # 转换成localtime
                time_local = time.localtime(timestamp)
                # 转换成新的时间格式(2016-05-05 20:28:54)
                date = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
                user = each['userinfo']['nick']
                region = each['userinfo']['region']
                userid = each['userinfo']['userid']
    
                item['comment'] = comment
                item['user'] = user
                item['date'] = date
                item['region'] = region
                item['userid'] = userid
    
                yield item
  • 相关阅读:
    扑克牌顺子
    多任务Multitask Learning
    智能指针
    左旋转字符串
    和为s的两个数字
    07.极简主义读后感
    06.极简主义——汇流(笔记)
    05.极简主义——奉献(笔记)
    04.极简主义——热情(笔记)
    03.极简主义——人际关系(笔记)
  • 原文地址:https://www.cnblogs.com/themost/p/7102518.html
Copyright © 2020-2023  润新知