• 用来爬取新浪微博评论数据


     1 # -*- coding:utf-8 -*-
     2 import requests
     3 import json
     4 import re
     5 import os
     6 import gevent
     7 import time
     8 import random
     9 from multiprocessing.dummy import Pool as ThreadPool
    10 from bs4 import BeautifulSoup
    11 class CommentCrawl(object):
    12     '''
    13     用来爬取新浪微博评论数据
    14     '''
    15     headers = {
    16         'User-Agent': '',
    17         'Cookie': ''}
    18     ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    19     all_comment = []
    20     def __init__(self,urlll,file_name):
    21         self.urlll = urlll
    22         self.file_name=file_name
    23     def base62_decode(self,string, alphabet=ALPHABET):
    24         base = len(alphabet)
    25         strlen = len(string)
    26         num = 0
    27         idx = 0
    28         for char in string:
    29             power = (strlen - (idx + 1))
    30             num += alphabet.index(char) * (base ** power)
    31             idx += 1
    32         return num
    33 
    34     def parser_url(self):
    35         code = self.urlll.split('?')[0].split('/')[-1]
    36         code1 = code[0]
    37         code2 = code[1:5]
    38         code3 = code[5:]
    39         id1 = self.base62_decode(code1)
    40         id2 = self.base62_decode(code2)
    41         id3 = self.base62_decode(code3)
    42         numList = [id1, id2, id3]
    43         plus = ''.join(map(str, numList))
    44         comment_url = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id='+ plus +'&root_comment_max_id_type=0&page={}'
    45         return comment_url
    46     def get_url_page(self):
    47         r = requests.get(self.parser_url().format(1),headers=self.headers)
    48         data = json.loads(r.text)
    49         total_page = data['data']['page']['totalpage']
    50         return total_page
    51 
    52     def all_urls(self):
    53         all_urls = [self.parser_url().format(i + 1) for i in range(self.get_url_page())]
    54         return all_urls
    55 
    56     def comment_parser(self,html):
    57         soup = BeautifulSoup(html, 'html.parser')
    58         data = soup.select('.WB_text')
    59         comment = [i.text.split('')[-1] for i in data]
    60         return comment
    61     def finnal_text(self,url):
    62         finnal_all_comment=''.join(self.all_comment)
    63         r1 = requests.get(url,headers=self.headers)
    64         time.sleep(random.randint(1,5))
    65         data1 = json.loads(r1.text)
    66         html =data1['data']['html']
    67         finnal_data = self.comment_parser(html)
    68         self.all_comment+=finnal_data
    69         print(len(self.all_comment))
    70         return finnal_all_comment
    71     def save_file(self,url):
    72         path = os.getcwd()
    73         filename = self.file_name + '.txt'
    74         file = path + '/' + filename
    75         f = open(file, 'a+', encoding='utf-8')
    76         f.write(self.finnal_text(url))
    77 
    78 if __name__ == "__main__":
    79     aa = CommentCrawl('http://weibo.com/2202387347/EFdPHe50Z?from=page_1006062202387347_profile&wvr=6&mod=weibotime','小米6发布会')
    80     all_link = aa.all_urls()
    81     pool=ThreadPool(4)
    82     results = pool.map(aa.save_file,all_link)
    83     pool.close()
    84     pool.join()
  • 相关阅读:
    攀岩
    插入排序
    runtime error
    vector
    旅行家
    九键字母组合
    [蓝桥杯][基础训练]Sine之舞
    代码计算程序运行的时间
    max_element
    distance
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6733080.html
Copyright © 2020-2023  润新知