# -*- coding: utf-8 -*- import requests, re import time import os import csv import sys import importlib from fake_useragent import UserAgent importlib.reload(sys) class WeiBoSpider(): def __init__(self, page): self.path = os.getcwd() + "/weibo.csv" self.csvfile = open(self.path, "a", newline="", encoding="utf-8-sig") self.writer = csv.writer(self.csvfile) # csv头部 self.writer.writerow(('话题链接', '话题内容', '楼主ID', '楼主昵称', '楼主性别', '发布日期', '发布时间', '转发量', '评论量', '点赞量', '评论者ID', '评论者昵称', '评论者性别', '评论日期', '评论时间', '评论内容')) self.headers = { 'Cookie': '_T_WM=22822641575; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2F; ALF=1584226439; MLOGIN=1; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5RJaVYrb.BEuOvUQ8Ca2OO5JpX5K-hUgL.FoqESh-7eKzpShM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMceoBfeh2EeKBN; SCF=AnRSOFp6QbWzfH1BqL4HB8my8eWNC5C33KhDq4Ko43RUIzs6rjJC49kIvz5_RcOJV2pVAQKvK2UbAd1Uh6j0pyo.; SUB=_2A25zQaQBDeRhGeBM71cR8SzNzzuIHXVQzcxJrDV6PUJbktAKLXD-kW1NRPYJXhsrLRnku_WvhsXi81eY0FM2oTtt; SUHB=0mxU9Kb_Ce6s6S; SSOLoginState=1581634641; WEIBOCN_FROM=1110106030; XSRF-TOKEN=dc7c27; M_WEIBOCN_PARAMS=oid%3D4471980021481431%26luicode%3D20000061%26lfid%3D4471980021481431%26uicode%3D20000061%26fid%3D4471980021481431', 'Referer': 'https://m.weibo.cn/detail/4312409864846621', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } self.comments_ID = [] self.page = page def get_title_id(self): # 获取内容中的id列表 for page in range(1, self.page): self.headers = { "User-Agent": UserAgent().chrome } time.sleep(1) api_url = 'https://m.weibo.cn/api/feed/trendtop?containerid=102803_ctg1_600059_-_ctg1_600059&page=' + str(page) rep = requests.get(url=api_url, headers=self.headers) # 获取ID值并写入列表comment_ID中 for json in rep.json()['data']['statuses']: comment_ID = json['id'] self.comments_ID.append(comment_ID) def spider_title(self, id): """爬取战役情每个主题的详情页面""" try: title_url = 'https://m.weibo.cn/detail/' + str(id) html_text = requests.get(url=title_url, headers=self.headers).text # 内容 title = re.findall('.*?"text": "(.*?)",.*?', html_text)[0] # 去掉title中的html标签 text = re.sub('<(S*?)[^>]*>.*?|<.*? />', '', title) # 用户id user_id = re.findall('.*?"id": "(.*?)",.*?', html_text)[0] # 用户昵称 user_nicname = re.findall('.*?"screen_name": "(.*?)",.*?', html_text)[0] # 性别 user_gender = re.findall('.*?"gender": "(.*?)",.*?', html_text)[0] # 发布时间 created_title_time = re.findall('.*?"created_at": "(.*?)",.*?', html_text)[0].split(" ") # 日期 if 'Mar' in created_title_time: title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '03', created_title_time[2]) elif 'Feb' in created_title_time: title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '02', created_title_time[2]) elif 'Jan' in created_title_time: title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '01', created_title_time[2]) else: pass # 发布时间 add_title_time = created_title_time[3] # 转发量 reposts_count = re.findall('.*?"reposts_count": (.*?),.*?', html_text)[0] # 评论量 comments_count = re.findall('.*?"comments_count": (.*?),.*?', html_text)[0] # 点赞量 attitudes_count = re.findall('.*?"attitudes_count": (.*?),.*?', html_text)[0] comment_count = int(int(comments_count) / 20) # 每个ajax一次加载20条数据 position1 = (title_url, text, user_id, user_nicname, user_gender, title_created_YMD, add_title_time, reposts_count, comments_count, attitudes_count, " ", " ", " ", " ", " ", " ") # 写入数据 print(title_url, text, user_id, user_nicname, user_gender, title_created_YMD, add_title_time, reposts_count, comments_count, attitudes_count) self.writer.writerow((position1)) return comment_count except: pass def get_page(self, id, max_id, id_type): # 抓取评论信息 params = { 'max_id': max_id, 'max_id_type': id_type } url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id'.format(id, id ) try: r = requests.get(url, params=params, headers=self.headers) if r.status_code == 200: return r.json() except requests.ConnectionError as e: print('error', e.args) pass def parse_page(self, jsondata): if jsondata: items = jsondata.get('data') item_max_id = {} item_max_id['max_id'] = items['max_id'] item_max_id['max_id_type'] = items['max_id_type'] return item_max_id def write_csv(self, jsondata): for json in jsondata['data']['data']: # 用户ID user_id = json['user']['id'] # 用户昵称 user_name = json['user']['screen_name'] # 用户性别,m表示男性,表示女性 user_gender = json['user']['gender'] # 获取评论 comments_text = json['text'] comment_text = re.sub('<(S*?)[^>]*>.*?|<.*? />', '', comments_text) # 正则匹配掉html标签 # 评论时间 created_times = json['created_at'].split(' ') if 'Feb' in created_times: created_YMD = "{}/{}/{}".format(created_times[-1], '02', created_times[2]) elif 'Jan' in created_times: created_YMD = "{}/{}/{}".format(created_times[-1], '01', created_times[2]) else: print('该时间不在疫情范围内,估计数据有误!') pass created_time = created_times[3] # 评论时间时分秒 position2 = ( " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", user_id, user_name, user_gender, created_YMD, created_time, comment_text) self.writer.writerow((position2)) # 写入数据 def main(self): self.get_title_id() count_title = len(self.comments_ID) for count, comment_ID in enumerate(self.comments_ID): print("正在爬取第%s个话题,一共找到个%s话题需要爬取" % (count + 1, count_title)) # maxPage获取返回的最大评论数量 maxPage = self.spider_title(comment_ID) m_id = 0 id_type = 0 if maxPage != 0: # 小于20条评论的不需要循环 try: # 用评论数量控制循环 for page in range(0, maxPage): # 自定义函数-抓取网页评论信息 jsondata = self.get_page(comment_ID, m_id, id_type) # 自定义函数-写入CSV文件 self.write_csv(jsondata) # 自定义函数-获取评论item最大值 results = self.parse_page(jsondata) time.sleep(1) m_id = results['max_id'] id_type = results['max_id_type'] except: pass print("--------------------------分隔符---------------------------") self.csvfile.close() if __name__ == '__main__': startTime = time.time() spider = WeiBoSpider(15) spider.main() endTime = time.time() useTime = (endTime - startTime) / 60 print("该次所获的信息一共使用%s分钟" % useTime)