• Python-爬取微博信息


    # -*- coding: utf-8 -*-
    import requests, re
    import time
    import os
    import csv
    import sys
    import importlib
    from fake_useragent import UserAgent
    
    importlib.reload(sys)
    class WeiBoSpider():
        def __init__(self, page):
            self.path = os.getcwd() + "/weibo.csv"
            self.csvfile = open(self.path, "a", newline="", encoding="utf-8-sig")
            self.writer = csv.writer(self.csvfile)
            # csv头部
            self.writer.writerow(('话题链接', '话题内容', '楼主ID', '楼主昵称', '楼主性别', '发布日期',
                     '发布时间', '转发量', '评论量', '点赞量', '评论者ID', '评论者昵称',
                     '评论者性别', '评论日期', '评论时间', '评论内容'))
            self.headers = {
        'Cookie': '_T_WM=22822641575; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2F; ALF=1584226439; MLOGIN=1; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5RJaVYrb.BEuOvUQ8Ca2OO5JpX5K-hUgL.FoqESh-7eKzpShM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMceoBfeh2EeKBN; SCF=AnRSOFp6QbWzfH1BqL4HB8my8eWNC5C33KhDq4Ko43RUIzs6rjJC49kIvz5_RcOJV2pVAQKvK2UbAd1Uh6j0pyo.; SUB=_2A25zQaQBDeRhGeBM71cR8SzNzzuIHXVQzcxJrDV6PUJbktAKLXD-kW1NRPYJXhsrLRnku_WvhsXi81eY0FM2oTtt; SUHB=0mxU9Kb_Ce6s6S; SSOLoginState=1581634641; WEIBOCN_FROM=1110106030; XSRF-TOKEN=dc7c27; M_WEIBOCN_PARAMS=oid%3D4471980021481431%26luicode%3D20000061%26lfid%3D4471980021481431%26uicode%3D20000061%26fid%3D4471980021481431',
        'Referer': 'https://m.weibo.cn/detail/4312409864846621',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    }
            self.comments_ID = []
            self.page = page
    
        def get_title_id(self):
            # 获取内容中的id列表
            for page in range(1, self.page):
                self.headers = {
                    "User-Agent": UserAgent().chrome
                }
                time.sleep(1)
                api_url = 'https://m.weibo.cn/api/feed/trendtop?containerid=102803_ctg1_600059_-_ctg1_600059&page=' + str(page)
                rep = requests.get(url=api_url, headers=self.headers)
                # 获取ID值并写入列表comment_ID中
                for json in rep.json()['data']['statuses']:
                    comment_ID = json['id']
                    self.comments_ID.append(comment_ID)
    
        def spider_title(self, id):
            """爬取战役情每个主题的详情页面"""
            try:
                title_url = 'https://m.weibo.cn/detail/' + str(id)
                html_text = requests.get(url=title_url, headers=self.headers).text
                # 内容
                title = re.findall('.*?"text": "(.*?)",.*?', html_text)[0]
                # 去掉title中的html标签
                text = re.sub('<(S*?)[^>]*>.*?|<.*? />', '', title)
                # 用户id
                user_id = re.findall('.*?"id": "(.*?)",.*?', html_text)[0]
                # 用户昵称
                user_nicname = re.findall('.*?"screen_name": "(.*?)",.*?', html_text)[0]
                # 性别
                user_gender = re.findall('.*?"gender": "(.*?)",.*?', html_text)[0]
                # 发布时间
                created_title_time = re.findall('.*?"created_at": "(.*?)",.*?', html_text)[0].split(" ")
                # 日期
                if 'Mar' in created_title_time:
                    title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '03', created_title_time[2])
                elif 'Feb' in created_title_time:
                    title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '02', created_title_time[2])
                elif 'Jan' in created_title_time:
                    title_created_YMD = "{}/{}/{}".format(created_title_time[-1], '01', created_title_time[2])
                else:
                    pass
                # 发布时间
                add_title_time = created_title_time[3]
                # 转发量
                reposts_count = re.findall('.*?"reposts_count": (.*?),.*?', html_text)[0]
                # 评论量
                comments_count = re.findall('.*?"comments_count": (.*?),.*?', html_text)[0]
                # 点赞量
                attitudes_count = re.findall('.*?"attitudes_count": (.*?),.*?', html_text)[0]
    
                comment_count = int(int(comments_count) / 20)  # 每个ajax一次加载20条数据
                position1 = (title_url, text, user_id, user_nicname, user_gender, title_created_YMD,
                             add_title_time, reposts_count, comments_count, attitudes_count, " ", " ", " ", " ", " ", " ")
                # 写入数据
                print(title_url, text, user_id, user_nicname, user_gender, title_created_YMD,
                             add_title_time, reposts_count, comments_count, attitudes_count)
                self.writer.writerow((position1))
                return comment_count
            except:
                pass
    
    
        def get_page(self, id, max_id, id_type):
            # 抓取评论信息
            params = {
                'max_id': max_id,
                'max_id_type': id_type
            }
            url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id'.format(id, id )
            try:
                r = requests.get(url, params=params, headers=self.headers)
                if r.status_code == 200:
                    return r.json()
            except requests.ConnectionError as e:
                print('error', e.args)
                pass
    
        def parse_page(self, jsondata):
            if jsondata:
                items = jsondata.get('data')
                item_max_id = {}
                item_max_id['max_id'] = items['max_id']
                item_max_id['max_id_type'] = items['max_id_type']
                return item_max_id
    
        def write_csv(self, jsondata):
            for json in jsondata['data']['data']:
                # 用户ID
                user_id = json['user']['id']
                # 用户昵称
                user_name = json['user']['screen_name']
                # 用户性别,m表示男性,表示女性
                user_gender = json['user']['gender']
                # 获取评论
                comments_text = json['text']
                comment_text = re.sub('<(S*?)[^>]*>.*?|<.*? />', '', comments_text)  # 正则匹配掉html标签
                # 评论时间
                created_times = json['created_at'].split(' ')
                if 'Feb' in created_times:
                    created_YMD = "{}/{}/{}".format(created_times[-1], '02', created_times[2])
                elif 'Jan' in created_times:
                    created_YMD = "{}/{}/{}".format(created_times[-1], '01', created_times[2])
                else:
                    print('该时间不在疫情范围内,估计数据有误!')
                    pass
                created_time = created_times[3]  # 评论时间时分秒
                position2 = (
                    " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", user_id, user_name, user_gender, created_YMD,
                    created_time,
                    comment_text)
                self.writer.writerow((position2))  # 写入数据
    
        def main(self):
            self.get_title_id()
            count_title = len(self.comments_ID)
            for count, comment_ID in enumerate(self.comments_ID):
                print("正在爬取第%s个话题,一共找到个%s话题需要爬取" % (count + 1, count_title))
                # maxPage获取返回的最大评论数量
                maxPage = self.spider_title(comment_ID)
                m_id = 0
                id_type = 0
                if maxPage != 0:  # 小于20条评论的不需要循环
                    try:
                        # 用评论数量控制循环
                        for page in range(0, maxPage):
                            # 自定义函数-抓取网页评论信息
                            jsondata = self.get_page(comment_ID, m_id, id_type)
                            # 自定义函数-写入CSV文件
                            self.write_csv(jsondata)
                            # 自定义函数-获取评论item最大值
                            results = self.parse_page(jsondata)
                            time.sleep(1)
                            m_id = results['max_id']
                            id_type = results['max_id_type']
                    except:
                        pass
                print("--------------------------分隔符---------------------------")
            self.csvfile.close()
    
    
    if __name__ == '__main__':
        startTime = time.time()
        spider = WeiBoSpider(15)
        spider.main()
        endTime = time.time()
        useTime = (endTime - startTime) / 60
        print("该次所获的信息一共使用%s分钟" % useTime)
    
  • 相关阅读:
    memcached+magent的集群部署详细过程
    HBase的安装配置
    vim操作知识累积
    Missing artifact jdk.tools:jdk.tools:jar:1.6
    hadoop2.X解压后的配置步骤
    免密码的SSH配置过程
    Linux网卡重启出现"No Suitable Device found:no device found for XXX"
    钉钉、钉应用(微应用和E应用)开发介绍
    Intellij-Idea使用小细节
    SpringMVC项目使用elastic search搜索
  • 原文地址:https://www.cnblogs.com/zhouzetian/p/12569176.html
Copyright © 2020-2023  润新知