• 爬微博


    # -*- coding:utf-8 -*-
    import time
    import json
    import uuid
    import traceback
    import requests
    import datetime
    import os
    import random
    from utils.qiniu_util import QiNiu
    from bs4 import BeautifulSoup
    __version__ = '1.0.0.0'
    """
    @brief : 简介
    @details: 详细信息
    @author : zhphuang
    @date : 2018-11-29
    """


    class SpiderWeibo(object):

    def __init__(self, window=None):
    """
    "关注": 25, "美食": 33, "读书": 37, "设计": 39, "时尚": 41, "动漫": 43, "萌宠": 45,
    "综艺": 10, "电影": 12, "运动健身": 16, "旅游": 30, "星座": 36, "校园": 32, "艺术": 40
    """
    self.qiniu = QiNiu()
    self.url = [
    {"sub_type": "热门", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0"},
    {"sub_type": "新鲜事", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_7978_-_ctg1_7978&openApp=0"},
    {"sub_type": "搞笑", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4388_-_ctg1_4388&openApp=0"},
    {"sub_type": "情感", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_1988_-_ctg1_1988&openApp=0"},
    {"sub_type": "明星", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4288_-_ctg1_4288&openApp=0"},
    {"sub_type": "社会", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4188_-_ctg1_4188&openApp=0"},
    {"sub_type": "数码", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5088_-_ctg1_5088&openApp=0"},
    {"sub_type": "体育", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_1388_-_ctg1_1388&openApp=0"},
    {"sub_type": "汽车", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5188_-_ctg1_5188&openApp=0"},
    {"sub_type": "电影", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5188_-_ctg1_5188&openApp=0"},
    {"sub_type": "游戏", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4888_-_ctg1_4888&openApp=0"},
    {"sub_type": "美食", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_2688_-_ctg1_2688&openApp=0"},
    {"sub_type": "读书", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4588_-_ctg1_4588&openApp=0"},
    {"sub_type": "设计", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5388_-_ctg1_5388&openApp=0"},
    {"sub_type": "时尚", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4488_-_ctg1_4488&openApp=0"},
    {"sub_type": "动漫", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_2388_-_ctg1_2388&openApp=0"},
    {"sub_type": "萌宠", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_2788_-_ctg1_2788&openApp=0"},
    {"sub_type": "综艺", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4688_-_ctg1_4688&openApp=0"},
    {"sub_type": "旅游", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_2588_-_ctg1_2588&openApp=0"},
    {"sub_type": "星座", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_1688_-_ctg1_1688&openApp=0"},
    {"sub_type": "校园", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_1488_-_ctg1_1488&openApp=0"},
    {"sub_type": "艺术", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5488_-_ctg1_5488&openApp=0"}
    ]
    self.cookies = "_T_WM=6e170ca7910c8a0400cc34f8812ee08a; "
    "ALF=1548469366; SCF=AgozafmBO6saBbFys4DjAOQFlYFxRK6CuW_YwqYMgRKRkIh2Or_PEsE7BKZwQNDSRBoJu9EbU9DiOGNoTiRp7As.; "
    "SUB=_2A25xIEcnDeRhGeNM7FMW8ybEzT6IHXVS62lvrDV6PUNbktAKLRDWkW1NSetNTi5Tt63QYZkk2y4kc76CgKb4VoPf; "
    "SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MoCM8JudP_s60h7KX4dpy5JpX5KMhUgL.Fo-ES02Ne0nRSoz2dJLoI7_VIPHVIPHoPN9DM5tt; "
    "SUHB=0z1wYnUQHMLUAy; SSOLoginState=1545877367; MLOGIN=1; WEIBOCN_FROM=1110006030; "
    "M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D102803_ctg1_3288_-_ctg1_3288%26uicode%3D20000174%26fid%3D102803"
    self.records = None
    self.agents = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    ]
    self.window = window

    def log(self, msg):
    if self.window:
    self.window.write_log_to_text(msg)

    def save_to_csv(self, key):
    dir = os.path.join(os.path.dirname(__file__), "data", "weibo", "%s" % datetime.date.today())
    if not os.path.exists(dir):
    os.mkdir(dir)
    with open('data/weibo/%s/result_%s_%s.json' % (datetime.date.today(), key, datetime.date.today()), "w", encoding='utf-8') as json_file:
    json.dump(self.records, json_file, ensure_ascii=False)

    def get_random_header(self, path):
    cookies = "_T_WM=6e170ca7910c8a0400cc34f8812ee08a; "
    "ALF=1548469366; SCF=AgozafmBO6saBbFys4DjAOQFlYFxRK6CuW_YwqYMgRKRkIh2Or_PEsE7BKZwQNDSRBoJu9EbU9DiOGNoTiRp7As.; "
    "SUB=_2A25xIEcnDeRhGeNM7FMW8ybEzT6IHXVS62lvrDV6PUNbktAKLRDWkW1NSetNTi5Tt63QYZkk2y4kc76CgKb4VoPf; "
    "SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MoCM8JudP_s60h7KX4dpy5JpX5KMhUgL.Fo-ES02Ne0nRSoz2dJLoI7_VIPHVIPHoPN9DM5tt; "
    "SUHB=0z1wYnUQHMLUAy; SSOLoginState=1545877367; MLOGIN=1; WEIBOCN_FROM=1110006030; "
    "M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D102803_ctg1_3288_-_ctg1_3288%26uicode%3D20000174%26fid%3D102803"
    return {
    "authority": "m.weibo.cn",
    "method": "GET",
    "path": path,
    "scheme": "https",
    "accept": "application/json, text/plain, */*",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9",
    "cookie": cookies,
    "mweibo-pwa": "1",
    "referer": "https://m.weibo.cn/",
    "x-requested-with": "XMLHttpRequest",
    "user-agent": random.choice(self.agents)
    }

    def get_comments(self, mid):
    comment_list = []
    commments_url = "https://m.weibo.cn/api/comments/show?id=%s&page=%s" % (mid, 1)
    header = {
    'User-agent': random.choice(self.agents),
    'Host': 'm.weibo.cn',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Referer': "https://m.weibo.cn/u/%s" % mid,
    'Connection': 'keep-alive',
    "cookie": self.cookies,

    }
    try:
    time.sleep(random.randint(1, 3))
    res = requests.get(commments_url, headers=header)
    if res.json().get("ok") == 1:
    data = res.json().get("data").get("data")
    for item in data:
    try:
    comment_content = BeautifulSoup(item["text"], "html.parser")
    [s.extract() for s in comment_content('a')] # 去掉<a>标签
    comment_content = comment_content.text.replace("回复:", "")
    if comment_content.strip():
    comment_time = int(time.time()) + random.randint(0, 3600 * 24)
    comment_list.append({"nickname": item["user"]["screen_name"],
    "publish_time": comment_time,
    "content": comment_content})
    except Exception as e:
    print(traceback.format_exc())
    continue

    except Exception as e:
    print(traceback.format_exc())
    return comment_list

    def get_data(self):
    for url in self.url:
    self.records = []
    header = self.get_random_header(url.get("url").replace("https://m.weibo.cn", ""))
    time.sleep(random.randint(5, 20))
    res = requests.get(url.get("url"), headers=header)
    res = res.json()
    if res.get("ok") == 1:
    datas = res.get("data").get("cards")
    for data in datas:
    if not data.__contains__('mblog'):
    continue
    try:
    record = dict()
    record['_type'] = 0 # 代表微博
    record['sub_type'] = url["sub_type"] # 分类
    record['uid'] = data["mblog"]["id"]
    record['username'] = data["mblog"]["user"]["screen_name"]
    record["signature"] = data["mblog"]["user"]["description"]
    record['user_photo'] = data["mblog"]["user"]["avatar_hd"]
    record['user_photo'] = self.qiniu.fetch(record['user_photo'], record['uid'] + "_avatar.jpg")
    record['sex'] = 1 if data["mblog"]["user"]["gender"] == "m" else 0
    record["publish_time"] = int(time.time())
    bs = BeautifulSoup(data["mblog"]["text"], "html.parser")
    [s.extract() for s in bs('a')] # 去掉<a>标签
    record["content"] = bs.text
    record["face_list"] = []
    record["src_list"] = [image["url"] for image in data["mblog"].get("pics", [])]
    record["src_list"] = [self.qiniu.fetch(i, str(uuid.uuid1()) + "weibo_image_%s.jpg" % index)
    for index, i in enumerate(record["src_list"])]
    span_list = bs.select("span.surl-text")
    record['label_list'] = [label_div.text for label_div in span_list]
    record["zhuanfa_count"] = data["mblog"]["reposts_count"]
    record["comment_count"] = data["mblog"]["comments_count"]
    record["dianzan_count"] = data["mblog"]["attitudes_count"]
    record["comment_list"] = self.get_comments(record['uid'])
    print(len(record["comment_list"]))
    record['link'] = data["scheme"]
    record["video_url"] = {}
    if data["mblog"].__contains__("page_info"):
    if data["mblog"]["page_info"].__contains__("media_info") and data["mblog"]["page_info"]["type"] == "video":
    page_pic = data["mblog"]["page_info"]["page_pic"]
    video_url = data["mblog"]["page_info"]["media_info"].get("mp4_sd_url")
    if not video_url:
    video_url = data["mblog"]["page_info"]["media_info"].get("mp4_hd_url")
    if video_url:
    video_url = self.qiniu.fetch(video_url, str(uuid.uuid1()) + "_weibo_video.mp4")
    page_pic = self.qiniu.fetch(page_pic["url"], str(uuid.uuid1()) + "_weibo_pre_video_image.jpg")
    record["video_url"] = {"pre_image": page_pic, "video_url": video_url}
    print(record)
    self.records.append(record)
    except Exception as e:
    print(traceback.format_exc())
    self.save_to_csv(url["sub_type"])


    if __name__ == '__main__':
    obj = SpiderWeibo()
    obj.get_data()
    当值未一旬,而视茫茫,而发苍苍,而齿牙动摇
  • 相关阅读:
    (转)接口100
    (转)WordPress常用模板函数 修改或自制WordPress主题必备
    (转)Memcached 之 .NET(C#)实例分析
    Ubuntu(Linux) + mono + xsp4 + nginx +asp.net MVC3 部署
    urls.py路由系统分发的本质
    Django-缓存
    Django-信号
    MySQL表按月切割
    通过ModelForm实现主机添加和编辑
    字定义JSON序列化支持datetime格式序列化
  • 原文地址:https://www.cnblogs.com/niuniuc/p/10549534.html
Copyright © 2020-2023  润新知