• 降低耦合性获取微博数据


    微博采集

      1 import json
      2 import queue
      3 import re
      4 import threading
      5 
      6 import requests
      7 import xlrd
      8 from lxml import etree
      9 from pymongo import MongoClient
     10 
     11 proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
     12             "host": "http-dyn.abuyun.com",
     13             "port": "9020",
     14             "user": "H6VZC52B4BF2986D",
     15             "pass": "7850C72DC876E723",
     16         }
     17 
     18 
     19 class WB:
     20     def __init__(self):
     21         self.start_temp_url = "https://m.weibo.cn/api/container/getIndex?type=uid&value={}"
     22         self.headers = {
     23             "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
     24         }
     26         self.proxies = {
     27             "http": proxyMeta,
     28             "https": proxyMeta,
     29         }
     30         self.save_content_q = queue.Queue()
     31         self.url_q = queue.Queue()
     32         self.client = MongoClient(host='localhost', port=27017)
     33         # 使用的mongo的数据表
     34         self.db = self.client.WbImageSet
     35 
     36     def save_data_mongodb(self, collect_name, data):
     37         # 定义一个去重的图标集名称
     38         self.collect_name = self.db[collect_name]
     39         history_record = self.collect_name.find_one({"_id": data['id']})
     40         if history_record:
     41             # 数据库中已经存在数据
     42             pass
     43         else:
     44             # 数据库中不存在,插入数据
     45             self.collect_name.update_one({'_id': data['id']}, {'$set': data}, upsert=True)
     46 
     47     def get_author_info(self, url):
     48         """
     49         获取博主信息
     50         :return:
     51         """
     52         response = requests.get(url=url, headers=self.headers, proxies=self.proxies, verify=True)
     53         self.data = response.content.decode('utf8')
     54         content = json.loads(self.data).get('data')
     55         max_content = content.get('userInfo').get('statuses_count')
     56         return max_content
     57 
     58     def get_containerid(self):
     59         """获取微博主页的containerid,爬取微博内容时需要此id"""
     60         content = json.loads(self.data).get('data')
     61         for data in content.get('tabsInfo').get('tabs'):
     62             if (data.get('tab_type') == 'weibo'):
     63                 containerid = data.get('containerid')
     64         return containerid
     65 
     66     def get_url(self, containerid, max_content):
     67         for x in range(int(max_content)//10):
     68             wb_content_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' + self.name + '&containerid='+containerid+'&page='+str(x)
     69             self.url_q.put(wb_content_url)
     70 
     71     def get_wb_content(self):
     72         """获取微博内容信息,并保存到文本中,内容包括:每条微博的内容、微博详情页面地址、点赞数、评论数、转发数等"""
     73         num = 0
     74         while True:
     75             try:
     76                 if self.url_q.empty():
     77                     break
     78                 weibo_url = self.url_q.get()
     79                 response = requests.get(url=weibo_url, headers=self.headers, proxies=self.proxies, verify=True)
     80                 content = json.loads(response.content.decode('utf8')).get('data')
     81                 cards = content.get('cards')
     82                 if (len(cards) > 0):
     83                     for j in range(len(cards)):
     84                         num += 1
     85                         card_type = cards[j].get('card_type')
     86                         item = {}
     87                         if (card_type == 9):
     88                             mblog = cards[j].get('mblog')
     89                             scheme = cards[j].get('scheme')  # 微博地址
     90                             print(scheme)
     91                             print("--正在抓取--{}--第{}条微博--".format(self.name, num))
     92                             text_id = mblog.get("id")  # 微博id 用于去重
     93                             text = mblog.get('text')  # 微博内容
     94                             # 以下为获取标题的正则匹配
     95                             html = etree.HTML(text)
     96                             x = html.xpath('//text()')
     97                             title = ','.join(x)
     98                             title = title.replace('
    ', '').replace('
    ', '').replace('	', '')
     99                             title = re.sub('(#.*#)', '', title)
    100                             title = re.sub('@', '', title)
    101                             title = re.sub(' ', '', title)
    102                             pictures = mblog.get('pics')  # 正文配图,返回list
    103                             pic_urls = []  # 存储图片url地址
    104                             if pictures:
    105                                 for picture in pictures:
    106                                     pic_url = picture.get('large').get('url')
    107                                     pic_urls.append(pic_url)
    108                             if pic_urls == []:
    109                                 continue
    110                             item['id'] = text_id
    111                             item['category'] = self.category
    112                             item['author'] = self.name
    113                             item['title'] = title
    114                             item['url'] = pic_urls
    115                             item['select'] = 0
    116                             # 去重逻辑
    117                             self.save_data_mongodb(self.name, item)
    118             except Exception as e:
    119                 print(e)
    120 
    121     def run(self):
    122         while True:
    123             if wb_content_q.empty():
    124                 break
    125             dict_wb = wb_content_q.get()
    126             self.category = dict_wb['category']
    127             self.name = dict_wb['name']
    128             self.wb_id = dict_wb['id']
    129             max_content = self.get_author_info(self.start_temp_url.format(self.wb_id))
    130             containerid = self.get_containerid()
    131             self.get_url(containerid, max_content)
    132 
    133             Threads_caiji = []
    134             for x in range(5):
    135                 t1 = threading.Thread(target=self.get_wb_content)
    136                 Threads_caiji.append(t1)
    137             for a in Threads_caiji:
    138                 a.start()
    139                 a.join()
    140 
    141 
    142 class Excel_path:
    143 
    144     @staticmethod
    145     def get_excel_info(path,num):
    146         """
    147         从excel中读取数据 所有链接切割把博主的id分割出来,按照 id:category 的方式保存为字典
    148         :param path:  excel路径
    149         :return:返回从excel中读取的字典
    150         """
    151         excel_sheet = xlrd.open_workbook(path).sheet_by_index(num)
    152         category_name = excel_sheet.col_values(0)[1:]
    153         wb_url = excel_sheet.col_values(2)[1:]
    154         name = excel_sheet.col_values(1)[1:]
    155         for i in wb_url:
    156             item = {}
    157             a = wb_url.index(i)
    158             c = i.split('?')[0]
    159             d = c.split('/')[-1]
    160             item['name'] = name[a]
    161             item['category'] = category_name[a]
    162             item['id'] = d
    163             print(item)
    164             wb_content_q.put(item)
    165 
    166 
    167 if __name__ == '__main__':
    168     wb_content_q = queue.Queue()
    169     excel_path = 'D:\gongsi_code\ImageSpider\微博\Image_set\数据源.xlsx'
    170     excel_index = 0
    171     Excel_path.get_excel_info(excel_path, int(excel_index))
    172     WB().run()

    采集微博数据 将数据存入mongo 所有链接存mongo中 

    下载器

     1 import os
     2 import queue
     3 import re
     4 import threading
     5 import time
     6 from concurrent.futures.thread import ThreadPoolExecutor
     7 
     8 import pymongo
     9 import requests
    10 import xlrd
    11 
    12 
    13 class WbDownload:
    14     def __init__(self):
    15         self.client = pymongo.MongoClient(host='localhost', port=27017)
    16         self.db = self.client.WbImageSet
    17         self.info_q = queue.Queue()
    18 
    19     def get_info(self, collection_name):
    20         self.collection = self.db[collection_name]
    21         a = self.collection.find({"select": 0})
    22         for index in a:
    23             self.info_q.put(index)
    24 
    25     def save_(self):
    26         while True:
    27             """保存到本地"""
    28             if self.info_q.empty():
    29                 break
    30             image = self.info_q.get()
    31             category_name = image['category']
    32             upload_time = time.strftime("%Y-%m-%d", time.localtime())
    33             rule = re.compile(r's*', re.S)
    34             rule2 = re.compile(r'W*', re.S)
    35             title = rule.sub('', image['title'])
    36             title = rule2.sub('', title)
    37             path = 'D:/微博/' + category_name + '/' + str(upload_time) + '/' + title
    38             if os.path.exists(path):
    39                 continue
    40             else:
    41                 os.makedirs(path)
    42             with open(path + '/content.txt', 'w', encoding='utf8')as fb:
    43                 fb.write(str([image['title']]))
    44             for x in image['url']:
    45                 x_index = image['url'].index(x)
    46                 if x[-4:-1:] == 'gif':
    47                     continue
    48                 with open(path + '/{}.jpg'.format(str(x_index)), 'wb') as f:
    49                     response = requests.get(url=x)
    50                     f.write(response.content)
    51             self.collection.update_one({"id": id}, {"$set": {"select": 1}})
    52             print('-----  ' + title + '  写入完成  ------')
    53 
    54 
    55     def run(self):
    56         while True:
    57             if name_q.empty():
    58                 break
    59             name = name_q.get()
    60             self.get_info(name)
    61             Threads = []
    62             for i in range(20):
    63                 t_down = threading.Thread(target=self.save_)
    64                 t_down.start()
    65                 Threads.append(t_down)
    66             for t in Threads:
    67                 t.join()
    68 
    69 
    70 
    71 
    72 class Excel_path:
    73 
    74     @staticmethod
    75     def get_excel_info(path,num):
    76         """
    77         从excel中读取数据 所有链接切割把博主的id分割出来,按照 id:category 的方式保存为字典
    78         :param path:  excel路径
    79         :return:返回从excel中读取的字典
    80         """
    81         excel_sheet = xlrd.open_workbook(path).sheet_by_index(num)
    82         name = excel_sheet.col_values(1)[1:]
    83         for x in name:
    84             print(x)
    85             name_q.put(x)
    86 
    87 
    88 if __name__ == '__main__':
    89     name_q = queue.Queue()
    90     path = 'D:\gongsi_code\ImageSpider\微博\Image_set\数据源.xlsx'
    91     excel_index = 0
    92     Excel_path.get_excel_info(path, excel_index)
    93     with ThreadPoolExecutor(5) as executor:
    94             wb = WbDownload()
    95             executor.submit(wb.run)

    专门从mongo 中取数据然后下载,下载完后修改mongo中字段的名称,避免重复使用数据

  • 相关阅读:
    定时器工厂
    无聊js画了个菱形
    盒模型之滚动条
    无聊,纯css写了个评分鼠标移入的效果
    json属性名为什么要双引号?
    原生js写的一个简单slider
    D2 前端技术论坛总结(上)
    第一天,入坑 —— 2014.10.24
    获取div相对文档的位置
    我们平时是怎么写html和css的?
  • 原文地址:https://www.cnblogs.com/lqn404/p/11325252.html
Copyright © 2020-2023  润新知