• 开发笔记 -- MongoDB 单个数据集合拆分脚本


    场景描述:

        数据清洗过程中发现,Mongodb中单个集合(表)数据量比较大(千万),定时任务处理加工数据时,由于服务器内存有限,会在处理过程中,出现内存不足的异常,于是希望把单个集合进行拆分成多个,在网上找了相关文章,发现处理这种的场景的案例比较少,基本上考虑了如下3中方案来实现:

        1. mongodb sharding 分片 ---还未研究透彻

        2. mongodb 聚合查询,将结果封装成变量,循环遍历,写入新的集合 ---测试了下,效率较低  --文末会说明该方案,适用于小数据量

        3. 自定义python脚本  ---自己动手,丰衣足食

    处理方式:

        这里采用了自己写脚本处理的方式,好了!逻辑比较简单,就是将源数据对象读取,循环遍历,到达自己指定的count计数后,开多线程,将符合条件的数据写入新集合(表)。

    脚本内容:小伙伴们,可以根据自己实际需求修改脚本内相应的条件,有更好的实现方案,欢迎交流。---脚本运行依赖包 pymongo

    ---说明:由于pool.map(thread_insert_001, source_rows_list) map方法,没有找到传递多个参数的方式,因此脚本中if else和重复函数较多,封装的不好。

    # -*- coding: utf-8 -*-
    import datetime
    import json
    
    # from bson import ObjectId
    from pymongo import MongoClient
    from multiprocessing.pool import ThreadPool
    
    
    # 将3千万条数据拆分成多个集合,每400万一组
    # 大概拆分成8组
    
    
    # mongodb settings
    settings = {
        "ip": '127.0.0.1',  # ip
        "port": 27007,  # 端口
        "db_name": "add_your_db_name",  # 数据库名字
        "set_name": "add_your_collection_name" ,  # 集合名字
        "new_db_name": "add_your_target_db_name",
        "target_set_name_001": "coll_001",
        "target_set_name_002": "coll_002",
        "target_set_name_003": "coll_003",
        "target_set_name_004": "coll_004",
        "target_set_name_005": "coll_005",
        "target_set_name_006": "coll_006",
        "target_set_name_007": "coll_007",
        "target_set_name_008": "coll_008",
    }
    
    
    class MyMongoDB():
        def __init__(self):
            try:
                self.conn = MongoClient(settings["ip"], settings["port"])
            except Exception as e:
                print(e)
            self.db = self.conn[settings["db_name"]]
            self.my_set = self.db[settings["set_name"]]
    
        def insert(self, dic):
            print("inser...")
            if dic:
                self.my_set.insert(dic)
            else:
                pass
    
        def update(self, dic, newdic):
            print("update...")
            self.my_set.update(dic, newdic)
    
        def delete(self, dic):
            print("delete...")
            self.my_set.remove(dic)
    
        def dbfind(self, dic=None):
            print("find...")
            current_set = self.my_set
            data = current_set.find()
    
            return data
    
    
    def thread_insert_001(rows_dic):
        conn_mongodb = MongoClient(settings["ip"], settings["port"])
        new_db = conn_mongodb[settings["new_db_name"]]
        new_collect_set = new_db[settings["target_set_name_001"]]
        new_collect_set.insert(rows_dic)
    
    
    def thread_insert_002(rows_dic):
        conn_mongodb = MongoClient(settings["ip"], settings["port"])
        new_db = conn_mongodb[settings["new_db_name"]]
        new_collect_set = new_db[settings["target_set_name_002"]]
        new_collect_set.insert(rows_dic)
    
    
    def thread_insert_003(rows_dic):
        conn_mongodb = MongoClient(settings["ip"], settings["port"])
        new_db = conn_mongodb[settings["new_db_name"]]
        new_collect_set = new_db[settings["target_set_name_003"]]
        new_collect_set.insert(rows_dic)
    
    
    def thread_insert_004(rows_dic):
        conn_mongodb = MongoClient(settings["ip"], settings["port"])
        new_db = conn_mongodb[settings["new_db_name"]]
        new_collect_set = new_db[settings["target_set_name_004"]]
    
        new_collect_set.insert(rows_dic)
    
    
    def thread_insert_005(rows_dic):
        conn_mongodb = MongoClient(settings["ip"], settings["port"])
        new_db = conn_mongodb[settings["new_db_name"]]
        new_collect_set = new_db[settings["target_set_name_005"]]
        new_collect_set.insert(rows_dic)
    
    
    def thread_insert_006(rows_dic):
        conn_mongodb = MongoClient(settings["ip"], settings["port"])
        new_db = conn_mongodb[settings["new_db_name"]]
        new_collect_set = new_db[settings["target_set_name_006"]]
        new_collect_set.insert(rows_dic)
    
    
    def thread_insert_007(rows_dic):
        conn_mongodb = MongoClient(settings["ip"], settings["port"])
        new_db = conn_mongodb[settings["new_db_name"]]
        new_collect_set = new_db[settings["target_set_name_007"]]
        new_collect_set.insert(rows_dic)
    
    
    def thread_insert_008(rows_dic):
        conn_mongodb = MongoClient(settings["ip"], settings["port"])
        new_db = conn_mongodb[settings["new_db_name"]]
        new_collect_set = new_db[settings["target_set_name_008"]]
        new_collect_set.insert(rows_dic)
    
    
    def split_mg_dt_col_to_new_col():
        # mongodb 旧集合数据 拆分 写入新集合
        my_mongodb_obj = MyMongoDB()
        data = my_mongodb_obj.dbfind()
    
        # 开10个线程,根据服务器配置修改
        pool = ThreadPool(10)
    
        count = 1
        source_rows_list = []
        for result in data:
            count += 1
            source_rows_list.append(result)
            # print(source_rows_list)
            if count < 4000000:
                if len(source_rows_list) == 2000:
                    print('11111111111111111')
                    print(count)
                    print(len(source_rows_list))
                    insert_dic = pool.map(thread_insert_001, source_rows_list)
                    del source_rows_list[:]
            elif 4000000 <= count < 8000000:
                if len(source_rows_list) == 2000:
                    print('222222222222222222')
                    print(count)
                    print(len(source_rows_list))
                    insert_dic = pool.map(thread_insert_002, source_rows_list)
                    del source_rows_list[:]
            elif 8000000 <= count < 12000000:
                if len(source_rows_list) == 2000:
                    print('333333333333333333')
                    print(count)
                    print(len(source_rows_list))
                    insert_dic = pool.map(thread_insert_003, source_rows_list)
                    del source_rows_list[:]
            elif 12000000 <= count < 16000000:
                if len(source_rows_list) == 2000:
                    print('44444444444444444444')
                    print(count)
                    print(len(source_rows_list))
                    insert_dic = pool.map(thread_insert_004, source_rows_list)
                    del source_rows_list[:]
            elif 16000000 <= count < 20000000:
                if len(source_rows_list) == 2000:
                    print('5555555555555555555555')
                    print(count)
                    print(len(source_rows_list))
                    insert_dic = pool.map(thread_insert_005, source_rows_list)
                    del source_rows_list[:]
            elif 20000000 <= count < 24000000:
                if len(source_rows_list) == 2000:
                    print('666666666666666666666')
                    print(count)
                    print(len(source_rows_list))
                    insert_dic = pool.map(thread_insert_006, source_rows_list)
                    del source_rows_list[:]
            elif 24000000 <= count < 28000000:
                if len(source_rows_list) == 2000:
                    print('7777777777777777777')
                    print(count)
                    print(len(source_rows_list))
                    insert_dic = pool.map(thread_insert_007, source_rows_list)
                    del source_rows_list[:]
            elif 28000000 <= count < 32000000:
                if len(source_rows_list) == 2000:
                    print('88888888888888888')
                    print(count)
                    print(len(source_rows_list))
                    insert_dic = pool.map(thread_insert_008, source_rows_list)
                    del source_rows_list[:]
    
        pool.close()
    
    if __name__ == '__main__':
        split_mg_dt_col_to_new_col()
    拆分脚本-展开查看

    ---补充:

    mongodb 查询结果,循环遍历写入,边读边取,不是一次直接返回所有数据集,类似mysql的游标或者指针

    var result = db.csmdR.find({"查询条件"})
    while(result.hasNext()) db.新集合(表)名.insert(result.next())
  • 相关阅读:
    Spring注解运行时抛出null
    关于apache服务器加载so的报错
    apache apr的编译和引用
    FreeSWITCH在会议室中持续播放音频文件
    64位FreeSWITCH编译安装(版本1.4.20)
    Spring整合Tiles
    eclipse启动报错eclipse failed to create the java virutal machine
    菜鸟新闻2--设置沉浸式状态栏的三种方法
    OkHttp3源码详解(三) 拦截器
    Android N(API level 24.)废弃了Html.fromHtml(String)
  • 原文地址:https://www.cnblogs.com/hellojesson/p/13037562.html
Copyright © 2020-2023  润新知