• 增量学习/训练


    针对大型数据集,数据过大无法加载到内存,使用增量训练方式

    sklearn

    def generator(all_file_path):
        for filename in all_file_path:
            try:
                bytedata = open(filename, "rb").read()
            except:
                bytedata = None
            if bytedata is None:
                continue
            byte_ngram = byteNgram(bytedata)
            label = label_to_index[pathlib.Path(filename).parent.parent.parent.parent.parent.name]
            yield byte_ngram,label
    
    def get_batch(data_iter, batch_size):
        data = [(item) for item in itertools.islice(data_iter, batch_size)]
        return data
    
    def iter_batch(data_iter, batch_size):
        data = get_batch(data_iter, batch_size)
        while len(data):
            try:
                x,y = zip(*data)
                yield x,y
            except:
                print("data error")
            finally:
                data = get_batch(data_iter, batch_size)
    

    https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html#sphx-glr-auto-examples-applications-plot-out-of-core-classification-py

    文中用到了HashingVectorizer , 在这里解释下

    使用两个hash函数(避免原始特征的哈希后位置在一起导致词频累加特征值突然变大)

    image-20220128105902480

    第一个hash函数:相当于分桶降维;第二个hash函数:hash到 {-1,1}

    引自 Feature Hashing for Large Scale Multitask Learning

    lightgbm

    自定义生成器

    predicts = []
    y_train = ()
    
    for i, (x_batch_text, y_batch) in enumerate(minibatch_iterators):
        x_batch = vectorizer.transform(x_batch_text)
    
        # sgd
        sgd_clf.partial_fit(x_batch, y_batch, classes=all_classes)
        
        # lgb
        try:
            lgb_clf.fit(x_batch, y_batch, init_model = lgb_clf)
        except:
            lgb_clf.fit(x_batch, y_batch)
        
        y_train += y_batch
        predicts = np.hstack([predicts,lgb_clf.predict(x_batch)])
        
        if i % 500 == 0:
            print("iter %s ============== " % i)
            metrics(y_train,predicts)
    

    借助pandas

    import lightgbm as lgb
    
    def increment():
        # 第一步,初始化模型为None,设置模型参数
        gbm=None
        params = {
                'task': 'train',
                'objective': 'multiclass',
                'num_class':"3",
                'boosting_type': 'gbdt',
                'learning_rate': 0.1,
                'num_leaves': 31,
                'tree_learner': 'serial',
                'min_data_in_leaf': 100,
                'metric': ['multi_logloss','multi_error'],
                'max_bin': 255,
                'num_trees': 300
            }
        # 第二步,流式读取数据(每次10万)
        CHUNK_SIZE = 1000000
    
        all_data = pd.read_csv(path, chunksize=CHUNK_SIZE)
    
        i = 0
        for data_chunk in all_data:
            print ('Size of uploaded chunk: %i instances, %i features' % (data_chunk.shape))
    
            # preprocess
            data_chunk = shuffle(data_chunk)
            x_train, y_train = pipeline(data_chunk)
    
            # 创建lgb的数据集
            lgb_train = lgb.Dataset(x_train, y_train)
            lgb_eval = lgb.Dataset(x_test, y_test)
    
            # 第三步:增量训练模型
            # 重点来了,通过 init_model 和 keep_training_booster 两个参数实现增量训练
            gbm = lgb.train(params,
                            lgb_train,
                            num_boost_round=1000,
                            valid_sets=lgb_eval,
                            init_model=gbm,             # 如果gbm不为None,那么就是在上次的基础上接着训练
                            early_stopping_rounds=10,
                            verbose_eval=False,
                            keep_training_booster=True) # 增量训练 
    
            # 输出模型评估分数
            score_train = dict([(s[1], s[2]) for s in gbm.eval_train()])
            score_valid = dict([(s[1], s[2]) for s in gbm.eval_valid()])
            print('当前模型在训练集的得分是:loss=%.4f, erro=%.4f'%(score_train['multi_logloss'], score_train['multi_error']))
            print('当前模型在测试集的得分是:loss=%.4f, erro=%.4f' % (score_valid['multi_logloss'], score_valid['multi_error']))
            i += 1
        return gbm
    gbm = increment()
    

    tensorflow

    加载上次保存的网络,接着训练就好了

    # 定义dataset
    def load_and_preprocess_from_path_label(path, label):
      return load_and_preprocess_image(path), label
    
    def make_dataset(image_paths, image_labels, image_count, BATCH_SIZE=32, AUTOTUNE=tf.data.experimental.AUTOTUNE):
        ds = tf.data.Dataset.from_tensor_slices((image_paths, image_labels))
    
        image_label_ds = ds.map(load_and_preprocess_from_path_label, num_parallel_calls=AUTOTUNE)
        
        # 设置一个和数据集大小一致的 shuffle buffer size(随机缓冲区大小)以保证数据
        # 被充分打乱。
        ds = image_label_ds.shuffle(buffer_size=image_count)
        # ds = ds.repeat()
        ds = ds.batch(BATCH_SIZE)
        # 当模型在训练的时候,`prefetch` 使数据集在后台取得 batch。
        ds = ds.prefetch(buffer_size=AUTOTUNE)
        return ds
    

    references

    https://zhuanlan.zhihu.com/p/41422048
    https://www.cnblogs.com/pinard/p/6688348.html

  • 相关阅读:
    MMoE论文笔记
    李宏毅-ELMO, BERT, GPT
    vue 手写一个时间选择器
    this 的几种使用场景
    如何使用markdown编辑器编写文章
    sublime text 3 入门技巧与常见问题解决
    Flex布局介绍
    在github上实现页面托管预览功能
    数据挖掘经典算法——最大期望算法
    数据挖掘经典算法——先验算法
  • 原文地址:https://www.cnblogs.com/gongyanzh/p/15841929.html
Copyright © 2020-2023  润新知