增量学习/训练

针对大型数据集，数据过大无法加载到内存，使用增量训练方式

sklearn
lightgbm
tensorflow

sklearn

def generator(all_file_path):
    for filename in all_file_path:
        try:
            bytedata = open(filename, "rb").read()
        except:
            bytedata = None
        if bytedata is None:
            continue
        byte_ngram = byteNgram(bytedata)
        label = label_to_index[pathlib.Path(filename).parent.parent.parent.parent.parent.name]
        yield byte_ngram,label

def get_batch(data_iter, batch_size):
    data = [(item) for item in itertools.islice(data_iter, batch_size)]
    return data

def iter_batch(data_iter, batch_size):
    data = get_batch(data_iter, batch_size)
    while len(data):
        try:
            x,y = zip(*data)
            yield x,y
        except:
            print("data error")
        finally:
            data = get_batch(data_iter, batch_size)

https://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html#sphx-glr-auto-examples-applications-plot-out-of-core-classification-py

文中用到了HashingVectorizer , 在这里解释下

使用两个hash函数（避免原始特征的哈希后位置在一起导致词频累加特征值突然变大）

第一个hash函数：相当于分桶降维；第二个hash函数：hash到 {-1，1}

引自 Feature Hashing for Large Scale Multitask Learning

lightgbm

自定义生成器

predicts = []
y_train = ()

for i, (x_batch_text, y_batch) in enumerate(minibatch_iterators):
    x_batch = vectorizer.transform(x_batch_text)

    # sgd
    sgd_clf.partial_fit(x_batch, y_batch, classes=all_classes)
    
    # lgb
    try:
        lgb_clf.fit(x_batch, y_batch, init_model = lgb_clf)
    except:
        lgb_clf.fit(x_batch, y_batch)
    
    y_train += y_batch
    predicts = np.hstack([predicts,lgb_clf.predict(x_batch)])
    
    if i % 500 == 0:
        print("iter %s ============== " % i)
        metrics(y_train,predicts)

借助pandas

import lightgbm as lgb

def increment():
    # 第一步，初始化模型为None，设置模型参数
    gbm=None
    params = {
            'task': 'train',
            'objective': 'multiclass',
            'num_class':"3",
            'boosting_type': 'gbdt',
            'learning_rate': 0.1,
            'num_leaves': 31,
            'tree_learner': 'serial',
            'min_data_in_leaf': 100,
            'metric': ['multi_logloss','multi_error'],
            'max_bin': 255,
            'num_trees': 300
        }
    # 第二步，流式读取数据(每次10万)
    CHUNK_SIZE = 1000000

    all_data = pd.read_csv(path, chunksize=CHUNK_SIZE)

    i = 0
    for data_chunk in all_data:
        print ('Size of uploaded chunk: %i instances, %i features' % (data_chunk.shape))

        # preprocess
        data_chunk = shuffle(data_chunk)
        x_train, y_train = pipeline(data_chunk)

        # 创建lgb的数据集
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # 第三步：增量训练模型
        # 重点来了，通过 init_model 和 keep_training_booster 两个参数实现增量训练
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1000,
                        valid_sets=lgb_eval,
                        init_model=gbm,             # 如果gbm不为None，那么就是在上次的基础上接着训练
                        early_stopping_rounds=10,
                        verbose_eval=False,
                        keep_training_booster=True) # 增量训练 

        # 输出模型评估分数
        score_train = dict([(s[1], s[2]) for s in gbm.eval_train()])
        score_valid = dict([(s[1], s[2]) for s in gbm.eval_valid()])
        print('当前模型在训练集的得分是：loss=%.4f, erro=%.4f'%(score_train['multi_logloss'], score_train['multi_error']))
        print('当前模型在测试集的得分是：loss=%.4f, erro=%.4f' % (score_valid['multi_logloss'], score_valid['multi_error']))
        i += 1
    return gbm
gbm = increment()

tensorflow

加载上次保存的网络，接着训练就好了

# 定义dataset
def load_and_preprocess_from_path_label(path, label):
  return load_and_preprocess_image(path), label

def make_dataset(image_paths, image_labels, image_count, BATCH_SIZE=32, AUTOTUNE=tf.data.experimental.AUTOTUNE):
    ds = tf.data.Dataset.from_tensor_slices((image_paths, image_labels))

    image_label_ds = ds.map(load_and_preprocess_from_path_label, num_parallel_calls=AUTOTUNE)
    
    # 设置一个和数据集大小一致的 shuffle buffer size（随机缓冲区大小）以保证数据
    # 被充分打乱。
    ds = image_label_ds.shuffle(buffer_size=image_count)
    # ds = ds.repeat()
    ds = ds.batch(BATCH_SIZE)
    # 当模型在训练的时候，`prefetch` 使数据集在后台取得 batch。
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

references：

https://zhuanlan.zhihu.com/p/41422048
https://www.cnblogs.com/pinard/p/6688348.html

相关阅读:
MMoE论文笔记
 李宏毅-ELMO, BERT, GPT
vue 手写一个时间选择器
 this 的几种使用场景
 如何使用markdown编辑器编写文章
 sublime text 3 入门技巧与常见问题解决
 Flex布局介绍
 在github上实现页面托管预览功能
 数据挖掘经典算法——最大期望算法
 数据挖掘经典算法——先验算法
原文地址：https://www.cnblogs.com/gongyanzh/p/15841929.html