• 模型召回之SimCSE


    模型召回之SimCSE

    dataset

    unsuper

    import numpy as np
    import math
    
    
    class UnsuperviseData(tf.keras.utils.Sequence):
        def __init__(self, x_set, batch_size):
            self.x = x_set
            self.batch_size = batch_size
    
        def __len__(self):
            return math.ceil(len(self.x) / self.batch_size)
    
        def __getitem__(self, idx):
            batch_x = self.x[idx * self.batch_size:(idx + 1) *
            self.batch_size]
            batch_x = batch_x + batch_x
            bx = np.array([batch_x[i::self.batch_size] for i in range(self.batch_size)]).flatten().tolist()
            return self._tokenizer(bx)
    
        def _tokenizer(self,x):
            return tokenizer(x, max_length=50, padding=True,truncation=True,return_tensors="tf")
    

    super

    class SuperviseData(tf.keras.utils.Sequence):
        def __init__(self, query_set, doc_set, corpus, batch_size):
            self.querys = query_set
            self.docs = doc_set
            self.corpus = corpus
            self.batch_size = batch_size
            self.size = len(self.corpus)
    
        def __len__(self):
            return math.ceil(len(self.querys) / self.batch_size)
    
        def __getitem__(self, idx):
            batch_query = self.querys[idx * self.batch_size: (idx + 1) * self.batch_size]
            batch_doc = self.docs[idx * self.batch_size: (idx + 1) * self.batch_size]
            # naive in-batch negativate
            randix = random.randint(1,self.batch_size)-1)
            neg_doc = batch_doc[randix:] + batch_doc[:randix]
            bx = np.array([(batch_query[i],batch_doc[i],neg_doc[i]) for i in range(self.batch_size)]).flatten().tolist()
            return self._tokenizer(bx)
    
        def _tokenizer(self, inputs):
            return tokenizer(inputs, max_length=50, padding=True,truncation=True,return_tensors="tf")
    

    loss

    对比学习的核心就是loss的编写,记录下loss的tensorflow实现

    假设embedding向量维度为3

    y_pred = tf.random.uniform((6,3))

    def unsupervise_loss(y_pred, alpha=0.05):
        idxs = tf.range(y_pred.shape[0])
        y_true = idxs + 1 - idxs % 2 * 2	# [1 0 3 2 5 4]
        y_pred = tf.math.l2_normalize(y_pred, dim = 1)
        similarities = tf.matmul(y_pred, y_pred,adjoint_b = True)
        similarities = similarities - tf.eye(tf.shape(y_pred)[0]) * 1e12
        similarities = similarities / alpha	# (6,6)
        print(y_true)
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, similarities, from_logits=True)	# softmax (6,)
        return tf.reduce_mean(loss)
    
    def supervise_loss(y_pred, alpha=0.05):
        row = tf.range(0, y_pred.shape[0], 3)	# 0 3
        col = tf.range(y_pred.shape[0])	
        col = tf.squeeze(tf.where(col % 3 != 0),axis=1)	# 1 2 4 5
        y_true = tf.range(0, len(col), 2)	# [0 2]
        y_pred = tf.math.l2_normalize(y_pred, dim = 1)
        similarities = tf.matmul(y_pred, y_pred,adjoint_b = True)
    
        similarities = tf.gather(similarities, row, axis=0)
        similarities = tf.gather(similarities, col, axis=1)
    
        similarities = similarities / alpha	# (2,4)
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, similarities, from_logits=True)
        return tf.reduce_mean(loss)
    

    model

    from transformers import AutoConfig,AutoTokenizer,TFAutoModel
    
    MODEL_NAME = "hfl/chinese-roberta-wwm-ext"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    config = AutoConfig.from_pretrained(MODEL_NAME) 
    # backbone = TFAutoModel.from_pretrained(MODEL_NAME)
    
    class baseModel(tf.keras.Model):
        def __init__(self,MODEL_NAME,finetune=False):
            super().__init__()
            self.backbone = TFAutoModel.from_pretrained(MODEL_NAME)
            if not finetune:
              self.backbone.trainable = False
              print("bert close")
            self.drop = tf.keras.layers.Dropout(0.2)
            self.dense_layer = tf.keras.layers.Dense(128)
            
        def call(self,inputs,training=False):
            x = self.backbone(inputs)[1]
            # x = self.drop(x)
            x = self.dense_layer(x)
            return x
    
    model = baseModel(MODEL_NAME,finetune=False)
    

    train

    unsuper train

    epochs = 5
    batch_size = 64
    
    t0 = time.time()
    for i in range(epochs):
        ds = UnsuperviseData(doc_df["doc_content"].values.tolist(), batch_size)
        print(f"epoch {i}, training ")
        for step, batchx in enumerate(ds):
            with tf.GradientTape() as tape:
                y_pred = model(batchx, training=True)
                loss = unsupervise_loss(y_pred)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
            if step % 50 == 0:
                print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}, spend time: {:.3f}".format(step,loss,0,time.time()-t0))
    

    super train

    epochs = 5
    batch_size = 32
    
    t0 = time.time()
    for i in range(epochs):
        ds = SuperviseData(train_data["query_content"].values.tolist(),train_data["doc_content"].values.tolist(), doc_df["doc_content"].values.tolist(), batch_size)
        print(f"epoch {i}, training ")
        for step, batchx in enumerate(ds):
            with tf.GradientTape() as tape:
                y_pred = model(batchx, training=True)
                loss = supervise_loss(y_pred)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
            if step % 50 == 0:
                print("Iteration step: {}; Loss: {:.3f}, Accuracy: {:.3%}, spend time: {:.3f}".format(step,loss,0,time.time()-t0))
    

    refenences:

    电商搜索召回. https://github.com/muyuuuu/E-commerce-Search-Recall?spm=5176.21852664.0.0.79006ebf02bd2j

    SimCSE pytorch. https://github.com/zhengyanzhao1997/NLP-model/tree/main/model/model/Torch_model/SimCSE-Chinese

    SimCSE的loss实现源码解读. https://zhuanlan.zhihu.com/p/377862950

    SimCSE简介以及核心代码详解——无监督文本向量抽取. https://zhuanlan.zhihu.com/p/462763973

    in-batch negative sampling

    https://github.com/facebookresearch/DPR/issues/110

    • assume that batch_size=4 and hard_negatives=1

    img

  • 相关阅读:
    Java DB 访问(三)mybatis mapper interface接口
    android stdio 安装后gradle 编译失败,设置代理不管用
    超星自动签到并邮件通知
    关于解决Mac使用docker安装SQL server for Linux 中文乱码问题
    Bom简单介绍
    grid-layout
    Position
    前端学习(注册表)
    关于学精通一门语言的个人看法(欢迎留言补充)
    Google 最新推出数据集搜索
  • 原文地址:https://www.cnblogs.com/gongyanzh/p/16122488.html
Copyright © 2020-2023  润新知