• transformer 源码


    训练时:
    1. 输入正确标签一次性解码出来

    预测时:
    1. 第一次输入1个词,解码出一个词
    第二次输入第一次输入的词和第一次解码出来词一起,解码出来第3个词,这样依次解码,解码到最长的长度或者<pad>。就结束。
    训练时,全部输入与预测时一个一个输入是一样的

    1. 需要传入词向量

        def __init__(self, hp):
            self.hp = hp
            self.token2idx, self.idx2token = load_vocab(hp.vocab) # 这里在实际的需求情况下传入自己的词典
            self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True) # 这里作者使用定义的变量训练的词向量,在实际的生产过程当中,我们可以使用word2vec、bert等

    2.position_encoding

    def positional_encoding(inputs,
                            num_units,
                            zero_pad=True,
                            scale=True,
                            scope="positional_encoding",
                            reuse=None):
        '''Sinusoidal Positional_Encoding.
    
        Args:
          inputs: A 2d Tensor with shape of (N, T).
          num_units: Output dimensionality
          zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
          scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
          scope: Optional scope for `variable_scope`.
          reuse: Boolean, whether to reuse the weights of a previous layer
            by the same name.
    
        Returns:
            A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
        '''
    
        N, T = inputs.get_shape().as_list()
        with tf.variable_scope(scope, reuse=reuse):
            position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])
    
            # First part of the PE function: sin and cos argument
            position_enc = np.array([
                [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)]
                for pos in range(T)])
    
            # Second part, apply the cosine to even columns and sin to odds.
            position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
            position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
    
            # Convert to a tensor
            lookup_table = tf.convert_to_tensor(position_enc)
    
            if zero_pad:
                lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                          lookup_table[1:, :]), 0)
            outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
    
            if scale:
                outputs = outputs * num_units**0.5
    
            return outputs

    3. multihead_attention

    def multihead_attention(queries, 
                            keys, 
                            num_units=None, 
                            num_heads=8, 
                            dropout_rate=0,
                            is_training=True,
                            causality=False,
                            scope="multihead_attention", 
                            reuse=None):
        '''Applies multihead attention.
        
        Args:
          queries: A 3d tensor with shape of [N, T_q, C_q].
          keys: A 3d tensor with shape of [N, T_k, C_k].
          num_units: A scalar. Attention size.
          dropout_rate: A floating point number.
          is_training: Boolean. Controller of mechanism for dropout.
          causality: Boolean. If true, units that reference the future are masked. 
          num_heads: An int. Number of heads.
          scope: Optional scope for `variable_scope`.
          reuse: Boolean, whether to reuse the weights of a previous layer
            by the same name.
            
        Returns
          A 3d tensor with shape of (N, T_q, C)  
        '''
        with tf.variable_scope(scope, reuse=reuse):
            # Set the fall back option for num_units
            if num_units is None:
                num_units = queries.get_shape().as_list()[-1]
            
            # Linear projections
            Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)  C为num_units,本实现中未设定,故等于C_q
            K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
            V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
            
            # Split and concat
            Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
            K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
            V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
    
            # Multiplication
            outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
            
            # Scale
            outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
    
            # Key Masking
            key_masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k)
            key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, -T_k)
            key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
            
            paddings = tf.ones_like(outputs)*(-2**32+1)
            b = tf.equal(key_masks, 0)
            """
                然后定义一个和outputs同shape的paddings,该tensor每个值都设定的极小。用where函数比较,当对应位置的key_masks值为0也就是需要mask时,
                outputs的该值(attention score)设置为极小的值(利用paddings实现),否则保留原来的outputs值。 
                经过以上key mask操作之后outputs的shape仍为 (h*N, T_q, T_k),只是对应mask了的key的score变为很小的值。
            """
            outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
      
            # Causality = Future blinding
            if causality: # 是否忽略未来信息
                diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
                tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k)
                masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
       
                paddings = tf.ones_like(masks)*(-2**32+1)
                outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
      
            # Activation
            outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
             
            # Query Masking
            query_masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q)
            query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
            query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
            outputs *= query_masks # broadcasting. (N, T_q, T_k)?注释有误,将C改成T_k
              
            # Dropouts
            outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
                   
            # Weighted sum
            outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
            
            # Restore shape
            outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
                  
            # Residual connection
            outputs += queries
                  
            # Normalize
            outputs = normalize(outputs) # (N, T_q, C)
     
        return outputs

    4. feedforward

    def feedforward(inputs, 
                    num_units=[2048, 512],
                    scope="multihead_attention", 
                    reuse=None):
        '''Point-wise feed forward net.
        
        Args:
          inputs: A 3d tensor with shape of [N, T, C].
          num_units: A list of two integers.
          scope: Optional scope for `variable_scope`.
          reuse: Boolean, whether to reuse the weights of a previous layer
            by the same name.
            
        Returns:
          A 3d tensor with the same shape and dtype as inputs
        '''
        with tf.variable_scope(scope, reuse=reuse):
            # Inner layer
            params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
                      "activation": tf.nn.relu, "use_bias": True}
            outputs = tf.layers.conv1d(**params)
            
            # Readout layer
            params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
                      "activation": None, "use_bias": True}
            outputs = tf.layers.conv1d(**params)
            
            # Residual connection
            outputs += inputs
            
            # Normalize
            outputs = normalize(outputs)
        
        return outputs

    5.normalize

    def normalize(inputs, 
                  epsilon = 1e-8,
                  scope="ln",
                  reuse=None):
        '''Applies layer normalization.
        
        Args:
          inputs: A tensor with 2 or more dimensions, where the first dimension has
            `batch_size`.
          epsilon: A floating number. A very small number for preventing ZeroDivision Error.
          scope: Optional scope for `variable_scope`.
          reuse: Boolean, whether to reuse the weights of a previous layer
            by the same name.
          
        Returns:
          A tensor with the same shape and data dtype as `inputs`.
        '''
        with tf.variable_scope(scope, reuse=reuse):
            inputs_shape = inputs.get_shape()
            params_shape = inputs_shape[-1:]
        
            mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
            beta= tf.Variable(tf.zeros(params_shape))
            gamma = tf.Variable(tf.ones(params_shape))
            normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
            outputs = gamma * normalized + beta
            
        return outputs

    6. encoder-decoder

                with tf.variable_scope("encoder"):
                    ## Embedding
                    self.enc = embedding(self.x, 
                                          vocab_size=len(de2idx), 
                                          num_units=hp.hidden_units, 
                                          scale=True,
                                          scope="enc_embed")
    
                    # key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1)
    
                    ## Positional Encoding
                    if hp.sinusoid:
                        self.enc += tf.cast(positional_encoding(self.x,
                                            num_units=hp.hidden_units,
                                            zero_pad=False,
                                            scale=False,
                                            scope="enc_pe"), tf.float32)
                    else:
                        self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
                                              vocab_size=hp.maxlen,
                                              num_units=hp.hidden_units,
                                              zero_pad=False,
                                              scale=False,
                                              scope="enc_pe")
    
                    # self.enc *= key_masks
                     
                    ## Dropout
                    self.enc = tf.layers.dropout(self.enc, 
                                                 rate=hp.dropout_rate,
                                                 training=tf.convert_to_tensor(is_training))
                    
                    ## Blocks
                    for i in range(hp.num_blocks):
                        with tf.variable_scope("num_blocks_{}".format(i)):
                            ### Multihead Attention
                            self.enc = multihead_attention(queries=self.enc, 
                                                            keys=self.enc, 
                                                            num_units=hp.hidden_units, 
                                                            num_heads=hp.num_heads, 
                                                            dropout_rate=hp.dropout_rate,
                                                            is_training=is_training,
                                                            causality=False)
                            
                            ### Feed Forward
                            self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])
                
                # Decoder
                with tf.variable_scope("decoder"):
                    ## Embedding
                    self.dec = embedding(self.decoder_inputs, 
                                          vocab_size=len(en2idx), 
                                          num_units=hp.hidden_units,
                                          scale=True, 
                                          scope="dec_embed")
                    self.dec_ = self.dec
    
                    # key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)
    
                    ## Positional Encoding
                    if hp.sinusoid:
                        self.dec += tf.cast(positional_encoding(self.decoder_inputs,
                                                        num_units=hp.hidden_units,
                                                        zero_pad=False,
                                                        scale=False,
                                                        scope="dec_pe"), tf.float32)
                    else:
                        self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen, 
                                          num_units=hp.hidden_units, 
                                          zero_pad=False, 
                                          scale=False,
                                          scope="dec_pe")
                    # self.dec *= key_masks
                    
                    ## Dropout
                    self.dec = tf.layers.dropout(self.dec, 
                                                rate=hp.dropout_rate, 
                                                training=tf.convert_to_tensor(is_training))
                    
                    ## Blocks
                    for i in range(hp.num_blocks):
                        with tf.variable_scope("num_blocks_{}".format(i)):
                            ## Multihead Attention ( self-attention)
                            self.dec = multihead_attention(queries=self.dec, 
                                                            keys=self.dec, 
                                                            num_units=hp.hidden_units, 
                                                            num_heads=hp.num_heads, 
                                                            dropout_rate=hp.dropout_rate,
                                                            is_training=is_training,
                                                            causality=True, 
                                                            scope="self_attention")
                            
                            ## Multihead Attention ( vanilla attention)
                            self.dec = multihead_attention(queries=self.dec, 
                                                           keys=self.enc,
                                                            num_units=hp.hidden_units,
                                                            num_heads=hp.num_heads,
                                                            dropout_rate=hp.dropout_rate,
                                                            is_training=is_training,
                                                            causality=False,
                                                            scope="vanilla_attention")
                            ## Feed Forward
                            self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])
    # Final linear projection
    self.logits = tf.layers.dense(self.dec, len(en2idx))
    self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
    self.istarget = tf.to_float(tf.not_equal(self.y, 0))
    self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget) / (tf.reduce_sum(self.istarget))
    tf.summary.scalar('acc', self.acc)

    7. train

                if is_training:  
                    # Loss
                    self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx)))
                    self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
                    self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
                   
                    # Training Scheme
                    self.global_step = tf.Variable(0, name='global_step', trainable=False)
                    self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
                    self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
                       
                    # Summary 
                    tf.summary.scalar('mean_loss', self.mean_loss)
                    self.merged = tf.summary.merge_all()
  • 相关阅读:
    Win10
    编码
    [转帖] Tomcat安全配置小技巧
    关于redis bind
    query data filtered by a JSON Column in SQLAlchemy
    Flask多线程环境下logging
    Flask request
    [转] MySQL树结构递归查询处理
    [转]了解BFF架构
    转载:ELK实战系列3-RabbitMQ+ELK搭建日志平台
  • 原文地址:https://www.cnblogs.com/callyblog/p/10430731.html
Copyright © 2020-2023  润新知