• tf2.0 实现transformer小改


    import sys
    
    # import keras
    import numpy as np
    import tensorflow as tf
    import matplotlib.pyplot as plt
    from tensorflow.keras import layers
    import os
    import matplotlib.pyplot as plt
    
    
    #   设置相关底层配置
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    
    def my_padding(x,size):
        result = np.array([])
        bat_size = x.shape[0]
        for i in range(bat_size):
            zero_mat = np.zeros((size - x.shape[1],x.shape[2]))
            start_mat = np.ones((1,x.shape[2])) * 0.3
            end_mat = np.ones((1,x.shape[2])) * 0.3
            each_x = x[i]
    
            each_x = np.vstack([start_mat,each_x])
            pad_x = np.vstack([each_x,zero_mat])
            pad_x = np.vstack([pad_x,end_mat])
            pad_x = pad_x.reshape(1, size+2, x.shape[2])
            if result.shape[0] == 0:
                result = pad_x
            else:
                result = np.vstack([result,pad_x])
    
        return result
    # test_x = np.array([
    #     [[111, 112, 113],
    #      [121, 122, 123],
    #      ],
    #     [[211, 212, 213],
    #      [221, 222, 223],
    #      ],
    #     [[311, 312, 313],
    #      [321, 322, 323],
    #      ],
    # ])
    # a = my_padding(test_x,6)
    # print(a.shape)
    # print(a)
    # sys.exit(2)
    
    def get_angles(pos, i, d_model):
        # 这里的i等价与上面公式中的2i和2i+1
        angle_rates = 1 / np.power(10000, (2*(i // 2))/ np.float32(d_model))
        return pos * angle_rates
    
    def positional_encoding(position, d_model):
        # print('position:',position)
    
        angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                                np.arange(d_model)[np.newaxis, :],
                                d_model)
        # 第2i项使用sin
        sines = np.sin(angle_rads[:, 0::2])
        # 第2i+1项使用cos
        cones = np.cos(angle_rads[:, 1::2])
        pos_encoding = np.concatenate([sines, cones], axis=-1)
        pos_encoding = pos_encoding[np.newaxis, ...]
    
        return tf.cast(pos_encoding, dtype=tf.float32)
    # pos_encoding = positional_encoding(50, 512)
    
    # def create_padding_mark(seq):
    #     # 获取为0的padding项
    #     # print('seq:', seq)
    #     seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    #     # 扩充维度以便用于attention矩阵
    #     return seq[:, np.newaxis, np.newaxis, :]  # (batch_size,1,1,seq_len)
    #     # print('seq_mask:',seq)
    #     return seq  # (batch_size,seq_len)
    def create_padding_mark(targets):
        # 获取为0的padding项
        # print('targets:',targets.shape)
        zero_mask = np.max(targets,axis=-1)
        zero_mask = tf.cast(tf.math.equal(zero_mask, 0), tf.float32)
        zero_mask = tf.reshape(zero_mask,(-1,1))
        one_mat = np.ones((targets.shape[0],targets.shape[0]))
        # print('zero_mask:',zero_mask.shape,zero_mask)
        result_mask1 = np.multiply(one_mat,zero_mask)
        result_mask2 = np.multiply(one_mat, tf.transpose(zero_mask))
        result_mask = tf.maximum(result_mask1,result_mask2)
        # print('result_mask:', result_mask.shape, result_mask)
        # sys.exit(2)
        return result_mask  # (batch_size,seq_len)
    # mark 测试
    # create_padding_mark([[1,2,0,0,3],[3,4,5,0,0],[0,0,0,0,0]])
    # sys.exit(2)
    def create_look_ahead_mark(size):
        # print('size:',size)
        # 1 - 对角线和取下三角的全部对角线(-1->全部)
        # 这样就可以构造出每个时刻未预测token的掩码
        mark = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)        #   (a, numLower, numUpper)
        # print('mark.shapeL:',mark.shape)
        # sys.exit(2)
        # mark = np.reshape(mark,())
        mark = tf.cast(mark,tf.float32)
        return mark  # (seq_len, seq_len)
    # temp = create_look_ahead_mark(3)
    # print(temp)
    # sys.exit(2)
    # 构建掩码
    def create_mask(targets):
    
        # loss_mark = np.max(targets,axis=-1)
        # loss_mark = tf.cast(tf.math.equal(loss_mark, 0), tf.float32)
        # loss_mark = tf.reshape(loss_mark,(-1,1))
        # # print('loss_mark:',loss_mark,loss_mark.shape)
        # decode_targets_padding_mask = create_padding_mark(targets)
        # decode_targets_padding_mask = tf.cast(decode_targets_padding_mask,tf.float32)
        # look_ahead 掩码, 掩掉未预测的词
        look_ahead_mask = create_look_ahead_mark(targets.shape[0])
        look_ahead_mask  = tf.cast(look_ahead_mask , tf.float32)
        # print('此粗话'k:', look_ahead_mask.shape,type(look_ahead_mask.shape))
        # 合并解码层第一层掩码
        # combine_mask = tf.maximum(decode_targets_padding_mask, look_ahead_mask)
        # print('combine_m
        # return combine_mask, loss_mark
        return look_ahead_mask
    
    # sys.exit(2)
    
    def scaled_dot_product_attention(q, k, v, mask):
        # query key 相乘获取匹配关系
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        # 使用dk进行缩放
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
        # 掩码
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
    
        # 通过softmax获取attention权重
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    
        # attention 乘上value
        output = tf.matmul(attention_weights, v)  # (.., seq_len_v, depth)
    
        return output, attention_weights
    #   attention测试
    # def print_out(q, k, v):
    #     temp_out, temp_att = scaled_dot_product_attention(
    #     q, k, v, None)
    #     print('attention weight:')
    #     print(temp_att)
    #     print('output:')
    #     print(temp_out)
    # np.set_printoptions(suppress=True)
    # temp_k = tf.constant([[10,0,0],
    #                       [0,10,0],
    #                       [0,0,10],
    #                       [0,0,10]], dtype=tf.float32)  # (4, 3)
    #
    # temp_v = tf.constant([[   1,0],
    #                       [  10,0],
    #                       [ 100,5],
    #                       [1000,6]], dtype=tf.float32)  # (4, 3)
    # # 关注第2个key, 返回对应的value
    # temp_q = tf.constant([[0,10,0]], dtype=tf.float32)
    # print_out(temp_q, temp_k, temp_v)
    
    
    # 构造mutil head attention层
    class MutilHeadAttention(tf.keras.layers.Layer):
        def __init__(self, d_model, num_heads):
            super(MutilHeadAttention, self).__init__()
            self.num_heads = num_heads
            self.d_model = d_model
    
            # d_model 必须可以正确分为各个头
            assert d_model % num_heads == 0
            # 分头后的维度
            self.depth = d_model // num_heads
    
            self.wq = tf.keras.layers.Dense(d_model)
            self.wk = tf.keras.layers.Dense(d_model)
            self.wv = tf.keras.layers.Dense(d_model)
    
            self.dense = tf.keras.layers.Dense(d_model)
    
        def split_heads(self, x, batch_size):
            # 分头, 将头个数的维度 放到 seq_len 前面
            x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
            return tf.transpose(x, perm=[0, 2, 1, 3])
    
        def call(self, v, k, q, mask):
            batch_size = tf.shape(q)[0]
    
            # 分头前的前向网络,获取q、k、v语义
            # print('or_q:',q.shape)
            q = self.wq(q)  # (batch_size, seq_len, d_model)
            # print('or_q:', q.shape)
            # sys.exit(2)
            k = self.wk(k)
            v = self.wv(v)
    
            # 分头
            q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
            k = self.split_heads(k, batch_size)
            v = self.split_heads(v, batch_size)
            # scaled_attention.shape == (batch_size, num_heads, seq_len_v, depth)
            # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    
            # 通过缩放点积注意力层
            scaled_attention, attention_weights = scaled_dot_product_attention(
                q, k, v, mask)
            # 把多头维度后移
            scaled_attention = tf.transpose(scaled_attention, [0, 2, 1, 3])  # (batch_size, seq_len_v, num_heads, depth)
    
            # 合并多头
            concat_attention = tf.reshape(scaled_attention,
                                          (batch_size, -1, self.d_model))
    
            # 全连接重塑
            output = self.dense(concat_attention)
            return output, attention_weights
    # temp_mha = MutilHeadAttention(d_model=512, num_heads=8)
    # # y = tf.random.uniform((1, 60, 512))
    # y = tf.random.uniform((2, 60, 512))
    # # y = tf.random.uniform((1, 2,60, 32))
    # output, att = temp_mha(y, k=y, q=y, mask=None)
    # print('x:{}'.format(y.shape))
    # print("out:{},att:{}".format(output.shape, att.shape))
    
    def point_wise_feed_forward_network(d_model, diff):
        return tf.keras.Sequential([
            tf.keras.layers.Dense(diff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
    
    class LayerNormalization(tf.keras.layers.Layer):
        def __init__(self, epsilon=1e-6, **kwargs):
            self.eps = epsilon
            super(LayerNormalization, self).__init__(**kwargs)
        def build(self, input_shape):
            self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
                                         initializer=tf.ones_initializer(), trainable=True)
            self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
                                        initializer=tf.zeros_initializer(), trainable=True)
            super(LayerNormalization, self).build(input_shape)
        def call(self, x):
            mean = tf.keras.backend.mean(x, axis=-1, keepdims=True)
            std = tf.keras.backend.std(x, axis=-1, keepdims=True)
            return self.gamma * (x - mean) / (std + self.eps) + self.beta
        def compute_output_shape(self, input_shape):
            return input_shape
    
    class EncoderLayer(tf.keras.layers.Layer):
        def __init__(self, d_model, n_heads, ddf, dropout_rate=0.1):
            super(EncoderLayer, self).__init__()
    
            self.mha = MutilHeadAttention(d_model, n_heads)             #   return (output, attention_weights)
            self.ffn = point_wise_feed_forward_network(d_model, ddf)
    
            self.layernorm1 = LayerNormalization(epsilon=1e-6)
            self.layernorm2 = LayerNormalization(epsilon=1e-6)
    
            self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
            self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    
        ###########################################################     可配一个大残差链接     ##########################################################
        def call(self, inputs, training, mask):
            # 多头注意力网络
            att_output, _ = self.mha(inputs, inputs, inputs, mask)      #   return (output, attention_weights)
            att_output = self.dropout1(att_output, training=training)
            out1 = self.layernorm1(inputs + att_output)  # (batch_size, input_seq_len, d_model)
            # 前向网络
            ffn_output = self.ffn(out1)
            ffn_output = self.dropout2(ffn_output, training=training)
            out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
            return out2
    # sample_encoder_layer = EncoderLayer(512, 8, 2048)   #(d_model, n_heads, ddf, dropout_rate=0.1)
    # sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None)
    # print('sample_encoder_layer_output.shape:',sample_encoder_layer_output.shape)
    
    class DecoderLayer(tf.keras.layers.Layer):
        def __init__(self, d_model, num_heads, dff, drop_rate=0.1):
            super(DecoderLayer, self).__init__()
    
            self.mha1 = MutilHeadAttention(d_model, num_heads)
            self.mha2 = MutilHeadAttention(d_model, num_heads)
    
            self.ffn = point_wise_feed_forward_network(d_model, dff)
    
            self.layernorm1 = LayerNormalization(epsilon=1e-6)
            self.layernorm2 = LayerNormalization(epsilon=1e-6)
            self.layernorm3 = LayerNormalization(epsilon=1e-6)
    
            self.dropout1 = layers.Dropout(drop_rate)
            self.dropout2 = layers.Dropout(drop_rate)
            self.dropout3 = layers.Dropout(drop_rate)
    
        def call(self, inputs, encode_out, training,
                 look_ahead_mask, padding_mask):
            # masked muti-head attention
            att1, att_weight1 = self.mha1(inputs, inputs, inputs, look_ahead_mask)     #(v, k, q)
            att1 = self.dropout1(att1, training=training)
            out1 = self.layernorm1(inputs + att1)
    
            # muti-head attention
            ###########################################################   此处inputs 可换为out1 ##########################################################
            # att2, att_weight2 = self.mha2(encode_out, encode_out, inputs, padding_mask)     #(v, k, q)
            att2, att_weight2 = self.mha2(encode_out, encode_out, out1, padding_mask)  # (v, k, q)
            att2 = self.dropout2(att2, training=training)
            out2 = self.layernorm2(out1 + att2)
    
            ffn_out = self.ffn(out2)
            ffn_out = self.dropout3(ffn_out, training=training)
            out3 = self.layernorm3(out2 + ffn_out)
    
            return out3, att_weight1, att_weight2
    # sample_encoder_layer = EncoderLayer(512, 8, 2048)   #(d_model, n_heads, ddf, dropout_rate=0.1)
    # sample_encoder_layer_output = sample_encoder_layer(tf.random.uniform((64, 43, 512)), False, None)
    # sample_decoder_layer = DecoderLayer(512, 8, 2048)   #   (d_model, num_heads, dff, drop_rate=0.1)
    # sample_decoder_layer_output, _, _ = sample_decoder_layer(
    # tf.random.uniform((64, 50, 512)), sample_encoder_layer_output,False, None, None)
    # print('sample_decoder_layer_output.shape:',sample_decoder_layer_output.shape)
    class Encoder(layers.Layer):
        # def __init__(self, n_layers, d_model, n_heads, ddf,
        #              input_vocab_size, max_seq_len, drop_rate=0.1):
        def __init__(self, n_layers, d_model, n_heads, ddf,  drop_rate=0.1):
            super(Encoder, self).__init__()
    
            self.n_layers = n_layers
            self.d_model = d_model
    
            # self.embedding = layers.Embedding(input_vocab_size, d_model)
            self.embedding = layers.Dense(d_model,activation='relu')
            # self.pos_embedding = positional_encoding(max_seq_len, d_model)
    
            self.encode_layer = [EncoderLayer(d_model, n_heads, ddf, drop_rate)
                                 for _ in range(n_layers)]
    
            self.dropout = layers.Dropout(drop_rate)
    
        def call(self, inputs, training, mark):
            seq_len = inputs.shape[1]
            word_emb = self.embedding(inputs)
            word_emb *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            # emb = word_emb + self.pos_embedding[:, :seq_len, :]
            emb = word_emb + positional_encoding(seq_len, self.d_model)
            x = self.dropout(emb, training=training)
            for i in range(self.n_layers):
                x = self.encode_layer[i](x, training, mark)
    
            return x
    # sample_encoder = Encoder(2, 512, 8, 1024, 180)      # (n_layers, d_model, n_heads, ddf, max_seq_len, drop_rate=0.1)
    # print('此处')
    # sample_encoder_output = sample_encoder(tf.random.uniform((64, 180,120)),False, None)
    # print('sample_encoder_output.shape:',sample_encoder_output.shape)
    # sys.exit(2)
    
    class Decoder(layers.Layer):
        def __init__(self, n_layers, d_model, n_heads, ddf, drop_rate=0.1):
            super(Decoder, self).__init__()
    
            self.d_model = d_model
            self.n_layers = n_layers
    
            # self.embedding = layers.Embedding(target_vocab_size, d_model)
            self.embedding = layers.Dense(d_model, activation='relu')
            # self.pos_embedding = positional_encoding(max_seq_len, d_model)
    
            self.decoder_layers = [DecoderLayer(d_model, n_heads, ddf, drop_rate)
                                   for _ in range(n_layers)]
    
            self.dropout = layers.Dropout(drop_rate)
    
        def call(self, inputs, encoder_out, training,
                 look_ahead_mark, padding_mark):
            # seq_len = tf.shape(inputs)[1]
            seq_len = inputs.shape[1]
            attention_weights = {}
            h = self.embedding(inputs)
            h *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            # h += self.pos_embedding[:, :seq_len, :]
            h += positional_encoding(seq_len, self.d_model)
    
            h = self.dropout(h, training=training)
            #         print('--------------------\n',h, h.shape)
            # 叠加解码层
            for i in range(self.n_layers):
                h, att_w1, att_w2 = self.decoder_layers[i](h, encoder_out,
                                                           training, look_ahead_mark,
                                                           padding_mark)
                attention_weights['decoder_layer{}_att_w1'.format(i + 1)] = att_w1
                attention_weights['decoder_layer{}_att_w2'.format(i + 1)] = att_w2
    
            return h, attention_weights
    # sample_encoder = Encoder(2, 512, 8, 1024)      # (n_layers, d_model, n_heads, ddf, drop_rate=0.1)
    # sample_encoder_output = sample_encoder(tf.random.uniform((64, 180,120)),False, None)
    # # print('sample_encoder_output.shape:',sample_encoder_output.shape)
    # sample_decoder = Decoder(2, 512,8,1024)       # (n_layers, d_model, n_heads, ddf, drop_rate=0.1)
    # sample_decoder_output, attn = sample_decoder(tf.random.uniform((64, 180,100)),sample_encoder_output, False, None, None)
    # print(sample_decoder_output.shape, attn['decoder_layer1_att_w1'].shape)
    # sys.exit(2)
    
    class Transformer(tf.keras.Model):
        def __init__(self, n_layers, d_model, n_heads, diff, target_vocab_size, drop_rate=0.1):
            super(Transformer, self).__init__()
            # self.bn1 = layers.BatchNormalization()
            self.encoder = Encoder(n_layers, d_model, n_heads, diff,drop_rate)
            self.decoder = Decoder(n_layers, d_model, n_heads, diff, drop_rate)
            self.bn = tf.keras.layers.BatchNormalization()
            self.final_layer = tf.keras.layers.Dense(target_vocab_size)
            # self.final_layer = tf.keras.layers.Dense(target_vocab_size,activation='tanh')
    
        # def call(self, inputs, targets, training, encode_padding_mask,
        #          look_ahead_mask, decode_padding_mask):
        def call(self, inputs, targets, training,look_ahead_mask = None, encode_padding_mask = None,
                  decode_padding_mask = None):
            # inputs = self.bn1(inputs)
            encode_out = self.encoder(inputs, training, encode_padding_mask)
            # print(encode_out.shape)
            decode_out, att_weights = self.decoder(targets, encode_out, training,
                                                   look_ahead_mask, decode_padding_mask)
            # print('decode_out.shape:',decode_out.shape)
            decode_out = self.bn(decode_out)
            final_out = self.final_layer(decode_out)
            # final_out = self.final_layer(decode_out) *10
    
            return final_out, att_weights
    # sample_transformer = Transformer(n_layers=2, d_model=512, n_heads=8, diff=1024,target_vocab_size=20)
    # temp_input = tf.random.uniform((64,180, 62))
    # temp_target = tf.random.uniform((64, 180,26))
    # fn_out, _ = sample_transformer(temp_input, temp_target, training=False,
    #                               encode_padding_mask=None,
    #                                look_ahead_mask=None,
    #                                decode_padding_mask=None,
    #                               )
    # print('fn_out.shape:',fn_out.shape)
    
    
    # @tf.function
    
    global_num = 0
    global_train_acc = 0
    
    def train_step(inputs, targets):
        # print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^6")
        global global_num
        global global_train_acc
        global_num +=1
        # tar_inp = targets[:, :-1,:6]
        # tar_real = targets[:, 1:,:6]
        tar_inp = targets[:, :-1, 5][:,:,np.newaxis]
        tar_real = targets[:, 1:, 5][:,:,np.newaxis]
        # print()
        with tf.GradientTape() as tape:
            predictions, _ = transformer(inputs, tar_inp,training = True,look_ahead_mask=look_mask,decode_padding_mask = None)
            # print("pre:{}".format(predictions[:2,:5]))
            # print("True:{}".format(tar_real[:2,:5]))
            # predictions, _ = transformer(inputs, tar_inp, training=True, look_ahead_mask=None,decode_padding_mask=None)
            # loss = loss_fun(tar_real, predictions,loss_mask)
            loss = loss_fun(tar_real, predictions)
        if global_num % 10 == 0:
            acc = get_acc(predictions,tar_real)
            global_train_acc = acc.numpy()
            # print('train_acc:{:.2f}'.format(acc.numpy()))
    
        # 求梯度
        gradients = tape.gradient(loss, transformer.trainable_variables)
        # 反向传播
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        mse_loss = tf.reduce_mean(tf.losses.MSE(predictions,tar_real))
        train_loss(mse_loss)
  • 相关阅读:
    特征值分解与奇异值分解的相关学习记录
    week9:Recommender Systems
    关于PCA的一些学习汇总
    第四周疑难点
    第二章感知机习题
    Week7:SVM难点记录
    week6:Diagnosing Bias vs. Variance难点记录
    laravel ajax表格删除
    dropzone
    laravel 部分路由取消csrf
  • 原文地址:https://www.cnblogs.com/cxhzy/p/15857010.html
Copyright © 2020-2023  润新知