• tensorflow单机多卡分布式训练(From_server2_gpus)


    #  官方教程 https://www.tensorflow.org/guide/distributed_training?hl=zh-cn

    import
    sys # import keras import numpy as np import tensorflow as tf import matplotlib.pyplot as plt from tensorflow.keras import layers import os import matplotlib.pyplot as plt from Model_encode_decode import Transformer2 import time gpu_num = 8 gpus = tf.config.experimental.list_physical_devices(device_type='GPU') tf.config.experimental.set_visible_devices(devices=gpus[0:gpu_num], device_type='GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # # sys.exit(2) mirrored_strategy = tf.distribute.MirroredStrategy() # 设置相关底层配置 # physical_devices = tf.config.experimental.list_physical_devices('GPU') # assert len(physical_devices) > 0, "Not enough GPU hardware devices available" # tf.config.experimental.set_memory_growth(physical_devices[0], True) global_num = 0 global_train_acc = 0 global_train_acc2 = 0 global_loss2 = 0 @tf.function def train_step2(inputs, targets): def step_fn(inputs, targets): # print('step_fn') inputs = inputs[:, :, :-1] tar_real = targets[:, 1:, -1] tar_real = tf.cast(tar_real, tf.int32) tar_real = tf.one_hot(tar_real, depth=target_size) tar_real = tf.squeeze(tar_real, axis=1) decode_in = targets[:, -1, 5:7] real_zhangfu = targets[:, 1:, 5] with tf.GradientTape() as tape: pre_class, pre_zhangfu = transformer(inputs, decode_in, training=True) loss, mse_loss = loss_fun(tar_real, real_zhangfu, pre_class, pre_zhangfu) # loss = loss_fun(tar_real, real_zhangfu, pre_class, pre_zhangfu) grads = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(list(zip(grads, transformer.trainable_variables))) # return loss return loss, mse_loss per_losse,per_mse_loss = mirrored_strategy.run(step_fn, args=(inputs, targets)) # print('per_losse:{},per_mse_loss:{}'.format(per_losse,per_mse_loss)) mean_loss = mirrored_strategy.reduce( tf.distribute.ReduceOp.SUM, per_losse, axis=None) mean_mse_loss = mirrored_strategy.reduce( tf.distribute.ReduceOp.SUM, per_mse_loss, axis=None) return mean_loss,mean_mse_loss # return mean_loss def train_loss_compute(inputs, targets): inputs = inputs[:, :, :-1] # inputs = inputs[:, :, 3][:,:,np.newaxis] tar_real = targets[:, 1:, -1] tar_real = tf.cast(tar_real, tf.int32) tar_real = tf.one_hot(tar_real, depth=target_size) tar_real = tf.squeeze(tar_real, axis=1) real_zhangfu = targets[:, 1:, 5] decode_in = targets[:, -1, 5:7] pre_class, pre_zhangfu = transformer(inputs, decode_in, training=False) # loss = loss_fun(tar_real, predictions[:,:,3]) loss,mse_loss = loss_fun(tar_real, real_zhangfu, pre_class, pre_zhangfu) acc, one_one_acc, two_two_acc, c3_c3_acc, c4_c4_acc, c5_c5_acc, c6_c6_acc, c7_acc = get_acc(pre_class, tar_real) acc2 = get_acc2(pre_zhangfu, real_zhangfu) # return loss, acc.numpy() return loss, mse_loss,tf.convert_to_tensor([acc, one_one_acc, two_two_acc, c3_c3_acc, c4_c4_acc, c5_c5_acc, c6_c6_acc, c7_acc]), acc2 @tf.function def graph_run(func,args_in): train_loss, train_mse_loss, train_acc, train_acc2 = mirrored_strategy.run(func, args=args_in) return train_loss, train_mse_loss, train_acc, train_acc2 def test_loss_compute(inputs, targets): inputs = inputs[:, :, :-1] # inputs = inputs[:, :, 3][:,:,np.newaxis] tar_real = targets[:, 1:, -1] tar_real = tf.cast(tar_real, tf.int32) tar_real = tf.one_hot(tar_real, depth=target_size) tar_real = tf.squeeze(tar_real, axis=1) real_zhangfu = targets[:, 1:, 5] decode_in = targets[:, -1, 5:7] pre_class, pre_zhangfu = transformer(inputs, decode_in, training=False) # loss = loss_fun(tar_real, predictions[:,:,3]) loss = loss_fun(tar_real, real_zhangfu, pre_class, pre_zhangfu) acc, one_one_acc, two_two_acc, c3_c3_acc, c4_c4_acc, c5_c5_acc, c6_c6_acc, c7_acc = get_acc(pre_class, tar_real) acc2 = get_acc2(pre_zhangfu, real_zhangfu) # return loss, acc.numpy() return loss, np.array([acc, one_one_acc, two_two_acc, c3_c3_acc, c4_c4_acc, c5_c5_acc, c6_c6_acc, c7_acc]), acc2 # loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, # reduction='none') loss_object = tf.keras.losses.MeanSquaredError() def loss_fun(y_true, y_true_zhangfu, y_pred, pre_zhangfu): loss_ = tf.losses.categorical_crossentropy(y_true, y_pred) loss_zhangfu = tf.losses.MSE(y_true_zhangfu, pre_zhangfu) loss_zhangfu = tf.reduce_mean(loss_zhangfu) add_weight = 2 # two_weight = add_weight * (ONE / TWO) # three_weight = add_weight * (THREE / TWO) y_true_argmax = tf.argmax(y_true, axis=-1) y_pre_argmax = tf.argmax(y_pred, axis=-1) preB_tS = ((y_true_argmax == 4) | (y_true_argmax == 5) | (y_true_argmax == 6)) & ( (y_pre_argmax == 0) | (y_pre_argmax == 1) | (y_pre_argmax == 2) | (y_pre_argmax == 3)) loss = tf.where(preB_tS, loss_ * add_weight, loss_) # loss = tf.reduce_mean(loss) loss = tf.reduce_sum(loss) * (1.0/bat) loss_mix = loss + 1 * loss_zhangfu # print('loss_class:{},loss_mse:{},loss_mix:{}'.format(loss,loss_zhangfu,loss_mix)) # sys.exit(2) return loss_mix, loss_zhangfu # return loss_mix def get_acc(pre, y): y = tf.argmax(y, axis=-1) pre = tf.argmax(pre, axis=-1) y_0 = tf.cast(y == 0, tf.int16) y_0 = tf.reduce_sum(tf.cast(y_0, tf.float32)) y_1 = tf.cast(y == 1, tf.int16) y_1 = tf.reduce_sum(tf.cast(y_1, tf.float32)) y_2 = tf.cast(y == 2, tf.int16) y_2 = tf.reduce_sum(tf.cast(y_2, tf.float32)) y_3 = tf.cast(y == 3, tf.int16) y_3 = tf.reduce_sum(tf.cast(y_3, tf.float32)) y_4 = tf.cast(y == 4, tf.int16) y_4 = tf.reduce_sum(tf.cast(y_4, tf.float32)) y_5 = tf.cast(y == 5, tf.int16) y_5 = tf.reduce_sum(tf.cast(y_5, tf.float32)) y_6 = tf.cast(y == 6, tf.int16) y_6 = tf.reduce_sum(tf.cast(y_6, tf.float32)) pre_1 = tf.cast(pre == 1, tf.int16) pre_1 = tf.reduce_sum(tf.cast(pre_1, tf.float32)) # print('预测和标签分布 zero: {}, one : {}, two : {} , pre_1: {}'.format(y_0,y_1,y_2,pre_1)) # ALL = y_0+y_1+y_2 acc_eq = tf.cast(tf.equal(pre, y), tf.int16) acc_float = tf.cast(acc_eq, tf.float32) acc = tf.reduce_mean(acc_float) one_one = (y == pre) & (y == 1) one_one = tf.cast(one_one, tf.int16) one_one_acc = tf.reduce_sum(tf.cast(one_one, tf.float32)) / (y_1 + 1e-9) # one_two = (pre == 1) & (y == 2) # one_two = tf.cast(one_two, tf.int16) # one_two_acc = tf.reduce_sum(tf.cast(one_two, tf.float32)) / y_2 two_two = (pre == y) & (y == 2) two_two = tf.cast(two_two, tf.int16) two_two_acc = tf.reduce_sum(tf.cast(two_two, tf.float32)) / (y_2 + 1e-9) c3_c3 = (pre == y) & (y == 3) c3_c3 = tf.cast(c3_c3, tf.int16) c3_c3_acc = tf.reduce_sum(tf.cast(c3_c3, tf.float32)) / (y_3 + 1e-9) c4_c4 = (pre == y) & (y == 4) c4_c4 = tf.cast(c4_c4, tf.int16) c4_c4_acc = tf.reduce_sum(tf.cast(c4_c4, tf.float32)) / (y_4 + 1e-9) c5_c5 = (pre == y) & (y == 5) c5_c5 = tf.cast(c5_c5, tf.int16) c5_c5_acc = tf.reduce_sum(tf.cast(c5_c5, tf.float32)) / (y_5 + 1e-9) c6_c6 = (pre == y) & (y == 6) c6_c6 = tf.cast(c6_c6, tf.int16) c6_c6_acc = tf.reduce_sum(tf.cast(c6_c6, tf.float32)) / (y_6 + 1e-9) c7 = ((y == 4) | (y == 5) | (y == 6)) & ((pre == 0) | (pre == 1) | (pre == 2) | (pre == 3)) c7 = tf.cast(c7, tf.int16) c7_acc = tf.reduce_mean(tf.cast(c7, tf.float32)) # other_one = (pre == 1) & (y != 1) # other_one = tf.cast(other_one, tf.int16) # other_one_acc = tf.reduce_mean(tf.cast(other_one, tf.float32)) /pre_1 # return ( # acc.numpy(), one_one_acc.numpy(), two_two_acc.numpy(), c3_c3_acc.numpy(), c4_c4_acc.numpy(), c5_c5_acc.numpy(), # c6_c6_acc.numpy(), c7_acc.numpy()) return ( acc, one_one_acc, two_two_acc, c3_c3_acc, c4_c4_acc, c5_c5_acc, c6_c6_acc, c7_acc) def get_acc2(pre, y): y_updown = tf.where(y > 0, 1, 0) pre_updown = tf.where(pre > 0, 1, 0) acc_eq = tf.cast(tf.equal(pre_updown, y_updown), tf.int16) acc_float = tf.cast(acc_eq, tf.float32) acc = tf.reduce_mean(acc_float) # return acc.numpy() return acc def fun_fenbu(x): z_0 = tf.cast(x == 0, tf.float32) z_0 = tf.reduce_sum(z_0) one = tf.cast(x == 1, tf.float32) one = tf.reduce_sum(one) two = tf.cast(x == 2, tf.float32) two = tf.reduce_sum(two) three = tf.cast(x == 3, tf.float32) three = tf.reduce_sum(three) f_4 = tf.cast(x == 4, tf.float32) f_4 = tf.reduce_sum(f_4) f_5 = tf.cast(x == 5, tf.float32) f_5 = tf.reduce_sum(f_5) s_6 = tf.cast(x == 6, tf.float32) s_6 = tf.reduce_sum(s_6) print( 'Zero:{},\tOne:{},\tTwo:{},\tThree:{},\tFour:{},\tFIVE:{},\tSix:{}'.format(z_0, one, two, three, f_4, f_5, s_6)) # 其他 # 3天涨2% # 3天涨4% # 5天涨8% # 3天跌-2% # 3天跌-4% # 3天跌-8% return (z_0, one, two, three, f_4, f_5, s_6) if __name__ == '__main__': # tf.random.set_seed(1) # gu_num = 'sz.002782' # 可立克 # gu_num = 'sh.600460' # 士兰微 # gu_num = 'sz.002739' # 万达 # gu_num = 'sh.600313' # 农发种业 # gu_num = 'sh.000001' # 上证 qian = 60 hou = 1 # qian = 30 # hou = 5 # split_test = -1 # split_test = 10 # 超参数 # EPOCHS = 1 EPOCHS = 2000000 Threshold = 3 num_layers = 3 # num_layers = 1 d_model = 512 # d_model = 256 dff = 1024 # dff = 256 # num_heads = 8 num_heads = 8 dropout_rate = 0.11 test_split = -1 # test_split = 40000 # target_size = 3 # test_index = 20000 test_index = -1 target_size = 7 # reslut_seq = 60 learing_rate = 1e-5 # bat = 300 bat = 700 bat = bat*gpu_num # guiyi_price = True guiyi_price = False # load_flag = False load_flag = True with mirrored_strategy.scope(): transformer = Transformer2(num_layers, d_model, num_heads, dff, target_size, dropout_rate) ########################## 加载模型 ###################################### model_save_path = './save_model' if load_flag: # load_name = r"data_410000,epoch_70_trainloss_1.38_testloss_0.00" # 410000预训练模型 drop = 0.1 80 % # load_name = r"data_-1,epoch_80_trainloss_0.77_testloss_0.00" # -1预训练模型 drop = 0.1 % # load_name = r"data_-1,epoch_7_trainloss_0.53_testloss_3.46" # -1预训练模型 drop = 0.1 % # load_name = r"drop_0.31,epoch_138_trainloss_1.30_testloss_3.42" # -1预训练模型 drop = 0.31 70 % load_name = r"drop_0.34,epoch_11_trainloss_1.27_testloss_3.60" # -1预训练模型 drop = 0.32 73 % temp_path = './decode_model' load_path = os.path.join(temp_path, load_name) load_path = os.path.join(load_path, load_name) transformer.load_weights(load_path) print('加载模型') ########################## 加载模型 ###################################### # optimizer = tf.keras.optimizers.Adam(learing_rate, beta_1=0.9, # beta_2=0.999, epsilon=1e-9) # optimizer = tf.keras.optimizers.Nadam(lr=learing_rate, be ta_1=0.9, beta_2=0.999, epsilon=1e-09, schedule_decay=0.004) # optimizer = tf.keras.optimizers.Adadelta(lr=learing_rate) # optimizer = tf.keras.optimizers.SGD(lr= learing_rate,momentum=0.9) optimizer = tf.keras.optimizers.Adamax(lr=learing_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-09) # optimizer = tf.keras.optimizers.Adamax(lr=learing_rate, beta_1=0.99, beta_2=0.999, epsilon=1e-09) train_list = [] test_list = [] train_acc_list = [] test_acc_list = [] # 数据加载和处理 # train_x, train_y = concat_db() # data_dir = r'C:\股票\股票包\new_dir\data\all_gu' # data = np.load(os.path.join(data_dir,'all_gu.npz')) data_dir = './data/all_test' data = np.load(os.path.join(data_dir, 'guiyi_False_all_data.npz')) train_x, train_y = data['x'][:test_split], data['y'][:test_split] db = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(1000000).batch(bat) db = mirrored_strategy.experimental_distribute_dataset(db) print() print('******************** Train数据分布 ********************') z_0, one, two, three, f_4, f_5, s_6 = fun_fenbu(train_y[:, -1, -1]) ####################################### Test ##############################################333 val_dir = './data/val_data' val_data = np.load(os.path.join(val_dir, 'guiyi_False_val_data.npz')) mix_part_x, mix_part_y = val_data['x'][:test_index], val_data['y'][:test_index] # mix_part_x, mix_part_y = get_data(gu_num) db_test = tf.data.Dataset.from_tensor_slices((mix_part_x, mix_part_y)).batch(bat) print('******************** TEST数据分布 ********************') z_0, one, two, three, f_4, f_5, s_6 = fun_fenbu(mix_part_y[:, -1, -1]) with mirrored_strategy.scope(): for epoch in range(EPOCHS): start_time = time.time() for batch, (inputs, targets) in enumerate(db): train_step2(inputs,targets) # continue # sys.exit(2) if batch % 100 == 0: # train_loss,train_mse_loss, train_acc, train_acc2 = mirrored_strategy.run(train_loss_compute, args=(inputs, targets)) train_loss, train_mse_loss, train_acc, train_acc2 = graph_run(train_loss_compute,args_in=(inputs, targets)) # print('per_losse:{},per_mse_loss:{}'.format(per_losse,per_mse_loss)) mean_loss = mirrored_strategy.reduce( tf.distribute.ReduceOp.SUM, train_loss, axis=None) mean_mse_loss = mirrored_strategy.reduce( tf.distribute.ReduceOp.SUM, train_mse_loss, axis=None) mean_acc = mirrored_strategy.reduce( tf.distribute.ReduceOp.MEAN, train_acc, axis=None) mean_acc2 = mirrored_strategy.reduce( tf.distribute.ReduceOp.MEAN, train_acc2, axis=None) end_time = time.time() # print() print('epoch:{},batch:{}'.format(epoch, batch)) print('train_loss:{},\tMSE_loss:{}'.format(mean_loss.numpy(), mean_mse_loss.numpy())) print('ACC标签顺序: acc,\t acc_1,\t acc_2,\t acc_3,\t acc_4,\t acc_5,\t acc_6,\t acc_7') print('train_acc:{}'.format(mean_acc.numpy().round(3))) print('涨幅预测正确train:{:.2f}'.format(mean_acc2)) print('time:{:.3f}'.format(end_time - start_time)) start_time = time.time() # 测试集测试 if batch % 500 == 0: # if batch % 500 == 0 and batch != 0: test_loss_all = 0 num = 0 acc = np.array([0, 0, 0, 0, 0, 0, 0, 0]).astype(np.float) acc2 = 0 for batch_test, (test_inputs, test_targets) in enumerate(db_test): num += 1 test_loss, test_acc, test_acc2 = test_loss_compute(test_inputs, test_targets) test_loss = tf.reduce_mean(test_loss).numpy() test_loss_all += test_loss acc += test_acc acc2 += test_acc2 # print('test_in_acc:',test_acc) test_loss_all = test_loss_all / (num + 1e-9) acc_all = acc / (num + 1e-9) acc_all2 = acc2 / (num + 1e-9) print('test_loss_all :', test_loss_all) print('test_acc :{}'.format(acc_all.round(3))) print('涨幅预测正确train:{:.2f},test:{:.2f}'.format(global_train_acc2, acc_all2)) if (epoch % 1 == 0) and (batch != 0): # print() print('保存模型') temp_model_name = 'drop_{},epoch_{}_trainloss_{:.2f}_testloss_{:.2f}'.format(dropout_rate, epoch, mean_loss.numpy(), test_loss_all) if not os.path.exists('./decode_model'): os.mkdir('./decode_model') temp_model_save = os.path.join('./decode_model', temp_model_name) if not os.path.exists(temp_model_save): os.mkdir(temp_model_save) transformer.save_weights(os.path.join(temp_model_save, temp_model_name))
  • 相关阅读:
    haproxy frontend 和backend
    haproxy 页面重定向(域名跳转)
    LWP::Simple 模块
    Perl LWP模块
    错误代码: 1582 Incorrect parameter count in the call to native function 'str_to_date'
    perl 面向对象 -> 符号使用
    跨域访问设置
    mysql 主从复制用户权限限制
    错误代码: 1045 Access denied for user 'skyusers'@'%' (using password: YES)
    sync_relay_log
  • 原文地址:https://www.cnblogs.com/cxhzy/p/16008176.html
Copyright © 2020-2023  润新知