英德翻译数据
https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/data/dataset_cn/wmt14_cn.html
paddle.dataset.wmt16:https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/data/dataset_cn/wmt16_cn.html
fluid.io.shuffle:https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/io_cn/shuffle_cn.html#shuffle
import paddle
import paddle.fluid as fluid
import numpy as np
# 英语字典
en_dict=paddle.dataset.wmt16.get_dict('en',1000)
# 德语字典
de_dict=paddle.dataset.wmt16.get_dict('de',1000)
# 返回一个reader creator,reader中的每个样本都是 (源语言单词ID序列,目标语言单词ID序列 下一个单词ID序列)
# 原语言字典取多大
source_dict_size=1000
# 翻译目标语言字典取多大
target_dict_size=500
batch_size=64
# PS:paddlepaddle1.6 version,1.5error
# wmt16
train=paddle.dataset.wmt16.train(source_dict_size=source_dict_size,
target_dict_size=target_dict_size,
src_lang='en')
data_generator = fluid.io.shuffle(train,buf_size=16)
batch_generator = fluid.io.batch(data_generator, batch_size=batch_size)
# a batchsize data
a_bacth=next(batch_generator())
print(len(a_bacth)) # 64
i=2
print(len(a_bacth[i])) # 3
# (源语言单词ID序列,目标语言单词ID序列 下一个单词ID序列)
print(a_bacth[i]) # ([0, 5, 59, 32, 228, 69, 3, 250, 2, 4, 1], [0, 4, 68, 26, 219, 6, 14, 2, 54, 2, 3], [4, 68, 26, 219, 6, 14, 2, 54, 2, 3, 1])