IMDB数据集
此模块将从 http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz 下载数据集。
- 这个数据集包含了25000条训练用电影评论数据,25000条测试用评论数据,且这些评论带有明显情感倾向
- 负面评论的得分小于等于4,正面评论的得分大于等于7,满分10分
- label:0/1
paddle.dataset.imdb:https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/data/dataset_cn/imdb_cn.html
import numpy as np
import paddle as paddle
import paddle.dataset.imdb as imdb
import paddle.fluid as fluid
word_dict = imdb.word_dict()
#[==================================================]db/imdb%2FaclImdb_v1.tar.gz not found, downloading https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz
# word_dict['<unk>']:5146
# list(word_dict.keys())[5146]:<unk>
BATCH_SIZE=8
train_reader = fluid.io.batch(paddle.reader.shuffle(imdb.train(word_dict), buf_size=15),
batch_size=BATCH_SIZE)
test_reader = fluid.io.batch(imdb.test(word_dict),
batch_size=BATCH_SIZE)
for train_data in train_reader():
break
for test_data in train_reader():
break
# 3个数据,没有填充到相同长度
# len(train_data[0][0]),len(train_data[1][0]),len(train_data[2][0]) :129, 471, 585
# batchsize:8
print(len(train_data)) # 8
# sentence data
print(train_data[0]) #([406, 3, 1252, 534, ....., 614], 0)
# (data,label)
print(len(train_data[0])) # 2