电影评论数据分类
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
data = keras.datasets.imdb # 导入电影评论数据
数据预处理
max_word = 10000 # 限制索引序号最大到10000
(x_train,y_train),(x_test,y_test) = data.load_data(num_words=max_word)
x_train.shape,y_train.shape,x_test.shape,y_test.shape
x_train[0]
data.get_word_index() # 查看序号对应的单词
处理文本的方法:把文本训练成密集向量
# 查看所有评论的长度
[len(x) for x in x_train]
x_train = keras.preprocessing.sequence.pad_sequences(x_train,300) # 把每条文本的特征值长度处理成300少的填充,多的剔除
x_test = keras.preprocessing.sequence.pad_sequences(x_test,300)
# 查看所有评论的长度
[len(x) for x in x_train]
y_train # 目标值不需要处理(通过目标值可以看出这是个二分类问题)
# 建立模型
model = keras.models.Sequential()
# 输入数据的维度长度最大为10000个单词,映射成长为50的向量,输入最大的序号为300
model.add(layers.Embedding(10000,50,input_length=300))
# 把文本训练成密集向量 形状25000,300,50
# model.add(layers.Flatten()) # 把上面 25000,300,50的三维变成一个二维的形状
model.add(layers.GlobalAveragePooling1D()) # 使用全局池化改变形状
model.add(layers.Dense(128,activation="relu"))# 输出128个单元激活函数relu
model.add(layers.Dropout(0.5))# 添加dropout层抑制过拟合
model.add(layers.Dense(1,activation="sigmoid"))# 输出一个单元,二分类问题使用sigmoid函数激活
model.summary()
# 编译模型
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
loss = "binary_crossentropy",
metrics=["acc"])
# 训练模型
history = model.fit(x_train,y_train,epochs=15,batch_size=256,validation_data=(x_test,y_test))
猫狗数据集实例
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import glob
image_filenames = glob.glob("F:/py/ziliao/数据集/猫狗数据集/dc/train/*.jpg") # 获取train数据所有图片的路径
dataset_test = glob.glob("F:/py/ziliao/数据集/猫狗数据集/dc/test/*.jpg") # 获取test数据所有图片的路径
image_filenames = np.random.permutation(image_filenames) # 对图片进行乱序
lables = list(map(lambda x: float(x.split("\")[1].split(".")[0] == "cat"),image_filenames))
dataset = tf.data.Dataset.from_tensor_slices((image_filenames, lables))
dataset
lables_test = list(map(lambda x: float(x.split("\")[1].split(".")[0] == "cat"),dataset_test))
test_dataset = tf.data.Dataset.from_tensor_slices((dataset_test, lables_test))
test_dataset
# 处理图片函数
def _pre_read(img_filename, lable):
image = tf.io.read_file(img_filename)
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, (200, 200))
image = tf.reshape(image, [200, 200, 3])
image = tf.image.per_image_standardization(image)
return image, lable
dataset = dataset.map(_pre_read)
dataset = dataset.shuffle(300)
dataset = dataset.repeat() # 不填参数一直循环
dataset = dataset.batch(32)
dataset
test_dataset = test_dataset.map(_pre_read)
test_dataset = test_dataset.shuffle(300)
test_dataset = test_dataset.repeat() # 不填参数一直循环
test_dataset = test_dataset.batch(32)
test_dataset
# CNN优化增加卷积层及抑制拟合:增大测试训练集隐藏单元数增大拟合,降低抑制数据拟合
# 建立模型
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(64,(3,3),
input_shape=(200,200,3),
activation="relu",
padding="same"))
model.add(tf.keras.layers.Conv2D(64,(3,3),activation="relu",padding="same"))
model.add(tf.keras.layers.Conv2D(64,(3,3),activation="relu",padding="same"))
model.add(tf.keras.layers.MaxPool2D())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv2D(128,(3,3),activation="relu",padding="same"))
model.add(tf.keras.layers.Conv2D(128,(3,3),activation="relu",padding="same"))
model.add(tf.keras.layers.MaxPool2D())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv2D(256,(3,3),activation="relu",padding="same"))
model.add(tf.keras.layers.Conv2D(256,(3,3),activation="relu",padding="same"))
model.add(tf.keras.layers.MaxPool2D())
model.add(tf.keras.layers.GlobalAveragePooling2D())
model.add(tf.keras.layers.Dense(256,activation="relu"))
model.add(tf.keras.layers.Dense(1,activation="sigmoid"))
model.summary()
# 编译模型
model.compile(optimizer="adam",
loss="binary_crossentropy",
metrics=["acc"])
# 训练模型
history = model.fit(dataset,epochs=10,
steps_per_epoch=781,
validation_data=test_dataset,
validation_steps=781)