斯坦福大学人工智能实验室李飞飞教授,实现人工智能3要素:语法(syntax)、语义(semantics)、推理(inference)。语言、视觉。通过语法(语言语法解析、视觉三维结构解析)和语义(语言语义、视觉特体动作含义)作模型输入训练数据,实现推理能力,训练学习能力应用到工作,从新数据推断结论。《The Syntax,Semantics and Inference Mechanism in Natureal Language》 http://www.aaai.org/Papers/Symposia/Fall/1996/FS-96-04/FS96-04-010.pdf 。
看图说话模型。输入一张图片,根据图像像给出描述图像内容自然语言,讲故事。翻译图像信息和文本信息。https://github.com/tensorflow/models/tree/master/research/im2txt 。
原理。编码器-解码器框架,图像编码成固定中间矢量,解码成自然语言描述。编码器Inception V3图像识别模型,解码器LSTM网络。{s0,s1,…,sn-1}字幕词,{wes0,wes1,…,wesn-1}对应词嵌入向量,LSTM输出{p1,p2,…,pn}句子下一词生成概率分布,{logp1(s1),logp2(s2),…,logpn(sn)}正确词每个步骤对数似然,总和取负数是模型最小化目标。
最佳实践。微软Microsoft COCO Caption数据集 http://mscoco.org/ 。Miscrosoft Common Objects in Context(COCO)数据集。超过30万张图片,200万个标记实体。对原COCO数据集33万张图片,用亚马逊Mechanical Turk服务,人工为每张图片生成至少5句标注,标注语句超过150万句。2014版本、2015版本。2014版本82783张图片,验证集40504张图片,测试集40775张图片。
TensorFlow-Slim图像分类库 https://github.com/tensorflow/models/tree/master/research/inception/inception/slim 。
构建模型。show_and_tell_model.py。
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from im2txt.ops import image_embedding from im2txt.ops import image_processing from im2txt.ops import inputs as input_ops class ShowAndTellModel(object): """Image-to-text implementation based on http://arxiv.org/abs/1411.4555. "Show and Tell: A Neural Image Caption Generator" Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan """ def __init__(self, config, mode, train_inception=False): """Basic setup. Args: config: Object containing configuration parameters. mode: "train", "eval" or "inference". train_inception: Whether the inception submodel variables are trainable. """ assert mode in ["train", "eval", "inference"] self.config = config self.mode = mode self.train_inception = train_inception # Reader for the input data. self.reader = tf.TFRecordReader() # To match the "Show and Tell" paper we initialize all variables with a # random uniform initializer. self.initializer = tf.random_uniform_initializer( minval=-self.config.initializer_scale, maxval=self.config.initializer_scale) # A float32 Tensor with shape [batch_size, height, width, channels]. self.images = None # An int32 Tensor with shape [batch_size, padded_length]. self.input_seqs = None # An int32 Tensor with shape [batch_size, padded_length]. self.target_seqs = None # An int32 0/1 Tensor with shape [batch_size, padded_length]. self.input_mask = None # A float32 Tensor with shape [batch_size, embedding_size]. self.image_embeddings = None # A float32 Tensor with shape [batch_size, padded_length, embedding_size]. self.seq_embeddings = None # A float32 scalar Tensor; the total loss for the trainer to optimize. self.total_loss = None # A float32 Tensor with shape [batch_size * padded_length]. self.target_cross_entropy_losses = None # A float32 Tensor with shape [batch_size * padded_length]. self.target_cross_entropy_loss_weights = None # Collection of variables from the inception submodel. self.inception_variables = [] # Function to restore the inception submodel from checkpoint. self.init_fn = None # Global step Tensor. self.global_step = None def is_training(self): """Returns true if the model is built for training mode.""" return self.mode == "train" def process_image(self, encoded_image, thread_id=0): """Decodes and processes an image string. Args: encoded_image: A scalar string Tensor; the encoded image. thread_id: Preprocessing thread id used to select the ordering of color distortions. Returns: A float32 Tensor of shape [height, width, 3]; the processed image. """ return image_processing.process_image(encoded_image, is_training=self.is_training(), height=self.config.image_height, width=self.config.image_width, thread_id=thread_id, image_format=self.config.image_format) def build_inputs(self): """Input prefetching, preprocessing and batching. Outputs: self.images self.input_seqs self.target_seqs (training and eval only) self.input_mask (training and eval only) """ if self.mode == "inference": # In inference mode, images and inputs are fed via placeholders. image_feed = tf.placeholder(dtype=tf.string, shape=[], name="image_feed") input_feed = tf.placeholder(dtype=tf.int64, shape=[None], # batch_size name="input_feed") # Process image and insert batch dimensions. images = tf.expand_dims(self.process_image(image_feed), 0) input_seqs = tf.expand_dims(input_feed, 1) # No target sequences or input mask in inference mode. target_seqs = None input_mask = None else: # Prefetch serialized SequenceExample protos. input_queue = input_ops.prefetch_input_data( self.reader, self.config.input_file_pattern, is_training=self.is_training(), batch_size=self.config.batch_size, values_per_shard=self.config.values_per_input_shard, input_queue_capacity_factor=self.config.input_queue_capacity_factor, num_reader_threads=self.config.num_input_reader_threads) # Image processing and random distortion. Split across multiple threads # with each thread applying a slightly different distortion. assert self.config.num_preprocess_threads % 2 == 0 images_and_captions = [] for thread_id in range(self.config.num_preprocess_threads): serialized_sequence_example = input_queue.dequeue() encoded_image, caption = input_ops.parse_sequence_example( serialized_sequence_example, image_feature=self.config.image_feature_name, caption_feature=self.config.caption_feature_name) image = self.process_image(encoded_image, thread_id=thread_id) images_and_captions.append([image, caption]) # Batch inputs. queue_capacity = (2 * self.config.num_preprocess_threads * self.config.batch_size) images, input_seqs, target_seqs, input_mask = ( input_ops.batch_with_dynamic_pad(images_and_captions, batch_size=self.config.batch_size, queue_capacity=queue_capacity)) self.images = images self.input_seqs = input_seqs self.target_seqs = target_seqs self.input_mask = input_mask def build_image_embeddings(self): """Builds the image model subgraph and generates image embeddings. Inputs: self.images Outputs: self.image_embeddings """ inception_output = image_embedding.inception_v3( self.images, trainable=self.train_inception, is_training=self.is_training()) self.inception_variables = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="InceptionV3") # Map inception output into embedding space. with tf.variable_scope("image_embedding") as scope: image_embeddings = tf.contrib.layers.fully_connected( inputs=inception_output, num_outputs=self.config.embedding_size, activation_fn=None, weights_initializer=self.initializer, biases_initializer=None, scope=scope) # Save the embedding size in the graph. tf.constant(self.config.embedding_size, name="embedding_size") self.image_embeddings = image_embeddings def build_seq_embeddings(self): """Builds the input sequence embeddings. Inputs: self.input_seqs Outputs: self.seq_embeddings """ with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"): embedding_map = tf.get_variable( name="map", shape=[self.config.vocab_size, self.config.embedding_size], initializer=self.initializer) seq_embeddings = tf.nn.embedding_lookup(embedding_map, self.input_seqs) self.seq_embeddings = seq_embeddings def build_model(self): """Builds the model. Inputs: self.image_embeddings self.seq_embeddings self.target_seqs (training and eval only) self.input_mask (training and eval only) Outputs: self.total_loss (training and eval only) self.target_cross_entropy_losses (training and eval only) self.target_cross_entropy_loss_weights (training and eval only) """ # This LSTM cell has biases and outputs tanh(new_c) * sigmoid(o), but the # modified LSTM in the "Show and Tell" paper has no biases and outputs # new_c * sigmoid(o). lstm_cell = tf.contrib.rnn.BasicLSTMCell( num_units=self.config.num_lstm_units, state_is_tuple=True) if self.mode == "train": lstm_cell = tf.contrib.rnn.DropoutWrapper( lstm_cell, input_keep_prob=self.config.lstm_dropout_keep_prob, output_keep_prob=self.config.lstm_dropout_keep_prob) with tf.variable_scope("lstm", initializer=self.initializer) as lstm_scope: # Feed the image embeddings to set the initial LSTM state. zero_state = lstm_cell.zero_state( batch_size=self.image_embeddings.get_shape()[0], dtype=tf.float32) _, initial_state = lstm_cell(self.image_embeddings, zero_state) # Allow the LSTM variables to be reused. lstm_scope.reuse_variables() if self.mode == "inference": # In inference mode, use concatenated states for convenient feeding and # fetching. tf.concat(axis=1, values=initial_state, name="initial_state") # Placeholder for feeding a batch of concatenated states. state_feed = tf.placeholder(dtype=tf.float32, shape=[None, sum(lstm_cell.state_size)], name="state_feed") state_tuple = tf.split(value=state_feed, num_or_size_splits=2, axis=1) # Run a single LSTM step. lstm_outputs, state_tuple = lstm_cell( inputs=tf.squeeze(self.seq_embeddings, axis=[1]), state=state_tuple) # Concatentate the resulting state. tf.concat(axis=1, values=state_tuple, name="state") else: # Run the batch of sequence embeddings through the LSTM. sequence_length = tf.reduce_sum(self.input_mask, 1) lstm_outputs, _ = tf.nn.dynamic_rnn(cell=lstm_cell, inputs=self.seq_embeddings, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32, scope=lstm_scope) # Stack batches vertically. lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size]) with tf.variable_scope("logits") as logits_scope: logits = tf.contrib.layers.fully_connected( inputs=lstm_outputs, num_outputs=self.config.vocab_size, activation_fn=None, weights_initializer=self.initializer, scope=logits_scope) if self.mode == "inference": tf.nn.softmax(logits, name="softmax") else: targets = tf.reshape(self.target_seqs, [-1]) weights = tf.to_float(tf.reshape(self.input_mask, [-1])) # Compute losses. losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits) batch_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)), tf.reduce_sum(weights), name="batch_loss") tf.losses.add_loss(batch_loss) total_loss = tf.losses.get_total_loss() # Add summaries. tf.summary.scalar("losses/batch_loss", batch_loss) tf.summary.scalar("losses/total_loss", total_loss) for var in tf.trainable_variables(): tf.summary.histogram("parameters/" + var.op.name, var) self.total_loss = total_loss self.target_cross_entropy_losses = losses # Used in evaluation. self.target_cross_entropy_loss_weights = weights # Used in evaluation. def setup_inception_initializer(self): """Sets up the function to restore inception variables from checkpoint.""" if self.mode != "inference": # Restore inception variables only. saver = tf.train.Saver(self.inception_variables) def restore_fn(sess): tf.logging.info("Restoring Inception variables from checkpoint file %s", self.config.inception_checkpoint_file) saver.restore(sess, self.config.inception_checkpoint_file) self.init_fn = restore_fn def setup_global_step(self): """Sets up the global step Tensor.""" global_step = tf.Variable( initial_value=0, name="global_step", trainable=False, collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) self.global_step = global_step def build(self): """Creates all ops for training and evaluation.""" # 构建模型 self.build_inputs() # 构建输入数据 self.build_image_embeddings() # 采用Inception V3构建图像模型,输出图片嵌入向量 self.build_seq_embeddings() # 构建输入序列embeddings self.build_model() # CNN、LSTM串联,构建完整模型 self.setup_inception_initializer() # 载入Inception V3预训练模型 self.setup_global_step() # 记录全局迭代次数
训练模型。train.py。
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from im2txt import configuration from im2txt import show_and_tell_model FLAGS = tf.app.flags.FLAGS tf.flags.DEFINE_string("input_file_pattern", "", "File pattern of sharded TFRecord input files.") tf.flags.DEFINE_string("inception_checkpoint_file", "", "Path to a pretrained inception_v3 model.") tf.flags.DEFINE_string("train_dir", "", "Directory for saving and loading model checkpoints.") tf.flags.DEFINE_boolean("train_inception", False, "Whether to train inception submodel variables.") tf.flags.DEFINE_integer("number_of_steps", 1000000, "Number of training steps.") tf.flags.DEFINE_integer("log_every_n_steps", 1, "Frequency at which loss and global step are logged.") tf.logging.set_verbosity(tf.logging.INFO) def main(unused_argv): assert FLAGS.input_file_pattern, "--input_file_pattern is required" assert FLAGS.train_dir, "--train_dir is required" model_config = configuration.ModelConfig() model_config.input_file_pattern = FLAGS.input_file_pattern model_config.inception_checkpoint_file = FLAGS.inception_checkpoint_file training_config = configuration.TrainingConfig() # Create training directory. # 创建训练结果存储路径 train_dir = FLAGS.train_dir if not tf.gfile.IsDirectory(train_dir): tf.logging.info("Creating training directory: %s", train_dir) tf.gfile.MakeDirs(train_dir) # Build the TensorFlow graph. # 建立TensorFlow数据流图 g = tf.Graph() with g.as_default(): # Build the model. # 构建模型 model = show_and_tell_model.ShowAndTellModel( model_config, mode="train", train_inception=FLAGS.train_inception) model.build() # Set up the learning rate. # 定义学习率 learning_rate_decay_fn = None if FLAGS.train_inception: learning_rate = tf.constant(training_config.train_inception_learning_rate) else: learning_rate = tf.constant(training_config.initial_learning_rate) if training_config.learning_rate_decay_factor > 0: num_batches_per_epoch = (training_config.num_examples_per_epoch / model_config.batch_size) decay_steps = int(num_batches_per_epoch * training_config.num_epochs_per_decay) def _learning_rate_decay_fn(learning_rate, global_step): return tf.train.exponential_decay( learning_rate, global_step, decay_steps=decay_steps, decay_rate=training_config.learning_rate_decay_factor, staircase=True) learning_rate_decay_fn = _learning_rate_decay_fn # Set up the training ops. # 定义训练操作 train_op = tf.contrib.layers.optimize_loss( loss=model.total_loss, global_step=model.global_step, learning_rate=learning_rate, optimizer=training_config.optimizer, clip_gradients=training_config.clip_gradients, learning_rate_decay_fn=learning_rate_decay_fn) # Set up the Saver for saving and restoring model checkpoints. saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep) # Run training. # 训练 tf.contrib.slim.learning.train( train_op, train_dir, log_every_n_steps=FLAGS.log_every_n_steps, graph=g, global_step=model.global_step, number_of_steps=FLAGS.number_of_steps, init_fn=model.init_fn, saver=saver) if __name__ == "__main__": tf.app.run()
预测生成模型。run_inference.py。
from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import os import tensorflow as tf from im2txt import configuration from im2txt import inference_wrapper from im2txt.inference_utils import caption_generator from im2txt.inference_utils import vocabulary FLAGS = tf.flags.FLAGS tf.flags.DEFINE_string("checkpoint_path", "", "Model checkpoint file or directory containing a " "model checkpoint file.") tf.flags.DEFINE_string("vocab_file", "", "Text file containing the vocabulary.") tf.flags.DEFINE_string("input_files", "", "File pattern or comma-separated list of file patterns " "of image files.") tf.logging.set_verbosity(tf.logging.INFO) def main(_): # Build the inference graph. g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(configuration.ModelConfig(), FLAGS.checkpoint_path) g.finalize() # Create the vocabulary. vocab = vocabulary.Vocabulary(FLAGS.vocab_file) filenames = [] for file_pattern in FLAGS.input_files.split(","): filenames.extend(tf.gfile.Glob(file_pattern)) tf.logging.info("Running caption generation on %d files matching %s", len(filenames), FLAGS.input_files) with tf.Session(graph=g) as sess: # Load the model from checkpoint. restore_fn(sess) # Prepare the caption generator. Here we are implicitly using the default # beam search parameters. See caption_generator.py for a description of the # available beam search parameters. generator = caption_generator.CaptionGenerator(model, vocab) for filename in filenames: with tf.gfile.GFile(filename, "r") as f: image = f.read() captions = generator.beam_search(sess, image) print("Captions for image %s:" % os.path.basename(filename)) for i, caption in enumerate(captions): # Ignore begin and end words. sentence = [vocab.id_to_word(w) for w in caption.sentence[1:-1]] sentence = " ".join(sentence) print(" %d) %s (p=%f)" % (i, sentence, math.exp(caption.logprob))) if __name__ == "__main__": tf.app.run()
参考资料:
《TensorFlow技术解析与实战》
欢迎推荐上海机器学习工作机会,我的微信:qingxingfengzi