• Python深度学习笔记08--处理文本数据的常用方法


    6.1 处理文本数据

    6.1.1 单词和字符的one-hot编码

    (1)单词级的one-hot编码:

     1 # 单词级的one-hot编码
     2 import numpy as np
     3 
     4 # 初始数据:每个样本是列表的一个元素(本例中的样本是一个句子,但也可以是一整篇文档)
     5 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
     6 
     7 # 构建数据中所有标记的索引
     8 token_index = {}
     9 for sample in samples:
    10     # 利用split方法对样本进行分词,在实际应用中,还需要从样本中去掉标点和特殊符号。
    11     for word in sample.split():
    12         if word not in token_index:
    13             # 为每一个唯一的单词分配一个唯一的索引            
    14             token_index[word] = len(token_index) + 1
    15             # 0号索引没有分配给任何单词            
    16 
    17 # 对样本进行分词。只考虑每个样本前max_length个单词
    18 max_length = 10
    19 
    20 # 将结果保存在result中
    21 # result是一个3D张量,第一维(高)是样本个数,第二维(行)是某样本第几个单词,
    22 # 第三维(列)是这个单词的向量表示
    23 results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
    24 for i, sample in enumerate(samples):
    25     for j, word in list(enumerate(sample.split()))[:max_length]:
    26         index = token_index.get(word)
    27         results[i, j, index] = 1.#将3D向量中出现的单词标记为1.

    (2)字符级的one-hot编码:

     1 import string
     2 import numpy as np
     3 
     4 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
     5 characters = string.printable  # 所有可打印的ASCII字符
     6 token_index = dict(zip(characters, range(1, len(characters) + 1)))
     7 
     8 max_length = 50
     9 results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
    10 for i, sample in enumerate(samples):
    11     for j, character in enumerate(sample[:max_length]):
    12         index = token_index.get(character)
    13         results[i, j, index] = 1.

    (3)使用Keras实现单词级的one-hot编码:

     1 from keras.preprocessing.text import Tokenizer
     2 
     3 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
     4 
     5 # 创建一个分词器,设置为只考虑前1000个最常见的单词
     6 tokenizer = Tokenizer(num_words=1000)
     7 # 构建单词索引
     8 tokenizer.fit_on_texts(samples)
     9 
    10 # 将字符串转换为整数索引组成的列表
    11 sequences = tokenizer.texts_to_sequences(samples)
    12 
    13 # 也可以直接得到one-hot二进制表示。这个分词器也支持除one-hot编码外的其他向量化模式
    14 one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
    15 
    16 # 找回单词索引
    17 word_index = tokenizer.word_index
    18 print('Found %s unique tokens.' % len(word_index))

    (4)使用散列技巧的单词级的one-hot编码:

     1 import numpy as np
     2 
     3 samples = ['The cat sat on the mat.', 'The dog ate my homework.']
     4 
     5 # 将单词保存为长度为1000的向量。如果单词数量接近1000个(或更多),
     6 # 那么会遇到很多散列冲突,这回降低这种编码方法的准确性
     7 dimensionality = 1000
     8 max_length = 10
     9 
    10 results = np.zeros((len(samples), max_length, dimensionality))
    11 for i, sample in enumerate(samples):
    12     for j, word in list(enumerate(sample.split()))[:max_length]:
    13         # 将单词散列为0~1000范围内的一个随机整数索引
    14         index = abs(hash(word)) % dimensionality
    15         results[i, j, index] = 1.

    6.1.2 使用词嵌入

    (1)利用Embedding层学习词嵌入: 

     1 from keras.layers import Embedding
     2 
     3 # Embedding层至少需要两个参数:标记的个数(这里是1000,即最大单词索引+1)
     4 # 和嵌入的维度(这里是64)
     5 embedding_layer = Embedding(1000, 64)
     6 
     7 from keras.datasets import imdb
     8 from keras import preprocessing
     9 
    10 # 作为特征的单词个数
    11 max_features = 10000
    12 # 超出的单词会被截断 
    13 # (这些单词是最常见单词)
    14 maxlen = 20
    15 
    16 # 加载数据,整数列表
    17 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    18 
    19 # 将整数列表转换为(samples, maxlen)的2D张量
    20 x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
    21 x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
    22 
    23 
    24 from keras.models import Sequential
    25 from keras.layers import Flatten, Dense
    26 
    27 model = Sequential()
    28 # 指定Embedding层的最大输入长度,以便后面将嵌入输入展平。
    29 
    30 model.add(Embedding(max_features, 8, input_length=maxlen))
    31 # Embedding层激活的形状为(samples, maxlen, 8)
    32 
    33 # 将3D的嵌入张量展平成形状(samples, maxlen * 8)的2D张量
    34 model.add(Flatten())
    35 
    36 # 添加分类器
    37 model.add(Dense(1, activation='sigmoid'))
    38 model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    39 # model.summary()
    40 
    41 history = model.fit(x_train, y_train,
    42                     epochs=10,
    43                     batch_size=32,
    44                     validation_split=0.2)

    (2)使用预训练的词嵌入:

      1 import os
      2 
      3 imdb_dir = '/home/ubuntu/data/aclImdb'
      4 train_dir = os.path.join(imdb_dir, 'train')
      5 
      6 labels = []
      7 texts = []
      8 
      9 for label_type in ['neg', 'pos']:
     10     dir_name = os.path.join(train_dir, label_type)
     11     for fname in os.listdir(dir_name):
     12         if fname[-4:] == '.txt':
     13             f = open(os.path.join(dir_name, fname))
     14             texts.append(f.read())
     15             f.close()
     16             if label_type == 'neg':
     17                 labels.append(0)
     18             else:
     19                 labels.append(1)
     20 
     21 from keras.preprocessing.text import Tokenizer
     22 from keras.preprocessing.sequence import pad_sequences
     23 import numpy as np
     24 
     25 maxlen = 100  # We will cut reviews after 100 words
     26 training_samples = 200  # We will be training on 200 samples
     27 validation_samples = 10000  # We will be validating on 10000 samples
     28 max_words = 10000  # We will only consider the top 10,000 words in the dataset
     29 
     30 tokenizer = Tokenizer(num_words=max_words)
     31 tokenizer.fit_on_texts(texts)
     32 sequences = tokenizer.texts_to_sequences(texts)
     33 
     34 word_index = tokenizer.word_index
     35 print('Found %s unique tokens.' % len(word_index))
     36 
     37 data = pad_sequences(sequences, maxlen=maxlen)
     38 
     39 labels = np.asarray(labels)
     40 print('Shape of data tensor:', data.shape)
     41 print('Shape of label tensor:', labels.shape)
     42 
     43 # Split the data into a training set and a validation set
     44 # But first, shuffle the data, since we started from data
     45 # where sample are ordered (all negative first, then all positive).
     46 indices = np.arange(data.shape[0])
     47 np.random.shuffle(indices)
     48 data = data[indices]
     49 labels = labels[indices]
     50 
     51 x_train = data[:training_samples]
     52 y_train = labels[:training_samples]
     53 x_val = data[training_samples: training_samples + validation_samples]
     54 y_val = labels[training_samples: training_samples + validation_samples]
     55 
     56 glove_dir = '/home/ubuntu/data/'
     57 
     58 embeddings_index = {}
     59 f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
     60 for line in f:
     61     values = line.split()
     62     word = values[0]
     63     coefs = np.asarray(values[1:], dtype='float32')
     64     embeddings_index[word] = coefs
     65 f.close()
     66 
     67 print('Found %s word vectors.' % len(embeddings_index))
     68 
     69 embedding_dim = 100
     70 
     71 embedding_matrix = np.zeros((max_words, embedding_dim))
     72 for word, i in word_index.items():
     73     embedding_vector = embeddings_index.get(word)
     74     if i < max_words:
     75         if embedding_vector is not None:
     76             # Words not found in embedding index will be all-zeros.
     77             embedding_matrix[i] = embedding_vector
     78 
     79 from keras.models import Sequential
     80 from keras.layers import Embedding, Flatten, Dense
     81 
     82 model = Sequential()
     83 model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
     84 model.add(Flatten())
     85 model.add(Dense(32, activation='relu'))
     86 model.add(Dense(1, activation='sigmoid'))
     87 model.summary()
     88 
     89 model.layers[0].set_weights([embedding_matrix])
     90 model.layers[0].trainable = False
     91 
     92 model.compile(optimizer='rmsprop',
     93               loss='binary_crossentropy',
     94               metrics=['acc'])
     95 history = model.fit(x_train, y_train,
     96                     epochs=10,
     97                     batch_size=32,
     98                     validation_data=(x_val, y_val))
     99 model.save_weights('pre_trained_glove_model.h5')
    100 
    101 import matplotlib.pyplot as plt
    102 
    103 acc = history.history['acc']
    104 val_acc = history.history['val_acc']
    105 loss = history.history['loss']
    106 val_loss = history.history['val_loss']
    107 
    108 epochs = range(1, len(acc) + 1)
    109 
    110 plt.plot(epochs, acc, 'bo', label='Training acc')
    111 plt.plot(epochs, val_acc, 'b', label='Validation acc')
    112 plt.title('Training and validation accuracy')
    113 plt.legend()
    114 
    115 plt.figure()
    116 
    117 plt.plot(epochs, loss, 'bo', label='Training loss')
    118 plt.plot(epochs, val_loss, 'b', label='Validation loss')
    119 plt.title('Training and validation loss')
    120 plt.legend()
    121 
    122 plt.show()
    123 
    124 from keras.models import Sequential
    125 from keras.layers import Embedding, Flatten, Dense
    126 
    127 model = Sequential()
    128 model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    129 model.add(Flatten())
    130 model.add(Dense(32, activation='relu'))
    131 model.add(Dense(1, activation='sigmoid'))
    132 model.summary()
    133 
    134 model.compile(optimizer='rmsprop',
    135               loss='binary_crossentropy',
    136               metrics=['acc'])
    137 history = model.fit(x_train, y_train,
    138                     epochs=10,
    139                     batch_size=32,
    140                     validation_data=(x_val, y_val))
    141 
    142 acc = history.history['acc']
    143 val_acc = history.history['val_acc']
    144 loss = history.history['loss']
    145 val_loss = history.history['val_loss']
    146 
    147 epochs = range(1, len(acc) + 1)
    148 
    149 plt.plot(epochs, acc, 'bo', label='Training acc')
    150 plt.plot(epochs, val_acc, 'b', label='Validation acc')
    151 plt.title('Training and validation accuracy')
    152 plt.legend()
    153 
    154 plt.figure()
    155 
    156 plt.plot(epochs, loss, 'bo', label='Training loss')
    157 plt.plot(epochs, val_loss, 'b', label='Validation loss')
    158 plt.title('Training and validation loss')
    159 plt.legend()
    160 
    161 plt.show()
    162 
    163 
    164 test_dir = os.path.join(imdb_dir, 'test')
    165 
    166 labels = []
    167 texts = []
    168 
    169 for label_type in ['neg', 'pos']:
    170     dir_name = os.path.join(test_dir, label_type)
    171     for fname in sorted(os.listdir(dir_name)):
    172         if fname[-4:] == '.txt':
    173             f = open(os.path.join(dir_name, fname))
    174             texts.append(f.read())
    175             f.close()
    176             if label_type == 'neg':
    177                 labels.append(0)
    178             else:
    179                 labels.append(1)
    180 
    181 sequences = tokenizer.texts_to_sequences(texts)
    182 x_test = pad_sequences(sequences, maxlen=maxlen)
    183 y_test = np.asarray(labels)
    184 
    185 
    186 model.load_weights('pre_trained_glove_model.h5')
    187 model.evaluate(x_test, y_test)
  • 相关阅读:
    HDU3440 House Man (差分约束)
    POJ1201 Intervals (差分约束)
    POJ2154 Color【 polya定理+欧拉函数优化】(三个例题)
    【2018年全国多校算法寒假训练营练习比赛(第三场)】
    数据人看Feed流-架构实践
    开源背后 | 面对端侧推理引擎的挑战,阿里工程师如何应对?
    容器十年 ——一部软件交付编年史
    公网对讲行业大咖分享:铁通电子为何选择阿里云?
    逾期率飙升如何破?揭秘金融科技如何化解消费金融行业风险
    微服务开源生态报告 No.2
  • 原文地址:https://www.cnblogs.com/asenyang/p/14325257.html
Copyright © 2020-2023  润新知