• fasttext和cnn的比较,使用keras imdb看效果——cnn要慢10倍。


    fasttext:

    '''This example demonstrates the use of fasttext for text classification
    Based on Joulin et al's paper:
    Bags of Tricks for Efficient Text Classification
    https://arxiv.org/abs/1607.01759
    Results on IMDB datasets with uni and bi-gram embeddings:
        Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
        Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
    '''
    
    from __future__ import print_function
    import numpy as np
    
    from keras.preprocessing import sequence
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import Embedding
    from keras.layers import GlobalAveragePooling1D
    from keras.datasets import imdb
    
    import numpy as np
    import json
    import warnings
    
    
    def load_data(path='imdb.npz', num_words=None, skip_top=0,
                  maxlen=None, seed=113,
                  start_char=1, oov_char=2, index_from=3, **kwargs):
        """Loads the IMDB dataset.
        # Arguments
            path: where to cache the data (relative to `~/.keras/dataset`).
            num_words: max number of words to include. Words are ranked
                by how often they occur (in the training set) and only
                the most frequent words are kept
            skip_top: skip the top N most frequently occurring words
                (which may not be informative).
            maxlen: sequences longer than this will be filtered out.
            seed: random seed for sample shuffling.
            start_char: The start of a sequence will be marked with this character.
                Set to 1 because 0 is usually the padding character.
            oov_char: words that were cut out because of the `num_words`
                or `skip_top` limit will be replaced with this character.
            index_from: index actual words with this index and higher.
        # Returns
            Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
        # Raises
            ValueError: in case `maxlen` is so low
                that no input sequence could be kept.
        Note that the 'out of vocabulary' character is only used for
        words that were present in the training set but are not included
        because they're not making the `num_words` cut here.
        Words that were not seen in the training set but are in the test set
        have simply been skipped.
        """
        # Legacy support
        if 'nb_words' in kwargs:
            warnings.warn('The `nb_words` argument in `load_data` '
                          'has been renamed `num_words`.')
            num_words = kwargs.pop('nb_words')
        if kwargs:
            raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
    
        #path = get_file(path,
        #                origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
        #                file_hash='599dadb1135973df5b59232a0e9a887c')
        with np.load(path) as f:
            x_train, labels_train = f['x_train'], f['y_train']
            x_test, labels_test = f['x_test'], f['y_test']
    
        np.random.seed(seed)
        indices = np.arange(len(x_train))
        np.random.shuffle(indices)
        x_train = x_train[indices]
        labels_train = labels_train[indices]
    
        indices = np.arange(len(x_test))
        np.random.shuffle(indices)
        x_test = x_test[indices]
        labels_test = labels_test[indices]
    
        xs = np.concatenate([x_train, x_test])
        labels = np.concatenate([labels_train, labels_test])
    
        if start_char is not None:
            xs = [[start_char] + [w + index_from for w in x] for x in xs]
        elif index_from:
            xs = [[w + index_from for w in x] for x in xs]
    
        if maxlen:
            xs, labels = _remove_long_seq(maxlen, xs, labels)
            if not xs:
                raise ValueError('After filtering for sequences shorter than maxlen=' +
                                 str(maxlen) + ', no sequence was kept. '
                                 'Increase maxlen.')
        if not num_words:
            num_words = max([max(x) for x in xs])
    
        # by convention, use 2 as OOV word
        # reserve 'index_from' (=3 by default) characters:
        # 0 (padding), 1 (start), 2 (OOV)
        if oov_char is not None:
            xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
        else:
            xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
    
        idx = len(x_train)
        x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
        x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
        return (x_train, y_train), (x_test, y_test)
    
    def create_ngram_set(input_list, ngram_value=2):
        """
        Extract a set of n-grams from a list of integers.
        >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
        {(4, 9), (4, 1), (1, 4), (9, 4)}
        >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
        [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
        """
        return set(zip(*[input_list[i:] for i in range(ngram_value)]))
    
    
    def add_ngram(sequences, token_indice, ngram_range=2):
        """
        Augment the input list of list (sequences) by appending n-grams values.
        Example: adding bi-gram
        >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
        >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
        >>> add_ngram(sequences, token_indice, ngram_range=2)
        [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
        Example: adding tri-gram
        >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
        >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
        >>> add_ngram(sequences, token_indice, ngram_range=3)
        [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
        """
        new_sequences = []
        for input_list in sequences:
            new_list = input_list[:]
            for ngram_value in range(2, ngram_range + 1):
                for i in range(len(new_list) - ngram_value + 1):
                    ngram = tuple(new_list[i:i + ngram_value])
                    if ngram in token_indice:
                        new_list.append(token_indice[ngram])
            new_sequences.append(new_list)
    
        return new_sequences
    
    # Set parameters:
    # ngram_range = 2 will add bi-grams features
    ngram_range = 1
    max_features = 20000
    maxlen = 400
    batch_size = 32
    embedding_dims = 50
    epochs = 5
    
    print('Loading data...')
    # the data, split between train and test sets
    #(x_train, y_train), (x_test, y_test) = load_data()
    (x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
    
    if ngram_range > 1:
        print('Adding {}-gram features'.format(ngram_range))
        # Create set of unique n-gram from the training set.
        ngram_set = set()
        for input_list in x_train:
            for i in range(2, ngram_range + 1):
                set_of_ngram = create_ngram_set(input_list, ngram_value=i)
                ngram_set.update(set_of_ngram)
    
        # Dictionary mapping n-gram token to a unique integer.
        # Integer values are greater than max_features in order
        # to avoid collision with existing features.
        start_index = max_features + 1
        token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
        indice_token = {token_indice[k]: k for k in token_indice}
    
        # max_features is the highest integer that could be found in the dataset.
        max_features = np.max(list(indice_token.keys())) + 1
    
        # Augmenting x_train and x_test with n-grams features
        x_train = add_ngram(x_train, token_indice, ngram_range)
        x_test = add_ngram(x_test, token_indice, ngram_range)
        print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
        print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
    
    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    
    print('Build model...')
    model = Sequential()
    
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features,
                        embedding_dims,
                        input_length=maxlen))
    
    # we add a GlobalAveragePooling1D, which will average the embeddings
    # of all words in the document
    model.add(GlobalAveragePooling1D())
    
    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs, validation_data=(x_test, y_test))
    

     效果:

    Train on 25000 samples, validate on 25000 samples
    Epoch 1/50
    2018-06-06 15:50:28.133461: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
    25000/25000 [==============================] - 9s 379us/step - loss: 0.6125 - acc: 0.7431 - val_loss: 0.5050 - val_acc: 0.8227
    Epoch 2/50
    25000/25000 [==============================] - 10s 402us/step - loss: 0.4059 - acc: 0.8633 - val_loss: 0.3738 - val_acc: 0.8646
    Epoch 3/50
    25000/25000 [==============================] - 11s 441us/step - loss: 0.3061 - acc: 0.8934 - val_loss: 0.3219 - val_acc: 0.8783
    Epoch 4/50
    25000/25000 [==============================] - 9s 375us/step - loss: 0.2550 - acc: 0.9110 - val_loss: 0.2970 - val_acc: 0.8853
    Epoch 5/50
    

     可以看到一个epoch只需要10来秒,还是很快的!但是我训练到50个epoch后发现acc 100%,但是验证集上数据acc 86%,看来是过拟合了。

    再看看传统cnn:

    '''This example demonstrates the use of Convolution1D for text classification.
    Gets to 0.89 test accuracy after 2 epochs.
    90s/epoch on Intel i5 2.4Ghz CPU.
    10s/epoch on Tesla K40 GPU.
    '''
    from __future__ import print_function
    
    from keras.preprocessing import sequence
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Activation
    from keras.layers import Embedding
    from keras.layers import Conv1D, GlobalMaxPooling1D
    from keras.datasets import imdb
    
    # set parameters:
    max_features = 5000
    import numpy as np
    import json
    import warnings
    
    
    def load_data(path='imdb.npz', num_words=None, skip_top=0,
                  maxlen=None, seed=113,
                  start_char=1, oov_char=2, index_from=3, **kwargs):
        """Loads the IMDB dataset.
        # Arguments
            path: where to cache the data (relative to `~/.keras/dataset`).
            num_words: max number of words to include. Words are ranked
                by how often they occur (in the training set) and only
                the most frequent words are kept
            skip_top: skip the top N most frequently occurring words
                (which may not be informative).
            maxlen: sequences longer than this will be filtered out.
            seed: random seed for sample shuffling.
            start_char: The start of a sequence will be marked with this character.
                Set to 1 because 0 is usually the padding character.
            oov_char: words that were cut out because of the `num_words`
                or `skip_top` limit will be replaced with this character.
            index_from: index actual words with this index and higher.
        # Returns
            Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
        # Raises
            ValueError: in case `maxlen` is so low
                that no input sequence could be kept.
        Note that the 'out of vocabulary' character is only used for
        words that were present in the training set but are not included
        because they're not making the `num_words` cut here.
        Words that were not seen in the training set but are in the test set
        have simply been skipped.
        """
        # Legacy support
        if 'nb_words' in kwargs:
            warnings.warn('The `nb_words` argument in `load_data` '
                          'has been renamed `num_words`.')
            num_words = kwargs.pop('nb_words')
        if kwargs:
            raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
    
        #path = get_file(path,
        #                origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
        #                file_hash='599dadb1135973df5b59232a0e9a887c')
        with np.load(path) as f:
            x_train, labels_train = f['x_train'], f['y_train']
            x_test, labels_test = f['x_test'], f['y_test']
    
        np.random.seed(seed)
        indices = np.arange(len(x_train))
        np.random.shuffle(indices)
        x_train = x_train[indices]
        labels_train = labels_train[indices]
    
        indices = np.arange(len(x_test))
        np.random.shuffle(indices)
        x_test = x_test[indices]
        labels_test = labels_test[indices]
    
        xs = np.concatenate([x_train, x_test])
        labels = np.concatenate([labels_train, labels_test])
    
        if start_char is not None:
            xs = [[start_char] + [w + index_from for w in x] for x in xs]
        elif index_from:
            xs = [[w + index_from for w in x] for x in xs]
    
        if maxlen:
            xs, labels = _remove_long_seq(maxlen, xs, labels)
            if not xs:
                raise ValueError('After filtering for sequences shorter than maxlen=' +
                                 str(maxlen) + ', no sequence was kept. '
                                 'Increase maxlen.')
        if not num_words:
            num_words = max([max(x) for x in xs])
    
        # by convention, use 2 as OOV word
        # reserve 'index_from' (=3 by default) characters:
        # 0 (padding), 1 (start), 2 (OOV)
        if oov_char is not None:
            xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
        else:
            xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
    
        idx = len(x_train)
        x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
        x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
        return (x_train, y_train), (x_test, y_test)
    
    def create_ngram_set(input_list, ngram_value=2):
        """
        Extract a set of n-grams from a list of integers.
        >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
        {(4, 9), (4, 1), (1, 4), (9, 4)}
        >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
        [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
        """
        return set(zip(*[input_list[i:] for i in range(ngram_value)]))
    
    
    def add_ngram(sequences, token_indice, ngram_range=2):
        """
        Augment the input list of list (sequences) by appending n-grams values.
        Example: adding bi-gram
        >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
        >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
        >>> add_ngram(sequences, token_indice, ngram_range=2)
        [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
        Example: adding tri-gram
        >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
        >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
        >>> add_ngram(sequences, token_indice, ngram_range=3)
        [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
        """
        new_sequences = []
        for input_list in sequences:
            new_list = input_list[:]
            for ngram_value in range(2, ngram_range + 1):
                for i in range(len(new_list) - ngram_value + 1):
                    ngram = tuple(new_list[i:i + ngram_value])
                    if ngram in token_indice:
                        new_list.append(token_indice[ngram])
            new_sequences.append(new_list)
    
        return new_sequences
    maxlen = 400
    batch_size = 32
    embedding_dims = 50
    filters = 250
    kernel_size = 3
    hidden_dims = 250
    epochs = 5
    
    print('Loading data...')
    (x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    
    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    
    print('Build model...')
    model = Sequential()
    
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features,
                        embedding_dims,
                        input_length=maxlen))
    model.add(Dropout(0.2))
    
    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation='relu',
                     strides=1))
    # we use max pooling:
    model.add(GlobalMaxPooling1D())
    
    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    
    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
    validation_data=(x_test, y_test))
    

     效果:

    Train on 25000 samples, validate on 25000 samples
    Epoch 1/5
    2018-06-06 16:10:34.733973: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
    25000/25000 [==============================] - 117s 5ms/step - loss: 0.4044 - acc: 0.8007 - val_loss: 0.3212 - val_acc: 0.8600
    Epoch 2/5
    25000/25000 [==============================] - 121s 5ms/step - loss: 0.2323 - acc: 0.9057 - val_loss: 0.2903 - val_acc: 0.8801
    Epoch 3/5
    25000/25000 [==============================] - 124s 5ms/step - loss: 0.1640 - acc: 0.9377 - val_loss: 0.2720 - val_acc: 0.8900
    Epoch 4/5
    25000/25000 [==============================] - 116s 5ms/step - loss: 0.1136 - acc: 0.9579 - val_loss: 0.3353 - val_acc: 0.8811
    Epoch 5/5
    25000/25000 [==============================] - 114s 5ms/step - loss: 0.0764 - acc: 0.9726 - val_loss: 0.3958 - val_acc: 0.8793

    可以看出cnn的确要慢10倍。

  • 相关阅读:
    OCP 062【中文】考试题库(cuug内部资料)第13题
    OCP 062【中文】考试题库(cuug内部资料)第12题
    560. 和为K的子数组 力扣(中等) 字节面试题,不会,前缀和,hash,有尺取法的味道
    863. 二叉树中所有距离为 K 的结点 力扣(中等) bfs
    671. 二叉树中第二小的节点 力扣(简单) auto循环set
    1713. 得到子序列的最少操作次数 力扣(困难) 最长公共子序列->最长上升子序列 lower(vector) 得迭代器
    47. 全排列 II 力扣(中等) 手写练习
    328. 奇偶链表 力扣(中等) 链表练习
    21. 合并两个有序链表 力扣(简单) 链表练习
    1743. 从相邻元素对还原数组 力扣(中等) 需要思考但可以做,hash
  • 原文地址:https://www.cnblogs.com/bonelee/p/9145776.html
Copyright © 2020-2023  润新知