• 吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:使用神经网络实现新闻话题分类


    import pandas as pd
    df = pd.read_json('/Users/chenyi/Documents/News_Category_Dataset.json', lines=True)
    df.head()

    categories = df.groupby('category')
    print("total categories: ", categories.ngroups)
    print(categories.size())

    df.category = df.category.map(lambda x:"WORLDPOST" if x == "THE WORLDPOST" else x)
    categories = df.groupby('category')
    print("total categories: ", categories.ngroups)
    print(categories.size())

    from keras.preprocessing import sequence
    from keras.preprocessing.text import Tokenizer, text_to_word_sequence, one_hot
    df['text'] = df.headline + " " + df.short_description
    
    # 将单词进行标号
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df.text)
    X = tokenizer.texts_to_sequences(df.text)
    df['words'] = X
    #记录每条数据的单词数
    df['word_length'] = df.words.apply(lambda i: len(i))
    #清除单词数不足5个的数据条目
    df = df[df.word_length >= 5]
    df.word_length.describe()

    def word2Frequent(sequences):
        word_index = {}
        for sequence in sequences:
            for word in sequence:
                word_index[word] = word_index.get(word, 0) + 1
        return word_index
    word_index = word2Frequent(df.words)
    
    
    count = 10000
    #将单词按照频率按照升序排序,然后取出排在第一万位的单词频率
    s = [(k, word_index[k]) for k in sorted(word_index, key=word_index.get, reverse=True)]
    print(s[0])
    frequent_to_index = {}
    for i in range(count):
        frequent_to_index[s[i][0]] = 9999 - i

    # 将分类进行编号
    categories = df.groupby('category').size().index.tolist()
    category_int = {}
    int_category = {}
    for i, k in enumerate(categories):
        category_int.update({k:i})
        int_category.update({i:k})
    
    df['c2id'] = df['category'].apply(lambda x: category_int[x])
    import numpy as np
    import keras.utils as utils
    from sklearn.model_selection import train_test_split
    import numpy as np
    
    def vectorize_sequences(sequences, dimension=10000):
        results = np.zeros((len(sequences), dimension))
        for i in range(len(sequences)):
            for word in sequences[i]:
                if frequent_to_index.get(word, None) is not None:
                    pos = frequent_to_index[word]
                    results[i, pos] = 1.0   
        return results
    
    X = np.array(df.words)
    X = vectorize_sequences(X)
    print(X[0])
    Y = utils.to_categorical(list(df.c2id))
    
    
    # 将数据分成两部分,80%用于训练,20%用于测试
    
    seed = 29
    x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

    from keras import models
    from keras import layers
    
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
    model.add(layers.Dense(64, activation='relu'))
    #当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
    model.add(layers.Dense(len(int_category), activation='softmax'))
    
    #对于输出多个分类结果,最好的损失函数是categorical_crossentropy
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train, y_train, epochs=20, validation_data=(x_val, y_val), batch_size=512)

    import matplotlib.pyplot as plt
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    
    epochs = range(1, len(loss) + 1)
    
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.xlabel('Epochs')
    plt.ylabel('acc')
    plt.legend()
    plt.show()

    from keras import models
    from keras import layers
    
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
    model.add(layers.Dense(4, activation='relu'))
    #当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
    model.add(layers.Dense(len(int_category), activation='softmax'))
    
    #对于输出多个分类结果,最好的损失函数是categorical_crossentropy
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train, y_train, epochs=20, batch_size=512)

    results = model.evaluate(x_val, y_val)
    print(results)

  • 相关阅读:
    linux 学习笔记 groupadd创建组
    linux学习笔记 4建立用户
    Linux学习笔记 3 权限篇
    Linux学习笔记 1 环境变量 2 vi命令
    指针 以及取地址
    练习题
    weblogic domain creation
    hibernate log4j 输出sql
    练习九 组函数应用
    练习八 spool导出
  • 原文地址:https://www.cnblogs.com/tszr/p/12232595.html
Copyright © 2020-2023  润新知