• sklearn——CountVectorizer详解


    关于sklearn——CountVectorizer的一篇详细讲解

    https://blog.csdn.net/weixin_38278334/article/details/82320307

    使用Keras进行设计全连接层进行文本分类

     1 #搭建一个全连接层神经网络进行文本情感分类的demo
     2 import pandas as pd
     3 df = pd.read_csv('train_comment_small_50.csv',sep=',')
     4 import re
     5 def clean_comment(text):
     6     text = re.sub('<[^<]+?>',' ',text)#只要是”^”这个字符是在中括号”[]”中被使用的话就是表示字符类的否定,如果不是的话就是表示限定开头。
     7     text = text.replace('\"','')
     8     text = text.replace('"','')
     9     return text
    10 df['cleaned_comment'] = df['comment_text'].apply(clean_comment)
    11 from sklearn.model_selection import train_test_split
    12 X_train,X_test,y_train,y_test = train_test_split(df['cleaned_comment'],df['toxic'],test_size=0.2)
    13 import nltk
    14 from sklearn.feature_extraction.text import CountVectorizer
    15 from nltk.corpus import stopwords
    16 vectorizer = CountVectorizer(binary=True,stop_words= stopwords.words('english'),lowercase=True,min_df=3,max_df=0.9,max_features=5000)
    17 X_train_onehot = vectorizer.fit_transform(X_train)
    18 import numpy as np
    19 np.set_printoptions(threshold=np.inf)
    20 print(X_train_onehot)
    21 #print(X_train_onehot.t
    22 from tensorflow.keras.models import Sequential
    23 from tensorflow.keras.layers import Dense
    24 nn = Sequential()
    25 print(vectorizer.get_feature_names())
    26 nn.add(Dense(units=500,activation='relu',input_dim=len(vectorizer.get_feature_names())))
    27 nn.add(Dense(units=1,activation='sigmoid'))
    28 nn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    29 nn.summary()
    30 nn.fit(X_train_onehot[:-20],y_train[:-20],epochs=5,batch_size=128,verbose=1,validation_data=(X_train_onehot[-100:],y_train[-100:]))
    31 socres = nn.evaluate(vectorizer.transform(X_test),y_test,verbose=1)
    32 print('accuracy:',socres[1])
    33 nn.save('nn.hd5')

    使用CNN对文本进行分类

    #简单的CNN对路透社新闻主题的分类应用
    
    import numpy as np
    import keras
    from keras.datasets import reuters
    from keras.preprocessing.text import Tokenizer
    from tensorflow.keras.models import Sequential
    from tensorflow.keras import layers
    batch_size = 32
    epochs = 12
    maxlen =10000
    embedding_dim = 128
    num_filters = 64
    kernel_size  =5
    (x_train,y_train),(x_test,y_test)= reuters.load_data(num_words=None,test_split=0.2)
    print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)
    word_index = reuters.get_word_index(path="return_word_index.json")
    print(np.array(word_index))
    num_classes = max(y_train)+1
    index_to_word = {}
    for key, value in word_index.items():
        print(key,value)
        index_to_word[value] = key
    tokenizer = Tokenizer(num_words=maxlen)
    x_train = tokenizer.sequences_to_matrix(x_train,mode='binary')
    print(x_train)
    x_test = tokenizer.sequences_to_matrix(x_test,mode='binary')
    y_train = keras.utils.to_categorical(y_train,num_classes)#将整型的类别标签转为onehot编码
    y_test = keras.utils.to_categorical(y_test,num_classes)
    
    model = Sequential()
    model.add(layers.Embedding(512,embedding_dim,input_length=maxlen))
    model.add(layers.Conv1D(num_filters,kernel_size,activation='relu'))
    model.add(layers.GlobalAveragePooling1D)
    model.add(layers.Dense(10,activation='relu'))
    model.add(layers.Dense(num_classes,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    history = model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_split=0.1)
    score = model.evaluate(x_test,y_test,batch_size=batch_size,verbose=1)
    print("loss=",  score[0])
    print("accuracy=", score[1])
  • 相关阅读:
    gcd
    Kuglarz
    三分题解
    杜教筛
    第一组dp解题报告
    dp总结1
    cf-BitwiseXor
    6.6总结
    图论总结
    CF1309总结
  • 原文地址:https://www.cnblogs.com/henuliulei/p/13742269.html
Copyright © 2020-2023  润新知