关于sklearn——CountVectorizer的一篇详细讲解
https://blog.csdn.net/weixin_38278334/article/details/82320307
使用Keras进行设计全连接层进行文本分类
1 #搭建一个全连接层神经网络进行文本情感分类的demo 2 import pandas as pd 3 df = pd.read_csv('train_comment_small_50.csv',sep=',') 4 import re 5 def clean_comment(text): 6 text = re.sub('<[^<]+?>',' ',text)#只要是”^”这个字符是在中括号”[]”中被使用的话就是表示字符类的否定,如果不是的话就是表示限定开头。 7 text = text.replace('\"','') 8 text = text.replace('"','') 9 return text 10 df['cleaned_comment'] = df['comment_text'].apply(clean_comment) 11 from sklearn.model_selection import train_test_split 12 X_train,X_test,y_train,y_test = train_test_split(df['cleaned_comment'],df['toxic'],test_size=0.2) 13 import nltk 14 from sklearn.feature_extraction.text import CountVectorizer 15 from nltk.corpus import stopwords 16 vectorizer = CountVectorizer(binary=True,stop_words= stopwords.words('english'),lowercase=True,min_df=3,max_df=0.9,max_features=5000) 17 X_train_onehot = vectorizer.fit_transform(X_train) 18 import numpy as np 19 np.set_printoptions(threshold=np.inf) 20 print(X_train_onehot) 21 #print(X_train_onehot.t 22 from tensorflow.keras.models import Sequential 23 from tensorflow.keras.layers import Dense 24 nn = Sequential() 25 print(vectorizer.get_feature_names()) 26 nn.add(Dense(units=500,activation='relu',input_dim=len(vectorizer.get_feature_names()))) 27 nn.add(Dense(units=1,activation='sigmoid')) 28 nn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) 29 nn.summary() 30 nn.fit(X_train_onehot[:-20],y_train[:-20],epochs=5,batch_size=128,verbose=1,validation_data=(X_train_onehot[-100:],y_train[-100:])) 31 socres = nn.evaluate(vectorizer.transform(X_test),y_test,verbose=1) 32 print('accuracy:',socres[1]) 33 nn.save('nn.hd5')
使用CNN对文本进行分类
#简单的CNN对路透社新闻主题的分类应用 import numpy as np import keras from keras.datasets import reuters from keras.preprocessing.text import Tokenizer from tensorflow.keras.models import Sequential from tensorflow.keras import layers batch_size = 32 epochs = 12 maxlen =10000 embedding_dim = 128 num_filters = 64 kernel_size =5 (x_train,y_train),(x_test,y_test)= reuters.load_data(num_words=None,test_split=0.2) print(x_train.shape,y_train.shape,x_test.shape,y_test.shape) word_index = reuters.get_word_index(path="return_word_index.json") print(np.array(word_index)) num_classes = max(y_train)+1 index_to_word = {} for key, value in word_index.items(): print(key,value) index_to_word[value] = key tokenizer = Tokenizer(num_words=maxlen) x_train = tokenizer.sequences_to_matrix(x_train,mode='binary') print(x_train) x_test = tokenizer.sequences_to_matrix(x_test,mode='binary') y_train = keras.utils.to_categorical(y_train,num_classes)#将整型的类别标签转为onehot编码 y_test = keras.utils.to_categorical(y_test,num_classes) model = Sequential() model.add(layers.Embedding(512,embedding_dim,input_length=maxlen)) model.add(layers.Conv1D(num_filters,kernel_size,activation='relu')) model.add(layers.GlobalAveragePooling1D) model.add(layers.Dense(10,activation='relu')) model.add(layers.Dense(num_classes,activation='softmax')) model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) history = model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_split=0.1) score = model.evaluate(x_test,y_test,batch_size=batch_size,verbose=1) print("loss=", score[0]) print("accuracy=", score[1])