1 import os 2 import jieba 3 import random 4 """ 5 函数说明:中文文本处理 6 Parameters: 7 folder_path - 文本存放的路径 8 test_size - 测试集占比,默认占所有数据集的百分之20 9 Returns: 10 all_words_list - 按词频降序排序的训练集列表 11 train_data_list - 训练集列表 12 test_data_list - 测试集列表 13 train_class_list - 训练集标签列表 14 test_class_list - 测试集标签列表 15 """ 16 def TextProcessing(folder_path): 17 folder_list = os.listdir(folder_path) #查看folder_path下的文件 18 data_list = [] #训练集 19 class_list = [] 20 21 #遍历每个子文件夹 22 for folder in folder_list: 23 new_folder_path = os.path.join(folder_path, folder) #根据子文件夹,生成新的路径 24 files = os.listdir(new_folder_path) #存放子文件夹下的txt文件的列表 25 26 j = 1 27 #遍历每个txt文件 28 for file in files: 29 if j > 100: #每类txt样本数最多100个 30 break 31 with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f: #打开txt文件 32 raw = f.read() 33 34 word_cut = jieba.cut(raw, cut_all = False) #精简模式,返回一个可迭代的generator 35 word_list = list(word_cut) #generator转换为list 36 37 data_list.append(word_list) 38 class_list.append(folder) 39 j += 1 40 data_class_list = list(zip(data_list, class_list)) # zip压缩合并,将数据与标签对应压缩 41 random.shuffle(data_class_list) # 将data_class_list乱序 42 index = int(len(data_class_list) * test_size) + 1 # 训练集和测试集切分的索引值 43 train_list = data_class_list[index:] # 训练集 44 test_list = data_class_list[:index] # 测试集 45 train_data_list, train_class_list = zip(*train_list) # 训练集解压缩 46 test_data_list, test_class_list = zip(*test_list) # 测试集解压缩 47 48 all_words_dict = {} # 统计训练集词频 49 for word_list in train_data_list: 50 for word in word_list: 51 if word in all_words_dict.keys(): 52 all_words_dict[word] += 1 53 else: 54 all_words_dict[word] = 1 55 56 # 根据键的值倒序排序 57 all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True) 58 all_words_list, all_words_nums = zip(*all_words_tuple_list) # 解压缩 59 all_words_list = list(all_words_list) # 转换成列表 60 return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list 61 62 if __name__ == '__main__': 63 # 文本预处理 64 folder_path = './SogouC/Sample' # 训练集存放地址 65 all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing( 66 folder_path, test_size=0.2) 67 print(all_words_list) 68 """ 69 函数说明:读取文件里的内容,并去重 70 Parameters: 71 words_file - 文件路径 72 Returns: 73 words_set - 读取的内容的set集合 74 """ 75 def MakeWordsSet(words_file): 76 words_set = set() # 创建set集合 77 with open(words_file, 'r', encoding='utf-8') as f: # 打开文件 78 for line in f.readlines(): # 一行一行读取 79 word = line.strip() # 去回车 80 if len(word) > 0: # 有文本,则添加到words_set中 81 words_set.add(word) 82 return words_set # 返回处理结果 83 """ 84 函数说明:文本特征选取 85 Parameters: 86 all_words_list - 训练集所有文本列表 87 deleteN - 删除词频最高的deleteN个词 88 stopwords_set - 指定的结束语 89 Returns: 90 feature_words - 特征集 91 """ 92 def words_dict(all_words_list, deleteN, stopwords_set=set()): 93 feature_words = [] # 特征列表 94 n = 1 95 for t in range(deleteN, len(all_words_list), 1): 96 if n > 1000: # feature_words的维度为1000 97 break 98 # 如果这个词不是数字,并且不是指定的结束语,并且单词长度大于1小于5,那么这个词就可以作为特征词 99 if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len( 100 all_words_list[t]) < 5: 101 feature_words.append(all_words_list[t]) 102 n += 1 103 return feature_words 104 105 if __name__ == '__main__': 106 # 文本预处理 107 folder_path = './SogouC/Sample' # 训练集存放地址 108 all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing( 109 folder_path, test_size=0.2) 110 111 # 生成stopwords_set 112 stopwords_file = './stopwords_cn.txt' 113 stopwords_set = MakeWordsSet(stopwords_file) 114 feature_words = words_dict(all_words_list, 100, stopwords_set) 115 print(feature_words) 116 117 # print(data_list) 118 # print(class_list) 119 if __name__ == '__main__': 120 #文本预处理 121 folder_path = './SogouC/Sample' #训练集存放地址 122 TextProcessing(folder_path) 123 ''' 124 将所有文本分成训练集和测试集,并对训练集中的所有单词进行词频统计,并按降序排序。 125 使用Sklearn构建朴素贝叶斯分类器 126 在scikit-learn中,一共有3个朴素贝叶斯的分类算法类。 127 分别是GaussianNB,MultinomialNB和BernoulliNB。 128 其中GaussianNB就是先验为高斯分布的朴素贝叶斯, 129 MultinomialNB就是先验为多项式分布的朴素贝叶斯, 130 而BernoulliNB就是先验为伯努利分布的朴素贝叶斯。 131 ''' 132 from sklearn.naive_bayes import MultinomialNB 133 import matplotlib.pyplot as plt 134 """ 135 函数说明:根据feature_words将文本向量化 136 Parameters: 137 train_data_list - 训练集 138 test_data_list - 测试集 139 feature_words - 特征集 140 Returns: 141 train_feature_list - 训练集向量化列表 142 test_feature_list - 测试集向量化列表 143 """ 144 def TextFeatures(train_data_list, test_data_list, feature_words): 145 def text_features(text, feature_words): #出现在特征集中,则置1 146 text_words = set(text) 147 features = [1 if word in text_words else 0 for word in feature_words] 148 return features 149 train_feature_list = [text_features(text, feature_words) for text in train_data_list] 150 test_feature_list = [text_features(text, feature_words) for text in test_data_list] 151 return train_feature_list, test_feature_list #返回结果 152 153 """ 154 函数说明:新闻分类器 155 156 Parameters: 157 train_feature_list - 训练集向量化的特征文本 158 test_feature_list - 测试集向量化的特征文本 159 train_class_list - 训练集分类标签 160 test_class_list - 测试集分类标签 161 Returns: 162 test_accuracy - 分类器精度 163 """ 164 def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list): 165 classifier = MultinomialNB().fit(train_feature_list, train_class_list) 166 test_accuracy = classifier.score(test_feature_list, test_class_list) 167 return test_accuracy 168 169 # if __name__ == '__main__': 170 # #文本预处理 171 # folder_path = './SogouC/Sample' #训练集存放地址 172 # all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2) 173 # 174 # # 生成stopwords_set 175 # stopwords_file = './stopwords_cn.txt' 176 # stopwords_set = MakeWordsSet(stopwords_file) 177 178 test_accuracy_list = [] 179 deleteNs = range(0, 1000, 20) #0 20 40 60 ... 980 180 for deleteN in deleteNs: 181 feature_words = words_dict(all_words_list, deleteN, stopwords_set) 182 train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words) 183 test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list) 184 test_accuracy_list.append(test_accuracy) 185 186 plt.figure() 187 plt.plot(deleteNs, test_accuracy_list) 188 plt.title('Relationship of deleteNs and test_accuracy') 189 plt.xlabel('deleteNs') 190 plt.ylabel('test_accuracy') 191 plt.show() 192 if __name__ == '__main__': 193 # 文本预处理 194 folder_path = './SogouC/Sample' # 训练集存放地址 195 all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, 196 test_size=0.2) 197 198 # 生成stopwords_set 199 stopwords_file = './stopwords_cn.txt' 200 stopwords_set = MakeWordsSet(stopwords_file) 201 202 test_accuracy_list = [] 203 feature_words = words_dict(all_words_list, 450, stopwords_set) 204 train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words) 205 test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list) 206 test_accuracy_list.append(test_accuracy) 207 ave = lambda c: sum(c) / len(c)