1 # -*- coding: utf-8 -*- 2 3 import sys 4 import os 5 import numpy as np 6 import pickle 7 from sklearn import metrics 8 9 #导入数据集 10 def loadDataSet(): 11 postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 12 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 13 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him', 'my'], 14 ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 15 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 16 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] 17 classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not,分类 18 return postingList, classVec 19 20 21 # 读取文件 22 def readfile(path): 23 fp = open(path, "rb") 24 content = fp.read() 25 fp.close() 26 return content 27 28 """ 29 30 #计算分类精度: 31 def metrics_result(actual,predict): 32 print('精度:{0:.3f}'.format(metrics.precision_score(actual,predict))) 33 print ('召回:{0:0.3f}'.format(metrics.recall_score(actual,predict))) 34 print ('f1-score:{0:.3f}'.format(metrics.f1_score(actual,predict))) 35 36 """ 37 38 # 读取bunch对象 39 def readbunchobj(path): 40 file_obj = open(path, "rb") 41 bunch = pickle.load(file_obj) 42 file_obj.close() 43 return bunch 44 45 46 # 写入bunch对象 47 def writebunchobj(path, bunchobj): 48 file_obj = open(path, "wb") 49 pickle.dump(bunchobj, file_obj) 50 file_obj.close() 51 52 53 class NBayes(object): 54 def __init__(self): 55 self.vocabulary = [] # 词典 56 self.idf = 0 # 词典的idf权值向量 57 self.tf = 0 # 训练集的权值矩阵 58 self.tdm = 0 # P(x|yi) 59 self.Pcates = {} # P(yi)--是个类别字典,这个集合就是p(yi)的值的集合 60 self.labels = [] # 对应每个文本的分类,是个外部导入的列表 61 self.doclength = 0 # 训练集文本数 62 self.vocablen = 0 # 词典词长 63 self.testset = 0 # 测试集 64 65 # 加载训练集并生成词典,以及tf, idf值 66 def train_set(self, trainset, classVec): 67 self.cate_prob(classVec) # 计算每个分类在数据集中的概率:P(yi) 68 self.doclength = len(trainset) 69 tempset = set() 70 [tempset.add(word) for doc in trainset for word in doc] # 生成词典 71 self.vocabulary = list(tempset) 72 self.vocablen = len(self.vocabulary) 73 self.calc_wordfreq(trainset) 74 # self.calc_tfidf(trainset) # 生成tf-idf权值 75 self.build_tdm() # 按分类累计向量空间的每维值:P(x|yi) 76 77 # 生成 tf-idf 78 def calc_tfidf(self, trainset): 79 self.idf = np.zeros([1, self.vocablen]) 80 self.tf = np.zeros([self.doclength, self.vocablen]) 81 for indx in range(self.doclength): 82 for word in trainset[indx]: 83 self.tf[indx, self.vocabulary.index(word)] += 1 84 # 消除不同句长导致的偏差 85 self.tf[indx] = self.tf[indx] / float(len(trainset[indx])) 86 for signleword in set(trainset[indx]): 87 self.idf[0, self.vocabulary.index(signleword)] += 1 88 self.idf = np.log(float(self.doclength) / self.idf) 89 self.tf = np.multiply(self.tf, self.idf) # 矩阵与向量的点乘 90 91 # 生成普通的词频向量 92 def calc_wordfreq(self, trainset): 93 self.idf = np.zeros([1, self.vocablen]) # 1*词典数 94 self.tf = np.zeros([self.doclength, self.vocablen]) # 训练集文件数*词典数 95 for indx in range(self.doclength): # 遍历所有的文本 96 for word in trainset[indx]: # 遍历文本中的每个词 97 self.tf[indx, self.vocabulary.index(word)] += 1 # 找到文本的词在字典中的位置+1 98 for signleword in set(trainset[indx]): 99 self.idf[0, self.vocabulary.index(signleword)] += 1 100 101 # 计算每个分类在数据集中的概率:P(yi) 102 def cate_prob(self, classVec): 103 self.labels = classVec#让分类作为相对应的标签 104 labeltemps = set(self.labels) # 获取全部分类,返回的是一个集合,其值为{0,1} 105 #print('分类的结果:',labeltemps) 106 for labeltemp in labeltemps: 107 # 统计列表中重复的值:self.labels.count(labeltemp) 108 self.Pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels))#求分类列表中重复的值,就是0和1在所有当中所占的比例 109 110 # 按分类累计向量空间的每维值:P(x|yi) 111 def build_tdm(self): 112 self.tdm = np.zeros([len(self.Pcates), self.vocablen]) # 类别行*词典列 113 sumlist = np.zeros([len(self.Pcates), 1]) # 统计每个分类的总值 114 for indx in range(self.doclength): 115 self.tdm[self.labels[indx]] += self.tf[indx] # 将同一类别的词向量空间值加总 116 sumlist[self.labels[indx]] = np.sum(self.tdm[self.labels[indx]]) # 统计每个分类的总值--是个标量 117 self.tdm = self.tdm / sumlist # P(x|yi) 118 119 # 测试集映射到当前词典 120 def map2vocab(self, testdata): 121 self.testset = np.zeros([1, self.vocablen]) 122 for word in testdata: 123 self.testset[0, self.vocabulary.index(word)] += 1 124 125 # 输出分类类别 126 def predict(self, testset): 127 if np.shape(testset)[1] != self.vocablen: 128 print("输入错误") 129 exit(0) 130 predvalue = 0 131 predclass = "" 132 for tdm_vect, keyclass in zip(self.tdm, self.Pcates): 133 # P(x|yi)P(yi) 134 temp = np.sum(testset * tdm_vect * self.Pcates[keyclass]) 135 if temp > predvalue: 136 predvalue = temp 137 predclass = keyclass 138 return predclass
算法的改进:
# 生成 tf-idf 78 def calc_tfidf(self, trainset): 79 self.idf = np.zeros([1, self.vocablen]) 80 self.tf = np.zeros([self.doclength, self.vocablen]) 81 for indx in range(self.doclength): 82 for word in trainset[indx]: 83 self.tf[indx, self.vocabulary.index(word)] += 1 84 # 消除不同句长导致的偏差 85 self.tf[indx] = self.tf[indx] / float(len(trainset[indx])) 86 for signleword in set(trainset[indx]): 87 self.idf[0, self.vocabulary.index(signleword)] += 1 88 self.idf = np.log(float(self.doclength) / self.idf) 89 self.tf = np.multiply(self.tf, self.idf) # 矩阵与向量的点乘