import numpy as np import math #加载模拟数据 def loaddata(): postingList=[['my','dog','has','flea','problem','help','please'], ['maybe','not','take','him','to','dog','park','stupid'], ['my','dalmation','is','so','cute','I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how','to','stop','him'], ['quit','buying','worthless','dog','food','stupid']] classVec = [0,1,0,1,0,1] # 1 侮辱 0 非侮辱 return postingList,classVec #创建词汇表 def createSet(dataset): result = set([]) for i in dataset: result = result | set(i) return list(result) # dataSet,labels = loaddata() # vacablist = createSet(dataSet) # print('外lables',labels) # print('外dataSet',dataSet) # print('外vacablist:',vacablist) #创建和词汇表对应的向量 def setofword(vacablist,inputdata): mylist = [0] * len(vacablist) for word in inputdata: if word in vacablist: mylist[vacablist.index(word)] = 1 else: print('没有 {} 这个词'.format(word)) return mylist # setofdata = setofword(vacablist,dataSet[3]) # print('外setofdata:',setofdata) #[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0] # print(vacablist) #['garbage', 'to', 'worthless', 'ate', 'has', 'so', 'take', 'cute', 'dog', 'flea', 'buying', 'help', 'is', 'park', 'I', 'food', 'my', 'licks', 'posting', 'dalmation', 'problem', 'please', 'stop', 'how', 'stupid', 'maybe', 'love', 'steak', 'quit', 'him', 'not', 'mr'] # print(dataSet[3]) #['stop', 'posting', 'stupid', 'worthless', 'garbage'] # trainmat = [] # for i in dataSet: # trainmat.append(setofword(vacablist,i)) # print('外trainmat:',trainmat) # 训练函数,算P(word1)的概率 def P1(trainmat,labels): plable_1 = sum(labels)/len(labels) data_0 = np.ones(len(trainmat[0])) count_0 = 2 data_1 = np.ones(len(trainmat[0])) count_1 = 2 for i in range(len(labels)): if labels[i] == 0: data_0 += trainmat[i] count_0 += sum(trainmat[i]) if labels[i] == 1: data_1 += trainmat[i] count_1 += sum(trainmat[i]) data_0 = data_0 / count_0 data_1 = data_1/count_1 print('data_0:{},count:{}'.format(data_0,count_0)) print('data_1:{},count:{}'.format(data_1, count_1)) print('plabel_1:',plable_1) return data_0,data_1,plable_1 # P1(trainmat,labels) #用得到的概率分类 def classfy(testset,data_0,data_1,plabel_1): print('开始classfy') p1 = 1 p0 = 1 for i in range(len(testset)): if testset[i] ==1 : p1 = p1 * data_1[i] p0 = p0 * data_0[i] p1 = p1 * plabel_1 p0 = p0 * (1-plabel_1) print('p1:{},p0:{}'.format(p1,p0)) if p1>p0: print('该分类为1') return 1 else: print('该分类为0') return 0 #测试总逻辑代码 def test(): dataSet,labels = loaddata() vacablist = createSet(dataSet) trainmat = [] for i in dataSet: #因为训练函数需要训练数据是词汇表的格式 trainmat.append(setofword(vacablist,i)) data_0, data_1, plable_1 = P1(trainmat,labels) testlist = ['my','love','stupid'] testdata = setofword(vacablist,testlist) classfy(testdata,data_0,data_1,plable_1) test()