import re import numpy as np def file_do(filename): with open(filename,'rb') as f: data = f.read().decode() data = data.split(' ') # print('dasdqaw:',data) words = [] labels = [] for i in data: # print('i:',i) data1 = re.findall(r'[u4e00-u9fff]+', i) #提取每行的中文 不要英文 和字符 if data1 != []: # print('data1:',data1) words.append(data1[1:]) labels.append(data1[0]) print('内words:',words) print('内labels:',labels) # print(len(words),len(labels)) return words,labels # filename = 'C:/Users/cuit/Desktop/文本测试数据/training-1000.txt' # words,labels = file_do(filename) #创建训练样本的词汇表 def createvacablist(words): vacablist = set() for row in words: vacablist = vacablist | set(row) # print(vacablist) return list(vacablist) # vacablist = createvacablist(words) # print('外vacablist:',vacablist) # print(vacablist.index('愤怒')) #将数据变为 0,1 向量集 def setdata(vacablist,inputdata): result = np.zeros(len(vacablist)) for i in inputdata: if i in vacablist: result[vacablist.index(i)] += 1 else: print('词汇表里没有 :',i) # print('inputdata',inputdata) # print('result:',result) return result # setdata(vacablist,words[0]) #训练函数,计算没个词在每个标签的不同概率 def P1(words,labels,vacablist): #先计算标签中好评概率 labels_1count = 0 # 1为好评 0为差评 for i in labels: if i == '好评': labels_1count +=1 plabels_1 = labels_1count/len(labels) print('plabels_1:',plabels_1) #将训练数据再统一成词汇表格式的数据 setwords = [] for i in words: setwords.append(setdata(vacablist,i)) # print('setwords:',setwords) count_1 = 2 # 好评中词语的个数 count_0 = 2 data_1 = np.ones(len(vacablist)) data_0 = np.ones(len(vacablist)) for i in range(len(setwords)): if labels[i] == '好评': data_1 = data_1 + setwords[i] count_1 += sum(setwords[i]) else: data_0 = data_0 + setwords[i] count_0 += sum(setwords[i]) # print('data_1:',data_1) # print('data_0:', data_0) print('count_1:{},count_0:{}'.format(count_1,count_0)) data_1 = (data_1/count_1) * plabels_1 data_0 = (data_0/count_0) * (1-plabels_1) print('data_1:', data_1) print('data_0:', data_0) return data_1,data_0 # data_1,data_0 = P1(words,labels,vacablist) # print('data_1:',data_1) # print('data_0:',data_0) def classfy(data_1,data_0,vacablist,test_data): p1 = 1 p0 = 1 set_tdata = setdata(vacablist,test_data) # print('set_tdata:',set_tdata) for i in range(len(set_tdata)): if set_tdata[i] != 0: # print('i:{},data_1[i]:{},data_0[i]:{},set_tdata[i]:{}'.format(i,data_1[i],data_0[i],set_tdata[i])) p1 = p1 * data_1[i] * set_tdata[i] p0 = p0 * data_0[i] * set_tdata[i] if p1>p0: # print('好评') return '好评' if p1 <p0: # print('差评') return '差评' else: # print('p1:{}.p0:{}'.format(p1,p0)) return -1 # mydata = ['这个','酒店','马马虎虎'] # classfy(data_1,data_0,vacablist,mydata) #测试文档预测 def test(): #训练数据 filename = 'C:/Users/cuit/Desktop/文本测试数据/training-1000.txt' words,labels = file_do(filename) print('len(words):',len(words)) vacablist = createvacablist(words) data_1, data_0 = P1(words, labels, vacablist) #测试数据转格式 print() filename = 'C:/Users/cuit/Desktop/文本测试数据/test-1000.txt' test_words,test_labels = file_do(filename) yes_count = 0 for i in range(len(test_words)): result = classfy(data_1,data_0,vacablist,test_words[i]) # '好评' 或 '差评' # print('result:{},labels[i]:{}'.format(result,test_labels[i])) if result == test_labels[i]: yes_count += 1 corect = yes_count/len(test_labels) print('正确了{}个,总共有{}个,准确率为{}'.format(yes_count,len(test_labels),corect)) test()