from pandas import read_csv import numpy as np from sklearn.datasets.base import Bunch import pickle #导入cPickle包并且取一个别名pickle #持久化类 from sklearn.feature_extraction.text import TfidfVectorizer import jieba import xlwt import operator#排序用 from sklearn import metrics Straindata=[] Strainlabel=[] Sart_train=[] Stestdata=[] Stestlabel=[] Sart_test=[] Slast=[] Snew=[] class obj: def __init__(self): self.key=0 self.weight=0.0 def importSmallContentdata(file,data,art,label,f): dataset=read_csv(file) Sdata = dataset.values[:,:] print(type(Sdata)) if f==1: for line in Sdata: ls=[] ls.append(line[14]) ls.append(line[15]) ls.append(line[16]) ls.append(line[17]) Slast.append(ls) #print(len(Slast)) #print("需要对照的小类数据准备完毕") '''找到smalli不为0的装入Straindata,把数据分开''' for smalli in range(14,18): #print(smalli) count=0 for line in Sdata: count=count+1 if line[smalli]!='0' and line[smalli]!=0 : k=1 ls=[] for i in line: if k==1: art.append(i) k=k+1 continue if k==11:#k为14并不代表是line[14],因为line是从0开始 break ls.append(float(i)) k=k+1 data.append(ls) label.append(line[smalli]) if f==1: Snew.append(count) #print("为什么都超限",len(Snew)) def getKvector(train_set,vec,n): nonzero=train_set.tdm.nonzero() k=0 lis=[] gather=[] p=-1 for i in nonzero[0]: p=p+1 if k==i: a=obj() a.key=nonzero[1][p] a.weight=train_set.tdm[i,nonzero[1][p]] lis.append(a) else: lis.sort(key=lambda obj: obj.weight, reverse=True)#对链表内为类对象的排序 gather.append(lis) while k < i: k=k+1 lis=[] a=obj() a.key=nonzero[1][p] a.weight=train_set.tdm[i,nonzero[1][p]] lis.append(a) gather.append(lis)#gather存储的是每条数据的事实描述的特征向量,已经从小到大排好了,只不过每个存既有key又有weight #我们只要key,不再需要weight sj=1 for i in gather: ls=[] for j in i: sj=sj+1 ls.append(float(j.key)) while sj<=n: sj=sj+1 ls.append(-1) sj=1 vec.append(ls) '''读取停用词''' def _readfile(path): with open(path, "rb") as fp: content = fp.read() return content ''' 读取bunch对象''' def _readbunchobj(path): with open(path, "rb") as file_obj: bunch = pickle.load(file_obj) return bunch '''写入bunch对象''' def _writebunchobj(path, bunchobj): with open(path, "wb") as file_obj: pickle.dump(bunchobj, file_obj) def buildtrainbunch(bunch_path,art_train,trainlabel): bunch = Bunch(label=[],contents=[]) for item1 in trainlabel: bunch.label.append(item1) #trainContentdatasave=[] #存储所有训练和测试数据的分词 for item2 in art_train: item2=str(item2) item2 = item2.replace(" ", "") item2 = item2.replace(" ", "") content_seg=jieba.cut(item2) save2='' for item3 in content_seg: if len(item3) > 1 and item3!=' ': #trainContentdatasave.append(item3) save2=save2+","+item3 bunch.contents.append(save2) with open(bunch_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("构建训练数据文本对象结束!!!") def buildtestbunch(bunch_path,art_test,testlabel): bunch = Bunch(label=[],contents=[]) for item1 in testlabel: bunch.label.append(item1) #testContentdatasave=[] #存储所有训练和测试数据的分词 for item2 in art_test: item2=str(item2) item2 = item2.replace(" ", "") item2 = item2.replace(" ", "") content_seg=jieba.cut(item2) save2='' for item3 in content_seg: if len(item3) > 1 and item3!=' ': #testContentdatasave.append(item3) save2=save2+","+item3 bunch.contents.append(save2) with open(bunch_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("构建测试数据文本对象结束!!!") def vector_space(stopword_path,bunch_path,space_path): stpwrdlst = _readfile(stopword_path).splitlines()#读取停用词 bunch = _readbunchobj(bunch_path)#导入分词后的词向量bunch对象 #构建tf-idf词向量空间对象 tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={}) #权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值 #使用TfidVectorizer初始化向量空间模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001,use_idf=True,max_features=15000) #print(vectorizer) #文本转为词频矩阵,单独保存字典文件 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ #创建词袋的持久化 _writebunchobj(space_path, tfidfspace) print("if-idf词向量空间实例创建成功!!!") def testvector_space(stopword_path,bunch_path,space_path,train_tfidf_path): stpwrdlst = _readfile(stopword_path).splitlines()#把停用词变成列表 bunch = _readbunchobj(bunch_path) tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={}) #导入训练集的TF-IDF词向量空间 ★★ trainbunch = _readbunchobj(train_tfidf_path) tfidfspace.vocabulary = trainbunch.vocabulary vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) _writebunchobj(space_path, tfidfspace) print("if-idf词向量空间实例创建成功!!!") if __name__=="__main__": '''============================先导入数据==================================''' file_train = 'F:/goverment/exceloperating/all_tocai_train.csv' file_test = 'F:/goverment/exceloperating/all_tocai_test.csv' importSmallContentdata(file_train,Straindata,Sart_train,Strainlabel,0) importSmallContentdata(file_test,Stestdata,Sart_test,Stestlabel,1) #print("Stestlabel" ,len(Stestlabel)) #print("小类导入数据完毕") #print("大类标签导入完毕")#共1329*4 '''==========================================================tf-idf对Bar进行文本特征提取============================================================================''' #导入分词后的词向量bunch对象 train_bunch_path ="F:/goverment/exceloperating/trainbunch.bat"#Bunch保存路径 train_space_path = "F:/goverment/exceloperating/traintfdifspace.dat" test_bunch_path ="F:/goverment/exceloperating/testbunch.bat" test_space_path = "F:/goverment/exceloperating/testtfdifspace.dat" stopword_path ="F:/goverment/exceloperating/hlt_stop_words.txt" '''============================================================tf-idf对Sart进行文本特征提取==============================================================================''' buildtrainbunch(train_bunch_path,Sart_train,Strainlabel) buildtestbunch(test_bunch_path,Sart_test,Stestlabel) vector_space(stopword_path,train_bunch_path,train_space_path) testvector_space(stopword_path,test_bunch_path,test_space_path,train_space_path) train_set=_readbunchobj(train_space_path) test_set=_readbunchobj(test_space_path) '''训练数据''' S_vec_train=[] getKvector(train_set,S_vec_train,76) '''测试数据''' S_vec_test=[] getKvector(test_set,S_vec_test,76) '''=================将得到的61个特征和之前的其它特征合并Btraindata==================''' '''小类训练数据''' S_vec_train=np.array(S_vec_train) #print(type(S_vec_train)) #print(S_vec_train.shape) Straindata=np.array(Straindata) #print(type(Straindata)) #print(Straindata.shape) Straindata=np.hstack((S_vec_train,Straindata)) #print(Straindata) '''小类测试数据''' S_vec_test=np.array(S_vec_test) Stestdata=np.array(Stestdata) Stestdata=np.hstack((S_vec_test,Stestdata)) print("分类算小类精度") Strainlabel=np.array(Strainlabel) Strainlabel=np.array(Strainlabel) from xgboost import XGBClassifier clf= XGBClassifier(learning_rate =0.1, n_estimators=1150, max_depth=2, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4,#没用 scale_pos_weight=1,#没用 seed=27) clf.fit(Straindata, Strainlabel) predict=clf.predict(Stestdata) aa=metrics.accuracy_score(Stestlabel, predict) print(aa)#40.09 ''''============================输出技术问题及其可能性================''' class attri: def __init__(self): self.key=0 self.weight=0.0 '''====================小类=======================''' attribute_proba=clf.predict_proba(Stestdata) label=[] for i in attribute_proba: lis=[] k=0 while k<4: k=k+1 p=1 mm=0 sj=-1 for j in i: sj=sj+1 if j>mm: mm=j p=sj i[p]=0#难道是从1开始? a=attri() a.key=p a.weight=mm lis.append(a) #lis.append(p) label.append(lis) #接下来将label和snew结合,再排序去重就可以和slast比较了 #print("为什么都超限",len(Snew)) print("label",len(label)) count=0 for lis in label: lis.append(Snew[count]) count=count+1 print("结合完成,准备去重!")#此时label和Snew的长度都为1439 bol=np.zeros(len(label)+1) Snew=[] for lis in label: if bol[lis[4]]==0: Snew.append(lis) bol[lis[4]]=1 #print(len(Snew))#去重后为1162 for i in range(len(Slast)+1): if i==0: continue if bol[i]==0: ls=[] a=attri() a.weight=1 a.key=0 ls.append(a) ls.append(a) ls.append(a) ls.append(a) ls.append(i) Snew.append(ls) #print("Snew",len(Snew)) #为1329 print("去重完毕,准备排序!") Snew.sort(key=operator.itemgetter(4)) print("排序完毕,准备比较!") myexcel = xlwt.Workbook() sheet = myexcel.add_sheet('sheet') si=-2 sj=-1 #cys=1 #print(Snew) for i in Snew: si=si+2 #print(si) #print("对于记录 %d:" % cys) #cys=cys+1 for j in range(len(i)): if(j==len(i)-1): continue sj=sj+1 #sheet.write(si,sj,str(j)) sheet.write(si,sj,str(i[j].key)) sheet.write(si+1,sj,str(i[j].weight*100)) #print ("发生技术问题 %d 的可能性是:%.2f %%" % (j.key,j.weight*100)) sj=-1 myexcel.save("Snew.xls")