第二部分是评论基于属性词典的分类,分类原则是只要评论中属性词典中的分词,那么评论就分到该属性类中去。
'''基于词典的评论文本按属性分类''' import pandas as pd import re,time import jieba from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.naive_bayes import MultinomialNB import numpy as np import pyltp,os class TextClass(): def __init__(self): self.stopWord=[] '''分词和词性标注''' def seg_pos(self,sent): stopWord_Path = 'D:/论文文件/学习文件/情感分析/dict词典/哈工大stopword .txt' with open(stopWord_Path, 'r', encoding='utf-8') as fr: # 加载停用词 for word in fr.readlines(): self.stopWord.append(word.strip()) line = re.sub(r'[a-zA-Z0-9]*', '', sent) abs_path = 'D:\LTP\ltp_data_v3.4.0\' # 根目录绝对路径 cws_path = os.path.join(abs_path, 'cws.model') # 分词库文件路径 seg = pyltp.Segmentor() # 分词器对象 seg.load(cws_path) # 加载分词语料库 cword = seg.segment(line) wordList = list(cword) # print(wordList) word_filter = [word for word in wordList if word not in self.stopWord] pos_model_path = os.path.join(abs_path, 'pos.model') # 词性标注模型路径 pos = pyltp.Postagger() pos.load(pos_model_path) pos_word = pos.postag(word_filter) pos.release() # 释放实例 seg.release() # return list(pos_word), return list(zip(word_filter, list(pos_word))) '''特征提取''' def feature_extraction(self,pos): features = [] for i in pos: if i[1] in ['n', 'nz', 'j']: features.append(i[0]) # 直接获得评论中的特征词 return features '''加载词典''' def openFile(self,path): with open(path,'r',encoding='utf-8') as f: for word in f.readlines(): yield word.strip() '''文本特征化''' def text2feature(self,text_list): feature_list=[] for line in text_list: pos=self.seg_pos(line) if len(self.feature_extraction(pos))==0:#如果评论没有特征词的话,那么标记一下为'none',以便于后面对这类无特征文本的处理 feature_list.append(['none']) else: feature_list.append(self.feature_extraction(pos)) return feature_list '''文本分类''' def classify(self,feature_list): abs_path = 'D:\论文文件\阅读论文\写论文准备\字典构建\手机属性词典\dictionary_0_3\' save_name = ['相机.txt', '处理器.txt', '价格.txt', '性能.txt', '续航.txt','外观.txt', '售后.txt'] c1 = list(self.openFile(abs_path+save_name[0])) c2 = list(self.openFile(abs_path+save_name[1])) c3 = list(self.openFile(abs_path+save_name[2])) c4 = list(self.openFile(abs_path + save_name[3])) c5 = list(self.openFile(abs_path + save_name[4])) c6 = list(self.openFile(abs_path + save_name[5])) c7 = list(self.openFile(abs_path + save_name[6])) dict={'camera':set([]),'processor':set([]),'price':set([]),'performance':set([]),'endurance':set([]),'appearance':set([]),'serve':set([]),'none':set([])} for i in range(len(feature_list)): for j in feature_list[i]: if j in c1: dict['camera'].add(i) if j in c2: dict['processor'].add(i) if j in c3: dict['price'].add(i) if j in c4: dict['performance'].add(i) if j in c5: dict['endurance'].add(i) if j in c6: dict['appearance'].add(i) if j in c7: dict['serve'].add(i) if j=='none' : dict['none'].add(i) return dict ##########!!!!!!这里缺少一个对于没有特正在7个属性词典的评论的处理,,明天解决!!!(初步想法是建立一个包含所有特征的词典,如果不在则评论放到无属性集合里 '''根据上面分类的结果(索引字典)将原文件里的评论分开,并存储''' def classify_save(self,index_dict,abs_path,ori_file): # 其中index_dict是分类的结果,为字典索引 # abs_path是存储分类文本的绝对地址 # ori_file是要分类的评论文本文件,且是DataFrame格式文件 #无返回文件,以csv格式存储分类结果 keys=index_dict.keys() print(keys) for i in keys:#注意encoding='gbk',index=False编码格式和去掉列的默认索引DataFrame.ix[]的用法 # print('关键词',i ) # print(list(index_dict[i])) # print(ori_file.ix[list(index_dict[i])].comment) ori_file.ix[list(index_dict[i])].to_csv(abs_path+i+'.csv',encoding='gbk',index=False) '''集成所有功能的一个函数,直接输入预测数据地址,和保存分类好结果的地址''' def all(self,pre_data_path,save_abs_path): file=list(pd.read_csv(pre_data_path,sep=',',encoding='GBK').comment) text_feature = self.text2feature(file) result = self.classify(text_feature) comment_file = pd.read_csv(pre_data_path, sep=',', encoding='GBK') self.classify_save(result, save_abs_path, comment_file) for i in result.keys():#输出各个属性评论占总评论的比例 print(i+'的评论比例:', len(result[i]) / len(file)) if __name__=='__main__' : s=time.time() '''对Excel中的预测数据按属性分类返回各类别评论索引''' '''华为预测数据''' path_xiaomi = 'D:/machinelearning data/crawlerData/xiaomi6X_pre_JD100.csv' '''华为预测数据''' path_huawei= 'D:/machinelearning data/crawlerData/huaweiP20_pre_JD100.csv' abs_path_xiaomi='D:\machinelearning data\crawlerData\cluster_data\feature_phone_xiaomi\' abs_path_huawei='D:\machinelearning data\crawlerData\cluster_data\feature_phone_huawei\' demo=TextClass() #demo.all(path_xiaomi,abs_path_xiaomi) demo.all(path_huawei, abs_path_huawei) e=time.time() print('耗时:',e-s)