#中文分词 def cut_word(text): text=" ".join(list(jieba.cut(text))) return text #中文文本的特征提取 def count_chinese_dome(): data=["10艘中俄军舰穿过津轻海峡,这一举措合乎国际法,无可指摘,却引起日本国内“异样反应”。" "19日,日本内阁官房副长官矶崎仁彦称,日方对此“高度关注”," "“将对我国周边海空域进行警戒和监视,采取万全的应对姿态”。"] data_new=[] # for sent in data: # data_new.append(cut_word(sent)) data_new=[cut_word(sent) for sent in data] # 1.实例化一个转换器对象 transfer = CountVectorizer(stop_words=[]) # 停用词 # 2.调用fit_transform() data_new_2 = transfer.fit_transform(data_new) print(data_new_2.toarray()) print(transfer.get_feature_names()) print(data_new_2) #Tf-idf文本特征抽取 def tfidf_demo(): data=["10艘中俄军舰穿过津轻海峡,这一举措合乎国际法,无可指摘,却引起日本国内“异样反应”。" "19日,日本内阁官房副长官矶崎仁彦称,日方对此“高度关注”," "“将对我国周边海空域进行警戒和监视,采取万全的应对姿态”。"] data_new=[] # for sent in data: # data_new.append(cut_word(sent)) data_new=[cut_word(sent) for sent in data] # 1.实例化一个转换器对象 transfer = TfidfVectorizer(stop_words=[]) # 停用词 # 2.调用fit_transform() data_new_2 = transfer.fit_transform(data_new) print(data_new_2.toarray()) print(transfer.get_feature_names())