1.jieba.analyse.extract_tags(text) text必须是一连串的字符串才可以
第一步:进行语料库的读取
第二步:进行分词操作
第三步:载入停用词,同时对分词后的语料库进行停用词的去除
第四步:选取一段文本分词列表,串接成字符串,使用jieba.analyse.extract_tags提取主题词
import pandas as pd import numpy as np import jieba # 1.导入数据语料的新闻数据 df_data = pd.read_table('data/val.txt', names=['category', 'theme', 'URL', 'content'], encoding='utf-8') # 2.对语料库进行分词操作 df_contents = df_data.content.values.tolist() # list of list 结构 Jie_content = [] for df_content in df_contents: split_content = jieba.lcut(df_content) if len(split_content) > 1 and split_content != ' ': Jie_content.append(split_content) # 3. 导入停止词的语料库, sep=' '表示分隔符, quoting控制引号的常量, names=列名, index_col=False,不用第一列做为行的列名, encoding stopwords = pd.read_csv('stopwords.txt', sep=' ', quoting=3, names=['stopwords'], index_col=False, encoding='utf-8') print(stopwords.head()) # 对文本进行停止词的去除 def drop_stops(Jie_content, stopwords): clean_content = [] all_words = [] for j_content in Jie_content: line_clean = [] for line in j_content: if line in stopwords: continue line_clean.append(line) all_words.append(line) clean_content.append(line_clean) return clean_content, all_words # 将DateFrame的stopwords数据转换为list形式 stopwords = stopwords.stopwords.values.tolist() clean_content, all_words = drop_stops(Jie_content, stopwords) print(clean_content[0]) #4. 使用jieba分词器,提取文本的关键字 import jieba.analyse index = 2000 content_word = ''.join(clean_content[index]) content_text = ' '.join(jieba.analyse.extract_tags(content_word, topK=5, withWeight=False)) print(content_word) print(content_text)