git: https://github.com/linyi0604/MachineLearning
词向量技术 Word2Vec 每个连续词汇片段都会对后面有一定制约 称为上下文context 找到句子之间语义层面的联系
1 from sklearn.datasets import fetch_20newsgroups 2 from bs4 import BeautifulSoup 3 import nltk, re 4 from gensim.models import word2vec 5 6 # nltk.download('punkt') 7 8 9 ''' 10 词向量技术 Word2Vec 11 每个连续词汇片段都会对后面有一定制约 称为上下文context 12 13 找到句子之间语义层面的联系 14 15 ''' 16 17 # 联网下载新闻数据 18 news = fetch_20newsgroups(subset="all") 19 x, y = news.data, news.target 20 21 # 定义一个函数 将每条新闻中的句子分离,并返回一个句子的列表 22 def news_to_sentences(news): 23 news_text = BeautifulSoup(news).get_text() 24 tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") 25 raw_sentences = tokenizer.tokenize(news_text) 26 sentences = [] 27 for sent in raw_sentences: 28 temp = re.sub("[^a-zA-Z]", " ", sent.lower().strip()).split() 29 sentences.append(temp) 30 31 return sentences 32 33 # 将长新闻中的句子剥离出来用于训练 34 sentences = [] 35 for i in x: 36 sentence_list = news_to_sentences(i) 37 sentences += sentence_list 38 39 40 # 配置词向量的维度 41 num_features = 300 42 # 保证被考虑的词汇的频度 43 min_word_count = 20 44 # 并行计算使用cpu核心数量 45 num_workers = 2 46 # 定义训练词向量的上下文窗口大小 47 context = 5 48 downsapling = 1e-3 49 50 # 训练词向量模型 51 model = word2vec.Word2Vec(sentences, 52 workers=num_workers, 53 size=num_features, 54 min_count=min_word_count, 55 window=context, 56 sample=downsapling) 57 # 这个设定代表当前训练好的词向量为最终版, 也可以加速模型训练的速度 58 model.init_sims(replace=True) 59 60 # 利用训练好的模型 寻找文本中与college相关的十个词汇 61 print(model.most_similar("college")) 62 ''' 63 [('wisconsin', 0.7664438486099243), 64 ('osteopathic', 0.7474539279937744), 65 ('madison', 0.7433826923370361), 66 ('univ', 0.7296794652938843), 67 ('melbourne', 0.7212647199630737), 68 ('walla', 0.7068545818328857), 69 ('maryland', 0.7038443088531494), 70 ('carnegie', 0.7038302421569824), 71 ('institute', 0.7003713846206665), 72 ('informatics', 0.6968873143196106)] 73 '''