根据语料计算词向量,两种模式 CBOW 和 skip-gram
# -*- coding:utf-8 -*- import os from gensim.models import word2vec class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for line in open(os.path.join(self.dirname)): yield line.split() if __name__ == '__main__': data_dir = '' model_path = '' files = os.listdir(data_dir) for index, data_path in enumerate(files): print 'index:', index, ' ', data_path sentences = MySentences(data_dir + data_path) # a memory-friendly iterator if index == 0: model = word2vec.Word2Vec(sentences, size=50, min_count=5, sg=1) else: model = word2vec.Word2Vec.load(model_path) model.train(sentences) print 'load success' model.save(model_path) print 'save success' # 继续训练 # # 输出词向量 print model['安踏'] # 查看相似性 print model.similarity('直播', '电商') print model.similarity('淘宝', '电商') # 输出最相似的词 for i in model.most_similar(positive=['微博'], topn=10): print i[0], i[1] # 输出反义词 for i in model.most_similar(negative=['微博'], topn=3): print i[0], i[1] # 找出气质最不合的词 print(model.doesnt_match(['马云', '京东', '阿里', '小米', '百度', '美团']))
载入词向量:
def load_word_vec_model(): word_vec_path = 'word2vec.txt' word_vec_model = KeyedVectors.load_word2vec_format(word_vec_path, binary=False) return word_vec_model
txt内容格式:
89299 50
广义 6.7723665 14.601548 20.063915 13.727134 -11.497403 -9.687737 -13.661188 13.636487 12.514348 -11.927621 9.849327 3.869883 -4.835537 21.264105 -0.27862522 -1.8299553 -6.370595 16.223785 -8.902656 -6.1665072 14.767804 -13.545085 -0.26700944 18.797802 3.4140692 -23.615307 5.3606462 -9.613785 -14.123712 -8.143979 -2.0690963 2.955524 4.1582117 0.92726874 -4.3396864 -10.7997 -2.9653497 -11.553318 -3.0220852 20.548243 -5.2833705 25.26876 -6.0394297 -1.6494333 3.4560573 12.670779 -13.85315 -8.514223 18.071764 -7.490371
钟爱 17.984484 -4.8768287 16.716238 16.658224 -27.738024 -25.891703 -19.179977 6.6909623 37.56464 -13.521651 13.267926 10.216028 -0.19054835 35.493042 29.336407 18.562439 -7.4809074 17.904173 -13.844719 3.022259 14.995911 -22.58654 -9.87084 15.710427 -14.876169 14.388888 -14.6048155 -5.1577635 -5.2825193 -10.078579 -5.086235 -22.363726 2.9529414 0.7049978 -10.118969 -22.133059 27.744198 -22.186438 -3.2051985 37.520164 10.439255 20.471209 -23.874033 -35.268066 -4.6956215 32.274727 24.359287 -8.854247 1.094503 -25.306633
89299 50 =》词条数目,向量维度