1.词向量训练demo
from gensim.models import Word2Vec from gensim.test.utils import common_texts import jieba import tqdm word2vec_path = './resources/word2vec.model' def word_vector_gener(): """ 几种不同的方法来生成词向量 :return: """ # 1.word2vec # 获取原始数据 DATA_PATH = './data/seo_search_word_copy.txt' # word2evctor = open('./word2vector.txt', 'w', encoding='utf8') word_list = [] finall = [] # jieba分词 with open(DATA_PATH, 'r', encoding='utf8') as file: for each_line in tqdm.tqdm(file.readlines()): query = each_line.strip().split(' ')[-1] # 分词 cut_word = jieba.lcut(query) finall.append(cut_word) # 训练模型 model = Word2Vec(finall, sg=1, size=10, window=2, min_count=1, negative=1, sample=0.001, workers=4) # model.save('./resources/word2vec.model') model.wv.save(word2vec_path) print(model['老师']) if __name__ == '__main__': word_vector_gener()
2.词向量加载demo(此方法为获得词向量最快)
word2vec_path = './resources/word2vec.model' wv = KeyedVectors.load(word2vec_path, mmap='r')
vector = wv['主管']
word = wv.most_similar(['主管'], topn=30)
print(word)
输出:
[('组长', 0.8488447070121765), ('经理', 0.8272342085838318), ('总监', 0.816636323928833), ('副经理', 0.8071938753128052), ('部长', 0.8019827604293823), ('专员', 0.7792257070541382), ('高级专员', 0.7695066332817078), ('主任', 0.7676611542701721), ('负责人', 0.761403501033783), ('部副', 0.7570186853408813), ('及', 0.7355248928070068), ('业务主管', 0.732032299041748), ('岗', 0.7316986322402954), ('副总', 0.7278518676757812), ('科长', 0.72648024559021), ('兼', 0.7262977957725525), ('助理', 0.7255839705467224), ('资深', 0.7252861261367798), ('组', 0.7167786955833435), ('储干', 0.7150581479072571), ('班长', 0.7146369218826294), ('职员', 0.7104721665382385), ('实习生', 0.707991898059845), ('支持', 0.7070707082748413), ('高级', 0.7055947184562683), ('管理人员', 0.7054109573364258), ('初级', 0.7042156457901001), ('副理', 0.7038965821266174), ('小组长', 0.7035383582115173), ('技术主管', 0.7024495601654053)]