• gensim示例


    安装

    !pip install gensim
    

    训练

    from gensim.models import word2vec
    import logging
    
    # 主程序
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.Text8Corpus()  # 加载语料。可替换为自己的语料
    model = word2vec.Word2Vec(sentences, size=200)  # 默认window=5
    
    # 计算两个词的相似度/相关程度
    y1 = model.similarity(u"不错", u"好")
    print(u"【不错】和【好】的相似度为:", y1)
    print("--------
    ")
    

    加载词向量文件

    from gensim.models import KeyedVectors
    
    file = '/home/xuehp/data/Tencent_AILab_ChineseEmbedding.txt'
    wv_from_text = KeyedVectors.load_word2vec_format(file, binary=False)
    wv_from_text.init_sims(replace=True) 
    

    获取单词向量

    import numpy as np
    
    # 未知词、短语向量补齐
    def compute_ngrams(word, min_n, max_n):
        #BOW, EOW = ('<', '>')  # Used by FastText to attach to all words as prefix and suffix
        extended_word =  word
        ngrams = []
        for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
            for i in range(0, len(extended_word) - ngram_length + 1):
                ngrams.append(extended_word[i:i + ngram_length])
        return list(set(ngrams))
    
    def wordVec(word, wv_from_text, min_n = 1, max_n = 3):
        '''
        ngrams_single/ngrams_more,主要是为了当出现oov的情况下,最好先不考虑单字词向量
        '''
        # 确认词向量维度
        word_size = wv_from_text.wv.syn0[0].shape[0]   
        # 计算word的ngrams词组
        ngrams = compute_ngrams(word,min_n = min_n, max_n = max_n)
        # 如果在词典之中,直接返回词向量
        if word in wv_from_text.wv.vocab.keys():
            return wv_from_text[word]
        else:  
            # 不在词典的情况下
            word_vec = np.zeros(word_size, dtype=np.float32)
            ngrams_found = 0
            ngrams_single = [ng for ng in ngrams if len(ng) == 1]
            ngrams_more = [ng for ng in ngrams if len(ng) > 1]
            # 先只接受2个单词长度以上的词向量
            for ngram in ngrams_more:
                if ngram in wv_from_text.wv.vocab.keys():
                    word_vec += wv_from_text[ngram]
                    ngrams_found += 1
                    #print(ngram)
            # 如果,没有匹配到,那么最后是考虑单个词向量
            if ngrams_found == 0:
                for ngram in ngrams_single:
                    word_vec += wv_from_text[ngram]
                    ngrams_found += 1
            if word_vec.any():
                return word_vec / max(1, ngrams_found)
            else:
                raise KeyError('all ngrams for word %s absent from model' % word)
    

    例子1

    vec = wordVec('苹果', wv_from_text, min_n = 1, max_n = 3)
    wv_from_text.most_similar(positive=[vec], topn=20)
    

    输出:

    [('苹果', 1.0),
     ('苹果公司', 0.8514505624771118),
     ('以及苹果', 0.8457839488983154),
     ('比如苹果', 0.7890200018882751),
     ('苹果新', 0.7845828533172607),
     ('其他苹果', 0.7817449569702148),
     ('iphone', 0.7793817520141602),
     ('苹果iphone', 0.7790712714195251),
     ('苹果的iphone', 0.7720062136650085),
     ('apple', 0.7679361701011658),
     ('苹果产品', 0.7623019814491272),
     ('像苹果', 0.7533938884735107),
     ('小米', 0.7517136335372925),
     ('关于苹果', 0.7515844106674194),
     ('iphone产品', 0.7507627606391907),
     ('iphonex', 0.7488199472427368),
     ('新款iphone', 0.747662365436554),
     ('苹果10', 0.7474119067192078),
     ('iphone系列', 0.7470223307609558),
     ('新iphone', 0.7435163855552673)]
    

    例子2

    vec = wordVec('iuap', wv_from_text, min_n = 1, max_n = 3)
    wv_from_text.most_similar(positive=[vec], topn=20)
    

    输出:

    [('iuap', 1.0),
     ('用友云平台', 0.8234802484512329),
     ('paas平台', 0.8118030428886414),
     ('用友云', 0.7954781651496887),
     ('云操作系统', 0.7548810839653015),
     ('iaas平台', 0.7546966075897217),
     ('appcenter', 0.7538243532180786),
     ('u8cloud', 0.7484996914863586),
     ('paas', 0.7466067671775818),
     ('社会化商业', 0.7457333207130432),
     ('云erp', 0.7428735494613647),
     ('协同云', 0.7421062588691711),
     ('海云捷迅', 0.7403150200843811),
     ('采购云', 0.7385496497154236),
     ('paas+saas', 0.7368173599243164),
     ('云管理平台', 0.7367190718650818),
     ('escloud', 0.736686646938324),
     ('私有云平台', 0.7358618974685669),
     ('mopaas', 0.7325429916381836),
     ('云应用', 0.7322961688041687)]
    

    例子3

    vec = wordVec('友云采', wv_from_text, min_n = 1, max_n = 3)
    wv_from_text.most_similar(positive=[vec], topn=20)
    

    输出:

    [('友云采', 1.0000001192092896),
     ('供应商协同平台', 0.7404446601867676),
     ('伙伴门户', 0.7326363325119019),
     ('企业交易平台', 0.7278861999511719),
     ('供应商门户', 0.7263870239257812),
     ('移动云分销', 0.7180557250976562),
     ('电商管理系统', 0.7153645157814026),
     ('求购大厅', 0.7131102085113525),
     ('百卓优采', 0.7128005027770996),
     ('o2o方案', 0.7122943997383118),
     ('农鲜生', 0.7077293992042542),
     ('会员资料库', 0.7064912915229797),
     ('企业管理云平台', 0.7042117118835449),
     ('56linked', 0.7034884691238403),
     ('网上订单系统', 0.7033181190490723),
     ('协同门户', 0.7029898762702942),
     ('电商建站', 0.7025145292282104),
     ('管理商机', 0.7013753056526184),
     ('直销通', 0.7007359862327576),
     ('erpbuilder', 0.6993728876113892)]
    

    例子4

    vec = wordVec('财务云', wv_from_text, min_n = 1, max_n = 3)
    wv_from_text.most_similar(positive=[vec], topn=20)
    

    输出:

    [('财务云', 1.0),
     ('财务共享服务', 0.7762293815612793),
     ('金蝶云', 0.7745106220245361),
     ('浪潮云', 0.7651669383049011),
     ('财务共享中心', 0.7502492070198059),
     ('畅捷通', 0.7385521531105042),
     ('协同云', 0.7370111346244812),
     ('企业云服务', 0.7364829182624817),
     ('用友云', 0.7306167483329773),
     ('采购云', 0.729377031326294),
     ('云erp', 0.7251084446907043),
     ('共享服务中心', 0.7224213480949402),
     ('人力云', 0.721336305141449),
     ('金蝶', 0.7165836095809937),
     ('用友', 0.7122166752815247),
     ('企业云', 0.7093378305435181),
     ('erp云', 0.7075839638710022),
     ('致远协同', 0.706666886806488),
     ('企业金融', 0.7049797773361206),
     ('移动信息化', 0.7018118500709534)]
    

    例子5

    vec = wordVec('友报账', wv_from_text, min_n = 1, max_n = 3)
    wv_from_text.most_similar(positive=[vec], topn=20)
    

    输出:

    [('报账', 0.7958753705024719),
     ('友报', 0.7958752512931824),
     ('报帐', 0.7087380886077881),
     ('报销业务', 0.7015117406845093),
     ('财务报账', 0.6572694778442383),
     ('审核报销', 0.6517125964164734),
     ('报销单', 0.6511596441268921),
     ('费用报销', 0.6456758975982666),
     ('报销单据', 0.642286479473114),
     ('原始票据', 0.6387859582901001),
     ('报销审核', 0.6324885487556458),
     ('发票报销', 0.6296700835227966),
     ('做账', 0.6251322031021118),
     ('员工报销', 0.6216662526130676),
     ('财务报销', 0.6187087297439575),
     ('原始单据', 0.6172932386398315),
     ('对账', 0.6172742247581482),
     ('费用报销单', 0.6142060160636902),
     ('审批报销', 0.6136212348937988),
     ('核账', 0.6098783016204834)]
    

    这个例子中,训练时候和测试时候的分词结果不一致。

    本文仅供学习使用

  • 相关阅读:
    LinQ&EF任我行(一)LinQ to SQL (转)
    WPF数据模板和控件模板
    Sql优化
    SQL锁表语句
    js动态创建dom
    js实现等待n秒后按钮可用
    js关于事件冒泡
    工作流学习(个人总结)
    sql常用函数
    将Datatable序列化为Json对象返回到客户端
  • 原文地址:https://www.cnblogs.com/xuehuiping/p/15185962.html
Copyright © 2020-2023  润新知