• NLP(二十三):用tf-idf得到句子向量,并计算相似度


    一、基于gensim

    1、模型类

    import os
    import jieba
    import pickle
    import logging
    import numpy as np
    from gensim import corpora, models, similarities
    import utils.word_process as word_process
    from root_path import root
    from pathlib import Path
    import heapq
    
    class TfIdf(object):
        """tf-idf模型计算相似度"""
        def __init__(self):
            root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
            if not Path(root_path).is_dir():
                os.mkdir(root_path)
            self.dic_path = os.path.join(root_path, "bow.model")
            self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model")
            self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model")
            self.stop_list = word_process.get_stop_list()
    
            self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")
    
    
        def del_stopwords(self, words):
            """删除一句话中的停用词"""
            word_list = []
    
            for word in words:
                if word not in self.stop_list:
                    word_list.append(word)
            return word_list
    
        def _seg_word(self, words_list, jieba_flag=True, del_stopword=True):
            """对多句话进行分词或分字"""
            word_list = []
            if jieba_flag:
                if del_stopword:
                    for words in words_list:
                        jieba.cut(words)
                        word_list.append(self.del_stopwords(list(jieba.cut(words))))
                else:
                    for words in words_list:
                        word_list.append(list(jieba.cut(words)))
            else:
                if del_stopword:
                    for words in words_list:
                        word_list.append(self.del_stopwords(words))
                else:
                    for words in words_list:
                        word_list.append([word for word in words])
            return word_list
    
        def train(self, sentence_list):
            """训练模型"""
            #下面保存语料字典
            word_list = self._seg_word(sentence_list)
            dic = corpora.Dictionary(word_list, prune_at=2000000)
            dic.save(self.dic_path)
    
            # 构建tfidf模型
            tfidf_model_path = self.tfidf_model_path
            corpus_model = [dic.doc2bow(word) for word in word_list]
            tfidf_model = models.TfidfModel(corpus_model)
            tfidf_model.save(tfidf_model_path)
    
            #构造检索模型
            tfidf_index_path = self.tfidf_index_path
            corpus_tfidf = tfidf_model[corpus_model]
            tfidf_index = similarities.MatrixSimilarity(corpus_tfidf)
            tfidf_index.save(tfidf_index_path)
    
        def predict(self, sentence):
            # 得到句子向量, 直接出检索结果(检索是基于word_list的)。
            dic = corpora.Dictionary.load(self.dic_path)
            words = sentence
            word_bow = dic.doc2bow(self._seg_word([words])[0])
            word_tfidf = models.TfidfModel.load(self.tfidf_model_path)[word_bow]
            tfidf_index = similarities.MatrixSimilarity.load(self.tfidf_index_path)
            score = tfidf_index[word_tfidf]
            return score
    
        def get_train_data(self):
            """得到句子数组和标签数组"""
            labels = []
            sentences = []
            with open(self.data_path, "r", encoding="utf8") as f:
                for line in f.readlines():
                    data_tuple = line.split("  ")
                    label = data_tuple[0]
                    labels.append(label)
                    sentence = data_tuple[1].replace("
    ", "").replace("
    ", "")
                    sentences.append(sentence)
            return labels, sentences
    
        def main(self):
            labels, sentences = self.get_train_data()
            print(sentences)
            self.train(sentences)
            score_list = self.predict("我有困难还不了")
    
            # 获取下标, 输出为[4, 5, 2]
            print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__))
    
            # 获取数值, 输出为[9, 9, 6]
            print(heapq.nlargest(30, score_list))
    
    
    
    if __name__ == '__main__':
        TfIdf().main()

    2、工具类

    import os
    from root_path import root
    import tqdm
    
    
    stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt")
    
    def get_stop_list():
        """得到停用词列表"""
        stop_word_list = []
        with open(stop, "r", encoding="utf8") as f:
            data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1)
            data_lines.set_description('正在处理停用词...')
            for line in data_lines:
                line = line.replace(" ", "").replace("
    ", "").replace("
    ", "")
                if len(line) == 1:
                    stop_word_list.append(line)
        return stop_word_list

    二、基于sklearn

    import os
    import jieba
    import pickle
    from root_path import root
    from pathlib import Path
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    class TfIdf(object):
        """tf-idf模型计算相似度"""
        def __init__(self):
            root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
            if not Path(root_path).is_dir():
                os.mkdir(root_path)
            self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")
            self.model_path = os.path.join(root_path, "tfidf.model")
    
        def get_train_data(self):
            """得到句子数组和标签数组"""
            labels = []
            sentences = []
            with open(self.data_path, "r", encoding="utf8") as f:
                for line in f.readlines():
                    data_tuple = line.split("  ")
                    label = data_tuple[0]
                    labels.append(label)
                    sentence = data_tuple[1].replace("
    ", "").replace("
    ", "")
                    sentences.append(sentence)
            return labels, sentences
    
        def train(self):
            labels, sentences = self.get_train_data()
            sent_words = [list(jieba.cut(sent0)) for sent0 in sentences]
            document = [" ".join(sent0) for sent0 in sent_words]
            tfidf_vectorizer = TfidfVectorizer()
            feature = tfidf_vectorizer.fit_transform(document)
            # 保存模型
            with open(self.model_path, 'wb') as f:
                pickle.dump(tfidf_vectorizer, f)
    
        def predict(self, sentence):
            # 加载模型
            with open(self.model_path, 'rb') as f:
                tfidf_vectorizer = pickle.load(f)
            sentence = list(jieba.cut(sentence))
            sen = " ".join(sentence)
            res = tfidf_vectorizer.transform([sen]).toarray()
            return res[0]
    
        def main(self):
            sentence = "是的,我知道那就十五号没办法,因为这个,也可能是十二十号发工资的,因为遇见了超过了一点点。"
            self.predict(sentence)
    
    if __name__ == '__main__':
        TfIdf().main()
  • 相关阅读:
    poj 2000
    poj1316
    poj1922
    poj2017
    poj1833 排列
    poj1338
    poj2136
    poj2242
    IE兼容html5标签
    绑定事件后,某些情况下需要解绑该事件
  • 原文地址:https://www.cnblogs.com/zhangxianrong/p/14899304.html
Copyright © 2020-2023  润新知