• tf-idf算法


    import numpy as np
    from collections import Counter
    import itertools
    import matplotlib.pyplot as plt
    docs = [
        "it is a good day, I like to stay here",
        "I am happy to be here",
        "I am bob",
        "it is sunny today",
        "I have a party today",
        "it is a dog and that is a cat",
        "there are dog and cat on the tree",
        "I study hard this morning",
        "today is a good day",
        "tomorrow will be a good day",
        "I like coffee, I like book and I like apple",
        "I do not like it",
        "I am kitty, I like bob",
        "I do not care who like bob, but I like kitty",
        "It is coffee time, bring your cup",
    ]
    docs_words=[d.replace(",","").split(" ") for d in docs]
    #itertools.chain(*iterables) 参数可以传入任意的序列,个数不限
    #set()函数创建一个无序不重复元素集
    #获取所有文档中的单词,并且不重复
    vocab=set(itertools.chain(*docs_words))
    #enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
    v2i={v:i for i,v in enumerate(vocab)}
    #:items() 方法把字典中每对 key 和 value 组成一个元组,并把这些元组放在列表中返回。
    i2v={i:v for v,i in v2i.items()}
    
    def safe_log(x):
        mask=x!=0
        x[mask]=np.log(x[mask])
        return x
    
    # lambda 函数是匿名的:
    # 所谓匿名函数,通俗地说就是没有名字的函数。lambda函数没有名字。
    # lambda 函数有输入和输出:
    # 输入是传入到参数列表argument_list的值,输出是根据表达式expression计算得到的值。
    # lambda 函数拥有自己的命名空间:
    # 不能访问自己参数列表之外或全局命名空间里的参数,只能完成非常简单的功能。
    # lambda x, y: x*y			# 函数输入是x和y,输出是它们的积x*y
    
    # (axis=1)与(axis=0)区别
    # 使用0值表示沿着每一列或行标签索引值向下执行方法
    # 使用1值表示沿着每一行或者列标签模向执行对应的方法
    # 按行相加,并且(keepdims)保持其二维特性
    #print(np.sum(a, axis=1, keepdims=True))
    tf_methods={
             "log": lambda x: np.log(1+x),
            "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
            "boolean": lambda x: np.minimum(x, 1),
            "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
    }
    
    
    idf_methods = {
            "log": lambda x: 1 + np.log(len(docs) / (x+1)),
            "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
            "len_norm": lambda x: x / (np.sum(np.square(x))+1),
        }
    # word_counts = Counter(words)
    # # 出现频率最高的3个单词
    # top_three = word_counts.most_common(3)
    # print(top_three)
    # [('eyes', 8), ('the', 5), ('look', 4)]
    def get_tf(method="log"):
        # term frequency: how frequent a word appears in a doc
        _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64)    # [n_vocab, n_doc]
        for i, d in enumerate(docs_words):
            counter = Counter(d)
            for v in counter.keys():
                _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]
    
        weighted_tf = tf_methods.get(method, None)
        if weighted_tf is None:
            raise ValueError
        return weighted_tf(_tf)
    
    
    def get_idf(method="log"):
        # inverse document frequency: low idf for a word appears in more docs, mean less important
        df = np.zeros((len(i2v), 1))
        for i in range(len(i2v)):
            d_count = 0
            for d in docs_words:
                d_count += 1 if i2v[i] in d else 0
            df[i, 0] = d_count
    
        idf_fn = idf_methods.get(method, None)
        if idf_fn is None:
            raise ValueError
        #如果包含词条t的文档越少, IDF越大,则说明词条具有很好的类别区分能力
        return idf_fn(df)
    
    
    def cosine_similarity(q, _tf_idf):
        unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
        unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
        similarity = unit_ds.T.dot(unit_q).ravel()
        return similarity
    
    
    def docs_score(q, len_norm=False):
        q_words = q.replace(",", "").split(" ")
    
        # add unknown words
        unknown_v = 0
        for v in set(q_words):
            if v not in v2i:
                v2i[v] = len(v2i)
                i2v[len(v2i)-1] = v
                unknown_v += 1
        if unknown_v > 0:
            _idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0)
            _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0)
        else:
            _idf, _tf_idf = idf, tf_idf
        counter = Counter(q_words)
        q_tf = np.zeros((len(_idf), 1), dtype=np.float)     # [n_vocab, 1]
        for v in counter.keys():
            q_tf[v2i[v], 0] = counter[v]
    
        q_vec = q_tf * _idf            # [n_vocab, 1]
        print(q_vec.shape)
        print(_tf_idf.shape)
    
        q_scores = cosine_similarity(q_vec, _tf_idf)
        if len_norm:
            len_docs = [len(d) for d in docs_words]
            q_scores = q_scores / np.array(len_docs)
        print(q_scores.shape)
        return q_scores
    
    
    def get_keywords(n=2):
        for c in range(3):
            col = tf_idf[:, c]
            idx = np.argsort(col)[-n:]
            print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx]))
    
    
    tf = get_tf()           # [n_vocab, n_doc]
    idf = get_idf()         # [n_vocab, 1]
    tf_idf = tf * idf       # [n_vocab, n_doc]
    # print("tf shape(vecb in each docs): ", tf.shape)
    # print("
    tf samples:
    ", tf[:2])
    # print("
    idf shape(vecb in all docs): ", idf.shape)
    # print("
    idf samples:
    ", idf[:2])
    # print("
    tf_idf shape: ", tf_idf.shape)
    # print("
    tf_idf sample:
    ", tf_idf[:2])
    
    
    # test
    get_keywords()
    q = "I get a coffee cup"
    scores = docs_score(q)
    print(scores)
    #argsort将数组x中的元素从小到大排序
    d_ids = scores.argsort()[-3:][::-1]
    print("
    top 3 docs for '{}':
    {}".format(q, [docs[i] for i in d_ids]))
    

      用tf-idf算法找到与一个文档相似的其他文档。首先要统计出这些文档中出现的所有词,计算每一个文档中词的tf值,tf是用一个文档中出现词w的个数初一文档的总次数,除以总词数是为了进行归一化处理。之后计算idf值,用文档的总数除以包含该词的文档数,最后对得到的商取对数,如果包含词的文档越少,idf值就越大,说明该词有很好的分辨能力。

  • 相关阅读:
    快速提取某一文件夹下所有文件名称
    CFileFind类的使用总结
    FILE文件流的中fopen、fread、fseek、fclose的使用
    经典损失函数:交叉熵(附tensorflow)
    tensorboard使用
    Windows下 tensorboard出现ValueError:Invalid format string
    新建全色或者resize(毫无价值,只是做记录)
    创建一个任意大小的全色矩阵 python
    转移图片位置
    getpatch
  • 原文地址:https://www.cnblogs.com/zhang12345/p/15322051.html
Copyright © 2020-2023  润新知