• python 相似语句匹配(非机器学习)


    #coding=utf-8
    
    import xlrd
    import distance
    from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
    import numpy as np
    from scipy.linalg import norm
    
    workbook = xlrd.open_workbook(u'工程师问答.xls')
    sheet_names= workbook.sheet_names()
    
    ls = []
    for sheet_name in sheet_names:
    
        sheet1 = workbook.sheet_by_name(sheet_name)
        for i in range(1, 3858):
            row = sheet1.row_values(i)
            ls.append(row[0])
    
    # print len(ls)
    target = u'D90的发动机热效率是多少?'
    print u'目标语句:' + target
    
    
    # 编辑距离计算
    def edit_distance(s1, s2):
        return distance.levenshtein(s1, s2)
    
    results = list(filter(lambda x: edit_distance(x, target) <= 5, ls))
    print u'1)编辑距离计算,阈值为5'
    for i in results:
        print i
    
    # 杰卡德系数计算
    def jaccard_similarity(s1, s2):
        def add_space(s):
            return ' '.join(list(s))
        
        # 将字中间加入空格
        s1, s2 = add_space(s1), add_space(s2)
        # 转化为TF矩阵
        cv = CountVectorizer(tokenizer=lambda s: s.split())
        corpus = [s1, s2]
        vectors = cv.fit_transform(corpus).toarray()
        # 求交集
        numerator = np.sum(np.min(vectors, axis=0))
        # 求并集
        denominator = np.sum(np.max(vectors, axis=0))
        # 计算杰卡德系数
        return 1.0 * numerator / denominator
    
    results = list(filter(lambda x: jaccard_similarity(x, target) > 0.6, ls))
    print u'2)杰卡德系数计算,阈值为0.6'
    for i in results:
        print i
    
    
    # TF 计算
    def tf_similarity(s1, s2):
        def add_space(s):
            return ' '.join(list(s))
        
        # 将字中间加入空格
        s1, s2 = add_space(s1), add_space(s2)
        # 转化为TF矩阵
        cv = CountVectorizer(tokenizer=lambda s: s.split())
        corpus = [s1, s2]
        vectors = cv.fit_transform(corpus).toarray()
        # 计算TF系数
        return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
    
    results = list(filter(lambda x: tf_similarity(x, target) > 0.7, ls))
    print u'3)TF 计算,阈值为0.7'
    for i in results:
        print i
    
    
    # TFIDF 系数
    def tfidf_similarity(s1, s2):
        def add_space(s):
            return ' '.join(list(s))
        
        # 将字中间加入空格
        s1, s2 = add_space(s1), add_space(s2)
        # 转化为TF矩阵
        cv = TfidfVectorizer(tokenizer=lambda s: s.split())
        corpus = [s1, s2]
        vectors = cv.fit_transform(corpus).toarray()
        # 计算TF系数
        return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
    
    results = list(filter(lambda x: tfidf_similarity(x, target) > 0.6, ls))
    print u'4)TFIDF 系数,阈值为0.6'
    for i in results:
        print i
  • 相关阅读:
    java复习计划
    超过16位的字符串装16进制
    《将博客搬至CSDN》
    android设置中文字体样式
    布局文件View和ViewGroup
    创建线程的两种方法,继承Thread,继承Runnable
    本地文件的copy复制
    字节流和字符流完成URL下载,并存入本地
    文本过滤器的用法,FileFilter()和FilenameFilter()
    JavaSE笔记
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/9989451.html
Copyright © 2020-2023  润新知