• python编辑距离


    import numpy as np
    import json
    import codecs
    
    # 计算编辑距离
    def edit_distance(word1, word2):
        len1 = len(word1)
        len2 = len(word2)
        dp = np.zeros((len1 + 1, len2 + 1))
        for i in range(len1 + 1):
            dp[i][0] = i
        for j in range(len2 + 1):
            dp[0][j] = j
    
        for i in range(1, len1 + 1):
            for j in range(1, len2 + 1):
                if word1[i - 1] == word2[j - 1]:
                    temp = 0
                else:
                    temp = 1
                dp[i][j] = min(dp[i - 1][j - 1] + temp, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
        return dp[len1][len2]
    
    
    # 190801
    # 根据编辑距离计算相似度
    def simility(word1, word2):
        res = edit_distance(word1, word2)
        maxLen = max(len(word1), len(word2))
        return 1-res*1.0/maxLen
    
    bianhaos = []
    sub_sens = []
    with codecs.open(r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoqucodexianbingshi_write_sub.txt','r','utf8') as f:
        for line in f:
            # bianhao,sub_sen = line.split('<->')
            # sub_sen = sub_sen.strip().strip('<b>').strip('<e>')
            # bianhaos.append(bianhao)
            sub_sens.append(line)
    count = len(sub_sens)
    leibie = [-1]*count
    cla = 0
    print(count)
    for i in range(count):
        if leibie[i] != -1:
            continue
        leibie[i] = cla
        sub1 = sub_sens[i]
        for j in range(count):
            if leibie[j] != -1:
                continue
            sub2 = sub_sens[j]
            sim = simility(sub1,sub2)
            if sim >= 0.5:
                leibie[j] = cla
        cla = cla + 1
        print(i)
    print(leibie)
    with open('leibie05.json','w') as f:
        json.dump(leibie,f)
  • 相关阅读:
    C#如何通过NCO3.0来连接SAP并调用SAP中的RFC
    .Net连接到SAP【转载】
    将博客搬至CSDN
    Apache Flume 简介
    日志收集以及分析:Splunk
    《淘宝技术这十年》读后感
    python参考手册--第9章
    Hadoop将过时了?
    pythn BeautifulSoup
    Python性能鸡汤
  • 原文地址:https://www.cnblogs.com/yiwoqu/p/11542074.html
Copyright © 2020-2023  润新知