import numpy as np import json import codecs # 计算编辑距离 def edit_distance(word1, word2): len1 = len(word1) len2 = len(word2) dp = np.zeros((len1 + 1, len2 + 1)) for i in range(len1 + 1): dp[i][0] = i for j in range(len2 + 1): dp[0][j] = j for i in range(1, len1 + 1): for j in range(1, len2 + 1): if word1[i - 1] == word2[j - 1]: temp = 0 else: temp = 1 dp[i][j] = min(dp[i - 1][j - 1] + temp, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1)) return dp[len1][len2] # 190801 # 根据编辑距离计算相似度 def simility(word1, word2): res = edit_distance(word1, word2) maxLen = max(len(word1), len(word2)) return 1-res*1.0/maxLen bianhaos = [] sub_sens = [] with codecs.open(r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoqucodexianbingshi_write_sub.txt','r','utf8') as f: for line in f: # bianhao,sub_sen = line.split('<->') # sub_sen = sub_sen.strip().strip('<b>').strip('<e>') # bianhaos.append(bianhao) sub_sens.append(line) count = len(sub_sens) leibie = [-1]*count cla = 0 print(count) for i in range(count): if leibie[i] != -1: continue leibie[i] = cla sub1 = sub_sens[i] for j in range(count): if leibie[j] != -1: continue sub2 = sub_sens[j] sim = simility(sub1,sub2) if sim >= 0.5: leibie[j] = cla cla = cla + 1 print(i) print(leibie) with open('leibie05.json','w') as f: json.dump(leibie,f)