import re import jieba.analyse import codecs import pandas as pd def word_replace(xianbingshi,hospital1): """替换词表""" data = [] hospital = [] """去重""" with codecs.open(hospital1,'r','utf8') as f: for line in f: line = line.strip() if line not in hospital: hospital.append(line) else: continue hospital.sort(key=len, reverse=True) with codecs.open(xianbingshi,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for ho in hospital: if ho in hospital: line = line.replace(ho,'[hospital]') line = line.strip() data.append(line) print(line) with codecs.open(r'C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoqucodexianbingshi_write_sub.txt','w','utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace3(xianbingshi2, operation1): data = [] operation = [] with codecs.open(operation1,'r','utf8') as f: for line in f: line = line.strip() if line not in operation: operation.append(line) else: continue """排序""" operation.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for op in operation: if op in line: line = line.replace(op, '[operation]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2, 'w','utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace1(xianbingshi2,disease1): data = [] disease = [] with codecs.open(disease1,'r','utf8') as f: for line in f: line = line.strip() if line not in disease: disease.append(line) else: continue disease.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for di in disease: if di in line and len(di)>1: line = line.replace(di, '[disease]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2,'w', 'utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace2(xianbingshi2, symptom1): data = [] symptom = [] with codecs.open(symptom1,'r','utf8') as f: for line in f: line = line.strip() if line not in symptom: symptom.append(line) else: continue """排序""" symptom.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for sy in symptom: if sy in line and len(sy) > 1: line = line.replace(sy, '[symptom]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2,'w', 'utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace4(xianbingshi2, test1): data = [] test = [] with codecs.open(test1,'r','utf8') as f: for line in f: line = line.strip() if line not in test: test.append(line) else: continue """排序""" test.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for te in test: if te in line: line = line.replace(te, '[test]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2, 'w','utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace5(xianbingshi2, time1): data = [] time = [] with codecs.open(time1,'r','utf8') as f: for line in f: line = line.strip() if line not in time: time.append(line) else: continue """排序""" time.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for t in time: if t in line: line = line.replace(t,'[time]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2,'w', 'utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace6(xianbingshi2, organ1): data = [] organ = [] with codecs.open(organ1,'r','utf8') as f: for line in f: line = line.strip() if line not in organ: organ.append(line) else: continue """排序""" organ.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for o in organ: if o in line and len(o) > 1: line = line.replace(o, '[organ]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2,'w', 'utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace7(xianbingshi2, symptom1): data = [] symptom = [] with codecs.open(symptom1,'r','utf8') as f: for line in f: line = line.strip() if line not in symptom and len(line) == 1: symptom.append(line) print(line) else: continue """排序""" symptom.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for sy in symptom: line = line.replace(sy, '[symptom]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2,'w', 'utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace8(xianbingshi2, disease1): data = [] disease = [] with codecs.open(disease1,'r','utf8') as f: for line in f: line = line.strip() if line not in disease and line == 1: disease.append(line) else: continue """排序""" disease.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for di in disease: line = line.replace(di, '[disease]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2,'w', 'utf8') as f: for line in data: f.write(line + ' ') f.close() def word_replace9(xianbingshi2, organ1): data = [] organ = [] with codecs.open(organ1,'r','utf8') as f: for line in f: line = line.strip() if line not in organ and line == 1: organ.append(line) else: continue """排序""" organ.sort(key=len, reverse=True) with codecs.open(xianbingshi2,'r','utf8') as f: """优先级:医院、手术、检查、症状、疾病、部位、时间""" for line in f: for o in organ: line = line.replace(o, '[organ]') line = line.strip() data.append(line) print(line) with codecs.open(xianbingshi2,'w', 'utf8') as f: for line in data: f.write(line + ' ') f.close() if __name__ == '__main__': disease1 =r'C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoquTXTdisease_0903.txt' organ1 = r"C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoquTXTorgan_0903.txt" test1 = r"C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoquTXT est_0903.txt" time1 = r"C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoqu ime1.txt" operation1 = r"C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoquTXToperation_0903.txt" symptom1 = r"C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoqucode症状.txt" xianbingshi = r'C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoqucodexianbingshi_write.txt' xianbingshi2 =r'C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoqucodexianbingshi_write_sub.txt' hospital1 = r'C:UsersAdministrator.SC-201812211013PycharmProjects词表工作代码yiwoquTXThospital_0903.txt' word_replace(xianbingshi, hospital1) word_replace3(xianbingshi2, operation1) word_replace1(xianbingshi2, disease1) word_replace2(xianbingshi2, symptom1) word_replace4(xianbingshi2, test1) # word_replace5(xianbingshi2, time1) word_replace6(xianbingshi2, organ1) word_replace7(xianbingshi2, symptom1) word_replace8(xianbingshi2, disease1) word_replace9(xianbingshi2, organ1)