将BMES标记的NER语料库转换为BIO标记的语料库
读取
f = open("./data/msra/dev.char.bmes",encoding='utf-8')
sentences = []
sentence = []
label_set=set()
cnt_line=0
for line in f:
#print(line)
cnt_line+=1
if len(line)==0 or line[0]=="
":
if len(sentence) > 0:
sentences.append(sentence)
#print(sentence)
sentence = []
continue
splits = line.split(' ')
sentence.append([splits[0],splits[-1][:-1]])
label_set.add(splits[-1])
if('
' not in splits[-1]):
print(splits[0],splits[-1])
print(cnt_line)
#print([splits[0],splits[-1]])
if len(sentence) >0:
sentences.append(sentence)
sentence = []
f.close()
转换
f=open("./output/msra-bio/dev.char.bmes","w+",encoding="utf-8")
for sen in sentences:
for word in sen:
char=word[0]
label=word[1]
if(label[0]=='S'):
label='B'+label[1:]
elif(label[0]=='E' or label[0]=='M'):
label='I'+label[1:]
f.write(f'{char} {label}
')
f.write('
')
f.close()