将xml文件改写成想要的txt文件。
原xml文件:
1 <?xml version="1.0" encoding="UTF-8"?> 2 3 -<ANNOTATION_DOCUMENT xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv2.6.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" VERSION="2.6" FORMAT="2.6" DATE="" AUTHOR="QNZS"> 4 5 6 -<HEADER TIME_UNITS="milliseconds" MEDIA_FILE=""> 7 8 <MEDIA_DESCRIPTOR MIME_TYPE="audio/mpeg" MEDIA_URL=""/> 9 10 </HEADER> 11 12 13 -<TIME_ORDER> 14 15 <TIME_SLOT TIME_VALUE="0" TIME_SLOT_ID="TS0"/> 16 17 <TIME_SLOT TIME_VALUE="1740" TIME_SLOT_ID="TS0001"/> 18 19 <TIME_SLOT TIME_VALUE="0" TIME_SLOT_ID="TS1"/> 20 21 <TIME_SLOT TIME_VALUE="0" TIME_SLOT_ID="TS1001"/> 22 23 </TIME_ORDER> 24 25 26 -<TIER TIER_ID="EDU" LINGUISTIC_TYPE_REF="EDU" DEFAULT_LOCALE="ru"> 27 28 29 -<ANNOTATION> 30 31 32 -<ALIGNABLE_ANNOTATION TIME_SLOT_REF2="TS0001" TIME_SLOT_REF1="TS0" ANNOTATION_ID="EDU0"> 33 34 <ANNOTATION_VALUE>0</ANNOTATION_VALUE> 35 36 </ALIGNABLE_ANNOTATION> 37 38 </ANNOTATION> 39 40 41 -<ANNOTATION> 42 43 44 -<ALIGNABLE_ANNOTATION TIME_SLOT_REF2="TS1001" TIME_SLOT_REF1="TS1" ANNOTATION_ID="EDU1"> 45 46 <ANNOTATION_VALUE>1</ANNOTATION_VALUE> 47 48 </ALIGNABLE_ANNOTATION> 49 50 </ANNOTATION> 51 52 </TIER> 53 54 55 -<TIER TIER_ID="角色" LINGUISTIC_TYPE_REF="EDUProp" DEFAULT_LOCALE="ru" PARENT_REF="EDU"> 56 57 58 +<ANNOTATION> 59 60 61 62 63 64 65 66 67 -<ANNOTATION> 68 69 70 -<REF_ANNOTATION ANNOTATION_ID="People1" ANNOTATION_REF="EDU1"> 71 72 <ANNOTATION_VALUE>客户</ANNOTATION_VALUE> 73 74 </REF_ANNOTATION> 75 76 </ANNOTATION> 77 78 </TIER> 79 80 81 -<TIER TIER_ID="文本" LINGUISTIC_TYPE_REF="EDUProp" DEFAULT_LOCALE="ru" PARENT_REF="EDU"> 82 83 84 -<ANNOTATION> 85 86 87 -<REF_ANNOTATION ANNOTATION_ID="Text0" ANNOTATION_REF="EDU0"> 88 89 <ANNOTATION_VALUE>人家不愿意愤怒的您没办法</ANNOTATION_VALUE> 90 91 </REF_ANNOTATION> 92 93 </ANNOTATION> 94 95 96 -<ANNOTATION> 97 98 99 -<REF_ANNOTATION ANNOTATION_ID="Text1" ANNOTATION_REF="EDU1"> 100 101 <ANNOTATION_VALUE>_end_</ANNOTATION_VALUE> 102 103 </REF_ANNOTATION> 104 105 </ANNOTATION> 106 107 </TIER> 108 109 110 -<TIER TIER_ID="规则信息" LINGUISTIC_TYPE_REF="EDUProp" DEFAULT_LOCALE="ru" PARENT_REF="EDU"> 111 112 113 -<ANNOTATION> 114 115 116 -<REF_ANNOTATION ANNOTATION_ID="Comment1" ANNOTATION_REF="EDU1"> 117 118 <ANNOTATION_VALUE/> 119 120 </REF_ANNOTATION> 121 122 </ANNOTATION> 123 124 </TIER> 125 126 <LINGUISTIC_TYPE TIME_ALIGNABLE="true" LINGUISTIC_TYPE_ID="EDU" GRAPHIC_REFERENCES="true"/> 127 128 <LINGUISTIC_TYPE TIME_ALIGNABLE="false" LINGUISTIC_TYPE_ID="EDUProp" GRAPHIC_REFERENCES="false" CONSTRAINTS="Symbolic_Association"/> 129 130 </ANNOTATION_DOCUMENT>
生成后的TXT文件:
坐席 : 人家不愿意愤怒的您没办法
客户 : _end_
可以通过以下代码显示。
import codecs import xml.etree.ElementTree as ET import sys,re import csv import os #获取文件目录下所有文件的文件名 def file_name(file_dir): for root, dirs, files in os.walk(file_dir): pass return files file_dir = 'D:/untitled/test/fcc' list_name = [] ll = file_name(file_dir) ll = str(ll) ld = ll.replace('.xml','') print(ld) list = eval(ld) print(list) for i in range(len(list)): txt = [] xml_01 = "./fcc/{}.xml".format(list[i]) csv_01 = "./csv/{}.txt".format(list[i]) xmlfile = codecs.open(xml_01, 'r', 'utf-8') txtfile = open(csv_01,'a+',encoding='utf-8',newline='') line = xmlfile.readline() while line: result = re.search('<ANNOTATION_VALUE>', line) #print(result) if result is not None: bs = re.sub('<.*?>', "", line) bs = bs.strip() if bs.isnumeric(): pass #print("成功") else: txt.append(bs) line = xmlfile.readline() for i in range(int((len(txt)-1)/2)): lines = txt[i] + " : " + txt[i+int((len(txt)-1)/2)] + ' ' txtfile.write(lines) xmlfile.close() txtfile.close()