#!/usr/bin/python # -*- coding: UTF-8 -*- import xml.sax import io, sys paper_tags = ('article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www') sub_tags = ('publisher', 'journal', 'booktitle') ret = [] class DBLPHandler(xml.sax.ContentHandler): def __init__(self): self.id = 1 self.reset() def reset(self): self.dup_article = 0 self.curtag = None self.author = '' self.title = '' self.pages = '' self.year = '' self.volume = '' self.journal = '' self.number = '' self.url = '' self.ee = '' def write_to_file(self, filename): file_object = file(filename, 'a+') for line in ret: file_object.write(line.encode('utf8')) #file_object.write(' ') file_object.close() def record_row(self): ret.append(u''.join((self.author, self.title, self.year, self.pages, self.journal, self.ee, ' ')).replace(' ', '')) #ret.append(self.author + self.title + self.year + self.pages+ self.journal + self.ee) #ret.append((self.author, self.title, self.year, self.pages, self.journal, self.ee)) #print (self.author, self.title, self.year, self.pages) def startElement(self, tag, attributes): if tag != None and len(tag.strip()) > 0: if tag == 'article': self.dup_article += 1 self.curtag = tag def endElement(self, tag): if tag != None and len(tag.strip()) > 0: if tag == 'article': self.record_row() self.reset() def characters(self, content): if content != ' ': if self.curtag == "title": self.title = content.strip() elif self.curtag == "author": self.author = content.strip() elif self.curtag == "year": self.year = content.strip() elif self.curtag == "ee": self.ee = content.strip() elif self.curtag == "journal": self.journal = content.strip() elif self.curtag == "pages": self.pages = content.strip() elif self.url == "url": self.url = content.strip() elif self.number == "number": self.number = content.strip() elif self.number == "volume": self.volume = content.strip() if (__name__ == "__main__"): filename = 'dblp.xml' if len(sys.argv) == 2: filename = sys.argv[1] # 创建一个 XMLReader parser = xml.sax.make_parser() # turn off namepsaces parser.setFeature(xml.sax.handler.feature_namespaces, 0) # 重写 ContextHandler Handler = DBLPHandler() parser.setContentHandler(Handler) parser.parse(filename) print 'Parser Complete!' Handler.write_to_file('out')
另外附处理DNA数据的脚本程序:
lens_DNA = [0, 1000, 2000, 2500, 500, 1000, 1500, 2000, 2500] lens_DBLP = [0, 40, 120, 200, 40, 80, 120, 160, 200] file_id = 1 LINE_MAX = 100 class DNA_Handler: def __init__(self): self.strn = '' def write_to_file(self, filename): file_object = open(filename, 'a+') file_object.write(self.strn) file_object.close() def read_file(self, filename): fo = open(filename, 'r') line = fo.readline() self.strn = '' file_id = 1 cnt_lines = 0 while line and file_id < 9: line = line.replace(' ', '') self.strn += line if len(self.strn) > lens_DNA[file_id]: self.strn = self.strn[0: lens_DNA[file_id]] + ' ' print self.strn if file_id <= 3: self.write_to_file('DNA_N' + str(file_id)) else: self.write_to_file('DNA_M' + str(file_id - 3)) self.strn = '' cnt_lines += 1 if cnt_lines >= LINE_MAX: file_id += 1 cnt_lines = 0 line = fo.readline() fo.close() print 'read_finished!' class DBLP_Handler: def __init__(self): self.strn = '' def write_to_file(self, filename): file_object = open(filename, 'a+') file_object.write(self.strn) file_object.close() def read_file(self, filename): fo = open(filename, 'r') line = fo.readline() self.strn = '' file_id = 1 cnt_lines = 0 while line and file_id < 9: line = line.replace(' ', '') self.strn += line if len(self.strn) > lens_DBLP[file_id]: self.strn = self.strn[0: lens_DBLP[file_id]] + ' ' print self.strn self.write_to_file('DBLP_' + str(file_id)) self.strn = '' cnt_lines += 1 if cnt_lines >= LINE_MAX: file_id += 1 cnt_lines = 0 line = fo.readline() fo.close() print 'read_finished!' if (__name__ == '__main__'): dh = DNA_Handler() dh.read_file('human_dna.fa') ''' bblp_h = DBLP_Handler() bblp_h.read_file('DBLP_data') '''