• python获取DBLP数据集


    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    
    import xml.sax
    import io, sys
    
    paper_tags = ('article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www')
    
    sub_tags = ('publisher', 'journal', 'booktitle')
    
    ret = []
    
    class DBLPHandler(xml.sax.ContentHandler):
    
        def __init__(self):
            self.id = 1
            self.reset()
    
        def reset(self):
            self.dup_article = 0
            self.curtag = None
            self.author = ''
            self.title = ''
            self.pages = ''
            self.year = ''
            self.volume = ''
            self.journal = ''
            self.number = ''
            self.url = ''
            self.ee = ''
    
        def write_to_file(self, filename):
            file_object = file(filename, 'a+')
            for line in ret:
                file_object.write(line.encode('utf8'))
                #file_object.write('
    ')
            file_object.close()
    
        def record_row(self):
            ret.append(u''.join((self.author, self.title, self.year, self.pages, self.journal, self.ee, '
    ')).replace(' ', ''))
            #ret.append(self.author + self.title + self.year + self.pages+ self.journal + self.ee)
            #ret.append((self.author, self.title, self.year, self.pages, self.journal, self.ee))
            #print (self.author, self.title, self.year, self.pages)
    
    
        def startElement(self, tag, attributes):
            if tag != None and len(tag.strip()) > 0:
                if tag == 'article':
                    self.dup_article += 1
                self.curtag = tag
    
        def endElement(self, tag):
            if tag != None and len(tag.strip()) > 0:
                if tag == 'article':
                    self.record_row()
                    self.reset()
    
        def characters(self, content):
            if content != '
    ':
                if self.curtag == "title":
                    self.title = content.strip()
                elif self.curtag == "author":
                    self.author = content.strip()
                elif self.curtag == "year":
                    self.year = content.strip()
                elif self.curtag == "ee":
                    self.ee = content.strip()
                elif self.curtag == "journal":
                    self.journal = content.strip()
                elif self.curtag == "pages":
                    self.pages = content.strip()
                elif self.url == "url":
                    self.url = content.strip()
                elif self.number == "number":
                    self.number = content.strip()
                elif self.number == "volume":
                    self.volume = content.strip()
    
    if (__name__ == "__main__"):
        filename = 'dblp.xml'
        if len(sys.argv) == 2:
            filename = sys.argv[1]
        # 创建一个 XMLReader
        parser = xml.sax.make_parser()
        # turn off namepsaces
        parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    
        # 重写 ContextHandler
        Handler = DBLPHandler()
        parser.setContentHandler(Handler)
    
        parser.parse(filename)
        print 'Parser Complete!'
        Handler.write_to_file('out')

    另外附处理DNA数据的脚本程序:

    lens_DNA = [0, 1000, 2000, 2500, 500, 1000, 1500, 2000, 2500]
    lens_DBLP = [0, 40, 120, 200, 40, 80, 120, 160, 200]
    
    file_id = 1
    LINE_MAX = 100
    
    class DNA_Handler:
        def __init__(self):
            self.strn = ''
    
        def write_to_file(self, filename):
            file_object = open(filename, 'a+')
            file_object.write(self.strn)
            file_object.close()
    
        def read_file(self, filename):
            fo = open(filename, 'r')
            line = fo.readline()
            self.strn = ''
            file_id = 1
            cnt_lines = 0
            while line and file_id < 9:
                line = line.replace('
    ', '')
                self.strn += line
                if len(self.strn) > lens_DNA[file_id]:
                    self.strn = self.strn[0: lens_DNA[file_id]] + '
    '
                    print self.strn
                    if file_id <= 3:
                        self.write_to_file('DNA_N' + str(file_id))
                    else:
                        self.write_to_file('DNA_M' + str(file_id - 3))
                    self.strn = ''
                    cnt_lines += 1
                    if cnt_lines >= LINE_MAX:
                        file_id += 1
                        cnt_lines = 0
                line = fo.readline()
            fo.close()
            print 'read_finished!'
    
    
    
    class DBLP_Handler:
    
        def __init__(self):
            self.strn = ''
    
        def write_to_file(self, filename):
            file_object = open(filename, 'a+')
            file_object.write(self.strn)
            file_object.close()
    
        def read_file(self, filename):
            fo = open(filename, 'r')
            line = fo.readline()
            self.strn = ''
            file_id = 1
            cnt_lines = 0
            while line and file_id < 9:
                line = line.replace('
    ', '')
                self.strn += line
                if len(self.strn) > lens_DBLP[file_id]:
                    self.strn = self.strn[0: lens_DBLP[file_id]] + '
    '
                    print self.strn
                    self.write_to_file('DBLP_' + str(file_id))
                    self.strn = ''
                    cnt_lines += 1
                    if cnt_lines >= LINE_MAX:
                        file_id += 1
                        cnt_lines = 0
                line = fo.readline()
            fo.close()
            print 'read_finished!'
    
    
    if (__name__ == '__main__'):
    
        dh = DNA_Handler()
        dh.read_file('human_dna.fa')
        '''
        bblp_h = DBLP_Handler()
        bblp_h.read_file('DBLP_data')
        '''

       

  • 相关阅读:
    HDU 6103 Kirinriki【尺取法】【思维题】【好题】
    HDU 6103 Kirinriki【尺取法】【思维题】【好题】
    HDU 6095 Rikka with Competition【阅读题】【水题】
    HDU 6095 Rikka with Competition【阅读题】【水题】
    HDU 2844 Coins[【经典题】【模板题】
    HDU 2844 Coins[【经典题】【模板题】
    HDU 6090 Rikka with Graph【思维题】
    HDU 6090 Rikka with Graph【思维题】
    Codeforces Round #318(Div. 1) 573 D. Bear and Cavalry【dp+矩阵+线段树优化】
    Codeforces Round #318(Div. 1) 573 D. Bear and Cavalry【dp+矩阵+线段树优化】
  • 原文地址:https://www.cnblogs.com/luntai/p/6206833.html
Copyright © 2020-2023  润新知