• 将搜狗词库(.scel格式)转化为txt格式


    参考:http://blog.csdn.net/zhangzhenhu/article/details/7014271

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import struct
    import os
    import sys
    
    #拼音表偏移,
    startPy = 0x1540
    #汉语词组表偏移
    startChinese = 0x2628
    #全局拼音表
    GPy_Table = {}
    
    #解析结果
    #元组(词频,拼音,中文词组)的列表
    GTable = []
    
    def byte2str(data):
        '''将原始字节码转为字符串'''
        i = 0
        length = len(data)
        ret = u''
        while i < length:
            x = data[i] + data[i+1]
            t = unichr(struct.unpack('H',x)[0])
            if t == u'
    ':
                ret += u'
    '
            elif t != u' ':
                ret += t
            i += 2
        return ret
    #获取拼音表
    def getPyTable(data):
    
        if data[0:4] != "x9Dx01x00x00":
            return None
        data = data[4:]
        pos = 0
        length = len(data)
        while pos < length:
            index = struct.unpack('H',data[pos]+data[pos+1])[0]
            #print index,
            pos += 2
            l = struct.unpack('H',data[pos]+data[pos+1])[0]
            #print l,
            pos += 2
            py = byte2str(data[pos:pos+l])
            #print py
            GPy_Table[index]=py
            pos += l
    
    #获取一个词组的拼音
    def getWordPy(data):
        pos = 0
        length = len(data)
        ret = u''
        while pos < length:
    
            index = struct.unpack('H',data[pos]+data[pos+1])[0]
            ret += GPy_Table[index]
            pos += 2
        return ret
    
    #获取一个词组
    def getWord(data):
        pos = 0
        length = len(data)
        ret = u''
        while pos < length:
    
            index = struct.unpack('H',data[pos]+data[pos+1])[0]
            ret += GPy_Table[index]
            pos += 2
        return ret
    
    #读取中文表
    def getChinese(data):
        #import pdb
        #pdb.set_trace()
    
        pos = 0
        length = len(data)
        while pos < length:
            #同音词数量
            same = struct.unpack('H',data[pos]+data[pos+1])[0]
            #print '[same]:',same,
    
            #拼音索引表长度
            pos += 2
            py_table_len = struct.unpack('H',data[pos]+data[pos+1])[0]
            #拼音索引表
            pos += 2
            # py = getWordPy(data[pos: pos+py_table_len])
    
            #中文词组
            pos += py_table_len
            for i in xrange(same):
                #中文词组长度
                c_len = struct.unpack('H',data[pos]+data[pos+1])[0]
                #中文词组
                pos += 2
                word = byte2str(data[pos: pos + c_len])
                #扩展数据长度
                pos += c_len
                ext_len = struct.unpack('H',data[pos]+data[pos+1])[0]
                #词频
                pos += 2
                count  = struct.unpack('H',data[pos]+data[pos+1])[0]
    
                # 保存
                # GTable.append((count,py,word))
                GTable.append(word)
                # 到下个词的偏移位置
                pos += ext_len
    
    
    def deal(file_name):
        print '-'*60
        f = open(file_name, 'rb')
        data = f.read()
        f.close()
    
    
        if data[0:12] != "x40x15x00x00x44x43x53x01x01x00x00x00":
            print "确认你选择的是搜狗(.scel)词库?"
            sys.exit(0)
        #pdb.set_trace()
    
        print u"词库名:" ,byte2str(data[0x130:0x338])#.encode('GB18030')
        print u"词库类型:" ,byte2str(data[0x338:0x540])#.encode('GB18030')
        print u"描述信息:" ,byte2str(data[0x540:0xd40])#.encode('GB18030')
        # print u"词库示例:",byte2str(data[0xd40:startPy])#.encode('GB18030')
        #
        getPyTable(data[startPy:startChinese])
        getChinese(data[startChinese:])
    
    if __name__ == '__main__':
        path1 = r'D:myProject	extClassifierinputfile1/'.replace('\', '/')
        path2 = r'D:myProject	extClassifieroutputfile/'.replace('\', '/')
        for filename in os.listdir(path1):
            print filename
            deal(path1 + filename)
    
        # 保存结果
        f = open(path2 + 'sougou.txt', 'w')
        # 删除相同元素
        GTable_filter = sorted(set(GTable), key=GTable.index)
    
        for word in GTable_filter:
            f.write(unicode(word).encode('GB18030'))
            f.write('
    ')
        f.close()

    有个问题没解决,就是在调用deal函数的时候,如果我这样调用的话,deal(filename)的话,就会显示IOError: [Errno 2] No such file or directory: 'ITxbcxc6xcbxe3xbbxfa.scel'这个错误,比如输入全路径才正确,很困惑。

  • 相关阅读:
    js的深拷贝特别注意this的深拷贝
    快速的熟悉一个angular的项目从run看起
    关于angular路由中的#
    AngularJS的Provider, Value, Constant, Service, Factory, Decorator的区别与详解
    css页面缩放
    jquery自定义window事件
    js自定义事件
    git分支
    webpack知识小结--require.context方法
    Vue 创建组件的两种方法
  • 原文地址:https://www.cnblogs.com/mothe123/p/4895020.html
Copyright © 2020-2023  润新知