• python – 解析pcfg语法树 提取其语法规则 Probabilistic Context-Free Grammar Parser


    http://www.voidcn.com/article/p-qnsvatou-byu.html

    string = '''
        (ROOT
            (S
                (NP (NN Carnac) (DT the) (NN Magnificent))
                (VP (VBD gave) (NP (DT a) (NN talk)))
            )
        )
    '''
    
    def is_symbol_char(character):
        '''
        Predicate to test if a character is valid
        for use in a symbol, extend as needed.
        '''
    
        return character.isalpha() or character in '-=$!?.'
    
    def tokenize(characters):
        '''
        Process characters into a nested structure.  The original string
        '(DT the)' is passed in as ['(', 'D', 'T', ' ', 't', 'h', 'e', ')']
        '''
    
        tokens = []
    
        while characters:
            character = characters.pop(0)
    
            if character.isspace():
                pass  # nothing to do, ignore it
    
            elif character == '(':  # signals start of recursive analysis (push)
                characters, result = tokenize(characters)
                tokens.append(result)
    
            elif character == ')':  # signals end of recursive analysis (pop)
                break
    
            elif is_symbol_char(character):
                # if it looks like a symbol, collect all
                # subsequents symbol characters
                symbol = ''
    
                while is_symbol_char(character):
                    symbol += character
                    character = characters.pop(0)
    
                # push unused non-symbol character back onto characters
                characters.insert(0, character)
    
                tokens.append(symbol)
    
        # Return whatever tokens we collected and any characters left over
        return characters, tokens
    
    def extract_rules(tokens):
        ''' Recursively walk tokenized data extracting rules. '''
    
        head, *tail = tokens
    
        print(head, '-->', *[x[0] if isinstance(x, list) else x for x in tail])
    
        for token in tail:  # recurse
            if isinstance(token, list):
                extract_rules(token)
    
    characters, tokens = tokenize(list(string))
    
    # After a successful tokenization, all the characters should be consumed
    assert not characters, "Didn't consume all the input!"
    
    print('Tokens:', tokens[0], 'Rules:', sep='
    
    ', end='
    
    ')
    
    extract_rules(tokens[0])

    https://github.com/usami/pcfg

  • 相关阅读:
    Python之路Day11
    Python之路Day10
    Python中需要注意的一些小坑
    Python之路Day09
    Python之路Day08
    关于谷歌浏览器安装非官方商城插件
    Python之路Day07
    Python之路Day06
    Python之路Day05
    Python 之路Day04
  • 原文地址:https://www.cnblogs.com/cupleo/p/13908416.html
Copyright © 2020-2023  润新知