http://www.voidcn.com/article/p-qnsvatou-byu.html
string = ''' (ROOT (S (NP (NN Carnac) (DT the) (NN Magnificent)) (VP (VBD gave) (NP (DT a) (NN talk))) ) ) ''' def is_symbol_char(character): ''' Predicate to test if a character is valid for use in a symbol, extend as needed. ''' return character.isalpha() or character in '-=$!?.' def tokenize(characters): ''' Process characters into a nested structure. The original string '(DT the)' is passed in as ['(', 'D', 'T', ' ', 't', 'h', 'e', ')'] ''' tokens = [] while characters: character = characters.pop(0) if character.isspace(): pass # nothing to do, ignore it elif character == '(': # signals start of recursive analysis (push) characters, result = tokenize(characters) tokens.append(result) elif character == ')': # signals end of recursive analysis (pop) break elif is_symbol_char(character): # if it looks like a symbol, collect all # subsequents symbol characters symbol = '' while is_symbol_char(character): symbol += character character = characters.pop(0) # push unused non-symbol character back onto characters characters.insert(0, character) tokens.append(symbol) # Return whatever tokens we collected and any characters left over return characters, tokens def extract_rules(tokens): ''' Recursively walk tokenized data extracting rules. ''' head, *tail = tokens print(head, '-->', *[x[0] if isinstance(x, list) else x for x in tail]) for token in tail: # recurse if isinstance(token, list): extract_rules(token) characters, tokens = tokenize(list(string)) # After a successful tokenization, all the characters should be consumed assert not characters, "Didn't consume all the input!" print('Tokens:', tokens[0], 'Rules:', sep=' ', end=' ') extract_rules(tokens[0])
https://github.com/usami/pcfg