哈工大LTP进阶使用-三元组事件抽取



    import os
    from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
    class LtpParser:
        def __init__(self):
            LTP_DIR = "../model/ltp_data_v3.4.0/"
            self.segmentor = Segmentor()
            self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"),os.path.join(LTP_DIR, "user_dict.txt"))
            self.postagger = Postagger()
            self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"),os.path.join(LTP_DIR, "user_dict.txt"))
            self.parser = Parser()
            self.parser.load(os.path.join(LTP_DIR, "parser.model"))
            self.recognizer = NamedEntityRecognizer()
            self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))
            self.labeller = SementicRoleLabeller()
            self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
        def format_labelrole(self, words, postags):
            arcs = self.parser.parse(words, postags)
            roles = self.labeller.label(words, postags, arcs)
            roles_dict = {}
            for role in roles:
                roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
            return roles_dict
        def build_parse_child_dict(self, words, postags, arcs):
            child_dict_list = []
            format_parse_list = []
            for index in range(len(words)):
                child_dict = dict()
                for arc_index in range(len(arcs)):
                    if arcs[arc_index].head == index+1:   #arcs的索引从1开始
                        if arcs[arc_index].relation in child_dict:
                            child_dict[arcs[arc_index].relation] = []
            rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
            relation = [arc.relation for arc in arcs]  # 提取依存关系
            heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]  # 匹配依存父节点词语
            for i in range(len(words)):
                # ['ATT', '***', 0, 'nh', '总理', 1, 'n']
                a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
            return child_dict_list, format_parse_list
        def parser_main(self, sentence):
            words = list(self.segmentor.segment(sentence))
            postags = list(self.postagger.postag(words))
            arcs = self.parser.parse(words, postags)
            child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
            roles_dict = self.format_labelrole(words, postags)
            return words, postags, child_dict_list, roles_dict, format_parse_list
    if __name__ == '__main__':
        parse = LtpParser()
        sentence = '中国是一个自由、和平的国家'
        words, postags, child_dict_list, roles_dict, format_parse_list = parse.parser_main(sentence)
        print(words, len(words))
        print(postags, len(postags))
        print(child_dict_list, len(child_dict_list))
        print(format_parse_list, len(format_parse_list))


    ['中国', '是', '一个', '自由', '、', '和平', '的', '国家'] 8
    ['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n'] 8
    [{}, {'SBV': [0], 'VOB': [7]}, {}, {'COO': [5], 'RAD': [6]}, {}, {'WP': [4]}, {}, {'ATT': [2, 3]}] 8
    {1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 7]}}
    [['SBV', '中国', 0, 'ns', '是', 1, 'v'], ['HED', '是', 1, 'v', 'Root', -1, 'n'], ['ATT', '一个', 2, 'm', '国家', 7, 'n'], ['ATT', '自由', 3, 'a', '国家', 7, 'n'], ['WP', '、', 4, 'wp', '和平', 5, 'a'], ['COO', '和平', 5, 'a', '自由', 3, 'a'], ['RAD', '的', 6, 'u', '自由', 3, 'a'], ['VOB', '国家', 7, 'n', '是', 1, 'v']] 8

    ['中国', '是', '一个', '自由', '、', '和平', '的', '国家']
    ['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n']
    [{}, {'SBV': [0], 'VOB': [7]}, {}, {'COO': [5], 'RAD': [6]}, {}, {'WP': [4]}, {}, {'ATT': [2, 3]}]
    2:SBV 0:HED 8:ATT 8:ATT 6:WP 4:COO 4:RAD 2:VOB
    {1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 7]}}
    [['SBV', '中国', 0, 'ns', '是', 1, 'v'], ['HED', '是', 1, 'v', 'Root', -1, 'n'], ['ATT', '一个', 2, 'm', '国家', 7, 'n'], ['ATT', '自由', 3, 'a', '国家', 7, 'n'], ['WP', '、', 4, 'wp', '和平', 5, 'a'], ['COO', '和平', 5, 'a', '自由', 3, 'a'], ['RAD', '的', 6, 'u', '自由', 3, 'a'], ['VOB', '国家', 7, 'n', '是', 1, 'v']]


    from sentence_parser import *
    import re
    import os
    from time import time
    from pprint import pprint
    from  pyltp import SentenceSplitter, Segmentor, Postagger, Parser
    from utils import clean_text
    from collections import Counter
    class TripleExtractor:
        def __init__(self):
            self.parser = LtpParser()
        '''文章分句处理, 切分长句,冒号,分号,感叹号等做切分标识'''
        def split_sents(self, content):
            return [sentence for sentence in re.split(r'[??!!。;;::
    ]', content) if
                    sentence and '北京银行' in sentence and len(sentence) < 300]
        def ruler1(self, words, postags, roles_dict, role_index):
            # words:['中国', '是', '一个', '自由', '、', '和平', '的', '国家']
            # postags:['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n']
            # roles_dict:{1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 7]}}
            # role_index:1
            v = words[role_index]  # 是
            role_info = roles_dict[role_index]
            if 'A0' in role_info.keys() and 'A1' in role_info.keys():
                s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2] + 1) if
                             postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]])
                o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2] + 1) if
                             postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]])
                if s and o:
                    return '1', [s, v, o]
            # elif 'A0' in role_info:
            #     s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2] + 1) if
            #                  postags[word_index][0] not in ['w', 'u', 'x']])
            #     if s:
            #         return '2', [s, v]
            # elif 'A1' in role_info:
            #     o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2]+1) if
            #                  postags[word_index][0] not in ['w', 'u', 'x']])
            #     return '3', [v, o]
            return '4', []
        def ruler2(self, words, postags, child_dict_list, roles_dict, arcs):
            # words:['中国', '是', '一个', '自由', '、', '和平', '的', '国家']
            # postags:['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n']
            # child_dict_list:[{}, {'SBV': [0], 'VOB': [7]}, {}, {'COO': [5], 'RAD': [6]}, {}, {'WP': [4]}, {}, {'ATT': [2, 3]}]
            # roles_dict:{1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 7]}}
            # arcs:[['SBV', '中国', 0, 'ns', '是', 1, 'v'], ['HED', '是', 1, 'v', 'Root', -1, 'n'], ['ATT', '一个', 2, 'm', '国家', 7, 'n'], ['ATT', '自由', 3, 'a', '国家', 7, 'n'], ['WP', '、', 4, 'wp', '和平', 5, 'a'], ['COO', '和平', 5, 'a', '自由', 3, 'a'], ['RAD', '的', 6, 'u', '自由', 3, 'a'], ['VOB', '国家', 7, 'n', '是', 1, 'v']]
            svos = []
            for index in range(len(postags)):  # [0,1,2,3,4,5,6,7]
                tmp = 1
                # 先借助语义角色标注的结果,进行三元组抽取
                if index in roles_dict:  # 1
                    flag, triple = self.ruler1(words, postags, roles_dict, index)
                    if flag == '1':
                        tmp = 0
                if tmp == 1:
                    # 如果语义角色标记为空,则使用依存句法进行抽取
                    # if postags[index] == 'v':
                    if postags[index]: # 是
                        # 抽取以谓词为中心的事实三元组
                        child_dict = child_dict_list[index]
                        # 主谓宾
                        # SBV:我送她一束花 (我 <– 送)
                        # VOB:我送她一束花 (送 –> 花)
                        if 'SBV' in child_dict and 'VOB' in child_dict:
                            r = words[index]
                            e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                            e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                            svos.append([e1, r, e2])
                        # 定语后置,动宾关系
                        # ATT:红苹果 (红 <– 苹果)
                        relation = arcs[index][0] 
                        head = arcs[index][2]
                        if relation == 'ATT':
                            if 'VOB' in child_dict:
                                e1 = self.complete_e(words, postags, child_dict_list, head - 1)
                                r = words[index]
                                e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                                temp_string = r + e2
                                if temp_string == e1[:len(temp_string)]:
                                    e1 = e1[len(temp_string):]
                                if temp_string not in e1:
                                    svos.append([e1, r, e2])
                        # 含有介宾关系的主谓动补关系
                        # CMP:做完了作业 (做 –> 完)
                        # POB:在贸易区内 (在 –> 内)
                        if 'SBV' in child_dict and 'CMP' in child_dict:
                            e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                            cmp_index = child_dict['CMP'][0]
                            r = words[index] + words[cmp_index]
                            if 'POB' in child_dict_list[cmp_index]:
                                e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
                                svos.append([e1, r, e2])
            return svos
        def complete_e(self, words, postags, child_dict_list, word_index):
            child_dict = child_dict_list[word_index]
            prefix = ''
            if 'ATT' in child_dict:
                for i in range(len(child_dict['ATT'])):
                    prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
            postfix = ''
            if postags[word_index] == 'v':
                if 'VOB' in child_dict:
                    postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                if 'SBV' in child_dict:
                    prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
            return prefix + words[word_index] + postfix
        def triples_main(self, content):
            # sentences = self.split_sents(content)
            svos = []
            sentence = content
            # for sentence in sentences:
            words, postags, child_dict_list, roles_dict, arcs = self.parser.parser_main(sentence)
            svo = self.ruler2(words, postags, child_dict_list, roles_dict, arcs)
            svos += svo
            return svos
    def test():
        extractor = TripleExtractor()
        contents = [
        for content in contents:


    [['中国', '是', '一个自由和平国家']]
    [['他', '完成', '交易']]
    [['我', '送', '一朵花']]
    [['我', '做', '作业']]
