• 中文文本生成_实战


    http://ltp.ai/demo.html

    from pyltp import *
    import os
    import re

    d_dir = '/usr/local/ltp_data_v3.4.0/'

    segmentor = Segmentor()
    s = '%s%s' % (d_dir, "cws.model")
    segmentor.load(s)

    postagger = Postagger()
    s = '%s%s' % (d_dir, "pos.model")
    postagger.load(s)

    parser = Parser()
    s = '%s%s' % (d_dir, "parser.model")
    parser.load(s)

    recognizer = NamedEntityRecognizer()
    s = '%s%s' % (d_dir, "ner.model")
    recognizer.load(s)

    labeller = SementicRoleLabeller()
    s = '%s%s' % ('/usr/local/ltp_data_v3.3.0/ltp_data/srl/', '')
    labeller.load(s)


    def gen_all(paragraph, split_join_tag='\t'):
    r = {}
    # 分词 其他分析依赖于该数据
    sentence = SentenceSplitter.split(paragraph)[0]
    # segmentor = Segmentor()
    # s = '%s%s' % (d_dir, "cws.model")
    # segmentor.load(s)
    words = segmentor.segment(sentence)
    r['words'] = split_join_tag.join(words)
    # print("\t".join(words))

    # 词性标注
    # postagger = Postagger()
    # s = '%s%s' % (d_dir, "pos.model")
    # postagger.load(s)
    postags = postagger.postag(words)
    r['postags'] = split_join_tag.join(postags)
    # print("\t".join(postags))

    # 依存句法关系
    # parser = Parser()
    # s = '%s%s' % (d_dir, "parser.model")
    # parser.load(s)
    arcs = parser.parse(words, postags)
    r['parser'] = split_join_tag.join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
    # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

    # 命名实体识别
    # recognizer = NamedEntityRecognizer()
    # s = '%s%s' % (d_dir, "ner.model")
    # recognizer.load(s)
    netags = recognizer.recognize(words, postags)
    r['netags'] = split_join_tag.join(netags)
    # print("\t".join(netags))

    # 语义角色类型

    # labeller = SementicRoleLabeller()
    # s = '%s%s' % ('/usr/local/ltp_data_v3.3.0/ltp_data/srl/', '')
    # labeller.load(s)
    roles = labeller.label(words, postags, netags, arcs)

    r['role'] = []
    for role in roles:
    d = {}
    d[role.index] = split_join_tag.join(
    ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
    # print(role.index, "".join(
    # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    r['role'].append(d)

    return r


    ori_f = 'list_b_only_title.txt'
    r_f = '%s%s' % (ori_f, '.del_ns.txt')
    res, select_r = {}, {}
    reg_l = ['ATT\\t\d+:SBV\\t\d+:HED\\t\d+:VOB\\t\d+']
    c = 0
    with open(ori_f, 'r', encoding='utf8') as fo:
    for i in fo:
    p = i.replace('\n', '').replace('"', '')
    try:
    a = gen_all(p)
    except Exception as e:
    print(p, ' ', e)
    continue
    res[p] = a
    for ii in reg_l:
    a_parser = a['parser']
    if re.compile(ii).search(a_parser) is not None:
    select_r[p] = a
    c += 1
    if c == 9988:
    break
    segmentor.release()
    postagger.release()
    parser.release()
    recognizer.release()
    labeller.release()



    feature_list

    话术表

    [
    哪个地方做什么的哪家靠谱?
    地名词库
    行业、业务词库
    ]
    苏州做网络推广的公司哪家靠谱?
    苏州镭射机维修哪家最专业?
    昆山做账的公司哪家比较好
    广州称重灌装机生产厂家哪家口碑比较好



    [
    含有专家知识
    ]
    郑州律师哪个好,如何判断合同是否有效?


    [
    哪个地方有做什么的?
    ]
    广东哪里有专业的全铝书柜定制?
    苏州吴中越溪哪里有通过率较高的会计培训班?

    [
    2-gram
    ]

    行业 属性 通过 “2-gram”实现,“动词+名词”

    昆山注册公司哪家专业?
    注册公司




    {'words': '大型\t雕铣机\t哪个\t牌子\t好\t?', 'postags': 'b\tn\tr\tn\ta\twp', 'parser': '2:ATT\t4:ATT\t4:ATT\t5:SBV\t0:HED\t5:WP', 'netags': 'O\tO\tO\tO\tO\tO', 'role': [{4: 'A0:(0,3)'}]}
    feature ATT SBV HED 相邻

    {'words': '深圳市\t东荣\t纯水\t设备\t有限公司\t有\t什么\t产品\t,\t电话\t是\t多少\t?', 'postags': 'ns\tnz\tn\tn\tn\tv\tr\tn\twp\tn\tv\tr\twp', 'parser': '5:ATT\t3:ATT\t4:ATT\t5:ATT\t6:SBV\t0:HED\t8:ATT\t6:VOB\t6:WP\t11:SBV\t6:COO\t11:VOB\t6:WP', 'netags': 'B-Ni\tI-Ni\tI-Ni\tI-Ni\tE-Ni\tO\tO\tO\tO\tO\tO\tO\tO', 'role': [{5: 'A0:(0,4)\tA1:(6,7)'}, {10: 'A0:(9,9)\tA1:(11,11)'}]}

    feature














  • 相关阅读:
    自定义注解标签验证
    redis-服务器配置-主从
    POJ-2195 Going Home(最小费用最大流模板)
    POJ-1087 A Plug for UNIX
    HDU-3625 Examining the Rooms (第一类斯特林数)
    网络流入门
    CodeForces-1082G Increasing Frequency
    python学习之模块-模块(三)
    python学习之模块-模块(二)
    python学习之模块-模块(一)
  • 原文地址:https://www.cnblogs.com/rsapaper/p/6185949.html
Copyright © 2020-2023  润新知