• nlp中各中文预训练模型的输入和输出


    Bert

    from transformers import (  
      BertTokenizer,
      BertConfig,
      BertModel,
    )
    # clue/roberta_chinese_base
    bertTokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    bertModel = BertModel.from_pretrained('bert-base-chinese')
    sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
    inputs = bertTokenizer(sen, return_tensors='pt')
    tokens = bertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    print(inputs)
    print(tokens)
    outputs = bertModel(**inputs)
    # print(len(outputs))
    print(outputs[0].shape, outputs[1].shape)
    {'input_ids': tensor([[  101,   100,  2990,   897,   749,   100,  7566,  1818,  1920,  7030,
             10223,   118,  8205,   118,  9143,  4638,  7564,  6378,  5298,  6427,
              6241,  3563,  1798,  5310,  3354,  4638,  3563,  1798,  1469,  6444,
              4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
    ['[CLS]', '[UNK]', '提', '供', '了', '[UNK]', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
    torch.Size([1, 35, 768]) torch.Size([1, 768])
    

    Roberta

    from transformers import (  
      BertTokenizer,
      BertConfig,
      BertModel,
    )
    # clue/roberta_chinese_base
    robertTokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
    robertModel = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')
    sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
    inputs = robertTokenizer(sen, return_tensors='pt')
    tokens = robertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    print(inputs)
    print(tokens)
    outputs = robertModel(**inputs)
    print(outputs)
    print(outputs[0].shape, outputs[1].shape)
    {'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
             10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
              4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
              4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
    ['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
    torch.Size([1, 40, 768]) torch.Size([1, 768])
    

    ALBert

    from transformers import (  
      BertTokenizer,
      AlbertModel,
    )
    # clue/roberta_chinese_base
    albertTokenizer = BertTokenizer.from_pretrained('clue/albert_chinese_tiny')
    albertModel = AlbertModel.from_pretrained('clue/albert_chinese_tiny')
    sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
    inputs = albertTokenizer(sen, return_tensors='pt')
    tokens = albertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    print(inputs)
    print(tokens)
    outputs = albertModel(**inputs)
    # print(len(outputs))
    print(outputs[0].shape, outputs[1].shape)	
    {'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
             10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
              4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
              4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
    ['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
    torch.Size([1, 40, 312]) torch.Size([1, 312])
    

    XLNet

    from transformers import AutoTokenizer, AutoModel
      
    xlnettokenizer = AutoTokenizer.from_pretrained("hfl/chinese-xlnet-base")
    xlnetModel = AutoModel.from_pretrained('hfl/chinese-xlnet-base')
    sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
    inputs = xlnettokenizer(sen, return_tensors='pt')
    tokens = xlnettokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    print(inputs)
    print(tokens)
    outputs = xlnetModel(**inputs)
    # print(outputs)
    print(outputs[0].shape, len(outputs[1]))
    {'input_ids': tensor([[   19, 13932,  9560,  4127,  3810,   603,   602,   412,  3336,  1144,
              3025,  4402,    13, 16636,    13,  7717,    20,    19,  3712,  3620,
              1723,  2280,  1301,    20,  2280,    24, 16338,  7921,    18,     4,
                 3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1, 1]])}
    ['▁', 'Trans', 'form', 'ers', '提供了', 'N', 'L', 'P', '领域', '大量', 'st', 'ate', '-', 'of', '-', 'art', '的', '▁', '预', '训练', '语言', '模型', '结构', '的', '模型', '和', '调用', '框架', '。', '<sep>', '<cls>']
    torch.Size([1, 31, 768]) 12
    

    Electra

    from transformers import AutoTokenizer, AutoModel
      
    
    electratokenizer = AutoTokenizer.from_pretrained("hfl/chinese-electra-180g-base-discriminator")
    electraModel = AutoModel.from_pretrained("hfl/chinese-electra-180g-base-discriminator")
    sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
    inputs = electratokenizer(sen, return_tensors='pt')
    tokens = electratokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    print(inputs)
    print(tokens)
    outputs = electraModel(**inputs)
    # print(outputs)
    print(outputs[0].shape)
    {'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
             10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
              4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
              4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
    ['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
    torch.Size([1, 40, 768])
    

    MacBert

    from transformers import AutoTokenizer, AutoModel
      
    
    mactokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
    macModel = AutoModel.from_pretrained("hfl/chinese-macbert-base")
    sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
    inputs = electratokenizer(sen, return_tensors='pt')
    tokens = electratokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    print(inputs)
    print(tokens)
    outputs = electraModel(**inputs)
    # print(outputs)
    print(outputs[0].shape)
    {'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
             10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
              4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
              4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
    ['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
    torch.Size([1, 40, 768])
    
  • 相关阅读:
    python使用subprocess及delegator调用第三方程序
    CentOS安装配置nginx-rtmp-module(同时支持IPv4和IPv6)
    npm和cnpm安装(for windows)
    MySQL中pymysql的具体操作
    浅谈Red Hat和 Centos7
    [致]Markdown新手指南
    python面试题
    VB使用ADO中recordeset.delete删除数据记录问题
    VB连接ACCESS数据库,使用 LIKE 通配符问题
    VB6位运算
  • 原文地址:https://www.cnblogs.com/xiximayou/p/15311104.html
Copyright © 2020-2023  润新知