• pytorch之对预训练的bert进行剪枝


    大体过程

    对层数进行剪枝

    1、加载预训练的模型;
    2、提取所需要层的权重,并对其进行重命名。比如我们想要第0层和第11层的权重,那么需要将第11层的权重保留下来并且重命名为第1层的名字;
    3、更改模型配置文件(保留几层就是几),并且将第11层的权重赋值给第1层;
    4、保存模型为pytorch_model.bin;
    首先我们来看一下bert具体有哪些权重:

    import torch
    from transformers import BertTokenizer, BertModel
    
    bertModel = BertModel.from_pretrained('bert-base-chinese', output_hidden_states=True, output_attentions=True)
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    for name,param in bertModel.named_parameters():
      print(name, param.shape)
    
    embeddings.word_embeddings.weight torch.Size([21128, 768])
    embeddings.position_embeddings.weight torch.Size([512, 768])
    embeddings.token_type_embeddings.weight torch.Size([2, 768])
    embeddings.LayerNorm.weight torch.Size([768])
    embeddings.LayerNorm.bias torch.Size([768])
    encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.0.attention.self.query.bias torch.Size([768])
    encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.0.attention.self.key.bias torch.Size([768])
    encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.0.attention.self.value.bias torch.Size([768])
    encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.0.attention.output.dense.bias torch.Size([768])
    encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.0.intermediate.dense.bias torch.Size([3072])
    encoder.layer.0.output.dense.weight torch.Size([768, 3072])
    encoder.layer.0.output.dense.bias torch.Size([768])
    encoder.layer.0.output.LayerNorm.weight torch.Size([768])
    encoder.layer.0.output.LayerNorm.bias torch.Size([768])
    encoder.layer.1.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.1.attention.self.query.bias torch.Size([768])
    encoder.layer.1.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.1.attention.self.key.bias torch.Size([768])
    encoder.layer.1.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.1.attention.self.value.bias torch.Size([768])
    encoder.layer.1.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.1.attention.output.dense.bias torch.Size([768])
    encoder.layer.1.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.1.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.1.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.1.intermediate.dense.bias torch.Size([3072])
    encoder.layer.1.output.dense.weight torch.Size([768, 3072])
    encoder.layer.1.output.dense.bias torch.Size([768])
    encoder.layer.1.output.LayerNorm.weight torch.Size([768])
    encoder.layer.1.output.LayerNorm.bias torch.Size([768])
    encoder.layer.2.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.2.attention.self.query.bias torch.Size([768])
    encoder.layer.2.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.2.attention.self.key.bias torch.Size([768])
    encoder.layer.2.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.2.attention.self.value.bias torch.Size([768])
    encoder.layer.2.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.2.attention.output.dense.bias torch.Size([768])
    encoder.layer.2.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.2.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.2.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.2.intermediate.dense.bias torch.Size([3072])
    encoder.layer.2.output.dense.weight torch.Size([768, 3072])
    encoder.layer.2.output.dense.bias torch.Size([768])
    encoder.layer.2.output.LayerNorm.weight torch.Size([768])
    encoder.layer.2.output.LayerNorm.bias torch.Size([768])
    encoder.layer.3.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.3.attention.self.query.bias torch.Size([768])
    encoder.layer.3.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.3.attention.self.key.bias torch.Size([768])
    encoder.layer.3.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.3.attention.self.value.bias torch.Size([768])
    encoder.layer.3.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.3.attention.output.dense.bias torch.Size([768])
    encoder.layer.3.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.3.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.3.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.3.intermediate.dense.bias torch.Size([3072])
    encoder.layer.3.output.dense.weight torch.Size([768, 3072])
    encoder.layer.3.output.dense.bias torch.Size([768])
    encoder.layer.3.output.LayerNorm.weight torch.Size([768])
    encoder.layer.3.output.LayerNorm.bias torch.Size([768])
    encoder.layer.4.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.4.attention.self.query.bias torch.Size([768])
    encoder.layer.4.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.4.attention.self.key.bias torch.Size([768])
    encoder.layer.4.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.4.attention.self.value.bias torch.Size([768])
    encoder.layer.4.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.4.attention.output.dense.bias torch.Size([768])
    encoder.layer.4.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.4.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.4.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.4.intermediate.dense.bias torch.Size([3072])
    encoder.layer.4.output.dense.weight torch.Size([768, 3072])
    encoder.layer.4.output.dense.bias torch.Size([768])
    encoder.layer.4.output.LayerNorm.weight torch.Size([768])
    encoder.layer.4.output.LayerNorm.bias torch.Size([768])
    encoder.layer.5.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.5.attention.self.query.bias torch.Size([768])
    encoder.layer.5.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.5.attention.self.key.bias torch.Size([768])
    encoder.layer.5.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.5.attention.self.value.bias torch.Size([768])
    encoder.layer.5.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.5.attention.output.dense.bias torch.Size([768])
    encoder.layer.5.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.5.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.5.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.5.intermediate.dense.bias torch.Size([3072])
    encoder.layer.5.output.dense.weight torch.Size([768, 3072])
    encoder.layer.5.output.dense.bias torch.Size([768])
    encoder.layer.5.output.LayerNorm.weight torch.Size([768])
    encoder.layer.5.output.LayerNorm.bias torch.Size([768])
    encoder.layer.6.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.6.attention.self.query.bias torch.Size([768])
    encoder.layer.6.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.6.attention.self.key.bias torch.Size([768])
    encoder.layer.6.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.6.attention.self.value.bias torch.Size([768])
    encoder.layer.6.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.6.attention.output.dense.bias torch.Size([768])
    encoder.layer.6.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.6.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.6.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.6.intermediate.dense.bias torch.Size([3072])
    encoder.layer.6.output.dense.weight torch.Size([768, 3072])
    encoder.layer.6.output.dense.bias torch.Size([768])
    encoder.layer.6.output.LayerNorm.weight torch.Size([768])
    encoder.layer.6.output.LayerNorm.bias torch.Size([768])
    encoder.layer.7.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.7.attention.self.query.bias torch.Size([768])
    encoder.layer.7.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.7.attention.self.key.bias torch.Size([768])
    encoder.layer.7.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.7.attention.self.value.bias torch.Size([768])
    encoder.layer.7.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.7.attention.output.dense.bias torch.Size([768])
    encoder.layer.7.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.7.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.7.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.7.intermediate.dense.bias torch.Size([3072])
    encoder.layer.7.output.dense.weight torch.Size([768, 3072])
    encoder.layer.7.output.dense.bias torch.Size([768])
    encoder.layer.7.output.LayerNorm.weight torch.Size([768])
    encoder.layer.7.output.LayerNorm.bias torch.Size([768])
    encoder.layer.8.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.8.attention.self.query.bias torch.Size([768])
    encoder.layer.8.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.8.attention.self.key.bias torch.Size([768])
    encoder.layer.8.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.8.attention.self.value.bias torch.Size([768])
    encoder.layer.8.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.8.attention.output.dense.bias torch.Size([768])
    encoder.layer.8.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.8.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.8.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.8.intermediate.dense.bias torch.Size([3072])
    encoder.layer.8.output.dense.weight torch.Size([768, 3072])
    encoder.layer.8.output.dense.bias torch.Size([768])
    encoder.layer.8.output.LayerNorm.weight torch.Size([768])
    encoder.layer.8.output.LayerNorm.bias torch.Size([768])
    encoder.layer.9.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.9.attention.self.query.bias torch.Size([768])
    encoder.layer.9.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.9.attention.self.key.bias torch.Size([768])
    encoder.layer.9.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.9.attention.self.value.bias torch.Size([768])
    encoder.layer.9.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.9.attention.output.dense.bias torch.Size([768])
    encoder.layer.9.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.9.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.9.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.9.intermediate.dense.bias torch.Size([3072])
    encoder.layer.9.output.dense.weight torch.Size([768, 3072])
    encoder.layer.9.output.dense.bias torch.Size([768])
    encoder.layer.9.output.LayerNorm.weight torch.Size([768])
    encoder.layer.9.output.LayerNorm.bias torch.Size([768])
    encoder.layer.10.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.10.attention.self.query.bias torch.Size([768])
    encoder.layer.10.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.10.attention.self.key.bias torch.Size([768])
    encoder.layer.10.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.10.attention.self.value.bias torch.Size([768])
    encoder.layer.10.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.10.attention.output.dense.bias torch.Size([768])
    encoder.layer.10.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.10.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.10.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.10.intermediate.dense.bias torch.Size([3072])
    encoder.layer.10.output.dense.weight torch.Size([768, 3072])
    encoder.layer.10.output.dense.bias torch.Size([768])
    encoder.layer.10.output.LayerNorm.weight torch.Size([768])
    encoder.layer.10.output.LayerNorm.bias torch.Size([768])
    encoder.layer.11.attention.self.query.weight torch.Size([768, 768])
    encoder.layer.11.attention.self.query.bias torch.Size([768])
    encoder.layer.11.attention.self.key.weight torch.Size([768, 768])
    encoder.layer.11.attention.self.key.bias torch.Size([768])
    encoder.layer.11.attention.self.value.weight torch.Size([768, 768])
    encoder.layer.11.attention.self.value.bias torch.Size([768])
    encoder.layer.11.attention.output.dense.weight torch.Size([768, 768])
    encoder.layer.11.attention.output.dense.bias torch.Size([768])
    encoder.layer.11.attention.output.LayerNorm.weight torch.Size([768])
    encoder.layer.11.attention.output.LayerNorm.bias torch.Size([768])
    encoder.layer.11.intermediate.dense.weight torch.Size([3072, 768])
    encoder.layer.11.intermediate.dense.bias torch.Size([3072])
    encoder.layer.11.output.dense.weight torch.Size([768, 3072])
    encoder.layer.11.output.dense.bias torch.Size([768])
    encoder.layer.11.output.LayerNorm.weight torch.Size([768])
    encoder.layer.11.output.LayerNorm.bias torch.Size([768])
    pooler.dense.weight torch.Size([768, 768])
    pooler.dense.bias torch.Size([768])
    

    完整代码:

    import os
    import json
    import torch
    import time
    from transformers import BertModel,BertTokenizer
    
    # 提取我们想要的层的权重并重命名
    def get_prune_paramerts(model):
        prune_paramerts = {}
        for name, param in model.named_parameters():
            if 'embeddings' in name:
                prune_paramerts[name] = param
            elif name.startswith('encoder.layer.0.'):
                prune_paramerts[name] = param
            elif name.startswith('encoder.layer.11.'):
                pro_name = name.split('encoder.layer.11.')
                prune_paramerts['encoder.layer.1.' + pro_name[1]] = param
            elif 'pooler' in name:
                prune_paramerts[name] = param
        return prune_paramerts
    
    # 修改配置文件
    def get_prune_config(config):
        prune_config = config
        prune_config['num_hidden_layers'] = 2
        return prune_config
    
    # 缩减模型的层数,并为相对应的层重新进行权重赋值
    def get_prune_model(model, prune_parameters):
        prune_model = model.state_dict()
        for name in list(prune_model.keys()):
            if 'embeddings.position_ids' == name:
                continue
            if 'embeddings' in name:
                prune_model[name] = prune_parameters[name]
            elif name.startswith('encoder.layer.0.'):
                prune_model[name] = prune_parameters[name]
            elif name.startswith('encoder.layer.1.'):
                prune_model[name] = prune_parameters[name]
            elif 'pooler' in name:
                prune_model[name] = prune_parameters[name]
            else:
                del prune_model[name]
        return prune_model
    
    def prune_main():
        model_path = '/data02/gob/project/simpleNLP/model_hub/chinese-bert-wwm-ext/'
        tokenizer = BertTokenizer.from_pretrained(model_path + 'vocab.txt')
        config = json.loads(open(model_path + 'config.json', 'r').read())
        model = BertModel.from_pretrained(model_path)
        text = '我喜欢吃鱼'
        inputs = tokenizer(text, return_tensors='pt')
        # print(model(**inputs))
    
        out_path = '/data02/gob/project/simpleNLP/model_hub/prune-chinese-bert-wwm-ext/'
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        prune_parameters = get_prune_paramerts(model)
        prune_config = get_prune_config(config)
        prune_model = get_prune_model(model, prune_parameters)
        """
        for name,param in model.named_parameters():
            print(name)
        print("===================================")
        for k,v in model.state_dict().items():
            print(k)
        """
        torch.save(prune_model, out_path + 'pytorch_model.bin')
        with open(out_path + 'config.json', 'w') as fp:
            fp.write(json.dumps(prune_config))
        with open(out_path + 'vocab.txt', 'w') as fp:
            fp.write(open(model_path + 'vocab.txt').read())
    
    if __name__ == '__main__':
        # prune_main()
        start_time = time.time()
        # 之后我们就可以像加载bert模型一样加载剪枝层后的模型
        model_path = '/data02/gob/project/simpleNLP/model_hub/prune-chinese-bert-wwm-ext/'
        tokenizer = BertTokenizer.from_pretrained(model_path + 'vocab.txt')
        config = json.loads(open(model_path + 'config.json', 'r').read())
        model = BertModel.from_pretrained(model_path)
        text = '我喜欢吃鱼'
        inputs = tokenizer(text, return_tensors='pt')
        for name, param in model.named_parameters():
            print(name, param.shape)
        end_time = time.time()
        print('预测耗时:{}s'.format(end_time-start_time))
    

    对ffn里面的维度进行剪枝

    1、加载预训练的模型;
    2、提取所需要层的权重,并选择topk的值进行裁剪,并重新赋值给该层的参数;
    3、更改模型配置文件(主要是修改维度);
    4、保存模型为pytorch_model.bin;
    具体代码:

    import os
    import json
    import torch
    import time
    from pprint import pprint
    from transformers import BertModel,BertTokenizer
    
    
    def get_prune_ffn_paramerts(model):
        prune_paramerts = {}
        for name, param in model.named_parameters():
            if 'intermediate.dense.weight' in name:
                param = torch.tensor(param.T.topk(384).values, requires_grad=True).T
                prune_paramerts[name] = param
            elif 'intermediate.dense.bias' in name:
                param = torch.tensor(param.topk(384).values, requires_grad=True)
                prune_paramerts[name] = param
            elif 'output.dense.weight' in name and 'attention.output.dense.weight' not in name:
                param = torch.tensor(param.topk(384).values, requires_grad=True)
                prune_paramerts[name] = param
        return prune_paramerts
    
    
    def get_prune_ffn_config(config):
        prune_config = config
        prune_config['intermediate_size'] = 384
        return prune_config
    
    def get_prune_model(model, prune_parameters):
        prune_model = model.state_dict()
        for name in list(prune_model.keys()):
            if name in prune_parameters:
                prune_model[name] = prune_parameters[name]
        return prune_model
    
    
    def prune_main():
        model_path = '/data02/gob/project/simpleNLP/model_hub/prune-chinese-bert-wwm-ext/'
        tokenizer = BertTokenizer.from_pretrained(model_path + 'vocab.txt')
        config = json.loads(open(model_path + 'config.json', 'r').read())
        model = BertModel.from_pretrained(model_path)
    
        out_path = '/data02/gob/project/simpleNLP/model_hub/prune-ffn-chinese-bert-wwm-ext/'
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        prune_parameters = get_prune_ffn_paramerts(model)
        prune_config = get_prune_ffn_config(config)
        prune_model = get_prune_model(model, prune_parameters)
        torch.save(prune_model, out_path + 'pytorch_model.bin')
        with open(out_path + 'config.json', 'w') as fp:
            fp.write(json.dumps(prune_config))
        with open(out_path + 'vocab.txt', 'w') as fp:
            fp.write(open(model_path + 'vocab.txt').read())
    
    if __name__ == '__main__':
        # prune_main()
        if torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        model_path = '/data02/gob/project/simpleNLP/model_hub/prune-chinese-bert-wwm-ext/'
        # model_path = '/data02/gob/project/simpleNLP/model_hub/bert-base-chinese/'
        tokenizer = BertTokenizer.from_pretrained(model_path + 'vocab.txt')
        config = json.loads(open(model_path + 'config.json', 'r').read())
        model = BertModel.from_pretrained(model_path)
        model.to(device)
        start_time = time.time()
        texts = ['我喜欢吃鱼,我也喜欢打篮球,你知不知道呀。在这个阳光明媚的日子里,我们一起去放风筝'] * 5000
        for text in texts:
            inputs = tokenizer(text, return_tensors='pt')
            for k in inputs.keys():
                inputs[k] = inputs[k].to(device)
            # pprint(inputs)
            # for name, param in model.named_parameters():
            #     print(name, param.shape)
        end_time = time.time()
        print('预测耗时:{}s'.format(end_time-start_time))
    

    对多头进行剪枝和对隐藏层维度进行剪枝

    相对复杂,暂时就不考虑了,一般情况下对层数进行剪枝,简单又方便。

  • 相关阅读:
    【BZOJ 1013】 [JSOI2008]球形空间产生器sphere
    【codeforces 779A】Pupils Redistribution
    【codeforces 779B】Weird Rounding
    【codeforces 779C】Dishonest Sellers
    Residential Gateway System for Home Network Service
    互联网大规模数据分析技术(自主模式)第五章 大数据平台与技术 第10讲 大数据处理平台Hadoop
    大数据系统基础(自主模式) 2.1大数据和云计算关系概述
    设计模式(10) 外观模式(FACADE)
    命令模式(Command Pattern)
    设计模式(三)建造者模式Builder(创建型)
  • 原文地址:https://www.cnblogs.com/xiximayou/p/15193655.html
Copyright © 2020-2023  润新知