SelfAttention 实例 pytorch

BERT模型入门系列（三）:Self-Attention详解 - 知乎 (zhihu.com)

读了不少书，看了不少视频，感觉这片文章最适合入门。

简洁清晰，例子好懂。

为什么需要self-attention模型?1、训练速度受限 2、处理长文本能力弱

The Illustrated Transformer – Jay Alammar – Visualizing machine learning one concept at a time. (jalammar.github.io)

计算过程

1、计算Q(查询向量Quey)、K(键向量)、Value(值向量)

2、计算注意力权重，这里使用点积来作为注意力打分函数

3、计算输出向量序列

详细步骤请参考原文：BERT模型入门系列（三）:Self-Attention详解 - 知乎 (zhihu.com)

原文程序貌似TensorFlow写的，这里用pytorch写一下。

import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F


# https://blog.csdn.net/weixin_53598445/article/details/125009686
# https://zhuanlan.zhihu.com/p/345280272


class selfAttention(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(selfAttention, self).__init__()
        self.key_layer = nn.Linear(input_size, hidden_size)
        self.query_layer = nn.Linear(input_size, hidden_size)
        self.value_layer = nn.Linear(input_size, hidden_size)

    def forward(self, x, w_k, w_q, w_v):
        self.key_layer.weight.data = w_k.mT     # 初始化key参数，可以直接学习
        self.key_layer.bias.data = torch.Tensor([0.0])
        key = self.key_layer(x)
        self.query_layer.weight.data = w_q.mT
        self.query_layer.bias.data = torch.Tensor([0.0])
        query = self.query_layer(x)
        self.value_layer.weight.data = w_v.mT
        self.value_layer.bias.data = torch.Tensor([0.0])
        value = self.value_layer(x)
        print('key:\n', key)
        print('query:\n', query)
        print('value:\n', value)

        attention_scores = torch.matmul(query, key.mT)  # query * (key的转置)
        print('query * (key的转置):\n', attention_scores)

        attention_softmax = F.softmax(attention_scores, dim=-1)  # dim=n参数用来指定第n个维度的和为1
        torch.set_printoptions(precision=2, sci_mode=False)  # 显示小数点后的位数
        print('注意力权重:\n', attention_softmax)

        h1 = value[0][0] * attention_softmax[0][0][0] \
             + value[0][1] * attention_softmax[0][0][1] \
             + value[0][2] * attention_softmax[0][0][2]

        h2 = value[0][0] * attention_softmax[0][1][0] \
             + value[0][1] * attention_softmax[0][1][1] \
             + value[0][2] * attention_softmax[0][1][2]

        h3 = value[0][0] * attention_softmax[0][2][0] \
             + value[0][1] * attention_softmax[0][2][1] \
             + value[0][2] * attention_softmax[0][2][2]

        print('输出向量序列:')
        print(h1)
        print(h2)
        print(h3)

        return 0


features = torch.tensor([[[1, 0, 1, 0],
                          [0, 2, 0, 2],
                          [1, 1, 1, 1]]], dtype=torch.float)

wk = torch.tensor([[0, 0, 1],
                   [1, 1, 0],
                   [0, 1, 0],
                   [1, 1, 0]], dtype=torch.float)

wq = torch.tensor([[1, 0, 1],
                   [1, 0, 0],
                   [0, 0, 1],
                   [0, 1, 1]], dtype=torch.float)
wv = torch.tensor([[0, 2, 0],
                   [0, 3, 0],
                   [1, 0, 3],
                   [1, 1, 0]], dtype=torch.float)

attention = selfAttention(4, 3)
attention.forward(features, wk, wq, wv)

运行结果，与图片计算基本一致（精度不同，略有差异）

key:
tensor([[[0., 1., 1.],
[4., 4., 0.],
[2., 3., 1.]]], grad_fn=<AddBackward0>)
query:
tensor([[[1., 0., 2.],
[2., 2., 2.],
[2., 1., 3.]]], grad_fn=<AddBackward0>)
value:
tensor([[[1., 2., 3.],
[2., 8., 0.],
[2., 6., 3.]]], grad_fn=<AddBackward0>)
query * (key的转置):
tensor([[[ 2., 4., 4.],
[ 4., 16., 12.],
[ 4., 12., 10.]]], grad_fn=<UnsafeViewBackward0>)
注意力权重:
tensor([[[ 0.06, 0.47, 0.47],
[ 0.00, 0.98, 0.02],
[ 0.00, 0.88, 0.12]]], grad_fn=<SoftmaxBackward0>)
输出向量序列:
tensor([1.94, 6.68, 1.60], grad_fn=<AddBackward0>)
tensor([2.00, 7.96, 0.05], grad_fn=<AddBackward0>)
tensor([2.00, 7.76, 0.36], grad_fn=<AddBackward0>)

不初始化，通过学习得到wq，wk，wv，源代码如下：

import torch
import numpy as np
import torch.nn as nn
import math
import torch.nn.functional as F


# https://blog.csdn.net/weixin_53598445/article/details/125009686
# https://zhuanlan.zhihu.com/p/345280272


class selfAttention(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(selfAttention, self).__init__()
        self.key_layer = nn.Linear(input_size, hidden_size)
        self.query_layer = nn.Linear(input_size, hidden_size)
        self.value_layer = nn.Linear(input_size, hidden_size)

    def forward(self, x):
        key = self.key_layer(x)
        query = self.query_layer(x)
        value = self.value_layer(x)
        print('key:\n', key)
        print('query:\n', query)
        print('value:\n', value)

        attention_scores = torch.matmul(query, key.mT)  # query * (key的转置)
        print('query * (key的转置):\n', attention_scores)

        attention_softmax = F.softmax(attention_scores, dim=-1)  # dim=n参数用来指定第n个维度的和为1
        torch.set_printoptions(precision=2, sci_mode=False)  # 显示小数点后的位数
        print('注意力权重:\n', attention_softmax)

        h1 = value[0][0] * attention_softmax[0][0][0] \
             + value[0][1] * attention_softmax[0][0][1] \
             + value[0][2] * attention_softmax[0][0][2]

        h2 = value[0][0] * attention_softmax[0][1][0] \
             + value[0][1] * attention_softmax[0][1][1] \
             + value[0][2] * attention_softmax[0][1][2]

        h3 = value[0][0] * attention_softmax[0][2][0] \
             + value[0][1] * attention_softmax[0][2][1] \
             + value[0][2] * attention_softmax[0][2][2]

        print('输出向量序列:')
        print(h1)
        print(h2)
        print(h3)

        return 0


features = torch.tensor([[[1, 0, 1, 0],
                          [0, 2, 0, 2],
                          [1, 1, 1, 1]]], dtype=torch.float)

attention = selfAttention(4, 3)
attention.forward(features)

输出结果：

key:
tensor([[[ 0.0680, 0.2645, 0.0556],
[-1.2327, -0.1178, 0.3482],
[-0.6597, 0.1382, 0.1653]]], grad_fn=<AddBackward0>)
query:
tensor([[[-0.0121, -0.0466, -0.2353],
[-0.2424, 0.3289, -0.2127],
[-0.1471, 0.0114, -0.2089]]], grad_fn=<AddBackward0>)
value:
tensor([[[-0.3317, -0.3424, -0.6434],
[ 0.4560, 1.2522, -1.7553],
[-0.0401, 0.1366, -1.2749]]], grad_fn=<AddBackward0>)
query * (key的转置):
tensor([[[-0.0262, -0.0615, -0.0373],
[ 0.0587, 0.1860, 0.1702],
[-0.0186, 0.1072, 0.0641]]], grad_fn=<UnsafeViewBackward0>)
注意力权重:
tensor([[[0.34, 0.33, 0.33],
[0.31, 0.35, 0.34],
[0.31, 0.35, 0.34]]], grad_fn=<SoftmaxBackward0>)
输出向量序列:
tensor([ 0.02, 0.34, -1.22], grad_fn=<AddBackward0>)
tensor([ 0.04, 0.38, -1.25], grad_fn=<AddBackward0>)
tensor([ 0.04, 0.38, -1.25], grad_fn=<AddBackward0>)

进程已结束,退出代码0

相关阅读:
浏览器es6报错 babelplugintransformruntime 引入代替babelpolyfill
windows11下安装MySQL 8.0 步骤
 【其它】idea 2022.1 超详细破解教程，亲测有效！（Webstorm,Goland,Pycharm,DataGrip等全家桶）
MyBatis的配置
 PVE配置证书
 shell脚本生成证书SAN证书，chrome自签名证书无效。
virtiowin下载地址
 DOCKER 2375 2376 TLS
H3C交换机同步网络时钟
 openwrt配置证书，ddnspod证书NGINX使用
原文地址：https://www.cnblogs.com/hbuwyg/p/16403413.html