注意力代码实现-CSDN博客

通用注意力机制


import numpy as np
from scipy.special import softmax

# 定义 4 个单词的词向量，每个向量维度为 3
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])
words = np.array([word_1, word_2, word_3, word_4])

# 生成权重矩阵
np.random.seed(42) # 设置随机数种子，确保每次运行代码时生成的权重矩阵相同。
W_Q = np.random.randint(3, size=(3, 3))
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))

# @符号在数学和编程中通常用于表示矩阵乘法，第一个矩阵的列数必须与第二个矩阵的行数相匹配。
#生成查询、键和值
Q = words @ W_Q
K = words @ W_K
V = words @ W_V

# 针对所有key向量对query向量进行评分
scores = Q @ K.T
scores

# 通过softmax操作计算权重
weights = softmax(scores / np.sqrt(K.shape[1]), axis=1)
weights

# 通过value向量的加权和来计算注意力
attention = weights @ V
print(attention)

自注意力

# 导入库
import torch
import torch.nn.functional as F

# 示例输入序列
input_sequence = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])

# 生成 Key、Query 和 Value 矩阵的随机权重
random_weights_key = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_query = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_value = torch.randn(input_sequence.size(-1), input_sequence.size(-1))

# 计算 Key、Query 和 Value 矩阵
key = torch.matmul(input_sequence, random_weights_key)
query = torch.matmul(input_sequence, random_weights_query)
value = torch.matmul(input_sequence, random_weights_value)

# 计算注意力分数
attention_scores = torch.matmul(query, key.T) / torch.sqrt(torch.tensor(query.size(-1), dtype=torch.float32))

# 使用 softmax 函数获得注意力权重
attention_weights = F.softmax(attention_scores, dim=-1)

# 计算 Value 向量的加权和
output = torch.matmul(attention_weights, value)

print("自注意力机制后的输出:")
print(output)


class SelfAttention(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(SelfAttention, self).__init__()
        self.scale = dk ** (-0.5)  # 缩放因子
        self.q = nn.Linear(dim, dk)
        self.k = nn.Linear(dim, dk)
        self.v = nn.Linear(dim, dv)

    def forward(self, x):
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)

        attention = (q @ k.transpose(-2, 1)) * self.scale
        attention = attention.softmax(dim=-1)  # 最后一个维度进行 softmax

        x = attention @ v  # 将softmax后的 attention 与 v 相乘
        return x  # 返回输出

attention = SelfAttention(dim=2, dk=2, dv=2)  # dim表示输入的维度，dk表示key和query的维度，dv表示value的维度
x = torch.tensor(1, 4, 2)  # batch_size=1, toekn_num=4, embedding_size=2
output = attention(x)