通用注意力机制
import numpy as np
from scipy.special import softmax
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])
words = np.array([word_1, word_2, word_3, word_4])
np.random.seed(42)
W_Q = np.random.randint(3, size=(3, 3))
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))
Q = words @ W_Q
K = words @ W_K
V = words @ W_V
scores = Q @ K.T
scores
weights = softmax(scores / np.sqrt(K.shape[1]), axis=1)
weights
attention = weights @ V
print(attention)
自注意力
import torch
import torch.nn.functional as F
input_sequence = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
random_weights_key = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_query = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_value = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
key = torch.matmul(input_sequence, random_weights_key)
query = torch.matmul(input_sequence, random_weights_query)
value = torch.matmul(input_sequence, random_weights_value)
attention_scores = torch.matmul(query, key.T) / torch.sqrt(torch.tensor(query.size(-1), dtype=torch.float32))
attention_weights = F.softmax(attention_scores, dim=-1)
output = torch.matmul(attention_weights, value)
print("自注意力机制后的输出:")
print(output)
class SelfAttention(torch.nn.Module):
def __init__(self, input_size, output_size):
super(SelfAttention, self).__init__()
self.scale = dk ** (-0.5)
self.q = nn.Linear(dim, dk)
self.k = nn.Linear(dim, dk)
self.v = nn.Linear(dim, dv)
def forward(self, x):
q = self.q(x)
k = self.k(x)
v = self.v(x)
attention = (q @ k.transpose(-2, 1)) * self.scale
attention = attention.softmax(dim=-1)
x = attention @ v
return x
attention = SelfAttention(dim=2, dk=2, dv=2)
x = torch.tensor(1, 4, 2)
output = attention(x)