
深度学习序列建模注意力机制理论1. 技术分析1.1 注意力机制概述注意力机制允许模型聚焦于输入的不同部分注意力机制类型 自注意力: 序列内部依赖 多头注意力: 多个注意力头 交叉注意力: 不同序列间交互 核心思想: 计算注意力权重 加权求和得到输出1.2 注意力机制对比类型复杂度能力适用场景点积注意力O(n²d)标准通用多头注意力O(n²d * h)多视角Transformer线性注意力O(nd)长序列长文档稀疏注意力O(n log n d)长序列高效计算1.3 Transformer架构Transformer架构 编码器: 处理输入序列 解码器: 生成输出序列 注意力机制: 核心组件 关键创新: 自注意力 位置编码 残差连接2. 核心功能实现2.1 自注意力机制import numpy as np class ScaledDotProductAttention: def __init__(self): pass def forward(self, Q, K, V, maskNone): d_k Q.shape[-1] scores np.dot(Q, K.transpose(-2, -1)) / np.sqrt(d_k) if mask is not None: scores scores.masked_fill(mask 0, -1e9) attn_weights self._softmax(scores, axis-1) output np.dot(attn_weights, V) return output, attn_weights def _softmax(self, x, axis-1): exp_x np.exp(x - np.max(x, axisaxis, keepdimsTrue)) return exp_x / np.sum(exp_x, axisaxis, keepdimsTrue) class MultiHeadAttention: def __init__(self, d_model, num_heads): self.d_model d_model self.num_heads num_heads self.d_k d_model // num_heads self.W_q np.random.randn(d_model, d_model) self.W_k np.random.randn(d_model, d_model) self.W_v np.random.randn(d_model, d_model) self.W_o np.random.randn(d_model, d_model) def split_heads(self, x): batch_size x.shape[0] return x.reshape(batch_size, -1, self.num_heads, self.d_k).transpose(0, 2, 1, 3) def forward(self, Q, K, V, maskNone): batch_size Q.shape[0] Q self.split_heads(Q self.W_q) K self.split_heads(K self.W_k) V self.split_heads(V self.W_v) output, attn_weights ScaledDotProductAttention().forward(Q, K, V, mask) output output.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.d_model) output output self.W_o return output, attn_weights2.2 Transformer编码器class TransformerEncoderLayer: def __init__(self, d_model, num_heads, d_ff, dropout0.1): self.self_attn MultiHeadAttention(d_model, num_heads) self.feed_forward PositionWiseFFN(d_model, d_ff) self.norm1 LayerNorm(d_model) self.norm2 LayerNorm(d_model) self.dropout dropout def forward(self, x, maskNone): attn_output, _ self.self_attn(x, x, x, mask) x self.norm1(x self._dropout(attn_output)) ff_output self.feed_forward(x) x self.norm2(x self._dropout(ff_output)) return x def _dropout(self, x): if self.dropout 0: mask np.random.rand(*x.shape) self.dropout return x * mask / (1 - self.dropout) return x class PositionWiseFFN: def __init__(self, d_model, d_ff): self.fc1 np.random.randn(d_model, d_ff) self.fc2 np.random.randn(d_ff, d_model) def forward(self, x): return self._gelu(x self.fc1) self.fc2 def _gelu(self, x): return 0.5 * x * (1 np.tanh(np.sqrt(2 / np.pi) * (x 0.044715 * x ** 3))) class LayerNorm: def __init__(self, d_model, eps1e-5): self.gamma np.ones(d_model) self.beta np.zeros(d_model) self.eps eps def forward(self, x): mean np.mean(x, axis-1, keepdimsTrue) var np.var(x, axis-1, keepdimsTrue) x_normalized (x - mean) / np.sqrt(var self.eps) return self.gamma * x_normalized self.beta class PositionalEncoding: def __init__(self, d_model, max_len5000): self.encoding self._compute_positional_encoding(d_model, max_len) def _compute_positional_encoding(self, d_model, max_len): position np.arange(max_len).reshape(-1, 1) div_term np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model)) pe np.zeros((max_len, 1, d_model)) pe[:, 0, 0::2] np.sin(position * div_term) pe[:, 0, 1::2] np.cos(position * div_term) return pe def forward(self, x): return x self.encoding[:x.shape[0]]2.3 高效注意力机制class LinearAttention: def __init__(self): pass def forward(self, Q, K, V): Q self._softmax(Q, axis-1) K self._softmax(K, axis-1) context np.dot(K.transpose(-2, -1), V) Z np.sum(K, axis-2, keepdimsTrue) output np.dot(Q, context) / np.dot(Q, Z) return output def _softmax(self, x, axis-1): exp_x np.exp(x - np.max(x, axisaxis, keepdimsTrue)) return exp_x / np.sum(exp_x, axisaxis, keepdimsTrue) class SparseAttention: def __init__(self, window_size5): self.window_size window_size def forward(self, Q, K, V): scores np.dot(Q, K.transpose(-2, -1)) mask self._create_sparse_mask(Q.shape[1], self.window_size) scores scores * mask (1 - mask) * (-1e9) attn_weights self._softmax(scores, axis-1) output np.dot(attn_weights, V) return output def _create_sparse_mask(self, seq_len, window_size): mask np.zeros((seq_len, seq_len)) for i in range(seq_len): start max(0, i - window_size) end min(seq_len, i window_size 1) mask[i, start:end] 1 return mask3. 性能对比3.1 注意力机制对比类型时间复杂度空间复杂度适用序列长度标准注意力O(n²d)O(n²)1000线性注意力O(nd)O(nd)10000稀疏注意力O(n log n d)O(n log n)10003.2 Transformer变体对比模型序列长度性能计算成本Vanilla Transformer512基准基准Longformer4096高中Reformer16384中高Linformer100000中低3.3 注意力头数量影响头数模型容量训练速度效果4低快中8中中高16高慢很高4. 最佳实践4.1 注意力机制选择def choose_attention_mechanism(seq_len, task_type): if seq_len 10000: return linear elif seq_len 2000: return sparse else: return standard class AttentionMechanismSelector: staticmethod def select(config): mechanisms { standard: MultiHeadAttention, linear: LinearAttention, sparse: SparseAttention } return mechanisms[config[type]](**config.get(params, {}))4.2 Transformer配置class TransformerConfigGenerator: staticmethod def from_task(task_type): configs { nlp: {d_model: 768, num_heads: 12, d_ff: 3072, layers: 12}, vision: {d_model: 512, num_heads: 8, d_ff: 2048, layers: 6}, long_seq: {d_model: 512, num_heads: 8, d_ff: 2048, layers: 12, attention: sparse} } return configs.get(task_type, configs[nlp])5. 总结注意力机制是Transformer的核心自注意力捕捉序列内部依赖多头注意力多视角特征学习高效注意力处理长序列位置编码注入顺序信息对比数据如下线性注意力适合超长序列(10000)稀疏注意力在长序列上平衡效果和效率12个头是NLP任务的标准配置推荐根据序列长度选择合适的注意力机制