
Transformer架构实战从零实现一个简易版ChatGPT聊天机器人当你在社交媒体上看到那些能流畅对话的AI助手时是否好奇它们是如何工作的Transformer架构正是这些智能对话系统的核心引擎。本文将带你从零开始用Python和PyTorch实现一个简易版的ChatGPT聊天机器人深入理解Transformer如何通过注意力机制理解人类语言。1. 环境准备与数据加载在开始构建聊天机器人之前我们需要搭建开发环境并准备训练数据。现代NLP项目通常依赖PyTorch或TensorFlow这样的深度学习框架它们提供了高效的GPU加速计算和自动微分功能。# 安装必要库 pip install torch torchtext spacy transformers python -m spacy download en_core_web_sm对话系统的训练数据通常采用问答对的形式。我们可以使用Cornell Movie Dialogs Corpus这样的公开数据集它包含数千条电影对话from torchtext.datasets import Multi30k from torchtext.data import Field, BucketIterator # 定义字段处理 SRC Field(tokenizespacy, init_tokensos, eos_tokeneos, lowerTrue) TRG Field(tokenizespacy, init_tokensos, eos_tokeneos, lowerTrue) # 加载数据集 train_data, valid_data, test_data Multi30k.splits(exts(.en, .de), fields(SRC, TRG)) # 构建词汇表 SRC.build_vocab(train_data, min_freq2) TRG.build_vocab(train_data, min_freq2) # 创建数据迭代器 BATCH_SIZE 128 train_iterator, valid_iterator, test_iterator BucketIterator.splits( (train_data, valid_data, test_data), batch_sizeBATCH_SIZE)提示在实际项目中建议使用更大的对话数据集如OpenSubtitles或Reddit对话数据它们能提供更丰富的语言模式。2. Transformer核心组件实现2.1 位置编码与词嵌入Transformer抛弃了RNN的循环结构因此需要位置编码来保留单词的顺序信息。位置编码使用正弦和余弦函数的组合为每个位置生成独特的编码import torch import torch.nn as nn import math class PositionalEncoding(nn.Module): def __init__(self, d_model, dropout0.1, max_len5000): super().__init__() self.dropout nn.Dropout(pdropout) position torch.arange(max_len).unsqueeze(1) div_term torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) pe torch.zeros(max_len, 1, d_model) pe[:, 0, 0::2] torch.sin(position * div_term) pe[:, 0, 1::2] torch.cos(position * div_term) self.register_buffer(pe, pe) def forward(self, x): x x self.pe[:x.size(0)] return self.dropout(x)词嵌入层将离散的单词索引转换为连续的向量表示。在实践中我们可以使用预训练的词向量如GloVe来初始化嵌入层class Embeddings(nn.Module): def __init__(self, vocab_size, d_model): super().__init__() self.embed nn.Embedding(vocab_size, d_model) self.d_model d_model def forward(self, x): return self.embed(x) * math.sqrt(self.d_model)2.2 多头注意力机制实现多头注意力是Transformer的核心创新它允许模型同时关注输入序列的不同位置class MultiHeadAttention(nn.Module): def __init__(self, heads, d_model, dropout0.1): super().__init__() self.d_model d_model self.d_k d_model // heads self.h heads self.q_linear nn.Linear(d_model, d_model) self.v_linear nn.Linear(d_model, d_model) self.k_linear nn.Linear(d_model, d_model) self.dropout nn.Dropout(dropout) self.out nn.Linear(d_model, d_model) def forward(self, q, k, v, maskNone): bs q.size(0) # 线性变换并分头 k self.k_linear(k).view(bs, -1, self.h, self.d_k) q self.q_linear(q).view(bs, -1, self.h, self.d_k) v self.v_linear(v).view(bs, -1, self.h, self.d_k) # 转置以获得维度 bs * h * sl * d_model k k.transpose(1,2) q q.transpose(1,2) v v.transpose(1,2) # 计算注意力分数 scores torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) if mask is not None: mask mask.unsqueeze(1) scores scores.masked_fill(mask 0, -1e9) scores F.softmax(scores, dim-1) scores self.dropout(scores) # 应用注意力权重到V output torch.matmul(scores, v) # 拼接多头输出并通过线性层 output output.transpose(1,2).contiguous().view(bs, -1, self.d_model) output self.out(output) return output注意解码器中的掩码多头注意力需要额外的掩码机制防止当前位置关注到未来的信息。3. 构建完整Transformer模型3.1 编码器实现编码器由多个相同的层堆叠而成每层包含自注意力机制和前馈神经网络class EncoderLayer(nn.Module): def __init__(self, d_model, heads, dropout0.1): super().__init__() self.norm_1 nn.LayerNorm(d_model) self.norm_2 nn.LayerNorm(d_model) self.attn MultiHeadAttention(heads, d_model, dropoutdropout) self.ff nn.Sequential( nn.Linear(d_model, d_model*4), nn.ReLU(), nn.Linear(d_model*4, d_model), nn.Dropout(dropout) ) self.dropout_1 nn.Dropout(dropout) self.dropout_2 nn.Dropout(dropout) def forward(self, x, mask): x2 self.norm_1(x) x x self.dropout_1(self.attn(x2, x2, x2, mask)) x2 self.norm_2(x) x x self.dropout_2(self.ff(x2)) return x class Encoder(nn.Module): def __init__(self, vocab_size, d_model, N, heads, dropout): super().__init__() self.N N self.embed Embeddings(vocab_size, d_model) self.pe PositionalEncoding(d_model, dropoutdropout) self.layers nn.ModuleList([EncoderLayer(d_model, heads, dropout) for _ in range(N)]) self.norm nn.LayerNorm(d_model) def forward(self, src, mask): x self.embed(src) x self.pe(x) for i in range(self.N): x self.layers[i](x, mask) return self.norm(x)3.2 解码器实现解码器同样由多个相同的层组成但增加了掩码多头注意力和编码器-解码器注意力class DecoderLayer(nn.Module): def __init__(self, d_model, heads, dropout0.1): super().__init__() self.norm_1 nn.LayerNorm(d_model) self.norm_2 nn.LayerNorm(d_model) self.norm_3 nn.LayerNorm(d_model) self.dropout_1 nn.Dropout(dropout) self.dropout_2 nn.Dropout(dropout) self.dropout_3 nn.Dropout(dropout) self.attn_1 MultiHeadAttention(heads, d_model, dropoutdropout) self.attn_2 MultiHeadAttention(heads, d_model, dropoutdropout) self.ff nn.Sequential( nn.Linear(d_model, d_model*4), nn.ReLU(), nn.Linear(d_model*4, d_model), nn.Dropout(dropout) ) def forward(self, x, e_outputs, src_mask, trg_mask): x2 self.norm_1(x) x x self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) x2 self.norm_2(x) x x self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask)) x2 self.norm_3(x) x x self.dropout_3(self.ff(x2)) return x class Decoder(nn.Module): def __init__(self, vocab_size, d_model, N, heads, dropout): super().__init__() self.N N self.embed Embeddings(vocab_size, d_model) self.pe PositionalEncoding(d_model, dropoutdropout) self.layers nn.ModuleList([DecoderLayer(d_model, heads, dropout) for _ in range(N)]) self.norm nn.LayerNorm(d_model) def forward(self, trg, e_outputs, src_mask, trg_mask): x self.embed(trg) x self.pe(x) for i in range(self.N): x self.layers[i](x, e_outputs, src_mask, trg_mask) return self.norm(x)3.3 组合完整Transformer将编码器和解码器组合起来并添加最终的线性层和softmaxclass Transformer(nn.Module): def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout): super().__init__() self.encoder Encoder(src_vocab, d_model, N, heads, dropout) self.decoder Decoder(trg_vocab, d_model, N, heads, dropout) self.out nn.Linear(d_model, trg_vocab) def forward(self, src, trg, src_mask, trg_mask): e_outputs self.encoder(src, src_mask) d_output self.decoder(trg, e_outputs, src_mask, trg_mask) output self.out(d_output) return output4. 训练与优化策略4.1 模型初始化与掩码生成训练前需要初始化模型参数并定义生成注意力掩码的函数def init_weights(m): if hasattr(m, weight) and m.weight.dim() 1: nn.init.xavier_uniform_(m.weight.data) def create_masks(src, trg): src_mask (src ! SRC.vocab.stoi[pad]).unsqueeze(-2) if trg is not None: trg_mask (trg ! TRG.vocab.stoi[pad]).unsqueeze(-2) size trg.size(1) nopeak_mask torch.triu(torch.ones(1, size, size), diagonal1).bool() trg_mask trg_mask ~nopeak_mask else: trg_mask None return src_mask, trg_mask4.2 训练循环与学习率调度使用Adam优化器配合学习率预热策略可以显著提高Transformer的训练效果import torch.optim as optim from torch.optim.lr_scheduler import LambdaLR def get_optimizer(model, warmup_steps4000, lr0.0001, betas(0.9, 0.98), eps1e-9): optimizer optim.Adam(model.parameters(), lrlr, betasbetas, epseps) def lr_lambda(step): step 1 # 避免除零 return min(step ** (-0.5), step * warmup_steps ** (-1.5)) scheduler LambdaLR(optimizer, lr_lambda) return optimizer, scheduler def train_model(model, iterator, optimizer, scheduler, criterion, clip): model.train() epoch_loss 0 for i, batch in enumerate(iterator): src batch.src.transpose(0, 1) trg batch.trg.transpose(0, 1) trg_input trg[:, :-1] trg_output trg[:, 1:] src_mask, trg_mask create_masks(src, trg_input) optimizer.zero_grad() output model(src, trg_input, src_mask, trg_mask) output_dim output.shape[-1] output output.contiguous().view(-1, output_dim) trg_output trg_output.contiguous().view(-1) loss criterion(output, trg_output) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), clip) optimizer.step() scheduler.step() epoch_loss loss.item() return epoch_loss / len(iterator)4.3 评估与推理训练完成后我们可以使用贪心搜索或束搜索来生成对话响应def evaluate(model, iterator, criterion): model.eval() epoch_loss 0 with torch.no_grad(): for i, batch in enumerate(iterator): src batch.src.transpose(0, 1) trg batch.trg.transpose(0, 1) trg_input trg[:, :-1] trg_output trg[:, 1:] src_mask, trg_mask create_masks(src, trg_input) output model(src, trg_input, src_mask, trg_mask) output_dim output.shape[-1] output output.contiguous().view(-1, output_dim) trg_output trg_output.contiguous().view(-1) loss criterion(output, trg_output) epoch_loss loss.item() return epoch_loss / len(iterator) def translate_sentence(model, sentence, max_len50): model.eval() tokens [token.lower() for token in sentence.split()] tokens [sos] tokens [eos] src_indexes [SRC.vocab.stoi[token] for token in tokens] src_tensor torch.LongTensor(src_indexes).unsqueeze(0).to(device) src_mask (src_tensor ! SRC.vocab.stoi[pad]).unsqueeze(-2).to(device) with torch.no_grad(): enc_src model.encoder(src_tensor, src_mask) trg_indexes [TRG.vocab.stoi[sos]] for i in range(max_len): trg_tensor torch.LongTensor(trg_indexes).unsqueeze(0).to(device) trg_mask create_masks(src_tensor, trg_tensor)[1] with torch.no_grad(): output model.decoder(trg_tensor, enc_src, src_mask, trg_mask) output model.out(output) pred_token output.argmax(2)[:,-1].item() trg_indexes.append(pred_token) if pred_token TRG.vocab.stoi[eos]: break trg_tokens [TRG.vocab.itos[i] for i in trg_indexes] return trg_tokens[1:]5. 模型部署与对话测试训练完成后我们可以将模型保存并部署为简单的聊天服务def save_model(model, path): torch.save(model.state_dict(), path) def load_model(path, src_vocab, trg_vocab, d_model, N, heads, dropout): model Transformer(src_vocab, trg_vocab, d_model, N, heads, dropout) model.load_state_dict(torch.load(path)) return model def chat_with_bot(model): print(开始对话(输入quit退出):) while True: user_input input(你: ) if user_input.lower() quit: break response translate_sentence(model, user_input) print(Bot:, .join(response))在实际项目中为了提高对话质量可以考虑以下优化策略使用更大的预训练语言模型作为基础采用更精细的束搜索策略添加对话历史上下文实现多轮对话管理加入情感分析和个性化响应生成