
多语言搜索构建跨语言搜索系统前言随着全球化发展多语言搜索成为很多应用的刚需。构建一个能处理多种语言的搜索系统能极大提升用户体验。我在多个项目中实践过多语言搜索今天分享一些经验和实现。多语言表示多语言 Embeddingfrom sentence_transformers import SentenceTransformer from typing import List, Dict class MultilingualEmbedder: 多语言 Embedder def __init__(self, model_name: str all-mpnet-base-v2): # 使用多语言模型 self.model SentenceTransformer(model_name) def encode(self, texts: List[str], show_progress_bar: bool False) - List[List[float]]: 编码文本 embeddings self.model.encode(texts, show_progress_barshow_progress_bar) return embeddings.tolist() def encode_query(self, query: str, language: str None) - List[float]: 编码查询 # 简单处理直接编码 return self.model.encode([query])[0].tolist()语言检测from langdetect import detect, detect_langs from typing import List class LanguageDetector: 语言检测器 def __init__(self): pass def detect_language(self, text: str) - str: 检测语言 try: return detect(text) except: return unknown def detect_language_probs(self, text: str) - List[Dict]: 检测语言概率 try: langs detect_langs(text) return [ {lang: lang.lang, prob: lang.prob} for lang in langs ] except: return [{lang: unknown, prob: 1.0}]多语言搜索策略查询翻译from transformers import MarianMTModel, MarianTokenizer from typing import List class QueryTranslator: 查询翻译器 def __init__(self): self.models {} self.tokenizers {} def _get_model_name(self, source_lang: str, target_lang: str) - str: 获取模型名称 return fHelsinki-NLP/opus-mt-{source_lang}-{target_lang} def load_model(self, source_lang: str, target_lang: str): 加载翻译模型 model_name self._get_model_name(source_lang, target_lang) if model_name not in self.models: self.tokenizers[model_name] MarianTokenizer.from_pretrained(model_name) self.models[model_name] MarianMTModel.from_pretrained(model_name) return self.models[model_name], self.tokenizers[model_name] def translate(self, text: str, source_lang: str, target_lang: str) - str: 翻译文本 try: model, tokenizer self.load_model(source_lang, target_lang) inputs tokenizer(text, return_tensorspt, paddingTrue, truncationTrue) outputs model.generate(**inputs) translated tokenizer.decode(outputs[0], skip_special_tokensTrue) return translated except: return text # 失败则返回原文 def translate_multiple(self, text: str, targets: List[str]) - Dict[str, str]: 翻译到多种语言 results {} source_lang self.detect_language(text) for target in targets: if target ! source_lang: results[target] self.translate(text, source_lang, target) else: results[target] text return results跨语言搜索from typing import List, Dict, Tuple class MultilingualSearch: 多语言搜索系统 def __init__(self, embedder: MultilingualEmbedder): self.embedder embedder self.document_embeddings {} self.documents {} self.languages set() def add_document(self, doc_id: str, text: str, language: str, metadata: Dict None): 添加文档 embedding self.embedder.encode([text])[0] self.document_embeddings[doc_id] embedding self.documents[doc_id] { text: text, language: language, metadata: metadata or {} } self.languages.add(language) def search(self, query: str, query_language: str None, target_languages: List[str] None, top_k: int 10) - List[Dict]: 搜索 if not target_languages: target_languages list(self.languages) # 编码查询 query_embedding self.embedder.encode_query(query, query_language) # 搜索 results [] for doc_id, doc_embedding in self.document_embeddings.items(): doc self.documents[doc_id] if target_languages and doc[language] not in target_languages: continue # 计算相似度 similarity self._compute_similarity(query_embedding, doc_embedding) results.append({ doc_id: doc_id, text: doc[text], language: doc[language], metadata: doc[metadata], similarity: similarity }) # 排序 results.sort(keylambda x: x[similarity], reverseTrue) return results[:top_k] def _compute_similarity(self, q_emb: List[float], d_emb: List[float]) - float: 计算相似度 import numpy as np q np.array(q_emb) d np.array(d_emb) dot np.dot(q, d) norm_q np.linalg.norm(q) norm_d np.linalg.norm(d) if norm_q 0 or norm_d 0: return 0.0 return dot / (norm_q * norm_d)语言特定处理分词与预处理import jieba import spacy from typing import List class MultilingualPreprocessor: 多语言预处理器 def __init__(self): self.nlp {} self._load_models() def _load_models(self): 加载模型 # 加载中文分词 # 加载英文等其他语言 try: self.nlp[en] spacy.load(en_core_web_sm) except: pass def tokenize(self, text: str, language: str) - List[str]: 分词 if language zh: return list(jieba.cut(text)) elif language en and en in self.nlp: doc self.nlp[en](text) return [token.text for token in doc] else: return text.split() def preprocess(self, text: str, language: str) - str: 预处理 # 简单预处理统一小写、去标点 import re text text.lower() text re.sub(r[^\w\s], , text) return text完整多语言搜索系统class CompleteMultilingualSearch: 完整多语言搜索系统 def __init__(self): self.embedder MultilingualEmbedder() self.language_detector LanguageDetector() self.preprocessor MultilingualPreprocessor() self.search_engine MultilingualSearch(self.embedder) def index_document(self, doc_id: str, text: str, language: str None, metadata: Dict None): 索引文档 if not language: language self.language_detector.detect_language(text) # 预处理 processed self.preprocessor.preprocess(text, language) self.search_engine.add_document(doc_id, processed, language, metadata) def search(self, query: str, target_languages: List[str] None, top_k: int 10) - List[Dict]: 搜索 query_lang self.language_detector.detect_language(query) processed_query self.preprocessor.preprocess(query, query_lang) results self.search_engine.search( processed_query, query_lang, target_languages, top_k ) return results总结多语言搜索要点多语言表示使用统一的多语言 Embedding语言检测自动识别查询和文档语言查询翻译可选择翻译查询到文档语言语言特定处理分词、预处理等结果融合综合多语言结果实践建议使用预训练的多语言模型保持查询和文档表示一致考虑语言特定的优化支持跨语言和单语言模式