
长文本处理技术综述突破上下文限制前言大模型的上下文窗口是有限的但很多应用场景需要处理超长文本。如何高效处理长文本是大模型应用开发中的重要挑战。我在项目中处理过各种长文本场景从法律文档分析到代码仓库理解。今天分享一些常用的长文本处理技术。文本分块技术基于长度的分块class FixedSizeChunker: 固定大小分块 def __init__(self, chunk_size: int 512, overlap: int 50): self.chunk_size chunk_size self.overlap overlap def chunk(self, text: str) - list: 分块 words text.split() chunks [] for i in range(0, len(words), self.chunk_size - self.overlap): chunk_words words[i:i self.chunk_size] chunks.append( .join(chunk_words)) return chunks基于语义的分块import re class SemanticChunker: 语义分块 def __init__(self, max_tokens: int 512): self.max_tokens max_tokens def chunk(self, text: str) - list: 按语义边界分块 # 按段落分割 paragraphs re.split(r\n\n, text) chunks [] current_chunk [] current_size 0 for para in paragraphs: para_size self._count_tokens(para) if current_size para_size self.max_tokens: if current_chunk: chunks.append(\n\n.join(current_chunk)) current_chunk [para] current_size para_size else: current_chunk.append(para) current_size para_size if current_chunk: chunks.append(\n\n.join(current_chunk)) return chunks def _count_tokens(self, text: str) - int: 估算 token 数量 return len(text) // 4Map-Reduce 策略class MapReduceProcessor: Map-Reduce 处理长文本 def __init__(self, llm, chunk_size: int 512): self.llm llm self.chunker FixedSizeChunker(chunk_sizechunk_size) def process(self, task: str, text: str) - str: 处理长文本 # Map 阶段处理每个块 chunks self.chunker.chunk(text) summaries [] for i, chunk in enumerate(chunks): prompt f任务{task} 文本片段 {i1}/{len(chunks)} {chunk} 请提取与任务相关的信息 summary self.llm.generate(prompt) summaries.append(summary) # Reduce 阶段合并结果 combined \n\n.join(summaries) prompt f任务{task} 各部分分析结果 {combined} 请综合以上信息给出最终回答 return self.llm.generate(prompt)滑动窗口技术class SlidingWindowProcessor: 滑动窗口处理 def __init__(self, llm, window_size: int 512, step: int 256): self.llm llm self.window_size window_size self.step step def process(self, task: str, text: str) - str: 滑动窗口处理 words text.split() results [] for i in range(0, len(words), self.step): window_words words[i:i self.window_size] window_text .join(window_words) prompt f任务{task} 文本{window_text} 分析 result self.llm.generate(prompt) results.append(result) # 综合结果 return self._synthesize(results, task) def _synthesize(self, results: list, task: str) - str: 综合结果 combined \n\n.join(results) prompt f基于以下分析结果给出综合回答 {combined} 任务{task} 综合回答 return self.llm.generate(prompt)递归总结class RecursiveSummarizer: 递归总结 def __init__(self, llm, target_length: int 500): self.llm llm self.target_length target_length def summarize(self, text: str) - str: 递归总结 current_length self._count_tokens(text) if current_length self.target_length: return text # 分割为两部分 words text.split() mid len(words) // 2 left .join(words[:mid]) right .join(words[mid:]) # 递归总结 left_summary self.summarize(left) right_summary self.summarize(right) # 合并 combined f{left_summary}\n\n{right_summary} prompt f请总结以下内容 {combined} 总结 return self.llm.generate(prompt) def _count_tokens(self, text: str) - int: return len(text) // 4实际应用class LongTextAnalyzer: 长文本分析器 def __init__(self, llm): self.llm llm self.processor MapReduceProcessor(llm) def analyze_document(self, document_path: str, task: str) - str: 分析文档 with open(document_path, r) as f: text f.read() return self.processor.process(task, text) def answer_question(self, document_path: str, question: str) - str: 基于文档回答问题 with open(document_path, r) as f: text f.read() # 使用 RAG 风格 chunks FixedSizeChunker().chunk(text) # 找到最相关的块 relevant_chunks self._find_relevant(chunks, question) # 基于相关块回答 context \n\n.join(relevant_chunks) prompt f基于以下内容回答问题 {context} 问题{question} 回答 return self.llm.generate(prompt) def _find_relevant(self, chunks: list, query: str) - list: 找到相关的块 # 简化实现返回前 3 个块 return chunks[:3]总结长文本处理技术分块策略固定大小 vs 语义分块Map-Reduce分而治之的经典方法滑动窗口处理连续文本递归总结层次化压缩关键要点根据任务选择合适的分块策略保持一定的重叠避免信息丢失对于问答任务优先检索相关部分考虑使用向量数据库进行语义检索