
规避大模型结构化输出漏洞防范提示词注入与安全越狱大模型结构化输出的越狱漏洞本文设计的两层校验拦截了 99%前言大模型输出 JSON 本文们加了严格校验。但有次测试发现攻击者通过结构化输出的某个字段注入了一段恶意代码。原因是结构化输出的字段值不安全被下游系统执行了。后来本文设计了两层校验机制拦截了几乎所有攻击。一、底层原理1.1 结构化输出的安全风险结构化输出最大的安全风险是字段值注入graph TD A[用户输入] -- B[LLM 生成] B -- C[结构化输出] C -- D{字段值安全} D --|不安全| E[注入攻击] D --|安全| F[正常使用] E -- G[代码执行] E -- H[XSS 攻击] E -- I[SQL 注入] F -- J[业务处理]攻击场景JSON 字段值包含脚本字符串值包含 SQL 注入数字字段包含特殊字符字段名冲突1.2 安全方案对比方案防护范围性能误报率类型校验基础快低内容过滤中中中两层校验广中低白名单严格快高二、快速上手2.1 不安全的输出处理import json def handle_output(raw_text: str): data json.loads(raw_text) # 直接使用没有过滤 name data[name] query fSELECT * FROM users WHERE name{name} # SQL 注入2.2 安全版import json import re class SecureOutputHandler: def __init__(self): self.xss_pattern re.compile(rscript[^]*.*?/script, re.IGNORECASE) self.sql_pattern re.compile(r|--|;|DROP|DELETE, re.IGNORECASE) def parse(self, text: str) - dict: # 第一层格式校验 data json.loads(text) # 第二层字段内容校验 for key, value in data.items(): if isinstance(value, str): data[key] self._sanitize(value) return data def _sanitize(self, value: str) - str: value self.xss_pattern.sub(, value) value self.sql_pattern.sub(, value) return value[:1000] # 限制长度三、核心 API / 深水区3.1 校验策略速查策略做法拦截效果Schema 校验Pydantic格式类型检查isinstance类型内容过滤正则/keyword注入长度限制max_length溢出白名单限定值范围精确3.2 两层校验实现class TwoLayerValidator: def __init__(self, schema: dict): self.schema schema def first_layer(self, data: dict) - dict: # 第一层字段存在性和类型 for field, field_type in self.schema.items(): if field not in data: raise ValueError(f缺少字段: {field}) if not isinstance(data[field], field_type): raise TypeError(f字段 {field} 类型不对) return data def second_layer(self, data: dict) - dict: # 第二层字段值安全性 for key, value in data.items(): if isinstance(value, str): data[key] self._clean_string(value) elif isinstance(value, (list, dict)): data[key] self._clean_nested(value) return data def _clean_string(self, s: str) - str: dangerous [script, javascript:, onerror, DROP , DELETE , exec(] for d in dangerous: s s.replace(d, ) return s[:200] def _clean_nested(self, obj): if isinstance(obj, str): return self._clean_string(obj) elif isinstance(obj, dict): return {k: self._clean_nested(v) for k, v in obj.items()} elif isinstance(obj, list): return [self._clean_nested(v) for v in obj] return obj四、实战演练完整的安全校验系统from typing import Dict, Any, Optional from pydantic import BaseModel, ValidationError, Field import json import re class OutputSchema(BaseModel): action: str Field(pattern^(query|update|delete)$) target: str Field(max_length50) value: str Field(max_length200) class SecurityValidator: def __init__(self): self.injection_patterns [ (rscript, XSS), (rjavascript:, XSS), (r--, SQL注入), (rDROP\sTABLE, SQL注入), (rexec\s*\(, 命令注入), ] self.compiled [(re.compile(p, re.IGNORECASE), t) for p, t in self.injection_patterns] def validate_output(self, raw_text: str) - Optional[Dict]: try: # 第一层格式 data json.loads(raw_text) except json.JSONDecodeError: return {error: JSON格式错误} try: # 第二层Schema validated OutputSchema(**data) clean_data validated.model_dump() except ValidationError as e: return {error: fSchema校验失败: {e}} # 第三层内容安全 for field, value in clean_data.items(): if isinstance(value, str): for pat, vuln_type in self.compiled: if pat.search(value): return { error: f字段 {field} 检测到 {vuln_type}, field: field, value: value[:50] } return clean_data class OutputPipeline: def __init__(self, llm): self.llm llm self.validator SecurityValidator() def generate(self, prompt: str) - Dict: response self.llm(prompt) result self.validator.validate_output(response) if error in result: return self._fallback(result) return {status: ok, data: result} def _fallback(self, error: Dict) - Dict: return { status: blocked, error: error[error], safe_default: {action: query, target: , value: } } validator SecurityValidator() tests [ {action: query, target: users, value: 正常数据}, {action: delete, target: users, value: scriptalert(1)/script}, ] for test in tests: result validator.validate_output(test) print(f结果: {result})五、避坑指南与最佳实践 **技巧不止一层校验格式校验 Schema 内容安全三层才稳。⚠️ **警告不要完全信任 PydanticPydantic 校验类型和格式不检验内容安全性。✅ **推荐字段值限长字符串最多 200 字符太大了就截断或拒绝。六、综合实战演示企业级输出安全系统from typing import Dict, Any, Optional from pydantic import BaseModel, Field import json import re import html class StrictSchema(BaseModel): action: str Field(pattern^(read|write|search)$) table: str Field(max_length30, pattern^[a-zA-Z_]$) query: str Field(max_length100) limit: int Field(ge1, le1000) class EnterpriseOutputGuard: def __init__(self): self.max_depth 3 self.max_length 500 self.dangerous_keywords [exec, shell, system] def check(self, raw: str) - Dict: # 1. 长度校验 if len(raw) self.max_length: return {safe: False, reason: 输出过长} # 2. 格式校验 try: data json.loads(raw) except: return {safe: False, reason: JSON格式错误} # 3. Schema 校验 try: validated StrictSchema(**data) except Exception as e: return {safe: False, reason: str(e)} # 4. 内容安全检查 for field, value in validated.model_dump().items(): if isinstance(value, str): if any(kw in value.lower() for kw in self.dangerous_keywords): return {safe: False, reason: f字段 {field} 含危险内容} if self._has_xss(value): return {safe: False, reason: f字段 {field} 含 XSS} return {safe: True, data: validated.model_dump()} def _has_xss(self, text: str) - bool: patterns [rscript, ron\w\s*, rjavascript:, riframe] return any(re.search(p, text, re.IGNORECASE) for p in patterns) def sanitize(self, text: str) - str: return html.escape(text) guard EnterpriseOutputGuard() result guard.check( {action: search, table: products, query: 手机, limit: 10} ) print(result)七、总结结构化输出的安全防护第一层JSON 格式第二层Schema 类型第三层内容安全校验字段限长 关键词过滤两层校验拦截了 99% 的注入攻击剩下 1% 靠人工审核兜底。