与应用——大模型评估与基准)
大模型评估与基准MMLU、HELM、HumanEval、ChatBot Arena一、大模型评估概述1.1 为什么需要专门评估importnumpyasnpimportmatplotlib.pyplotaspltfrommatplotlib.patchesimportRectangle,FancyBboxPatchimportwarnings warnings.filterwarnings(ignore)print(*60)print(大模型评估衡量模型真实能力)print(*60)# 评估维度fig,axplt.subplots(figsize(12,8))ax.axis(off)# 中心centerplt.Circle((0.5,0.5),0.12,colorlightcoral,ecblack)ax.add_patch(center)ax.text(0.5,0.5,大模型\n评估,hacenter,vacenter,fontsize10,fontweightbold)# 评估维度dimensions{知识理解:(0.15,0.75),推理能力:(0.85,0.75),代码生成:(0.15,0.25),数学能力:(0.85,0.25),多语言:(0.5,0.85),安全性:(0.5,0.15)}fordim,(x,y)indimensions.items():circleplt.Circle((x,y),0.1,colorlightblue,ecblack)ax.add_patch(circle)ax.text(x,y,dim,hacenter,vacenter,fontsize8)# 连接到中心ax.annotate(,xy(x,y),xytext(0.5,0.5),arrowpropsdict(arrowstyle-,colorgray,lw1,alpha0.5))ax.set_xlim(0,1)ax.set_ylim(0,1)ax.set_title(大模型评估维度,fontsize14)plt.tight_layout()plt.show()print(\n 大模型评估挑战:)print( 1. 任务多样性需要覆盖各种能力)print( 2. 避免数据污染防止训练数据泄露)print( 3. 评估成本人工评估昂贵)print( 4. 快速迭代模型更新频繁)二、MMLU大规模多任务语言理解2.1 MMLU概述defmmlu_overview():MMLU概述print(\n*60)print(MMLUMassive Multitask Language Understanding)print(*60)fig,axesplt.subplots(1,2,figsize(14,6))# 任务分布ax1axes[0]ax1.set_title(MMLU任务分布,fontsize11)categories{人文:15,社科:15,STEM:15,其他:12}colors[lightblue,lightgreen,lightcoral,lightyellow]wedges,texts,autotextsax1.pie(categories.values(),labelscategories.keys(),colorscolors,autopct%1.0f%%)ax1.set_title(MMLU任务类别分布,fontsize10)# 示例ax2axes[1]ax2.axis(off)ax2.set_title(MMLU示例,fontsize11)example 问题示例: 以下哪个是牛顿第一定律的描述 A. F ma B. 作用力与反作用力 C. 物体保持静止或匀速直线运动 D. 能量守恒 答案: C --- 数据统计: • 57个任务 • 约14,000个问题 • 覆盖人文、社科、STEM等领域 • 4选1选择题格式 ax2.text(0.05,0.95,example,transformax2.transAxes,fontsize9,verticalalignmenttop,fontfamilymonospace)plt.suptitle(MMLU多任务语言理解基准,fontsize12)plt.tight_layout()plt.show()print(\n MMLU特点:)print( - 覆盖57个学科领域)print( - 约14,000道选择题)print( - 测试模型的知识广度)print( - 5-shot评估标准)mmlu_overview()2.2 MMLU评估defmmlu_evaluation():MMLU评估print(\n*60)print(MMLU评估结果)print(*60)fig,axplt.subplots(figsize(12,6))models[GPT-3.5,GPT-4,Claude-2,LLaMA-2-70B,Gemini-Pro]scores[70.0,86.4,75.0,68.9,71.8]colors[lightblue,lightgreen,lightcoral,lightyellow,lightpink]barsax.bar(models,scores,colorcolors)ax.set_ylabel(准确率 (%))ax.set_title(MMLU基准测试结果)ax.set_ylim(60,90)forbar,scoreinzip(bars,scores):ax.text(bar.get_x()bar.get_width()/2,bar.get_height()0.5,f{score}%,hacenter,vabottom,fontsize9)plt.tight_layout()plt.show()print(\n MMLU评估代码:)code from lm_eval import evaluator from lm_eval.tasks import initialize_tasks # 加载MMLU任务 tasks initialize_tasks([mmlu]) # 评估模型 results evaluator.simple_evaluate( modelgpt-3.5-turbo, tasks[mmlu], num_fewshot5, # 5-shot limit100 # 限制样本数 ) print(fMMLU准确率: {results[results][mmlu][acc]:.2%}) print(code)mmlu_evaluation()三、HELM整体评估3.1 HELM框架defhelm_overview():HELM概述print(\n*60)print(HELMHolistic Evaluation of Language Models)print(*60)fig,axesplt.subplots(1,2,figsize(14,6))# 评估维度ax1axes[0]ax1.axis(off)ax1.set_title(HELM评估维度,fontsize11)dimensions[(准确性,0.2,0.8),(鲁棒性,0.5,0.8),(公平性,0.8,0.8),(效率,0.2,0.5),(毒性,0.5,0.5),(不确定性,0.8,0.5),]fordim,x,yindimensions:circleplt.Circle((x,y),0.08,colorlightblue,ecblack)ax1.add_patch(circle)ax1.text(x,y,dim,hacenter,vacenter,fontsize8)# 场景ax2axes[1]ax2.axis(off)ax2.set_title(HELM评估场景,fontsize11)scenarios[• 问答 (Question Answering),• 信息检索 (Information Retrieval),• 摘要生成 (Summarization),• 情感分析 (Sentiment Analysis),• 毒性检测 (Toxicity Detection),• 代码生成 (Code Generation)]y_pos0.75forscenarioinscenarios:ax2.text(0.1,y_pos,scenario,fontsize8)y_pos-0.1plt.suptitle(HELM多维度综合评估,fontsize12)plt.tight_layout()plt.show()print(\n HELM特点:)print( - 多维度评估准确性、鲁棒性、公平性等)print( - 多场景覆盖)print( - 标准化评估流程)print( - 开源评估框架)helm_overview()四、HumanEval代码生成4.1 HumanEval概述defhumaneval_overview():HumanEval概述print(\n*60)print(HumanEval代码生成评估)print(*60)fig,axesplt.subplots(1,2,figsize(14,6))# 任务示例ax1axes[0]ax1.axis(off)ax1.set_title(HumanEval示例,fontsize11)example 任务: 实现一个函数 def has_close_elements(numbers: List[float], threshold: float) - bool: 检查列表中是否有两个元素之差小于阈值 示例: has_close_elements([1.0, 2.0, 3.0], 0.5) False has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3) True ax1.text(0.05,0.95,example,transformax1.transAxes,fontsize8,verticalalignmenttop,fontfamilymonospace)# 评估结果ax2axes[1]models[GPT-3.5,GPT-4,Claude-2,CodeLLaMA-34B,StarCoder-15B]scores[48.1,82.0,70.0,48.8,33.6]colors[lightblue,lightgreen,lightcoral,lightyellow,lightpink]barsax2.bar(models,scores,colorcolors)ax2.set_ylabel(Pass1 (%))ax2.set_title(HumanEval代码生成结果)ax2.set_ylim(0,90)forbar,scoreinzip(bars,scores):ax2.text(bar.get_x()bar.get_width()/2,bar.get_height()1,f{score}%,hacenter,vabottom,fontsize8)plt.suptitle(HumanEval代码生成能力评估,fontsize12)plt.tight_layout()plt.show()print(\n HumanEval特点:)print( - 164个手写编程问题)print( - 测试函数正确性)print( - Passk评估指标)print( - 评估代码生成能力)humaneval_overview()4.2 Passk指标defpass_at_k():Passk指标print(\n*60)print(Passk评估指标)print(*60)fig,axplt.subplots(figsize(12,6))ax.axis(off)formula Passk 公式: Passk E[1 - C(n - c, k) / C(n, k)] 其中: • n: 生成的样本总数 • c: 通过测试的样本数 • k: 考虑的前k个样本 解读: • Pass1: 第一个答案正确 • Pass10: 前10个答案中有正确 • Pass100: 前100个答案中有正确 评估示例: n100, c50, k1: Pass1 0.5 n100, c50, k10: Pass10 ≈ 0.999 ax.text(0.05,0.95,formula,transformax.transAxes,fontsize10,verticalalignmenttop,fontfamilymonospace)plt.tight_layout()plt.show()pass_at_k()五、ChatBot Arena5.1 竞技场评估defchatbot_arena():ChatBot Arenaprint(\n*60)print(ChatBot Arena人类偏好评估)print(*60)fig,axesplt.subplots(1,2,figsize(14,6))# 评估流程ax1axes[0]ax1.axis(off)ax1.set_title(ChatBot Arena评估流程,fontsize11)steps[(用户提问,0.2,0.7),(模型A回答,0.5,0.7),(模型B回答,0.8,0.7),(投票选择,0.5,0.4),]forlabel,x,yinsteps:circleplt.Circle((x,y),0.08,colorlightblue,ecblack)ax1.add_patch(circle)ax1.text(x,y,label,hacenter,vacenter,fontsize7)ifx0.8:ax1.annotate(,xy(x0.28,y),xytext(x0.1,y),arrowpropsdict(arrowstyle-,lw1))ifx0.8:ax1.annotate(,xy(0.5,0.55),xytext(0.8,0.62),arrowpropsdict(arrowstyle-,lw1))# ELO评分ax2axes[1]models[GPT-4,Claude-3,Gemini-Pro,LLaMA-3-70B,GPT-3.5]elo_scores[1250,1220,1150,1120,1080]colors[lightgreen,lightblue,lightcoral,lightyellow,lightgray]barsax2.bar(models,elo_scores,colorcolors)ax2.set_ylabel(ELO评分)ax2.set_title(ChatBot Arena排行榜)ax2.set_ylim(1000,1300)forbar,scoreinzip(bars,elo_scores):ax2.text(bar.get_x()bar.get_width()/2,bar.get_height()5,f{score},hacenter,vabottom,fontsize9)plt.suptitle(ChatBot Arena人类偏好排名,fontsize12)plt.tight_layout()plt.show()print(\n ChatBot Arena特点:)print( - 随机匿名对战)print( - 人类投票选择)print( - ELO评分系统)print( - 反映真实用户体验)chatbot_arena()六、评估工具6.1 常用评估框架defevaluation_tools():评估工具print(\n*60)print(大模型评估工具)print(*60)code # 1. LM Evaluation Harness from lm_eval import evaluator results evaluator.simple_evaluate( modelhf-causal, model_argspretrainedmeta-llama/Llama-2-7b-hf, tasks[mmlu, hellaswag, truthfulqa], num_fewshot5 ) # 2. OpenAI Evals import evals # 定义评估任务 eval evals.Eval( namemy_eval, modelgpt-3.5-turbo, datasetevals/registry/data/mmlu.jsonl ) results eval.run() # 3. HELM from helm import HelmRunner runner HelmRunner( modelopenai/gpt-3.5-turbo, scenariommlu ) results runner.run() # 4. AlpacaEval from alpaca_eval import evaluate results evaluate( model_outputsmodel_outputs.json, reference_outputsreference_outputs.json ) print(code)evaluation_tools()七、总结基准任务类型评估方式特点MMLU知识问答5-shot知识广度HELM多维度标准化全面评估HumanEval代码生成Passk编程能力ChatBot Arena对话人类投票用户体验评估选择建议知识能力 → MMLU综合能力 → HELM编程能力 → HumanEval对话体验 → ChatBot Arena