“用LLM当裁判”,简单来说,就是用LLM根据你设定的特定标准,评估其他LLM给出的回复,也就是用LLM来开展LLM(系统)评估。在《Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena》论文 里,就把这种方式作为人工评估的替代方案提了出来,毕竟人工评估既费钱又耗时。用LLM当裁判主要有以下三种类型:
pip install deepeval from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import GEval
test_case = LLMTestCase(input="input to your LLM", actual_output="your LLM output") coherence_metric = GEval( name="Coherence", criteria="Coherence - the collective quality of all sentences in the actual output", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], )
from deepeval.test_case import LLMTestCase from deepeval.metrics.dag import ( DeepAcyclicGraph, TaskNode, BinaryJudgementNode, NonBinaryJudgementNode, VerdictNode, ) from deepeval.metrics import DAGMetric
correct_order_node = NonBinaryJudgementNode( criteria="Are the summary headings in the correct order: 'intro' => 'body' => 'conclusion'?", children=[ VerdictNode(verdict="Yes", score=10), VerdictNode(verdict="Two are out of order", score=4), VerdictNode(verdict="All out of order", score=2), ], )
correct_headings_node = BinaryJudgementNode( criteria="Does the summary headings contain all three: 'intro', 'body', and 'conclusion'?", children=[ VerdictNode(verdict=False, score=0), VerdictNode(verdict=True, child=correct_order_node), ], )
extract_headings_node = TaskNode( instructions="Extract all headings in `actual_output`", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], output_label="Summary headings", children=[correct_headings_node, correct_order_node], )
# create the DAG dag = DeepAcyclicGraph(root_nodes=[extract_headings_node])
# create the metric format_correctness = DAGMetric(name="Format Correctness", dag=dag)
# create a test case test_case = LLMTestCase(input="your-original-text", actual_output="your-summary")
from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import GEval
test_case = LLMTestCase(input="input to your LLM", actual_output="your LLM output") coherence_metric = GEval( name="Coherence", criteria="Coherence - the collective quality of all sentences in the actual output", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], )
from deepeval.test_case import LLMTestCaseParams from deepeval.metrics.dag import ( DeepAcyclicGraph, TaskNode, BinaryJudgementNode, NonBinaryJudgementNode, VerdictNode, ) from deepeval.metrics import DAGMetric from deepeval.metrics import GEval
g_eval_summarization = GEval( name="Summarization", criteria="Determine how good a summary the 'actual output' is to the 'input'", evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT] )
correct_order_node = NonBinaryJudgementNode( criteria="Are the summary headings in the correct order: 'intro' => 'body' => 'conclusion'?", children=[ VerdictNode(verdict="Yes", g_eval=g_eval_summarization), VerdictNode(verdict="Two are out of order", score=0), VerdictNode(verdict="All out of order", score=0), ], )
correct_headings_node = BinaryJudgementNode( criteria="Does the summary headings contain all three: 'intro', 'body', and 'conclusion'?", children=[ VerdictNode(verdict=False, score=0), VerdictNode(verdict=True, child=correct_order_node), ], )
extract_headings_node = TaskNode( instructions="Extract all headings in `actual_output`", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], output_label="Summary headings", children=[correct_headings_node, correct_order_node], )
# create the DAG dag = DeepAcyclicGraph(root_nodes=[extract_headings_node])
然后,基于这个DAG创建DAG指标,并进行评估:
from deepeval.test_case import LLMTestCase ...
# create the metric summarization = DAGMetric(name="Summarization", dag=dag)
# create a test case for summarization test_case = LLMTestCase(input="your-original-text", actual_output="your-summary")