自动化LLM测试:GPT-4评估与Google Sheets跟踪
这是一个Engineering, AI Summarization领域的自动化工作流,包含 17 个节点。主要使用 Set, Limit, Merge, Webhook, HttpRequest 等节点。 自动化LLM测试:GPT-4评估与Google Sheets跟踪
- •HTTP Webhook 端点(n8n 会自动生成)
- •可能需要目标 API 的认证凭证
- •Google Sheets API 凭证
使用的节点 (17)
{
"meta": {
"instanceId": "45e293393b5dd8437fb351e5b1ef5511ef67e6e0826a1c10b9b68be850b67593"
},
"nodes": [
{
"id": "2dbc4a8a-4fb6-4679-9d96-2724f79fbac1",
"name": "合并",
"type": "n8n-nodes-base.merge",
"position": [
1980,
600
],
"parameters": {
"mode": "combine",
"options": {},
"combineBy": "combineByPosition"
},
"typeVersion": 3.1
},
{
"id": "146a6af3-58ec-4555-9202-3ce87a83af28",
"name": "结构化输出解析器",
"type": "@n8n/n8n-nodes-langchain.outputParserStructured",
"position": [
1540,
520
],
"parameters": {
"jsonSchemaExample": "{\n \"reasoning\": \"The Assistant fabricated a $1 million figure and a 12-month provision that are not found in the source. This breaches factual correctness and completeness. The output would mislead business stakeholders if used without correction.\",\n \"decision\": \"Fail\"\n}"
},
"typeVersion": 1.2
},
{
"id": "83da8236-e5fb-4847-8033-6559f575c7ff",
"name": "更新结果",
"type": "n8n-nodes-base.googleSheets",
"position": [
960,
200
],
"parameters": {
"columns": {
"value": {
"ID": "={{ $json.ID }}",
"Input": "={{ $json.Input }}",
"Output": "={{ $json.Output }}",
"Decision": "={{ $json.output.decision }}",
"Test No.": "={{ $json[\"Test No\"][\"\"] }}",
"Reasoning": "={{ $json.output.reasoning }}",
"AI Platform": "={{ $json[\"AI Platform\"] }}",
"Reference Answer": "={{ $json[\"Reference Answer\"] }}"
},
"schema": [
{
"id": "ID",
"type": "string",
"display": true,
"removed": false,
"required": false,
"displayName": "ID",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "Test No.",
"type": "string",
"display": true,
"required": false,
"displayName": "Test No.",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "AI Platform",
"type": "string",
"display": true,
"required": false,
"displayName": "AI Platform",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "Input",
"type": "string",
"display": true,
"required": false,
"displayName": "Input",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "Output",
"type": "string",
"display": true,
"required": false,
"displayName": "Output",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "Reference Answer",
"type": "string",
"display": true,
"required": false,
"displayName": "Reference Answer",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "Decision",
"type": "string",
"display": true,
"required": false,
"displayName": "Decision",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "Reasoning",
"type": "string",
"display": true,
"required": false,
"displayName": "Reasoning",
"defaultMatch": false,
"canBeUsedToMatch": true
}
],
"mappingMode": "defineBelow",
"matchingColumns": [
"ID"
],
"attemptToConvertTypes": false,
"convertFieldsToString": false
},
"options": {},
"operation": "appendOrUpdate",
"sheetName": {
"__rl": true,
"mode": "list",
"value": 537199982,
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/1c73be3fHkKr0DVJYIt9qlNfJcfuUV6DTShp93fa55Ig/edit#gid=537199982",
"cachedResultName": "Results"
},
"documentId": {
"__rl": true,
"mode": "url",
"value": "https://docs.google.com/spreadsheets/d/1c73be3fHkKr0DVJYIt9qlNfJcfuUV6DTShp93fa55Ig/edit?usp=sharing"
}
},
"credentials": {
"googleSheetsOAuth2Api": {
"id": "04iXS2lwUVyzn6F2",
"name": "Google Sheets account"
}
},
"typeVersion": 4.5
},
{
"id": "824c06fb-9104-4c65-a77f-33db0167c0f6",
"name": "便签4",
"type": "n8n-nodes-base.stickyNote",
"position": [
560,
-20
],
"parameters": {
"color": 4,
"height": 720,
"content": "## 2. 执行子工作流"
},
"typeVersion": 1
},
{
"id": "3a20e99f-b183-4362-b909-2fffdd48d0d2",
"name": "便签8",
"type": "n8n-nodes-base.stickyNote",
"position": [
-680,
160
],
"parameters": {
"width": 460,
"height": 280,
"content": "## 数据格式"
},
"typeVersion": 1
},
{
"id": "16fe7cb7-ca24-40f1-855b-e1867bf29b56",
"name": "便签 9",
"type": "n8n-nodes-base.stickyNote",
"position": [
0,
-20
],
"parameters": {
"color": 6,
"width": 360,
"height": 180,
"content": "## 1. 获取测试用例"
},
"typeVersion": 1
},
{
"id": "86f611e8-ca94-4b9f-a858-45d0fcbdfcfa",
"name": "便签15",
"type": "n8n-nodes-base.stickyNote",
"position": [
900,
-20
],
"parameters": {
"color": 6,
"width": 260,
"height": 180,
"content": "## 4. 更新结果"
},
"typeVersion": 1
},
{
"id": "caa54653-920b-4d4f-abb6-bab54c64350b",
"name": "便签16",
"type": "n8n-nodes-base.stickyNote",
"position": [
1320,
-20
],
"parameters": {
"color": 4,
"width": 360,
"height": 340,
"content": "## 3. 评判LLM输出"
},
"typeVersion": 1
},
{
"id": "9b22fb78-d6fa-4dad-a543-1b02828d2f2e",
"name": "限制",
"type": "n8n-nodes-base.limit",
"disabled": true,
"position": [
360,
220
],
"parameters": {
"maxItems": 3
},
"typeVersion": 1
},
{
"id": "faad2c18-defc-4644-b9a3-3650c26f5891",
"name": "提取数据",
"type": "n8n-nodes-base.set",
"position": [
1000,
400
],
"parameters": {
"mode": "raw",
"options": {},
"jsonOutput": "={{ $json.body }}"
},
"typeVersion": 3.4
},
{
"id": "ec8629e4-7715-410c-aa6d-560fd284a1ca",
"name": "获取测试",
"type": "n8n-nodes-base.googleSheets",
"position": [
140,
220
],
"parameters": {
"options": {},
"sheetName": {
"__rl": true,
"mode": "list",
"value": "gid=0",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/1c73be3fHkKr0DVJYIt9qlNfJcfuUV6DTShp93fa55Ig/edit#gid=0",
"cachedResultName": "Tests"
},
"documentId": {
"__rl": true,
"mode": "url",
"value": "https://docs.google.com/spreadsheets/d/1c73be3fHkKr0DVJYIt9qlNfJcfuUV6DTShp93fa55Ig/edit?usp=sharing"
}
},
"credentials": {
"googleSheetsOAuth2Api": {
"id": "04iXS2lwUVyzn6F2",
"name": "Google Sheets account"
}
},
"typeVersion": 4.5
},
{
"id": "d7160cac-8bea-4464-bfea-00c785b8ac7e",
"name": "执行子工作流",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"maxTries": 2,
"position": [
620,
220
],
"parameters": {
"url": "https://webhook-processor-production-48f8.up.railway.app/webhook/llm-as-a-judge",
"method": "POST",
"options": {
"batching": {
"batch": {
"batchSize": 1,
"batchInterval": 500
}
}
},
"jsonBody": "={{ $json }}",
"sendBody": true,
"specifyBody": "json"
},
"retryOnFail": false,
"typeVersion": 4.2
},
{
"id": "6920a43b-bdbf-47c0-a644-1f75375e1127",
"name": "Webhook",
"type": "n8n-nodes-base.webhook",
"position": [
620,
480
],
"webhookId": "1cbce320-d28e-4e97-8663-bf2c6a36a358",
"parameters": {
"path": "llm-as-a-judge",
"options": {},
"httpMethod": "POST",
"responseData": "allEntries",
"responseMode": "lastNode"
},
"typeVersion": 2
},
{
"id": "70cc9edd-f481-420e-bfcc-02b25f4353db",
"name": "基础LLM链",
"type": "@n8n/n8n-nodes-langchain.chainLlm",
"onError": "continueErrorOutput",
"position": [
1380,
340
],
"parameters": {
"text": "=INPUT:\n\n{\n \"task\": {{ $('Extract Data').item.json['Input'] }},\n \"answer_key\": {{ $('Extract Data').item.json['Reference Answer'] }},\n \"output\": {{ $('Extract Data').item.json['Output'] }}\n}\n\nOUTPUT:",
"messages": {
"messageValues": [
{
"message": "=## Context\n\nYou are an evaluator of LLMs in the legal domain.\n\n## Inputs Provided for Each Task\n\n- task: The legal question or instruction.\n- answer_key: The correct answer for this task, found in the answer key column of the same Google Sheet.\n- output: The answer generated by the AI Assistant.\n\n\n## Evaluation Rules\n\nGrade the AI Assistant's output as Pass or Fail by comparing it ONLY to the answer_key for that task.\n\nDo not use or reference the original source material or any other information.\n\n## Criteria for Pass\n\n1. Factual Correctness\n- The output must accurately reflect the information in the answer_key.\n- Minor differences in paraphrasing, wording, or formatting (including clause numbering, references, or synonyms) are acceptable if the substantive information matches the answer_key.\n- If the answer key provides multiple possible correct answers (e.g., separated by \"OR\"), any output that matches any one of the alternatives is acceptable.\n\n\n2. Relevance to the Query\n- The output must directly answer the task as covered in the answer_key.\n- Do not introduce unrelated or off-topic information.\n\n\n3. Completeness\n- If the output contains extra information that does not contradict or misrepresent the answer key, it is acceptable.\n- Omitting any critical point present in the answer_key = Fail.\n\n\n## Key Rule\n- If the output materially fails any one of the three requirements compared to the answer_key, grade as Fail.\n- Minor paraphrasing or stylistic differences are acceptable if the substantive meaning is identical.\n\n\n## Required Output Format\n\nYour evaluation must be provided in JSON with two keys only:\n\n- decision: Pass or Fail\n- reasoning: A brief explanation, strictly comparing the output to the answer_key.\n\n\n### Example Input 1\n\n{\n \"task\": \"Extract the liability cap and time-based provisions from a limitation of liability clause.\",\n \"answer_key\": \"The liability cap is $1 million with a 12-month limit.\",\n \"output\": \"The liability cap is $1 million with a 12-month limit.\"\n}\n\n### Example Output 1\n\n{\n \"output\": {\n {\n \"decision\": \"Pass\",\n \"reasoning\": \"The output exactly matches the answer key, so it is factually correct, relevant, and complete.\"\n }\n }\n}\n\n### Example Input 2\n\n{\n \"task\": \"Extract the liability cap and time-based provisions from a limitation of liability clause.\",\n \"answer_key\": \"The liability cap is $1 million with a 12-month limit.\",\n \"output\": \"The liability cap is $2 million and there is no time limit.\"\n}\n\n### Example Output 2\n\n{\n \"output\": {\n {\n \"decision\": \"Fail\",\n \"reasoning\": \"The output gives a $2 million cap and omits the 12-month limit from the answer key. This fails both factual correctness and completeness.\"\n }\n}\n\n### Example Input 3\n\n{\n \"task\": \"State the governing law.\",\n \"answer_key\": \"Singapore law.\",\n \"output\": \"This agreement is governed by Singapore law. All disputes will be subject to the exclusive jurisdiction of Singapore courts.\"\n}\n\n### Example Output 3\n\n{\n \"output\": {\n \"reasoning\": \"All required information from the answer_key is present. The extra information does not contradict or misrepresent the answer_key.\"\n \"decision\": \"Pass\",\n }\n}\n\n### Example Input 4\n\n{\n \"task\": \"Identify the relevant clause.\",\n \"answer_key\": \"Clause 5\",\n \"output\": \"clause 5\"\n}\n\n### Example Output 4\n\n{\n \"output\": {\n \"reasoning\": \"The output matches the answer key despite minor formatting differences.\"\n \"decision\": \"Pass\",\n }\n}\n\n### Example Input 5\n\n{\n \"task\": \"Extract the parties to the contract.\",\n \"answer_key\": \"Company A and Company B OR The Buyer and the Seller\",\n \"output\": \"The Buyer and the Seller\"\n}\n\n### Example Output 5\n\n{\n \"output\": {\n \"reasoning\": \"The output matches one of the acceptable answer_key alternatives.\"\n \"decision\": \"Pass\",\n }\n}\n\n## Reminder\nAlways grade solely by comparison to the answer_key column for each task in the input data."
}
]
},
"promptType": "define",
"hasOutputParser": true
},
"typeVersion": 1.4
},
{
"id": "f4ddb551-cbaa-4c2d-96ca-3769a199ce1a",
"name": "OpenRouter 聊天模型",
"type": "@n8n/n8n-nodes-langchain.lmChatOpenRouter",
"position": [
1380,
520
],
"parameters": {
"model": "openai/gpt-4.1",
"options": {}
},
"credentials": {
"openRouterApi": {
"id": "ipzDVYsZqbum9bX4",
"name": "OpenRouter account 2"
}
},
"typeVersion": 1
},
{
"id": "b8eedf4a-eb85-4b4a-ad4b-61d9d31984c1",
"name": "保留原始数据",
"type": "n8n-nodes-base.set",
"position": [
1480,
820
],
"parameters": {
"mode": "raw",
"options": {},
"jsonOutput": "={{ $json.body }}"
},
"typeVersion": 3.4
},
{
"id": "69c41be1-ff93-4098-8b9d-cd5cc88d9271",
"name": "手动触发器",
"type": "n8n-nodes-base.manualTrigger",
"position": [
-80,
220
],
"parameters": {},
"typeVersion": 1
}
],
"pinData": {},
"connections": {
"Limit": {
"main": [
[
{
"node": "Execute Subworkflow",
"type": "main",
"index": 0
}
]
]
},
"Webhook": {
"main": [
[
{
"node": "Keep Original Data",
"type": "main",
"index": 0
},
{
"node": "Extract Data",
"type": "main",
"index": 0
}
]
]
},
"Get Tests": {
"main": [
[
{
"node": "Limit",
"type": "main",
"index": 0
}
]
]
},
"Extract Data": {
"main": [
[
{
"node": "Basic LLM Chain",
"type": "main",
"index": 0
}
]
]
},
"Manual Trigger": {
"main": [
[
{
"node": "Get Tests",
"type": "main",
"index": 0
}
]
]
},
"Basic LLM Chain": {
"main": [
[
{
"node": "Merge",
"type": "main",
"index": 0
}
]
]
},
"Keep Original Data": {
"main": [
[
{
"node": "Merge",
"type": "main",
"index": 1
}
]
]
},
"Execute Subworkflow": {
"main": [
[
{
"node": "Update Results",
"type": "main",
"index": 0
}
]
]
},
"OpenRouter Chat Model": {
"ai_languageModel": [
[
{
"node": "Basic LLM Chain",
"type": "ai_languageModel",
"index": 0
}
]
]
},
"Structured Output Parser": {
"ai_outputParser": [
[
{
"node": "Basic LLM Chain",
"type": "ai_outputParser",
"index": 0
}
]
]
}
}
}如何使用这个工作流?
复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。
这个工作流适合什么场景?
高级 - 工程, AI 摘要总结
需要付费吗?
本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。
相关工作流推荐
Adam Janes
@adamjanesI am a product-minded technologist with hacker DNA building things in AI automation. I have a broad and varied background - having worked in Product, Design, and Sales - combined with deep technical experience as a Senior Developer and Fractional CTO. I am also a best-selling Udemy instructor (with 25K+ students), and founder of WOOFCODE - a free coding camp for fullstack developers. I practice non-violent communication, motivational interviewing, and Tibetan Buddhist meditation.
分享此工作流