8
n8n 中文网amn8n.com

自动化LLM测试:GPT-4评估与Google Sheets跟踪

高级

这是一个Engineering, AI Summarization领域的自动化工作流,包含 17 个节点。主要使用 Set, Limit, Merge, Webhook, HttpRequest 等节点。 自动化LLM测试:GPT-4评估与Google Sheets跟踪

前置要求
  • HTTP Webhook 端点(n8n 会自动生成)
  • 可能需要目标 API 的认证凭证
  • Google Sheets API 凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "meta": {
    "instanceId": "45e293393b5dd8437fb351e5b1ef5511ef67e6e0826a1c10b9b68be850b67593"
  },
  "nodes": [
    {
      "id": "2dbc4a8a-4fb6-4679-9d96-2724f79fbac1",
      "name": "合并",
      "type": "n8n-nodes-base.merge",
      "position": [
        1980,
        600
      ],
      "parameters": {
        "mode": "combine",
        "options": {},
        "combineBy": "combineByPosition"
      },
      "typeVersion": 3.1
    },
    {
      "id": "146a6af3-58ec-4555-9202-3ce87a83af28",
      "name": "结构化输出解析器",
      "type": "@n8n/n8n-nodes-langchain.outputParserStructured",
      "position": [
        1540,
        520
      ],
      "parameters": {
        "jsonSchemaExample": "{\n  \"reasoning\": \"The Assistant fabricated a $1 million figure and a 12-month provision that are not found in the source. This breaches factual correctness and completeness. The output would mislead business stakeholders if used without correction.\",\n  \"decision\": \"Fail\"\n}"
      },
      "typeVersion": 1.2
    },
    {
      "id": "83da8236-e5fb-4847-8033-6559f575c7ff",
      "name": "更新结果",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        960,
        200
      ],
      "parameters": {
        "columns": {
          "value": {
            "ID": "={{ $json.ID }}",
            "Input": "={{ $json.Input }}",
            "Output": "={{ $json.Output }}",
            "Decision": "={{ $json.output.decision }}",
            "Test No.": "={{ $json[\"Test No\"][\"\"] }}",
            "Reasoning": "={{ $json.output.reasoning }}",
            "AI Platform": "={{ $json[\"AI Platform\"] }}",
            "Reference Answer": "={{ $json[\"Reference Answer\"] }}"
          },
          "schema": [
            {
              "id": "ID",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "ID",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Test No.",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Test No.",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "AI Platform",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "AI Platform",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Input",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Input",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Output",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Output",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Reference Answer",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Reference Answer",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Decision",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Decision",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Reasoning",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Reasoning",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [
            "ID"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "appendOrUpdate",
        "sheetName": {
          "__rl": true,
          "mode": "list",
          "value": 537199982,
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1c73be3fHkKr0DVJYIt9qlNfJcfuUV6DTShp93fa55Ig/edit#gid=537199982",
          "cachedResultName": "Results"
        },
        "documentId": {
          "__rl": true,
          "mode": "url",
          "value": "https://docs.google.com/spreadsheets/d/1c73be3fHkKr0DVJYIt9qlNfJcfuUV6DTShp93fa55Ig/edit?usp=sharing"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "04iXS2lwUVyzn6F2",
          "name": "Google Sheets account"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "824c06fb-9104-4c65-a77f-33db0167c0f6",
      "name": "便签4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        560,
        -20
      ],
      "parameters": {
        "color": 4,
        "height": 720,
        "content": "## 2. 执行子工作流"
      },
      "typeVersion": 1
    },
    {
      "id": "3a20e99f-b183-4362-b909-2fffdd48d0d2",
      "name": "便签8",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -680,
        160
      ],
      "parameters": {
        "width": 460,
        "height": 280,
        "content": "## 数据格式"
      },
      "typeVersion": 1
    },
    {
      "id": "16fe7cb7-ca24-40f1-855b-e1867bf29b56",
      "name": "便签 9",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        0,
        -20
      ],
      "parameters": {
        "color": 6,
        "width": 360,
        "height": 180,
        "content": "## 1. 获取测试用例"
      },
      "typeVersion": 1
    },
    {
      "id": "86f611e8-ca94-4b9f-a858-45d0fcbdfcfa",
      "name": "便签15",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        900,
        -20
      ],
      "parameters": {
        "color": 6,
        "width": 260,
        "height": 180,
        "content": "## 4. 更新结果"
      },
      "typeVersion": 1
    },
    {
      "id": "caa54653-920b-4d4f-abb6-bab54c64350b",
      "name": "便签16",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1320,
        -20
      ],
      "parameters": {
        "color": 4,
        "width": 360,
        "height": 340,
        "content": "## 3. 评判LLM输出"
      },
      "typeVersion": 1
    },
    {
      "id": "9b22fb78-d6fa-4dad-a543-1b02828d2f2e",
      "name": "限制",
      "type": "n8n-nodes-base.limit",
      "disabled": true,
      "position": [
        360,
        220
      ],
      "parameters": {
        "maxItems": 3
      },
      "typeVersion": 1
    },
    {
      "id": "faad2c18-defc-4644-b9a3-3650c26f5891",
      "name": "提取数据",
      "type": "n8n-nodes-base.set",
      "position": [
        1000,
        400
      ],
      "parameters": {
        "mode": "raw",
        "options": {},
        "jsonOutput": "={{ $json.body }}"
      },
      "typeVersion": 3.4
    },
    {
      "id": "ec8629e4-7715-410c-aa6d-560fd284a1ca",
      "name": "获取测试",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        140,
        220
      ],
      "parameters": {
        "options": {},
        "sheetName": {
          "__rl": true,
          "mode": "list",
          "value": "gid=0",
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1c73be3fHkKr0DVJYIt9qlNfJcfuUV6DTShp93fa55Ig/edit#gid=0",
          "cachedResultName": "Tests"
        },
        "documentId": {
          "__rl": true,
          "mode": "url",
          "value": "https://docs.google.com/spreadsheets/d/1c73be3fHkKr0DVJYIt9qlNfJcfuUV6DTShp93fa55Ig/edit?usp=sharing"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "04iXS2lwUVyzn6F2",
          "name": "Google Sheets account"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "d7160cac-8bea-4464-bfea-00c785b8ac7e",
      "name": "执行子工作流",
      "type": "n8n-nodes-base.httpRequest",
      "onError": "continueErrorOutput",
      "maxTries": 2,
      "position": [
        620,
        220
      ],
      "parameters": {
        "url": "https://webhook-processor-production-48f8.up.railway.app/webhook/llm-as-a-judge",
        "method": "POST",
        "options": {
          "batching": {
            "batch": {
              "batchSize": 1,
              "batchInterval": 500
            }
          }
        },
        "jsonBody": "={{ $json }}",
        "sendBody": true,
        "specifyBody": "json"
      },
      "retryOnFail": false,
      "typeVersion": 4.2
    },
    {
      "id": "6920a43b-bdbf-47c0-a644-1f75375e1127",
      "name": "Webhook",
      "type": "n8n-nodes-base.webhook",
      "position": [
        620,
        480
      ],
      "webhookId": "1cbce320-d28e-4e97-8663-bf2c6a36a358",
      "parameters": {
        "path": "llm-as-a-judge",
        "options": {},
        "httpMethod": "POST",
        "responseData": "allEntries",
        "responseMode": "lastNode"
      },
      "typeVersion": 2
    },
    {
      "id": "70cc9edd-f481-420e-bfcc-02b25f4353db",
      "name": "基础LLM链",
      "type": "@n8n/n8n-nodes-langchain.chainLlm",
      "onError": "continueErrorOutput",
      "position": [
        1380,
        340
      ],
      "parameters": {
        "text": "=INPUT:\n\n{\n  \"task\": {{ $('Extract Data').item.json['Input'] }},\n  \"answer_key\": {{ $('Extract Data').item.json['Reference Answer'] }},\n  \"output\": {{ $('Extract Data').item.json['Output'] }}\n}\n\nOUTPUT:",
        "messages": {
          "messageValues": [
            {
              "message": "=## Context\n\nYou are an evaluator of LLMs in the legal domain.\n\n## Inputs Provided for Each Task\n\n- task: The legal question or instruction.\n- answer_key: The correct answer for this task, found in the answer key column of the same Google Sheet.\n- output: The answer generated by the AI Assistant.\n\n\n## Evaluation Rules\n\nGrade the AI Assistant's output as Pass or Fail by comparing it ONLY to the answer_key for that task.\n\nDo not use or reference the original source material or any other information.\n\n## Criteria for Pass\n\n1. Factual Correctness\n- The output must accurately reflect the information in the answer_key.\n- Minor differences in paraphrasing, wording, or formatting (including clause numbering, references, or synonyms) are acceptable if the substantive information matches the answer_key.\n- If the answer key provides multiple possible correct answers (e.g., separated by \"OR\"), any output that matches any one of the alternatives is acceptable.\n\n\n2. Relevance to the Query\n- The output must directly answer the task as covered in the answer_key.\n- Do not introduce unrelated or off-topic information.\n\n\n3. Completeness\n- If the output contains extra information that does not contradict or misrepresent the answer key, it is acceptable.\n- Omitting any critical point present in the answer_key = Fail.\n\n\n## Key Rule\n- If the output materially fails any one of the three requirements compared to the answer_key, grade as Fail.\n- Minor paraphrasing or stylistic differences are acceptable if the substantive meaning is identical.\n\n\n## Required Output Format\n\nYour evaluation must be provided in JSON with two keys only:\n\n- decision: Pass or Fail\n- reasoning: A brief explanation, strictly comparing the output to the answer_key.\n\n\n### Example Input 1\n\n{\n \"task\": \"Extract the liability cap and time-based provisions from a limitation of liability clause.\",\n \"answer_key\": \"The liability cap is $1 million with a 12-month limit.\",\n \"output\": \"The liability cap is $1 million with a 12-month limit.\"\n}\n\n### Example Output 1\n\n{\n  \"output\": {\n    {\n     \"decision\": \"Pass\",\n     \"reasoning\": \"The output exactly matches the answer key, so it is factually correct, relevant, and complete.\"\n     }\n  }\n}\n\n### Example Input 2\n\n{\n \"task\": \"Extract the liability cap and time-based provisions from a limitation of liability clause.\",\n \"answer_key\": \"The liability cap is $1 million with a 12-month limit.\",\n \"output\": \"The liability cap is $2 million and there is no time limit.\"\n}\n\n### Example Output 2\n\n{\n  \"output\": {\n    {\n     \"decision\": \"Fail\",\n     \"reasoning\": \"The output gives a $2 million cap and omits the 12-month limit from the answer key. This fails both factual correctness and completeness.\"\n    }\n}\n\n### Example Input 3\n\n{\n \"task\": \"State the governing law.\",\n \"answer_key\": \"Singapore law.\",\n \"output\": \"This agreement is governed by Singapore law. All disputes will be subject to the exclusive jurisdiction of Singapore courts.\"\n}\n\n### Example Output 3\n\n{\n  \"output\": {\n    \"reasoning\": \"All required information from the answer_key is present. The extra information does not contradict or misrepresent the answer_key.\"\n \"decision\": \"Pass\",\n  }\n}\n\n### Example Input 4\n\n{\n \"task\": \"Identify the relevant clause.\",\n \"answer_key\": \"Clause 5\",\n \"output\": \"clause 5\"\n}\n\n### Example Output 4\n\n{\n  \"output\": {\n    \"reasoning\": \"The output matches the answer key despite minor formatting differences.\"\n    \"decision\": \"Pass\",\n  }\n}\n\n### Example Input 5\n\n{\n \"task\": \"Extract the parties to the contract.\",\n \"answer_key\": \"Company A and Company B OR The Buyer and the Seller\",\n \"output\": \"The Buyer and the Seller\"\n}\n\n### Example Output 5\n\n{\n  \"output\": {\n    \"reasoning\": \"The output matches one of the acceptable answer_key alternatives.\"\n    \"decision\": \"Pass\",\n  }\n}\n\n## Reminder\nAlways grade solely by comparison to the answer_key column for each task in the input data."
            }
          ]
        },
        "promptType": "define",
        "hasOutputParser": true
      },
      "typeVersion": 1.4
    },
    {
      "id": "f4ddb551-cbaa-4c2d-96ca-3769a199ce1a",
      "name": "OpenRouter 聊天模型",
      "type": "@n8n/n8n-nodes-langchain.lmChatOpenRouter",
      "position": [
        1380,
        520
      ],
      "parameters": {
        "model": "openai/gpt-4.1",
        "options": {}
      },
      "credentials": {
        "openRouterApi": {
          "id": "ipzDVYsZqbum9bX4",
          "name": "OpenRouter account 2"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "b8eedf4a-eb85-4b4a-ad4b-61d9d31984c1",
      "name": "保留原始数据",
      "type": "n8n-nodes-base.set",
      "position": [
        1480,
        820
      ],
      "parameters": {
        "mode": "raw",
        "options": {},
        "jsonOutput": "={{ $json.body }}"
      },
      "typeVersion": 3.4
    },
    {
      "id": "69c41be1-ff93-4098-8b9d-cd5cc88d9271",
      "name": "手动触发器",
      "type": "n8n-nodes-base.manualTrigger",
      "position": [
        -80,
        220
      ],
      "parameters": {},
      "typeVersion": 1
    }
  ],
  "pinData": {},
  "connections": {
    "Limit": {
      "main": [
        [
          {
            "node": "Execute Subworkflow",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Webhook": {
      "main": [
        [
          {
            "node": "Keep Original Data",
            "type": "main",
            "index": 0
          },
          {
            "node": "Extract Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Get Tests": {
      "main": [
        [
          {
            "node": "Limit",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Data": {
      "main": [
        [
          {
            "node": "Basic LLM Chain",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Manual Trigger": {
      "main": [
        [
          {
            "node": "Get Tests",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Basic LLM Chain": {
      "main": [
        [
          {
            "node": "Merge",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Keep Original Data": {
      "main": [
        [
          {
            "node": "Merge",
            "type": "main",
            "index": 1
          }
        ]
      ]
    },
    "Execute Subworkflow": {
      "main": [
        [
          {
            "node": "Update Results",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "OpenRouter Chat Model": {
      "ai_languageModel": [
        [
          {
            "node": "Basic LLM Chain",
            "type": "ai_languageModel",
            "index": 0
          }
        ]
      ]
    },
    "Structured Output Parser": {
      "ai_outputParser": [
        [
          {
            "node": "Basic LLM Chain",
            "type": "ai_outputParser",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

高级 - 工程, AI 摘要总结

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量17
分类2
节点类型11
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

作者
Adam Janes

Adam Janes

@adamjanes

I am a product-minded technologist with hacker DNA building things in AI automation. I have a broad and varied background - having worked in Product, Design, and Sales - combined with deep technical experience as a Senior Developer and Fractional CTO. I am also a best-selling Udemy instructor (with 25K+ students), and founder of WOOFCODE - a free coding camp for fullstack developers. I practice non-violent communication, motivational interviewing, and Tibetan Buddhist meditation.

外部链接
在 n8n.io 查看

分享此工作流