8
n8n 中文网amn8n.com

使用 Gemini AI 和 Chunkr.ai 按目录分段 PDF

高级

这是一个AI, IT Ops领域的自动化工作流,包含 36 个节点。主要使用 Set, Code, Html, Wait, Merge 等节点,结合人工智能技术实现智能自动化。 使用 Gemini AI 和 Chunkr.ai 按目录分段 PDF

前置要求
  • Google Drive API 凭证
  • 可能需要目标 API 的认证凭证
  • Google Gemini API Key
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "meta": {
    "instanceId": "4970c3bf8a443f2d4c35b388ef96894f5ae52e32fa83fc8da098b824d939fbd9",
    "templateCredsSetupCompleted": true
  },
  "nodes": [
    {
      "id": "08ae2ea6-5ad1-4fdf-ac75-4e22811437cc",
      "name": "当点击\"执行工作流\"时",
      "type": "n8n-nodes-base.manualTrigger",
      "disabled": true,
      "position": [
        -1860,
        -220
      ],
      "parameters": {},
      "typeVersion": 1
    },
    {
      "id": "f81acfdb-2eae-4824-a4ec-2540ff15fa12",
      "name": "状态是:",
      "type": "n8n-nodes-base.switch",
      "position": [
        -40,
        20
      ],
      "parameters": {
        "rules": {
          "values": [
            {
              "outputKey": "Succeeded",
              "conditions": {
                "options": {
                  "version": 2,
                  "leftValue": "",
                  "caseSensitive": true,
                  "typeValidation": "strict"
                },
                "combinator": "and",
                "conditions": [
                  {
                    "id": "a11576d1-4bfa-46ce-abce-25be2bc75a20",
                    "operator": {
                      "type": "string",
                      "operation": "equals"
                    },
                    "leftValue": "={{ $json.status }}",
                    "rightValue": "Succeeded"
                  }
                ]
              },
              "renameOutput": true
            },
            {
              "outputKey": "Processing",
              "conditions": {
                "options": {
                  "version": 2,
                  "leftValue": "",
                  "caseSensitive": true,
                  "typeValidation": "strict"
                },
                "combinator": "and",
                "conditions": [
                  {
                    "id": "19b80bb6-63f5-47f7-9d58-321de4f6893c",
                    "operator": {
                      "name": "filter.operator.equals",
                      "type": "string",
                      "operation": "equals"
                    },
                    "leftValue": "={{ $json.status }}",
                    "rightValue": "Processing"
                  }
                ]
              },
              "renameOutput": true
            },
            {
              "outputKey": "Failed",
              "conditions": {
                "options": {
                  "version": 2,
                  "leftValue": "",
                  "caseSensitive": true,
                  "typeValidation": "strict"
                },
                "combinator": "and",
                "conditions": [
                  {
                    "id": "b8822d01-57d4-47fd-95e9-5255ac5059f4",
                    "operator": {
                      "name": "filter.operator.equals",
                      "type": "string",
                      "operation": "equals"
                    },
                    "leftValue": "={{ $json.status }}",
                    "rightValue": "Failed"
                  }
                ]
              },
              "renameOutput": true
            }
          ]
        },
        "options": {}
      },
      "typeVersion": 3.2
    },
    {
      "id": "77949b9c-b3a1-4cd9-b643-d7f49dc64726",
      "name": "Google Gemini 聊天模型",
      "type": "@n8n/n8n-nodes-langchain.lmChatGoogleGemini",
      "position": [
        1360,
        80
      ],
      "parameters": {
        "options": {},
        "modelName": "models/gemini-2.5-pro-preview-05-06"
      },
      "credentials": {
        "googlePalmApi": {
          "id": "srw1vvQUiWhObvtc",
          "name": "Google Gemini(PaLM) Api account"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "1e53fcc8-4697-48c4-90cb-c07dee049949",
      "name": "结构化输出解析器",
      "type": "@n8n/n8n-nodes-langchain.outputParserStructured",
      "position": [
        1620,
        280
      ],
      "parameters": {
        "schemaType": "manual",
        "inputSchema": "{\n  \"$schema\": \"http://json-schema.org/draft-07/schema#\",\n  \"title\": \"TableOfContents\",\n  \"description\": \"A JSON schema for representing a hierarchical table of contents with manually inlined nesting (up to 3 levels) to avoid $ref or $defs, and simplified keywords.\",\n  \"type\": \"object\",\n  \"properties\": {\n    \"tableOfContents\": {\n      \"type\": \"array\",\n      \"description\": \"The root list of top-level headings (Level 1).\",\n      \"items\": {\n        \"type\": \"object\",\n        \"description\": \"Represents a Level 1 heading.\",\n        \"properties\": {\n          \"text\": {\n            \"type\": \"string\",\n            \"description\": \"The text content of the Level 1 heading.\"\n          },\n          \"level\": {\n            \"type\": \"integer\",\n            \"description\": \"The hierarchical level of the heading (e.g., 1 for H1, 2 for H2, etc.).\"\n          },\n          \"children\": {\n            \"type\": \"array\",\n            \"description\": \"An array of Level 2 subheadings.\",\n            \"items\": {\n              \"type\": \"object\",\n              \"description\": \"Represents a Level 2 heading.\",\n              \"properties\": {\n                \"text\": {\n                  \"type\": \"string\",\n                  \"description\": \"The text content of the Level 2 heading.\"\n                },\n                \"level\": {\n                  \"type\": \"integer\",\n                  \"description\": \"The hierarchical level of the heading.\"\n                },\n                \"children\": {\n                  \"type\": \"array\",\n                  \"description\": \"An array of Level 3 subheadings.\",\n                  \"items\": {\n                    \"type\": \"object\",\n                    \"description\": \"Represents a Level 3 heading.\",\n                    \"properties\": {\n                      \"text\": {\n                        \"type\": \"string\",\n                        \"description\": \"The text content of the Level 3 heading.\"\n                      },\n                      \"level\": {\n                        \"type\": \"integer\",\n                        \"description\": \"The hierarchical level of the heading.\"\n                      },\n                      \"children\": {\n                        \"type\": \"array\",\n                        \"description\": \"Level 3 headings typically have no children in this model, but the array must exist.\",\n                        \"items\": {\n                          \"type\": \"object\",\n                          \"description\": \"Schema for items (if any, typically none) in the children array of a Level 3 heading. These items must be objects.\",\n                          \"additionalProperties\": false\n                        }\n                      }\n                    },\n                    \"required\": [\n                      \"text\",\n                      \"level\",\n                      \"children\"\n                    ],\n                    \"additionalProperties\": false\n                  }\n                }\n              },\n              \"required\": [\n                \"text\",\n                \"level\",\n                \"children\"\n              ],\n              \"additionalProperties\": false\n            }\n          }\n        },\n        \"required\": [\n          \"text\",\n          \"level\",\n          \"children\"\n        ],\n        \"additionalProperties\": false\n      }\n    }\n  },\n  \"required\": [\n    \"tableOfContents\"\n  ],\n  \"additionalProperties\": false\n}\n"
      },
      "typeVersion": 1.2
    },
    {
      "id": "ecccb5bf-6625-476e-b010-e50e3b89a80b",
      "name": "自动修复输出解析器",
      "type": "@n8n/n8n-nodes-langchain.outputParserAutofixing",
      "position": [
        1480,
        80
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 1
    },
    {
      "id": "97b36f68-cd64-437b-8af0-dada28b40ea8",
      "name": "设置文件名",
      "type": "n8n-nodes-base.set",
      "position": [
        -980,
        20
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "49f89680-ba63-43f1-af72-7e5afd8ecb0e",
              "name": "fileName",
              "type": "string",
              "value": "={{ $('Merge').item.binary.data.fileName.replaceAll('.pdf','') }}"
            },
            {
              "id": "646e8985-5587-41f6-b4b2-d781ecff9e7c",
              "name": "fileNameSnake",
              "type": "string",
              "value": "={{ $('Merge').item.binary.data.fileName.replaceAll('.pdf','').toSnakeCase() }}"
            },
            {
              "id": "a7eafca2-13e6-46d1-b281-2ef4ea4d6a93",
              "name": "createdAt",
              "type": "string",
              "value": "={{ $now }}"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "46c53d9b-9387-4415-9b8c-b01a12e391a3",
      "name": "当由另一个工作流执行时",
      "type": "n8n-nodes-base.executeWorkflowTrigger",
      "position": [
        -1860,
        300
      ],
      "parameters": {
        "workflowInputs": {
          "values": [
            {
              "name": "URL"
            }
          ]
        }
      },
      "typeVersion": 1.1
    },
    {
      "id": "f2145258-4cb1-4339-81d4-f9dfe524b972",
      "name": "提取章节标题作为备用",
      "type": "n8n-nodes-base.code",
      "position": [
        880,
        -180
      ],
      "parameters": {
        "jsCode": "// Input: Full JSON from Chunkr task\nconst chunks = $(\"Status is:\").first().json.output.chunks;\nconst headings = [];\n\nfor (const chunk of chunks) {\n  for (const segment of chunk.segments) {\n    if (segment.segment_type === 'SectionHeader') {\n      // We store the content of the heading\n      headings.push(segment.content.trim());\n    }\n  }\n}\n\n// Remove duplicates that might span chunks\nconst uniqueHeadings = [...new Set(headings)];\n\n// Return the ordered list of unique headings\nreturn [{ json: { headings: uniqueHeadings } }];"
      },
      "typeVersion": 2
    },
    {
      "id": "47b749ec-4832-45a0-826e-13ef23fd4647",
      "name": "获取文档开头以查找目录",
      "type": "n8n-nodes-base.code",
      "position": [
        280,
        -180
      ],
      "parameters": {
        "jsCode": "\nconst taskResult = $input.first().json;\n\n\n\n// Define how many chunks you want to extract\nconst numberOfChunksToExtract = 10;\n\n// An array to hold the text content from the selected chunks\nlet textContentArray = [];\n\n// Check if the output and chunks exist in the task result\nif (taskResult.output && taskResult.output.chunks) {\n  const allChunks = taskResult.output.chunks;\n\n  // Loop through the first 'numberOfChunksToExtract' chunks,\n  // or fewer if the document doesn't have that many.\n  for (let i = 0; i < Math.min(numberOfChunksToExtract, allChunks.length); i++) {\n    const chunk = allChunks[i];\n    // Ensure the chunk and its 'embed' field exist and are not empty\n    if (chunk && chunk.embed && chunk.embed.trim() !== '') {\n      textContentArray.push(chunk.embed);\n    }\n  }\n}\n\n// Join all the collected text content with a double newline for separation\nconst concatenatedText = textContentArray.join('\\n\\n');\n\n// Return a single JSON object with the specified key and the concatenated text\nreturn [{\n  json: {\n    \"firstXNumberOfChunks\": concatenatedText\n  }\n}];"
      },
      "typeVersion": 2
    },
    {
      "id": "7f136d22-2195-4d56-803a-a9f6384f3557",
      "name": "停止与错误处理",
      "type": "n8n-nodes-base.stopAndError",
      "position": [
        200,
        220
      ],
      "parameters": {
        "errorMessage": "The chunkr Task failed!"
      },
      "typeVersion": 1
    },
    {
      "id": "228fed4c-c2a9-4dde-a270-e674ae61b9da",
      "name": "Google Gemini 聊天模型1",
      "type": "@n8n/n8n-nodes-langchain.lmChatGoogleGemini",
      "position": [
        1500,
        280
      ],
      "parameters": {
        "options": {},
        "modelName": "models/gemini-2.5-flash-preview-05-20"
      },
      "credentials": {
        "googlePalmApi": {
          "id": "srw1vvQUiWhObvtc",
          "name": "Google Gemini(PaLM) Api account"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "9a21bf6c-208f-45a7-bb78-f27935b53b5d",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2640,
        -240
      ],
      "parameters": {
        "width": 580,
        "height": 260,
        "content": "### 欢迎使用文档处理工作流!"
      },
      "typeVersion": 1
    },
    {
      "id": "447c7ec5-a094-4034-afc4-fcd7aae5f4de",
      "name": "将 PDF 转换为 base64",
      "type": "n8n-nodes-base.extractFromFile",
      "position": [
        -1180,
        20
      ],
      "parameters": {
        "options": {},
        "operation": "binaryToPropery",
        "binaryPropertyName": "=data"
      },
      "typeVersion": 1
    },
    {
      "id": "dfc84ad3-4a85-4641-bc16-1f89c54b1c3a",
      "name": "便签1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1940,
        -460
      ],
      "parameters": {
        "width": 480,
        "height": 440,
        "content": "### 节点:当点击\"执行工作流\"时(手动触发器)"
      },
      "typeVersion": 1
    },
    {
      "id": "b61a3d2e-9773-4e03-ad49-380fca4bb04d",
      "name": "便签 2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1940,
        20
      ],
      "parameters": {
        "width": 480,
        "height": 460,
        "content": "### 节点:由另一个工作流执行时(Webhook 触发器)"
      },
      "typeVersion": 1
    },
    {
      "id": "0f1ddb9a-dc65-44b7-b5ed-b4f9fa64e743",
      "name": "从 URL 下载 PDF",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -1620,
        300
      ],
      "parameters": {
        "url": "={{ $json.URL }}",
        "options": {}
      },
      "typeVersion": 4.2
    },
    {
      "id": "b1095619-41f5-4381-9516-3c221ef98388",
      "name": "从 Google Drive 下载 PDF",
      "type": "n8n-nodes-base.googleDrive",
      "disabled": true,
      "position": [
        -1640,
        -220
      ],
      "parameters": {
        "fileId": {
          "__rl": true,
          "mode": "list",
          "value": "11ReFvvEnHKiFegKJ5tjm-MMfJn96nBng",
          "cachedResultUrl": "https://drive.google.com/file/d/11ReFvvEnHKiFegKJ5tjm-MMfJn96nBng/view?usp=drivesdk",
          "cachedResultName": "S1-Handlungsempfehlung: Diagnostik und Therapie der Loiasis (Afrikanischer Augenwurm).pdf"
        },
        "options": {},
        "operation": "download"
      },
      "credentials": {
        "googleDriveOAuth2Api": {
          "id": "RHXEETyEk6E7K4gH",
          "name": "Google Drive account"
        }
      },
      "typeVersion": 3
    },
    {
      "id": "ceb2c044-a55c-4c9d-9736-6769cce0ed12",
      "name": "便签 3",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1100,
        -500
      ],
      "parameters": {
        "width": 1000,
        "height": 400,
        "content": "### 节点:POST Chunkr 任务(HTTP 请求)和 GET Chunkr 任务"
      },
      "typeVersion": 1
    },
    {
      "id": "45dd3735-1c1a-4b46-ad10-d79234f01b7a",
      "name": "在轮询 Chunkr 结果前等待",
      "type": "n8n-nodes-base.wait",
      "position": [
        -480,
        20
      ],
      "webhookId": "23fb1820-b060-4e25-87c3-3c49f7ffb916",
      "parameters": {
        "amount": 10
      },
      "typeVersion": 1.1
    },
    {
      "id": "572c6ae1-cf9a-43ff-9c09-3bc650875d70",
      "name": "便签 4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        680,
        -600
      ],
      "parameters": {
        "width": 480,
        "height": 380,
        "content": "### 节点:提取章节标题作为备用(代码)"
      },
      "typeVersion": 1
    },
    {
      "id": "56cbaf07-965c-4718-b282-dd1b471ffa90",
      "name": "便签 5",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        120,
        -600
      ],
      "parameters": {
        "width": 500,
        "height": 380,
        "content": "### 节点:获取文档开头以查找目录(代码)"
      },
      "typeVersion": 1
    },
    {
      "id": "2b103146-6917-4745-adff-cb790dbdd7a6",
      "name": "单独返回每个章节",
      "type": "n8n-nodes-base.code",
      "position": [
        2040,
        -400
      ],
      "parameters": {
        "jsCode": "// --- Two-Pass Heading Mapping with Levels, Rich Content, Simplified Boundaries & Multi-Item Output ---\n\n// Get data from the two specified previous nodes\nconst goldenTocContainer = $(\"Table of Content Agent\").first().json.output;\nconst chunkrOutputContainer = $('GET Chunkr Task').first().json.output;\n\nconsole.log(\"Script Start: Levels, Rich Content, Simplified Boundaries & Multi-Item Output\");\n\nconst goldenTocArray = goldenTocContainer?.tableOfContents || [];\nconst chunkrChunks = chunkrOutputContainer?.chunks || [];\n\nif (!goldenTocArray.length) {\n    console.error(\"STOP: Golden ToC array is empty or not found.\");\n    return [{ json: { error: \"Golden ToC array is empty.\" }}];\n}\nif (!chunkrChunks.length) {\n    console.error(\"STOP: Chunkr chunks array is empty or not found.\");\n    return [{ json: { error: \"Chunkr chunks array is empty.\" }}];\n}\nconsole.log(`Inputs loaded: Golden ToC items: ${goldenTocArray.length}, Chunkr Chunks: ${chunkrChunks.length}`);\n\n// --- Helper Functions ---\nfunction normalizeText(text) {\n    if (typeof text !== 'string') return '';\n    return text.toLowerCase().replace(/\\s+/g, ' ').trim();\n}\n\nconst flatGoldenHeadings = [];\nlet uniqueIdCounter = 0;\n\nfunction flattenGoldenToC(nodes, defaultParentLevel = 0) {\n    for (const node of nodes) {\n        const currentItemLevel = node.level !== undefined ? node.level : defaultParentLevel + 1;\n        if (node.text) {\n            flatGoldenHeadings.push({\n                id: uniqueIdCounter++,\n                originalText: node.text.trim(),\n                normalizedText: normalizeText(node.text.trim()),\n                level: currentItemLevel,\n                isMapped: false,\n                matchDetails: { // Initialize with fields that will hold extracted content\n                    type: \"Unmatched\", chunkIndex: -1, segmentIndex: -1, chunkId: null, // chunkId kept for internal ref if needed\n                    matchedSegmentContent: \"\", sourceSegmentType: null,\n                    sectionText: \"\", sectionHTML: \"\", sectionMarkdown: \"\"\n                    // Boundary indices for output are removed here\n                }\n            });\n        }\n        if (node.children && node.children.length > 0) {\n            flattenGoldenToC(node.children, currentItemLevel);\n        }\n    }\n}\nflattenGoldenToC(goldenTocArray, 0);\nconsole.log(`Golden ToC flattened: ${flatGoldenHeadings.length} total headings with levels.`);\n\nconst usedChunkIndices = new Set();\nlet internalProcessedToc = [];\nlet identifiedPdfToCItem = null;\n\n// --- Step 1: Identify and Create Entry for the PDF's Own Table of Contents Chunk ---\nlet pdfTocChunkIndex = -1;\nlet maxTocItemsInAChunk = 0;\nconst minThreshold = 3;\nconst percentageThreshold = Math.floor(flatGoldenHeadings.length * 0.10);\nconst TOC_CHUNK_THRESHOLD_COUNT = Math.min(Math.max(minThreshold, percentageThreshold), 15);\n\nconsole.log(`Pre-computation: Identifying PDF's own ToC Chunk (threshold: > ${TOC_CHUNK_THRESHOLD_COUNT} distinct golden items)`);\nif (flatGoldenHeadings.length > 0) {\n    for (let chunkIdx = 0; chunkIdx < chunkrChunks.length; chunkIdx++) {\n        const chunk = chunkrChunks[chunkIdx];\n        if (!chunk.segments || chunk.segments.length === 0) continue;\n        let combinedChunkText = chunk.segments.reduce((acc, seg) => acc + (seg.content ? normalizeText(seg.content) + \" \" : \"\"), \"\").trim();\n        if (!combinedChunkText) continue;\n        const foundIds = new Set(flatGoldenHeadings.filter(gh => gh.normalizedText && combinedChunkText.includes(gh.normalizedText)).map(gh => gh.id));\n        const distinctItemsCount = foundIds.size;\n        if (distinctItemsCount > TOC_CHUNK_THRESHOLD_COUNT && distinctItemsCount > maxTocItemsInAChunk) {\n            maxTocItemsInAChunk = distinctItemsCount;\n            pdfTocChunkIndex = chunkIdx;\n        }\n    }\n}\n\nif (pdfTocChunkIndex !== -1) {\n    const tocChunk = chunkrChunks[pdfTocChunkIndex];\n    let tocContentSample = (tocChunk.segments && tocChunk.segments.length > 0)\n        ? tocChunk.segments.map(s => s.content || \"\").join(' ').substring(0, 150) + \"...\"\n        : \"[No segment content for sample]\";\n    identifiedPdfToCItem = {\n        id: \"pdf_toc_\" + pdfTocChunkIndex,\n        originalText: \"Document Table of Contents (Auto-Detected)\",\n        level: 1,\n        isMapped: true,\n        matchDetails: {\n            type: \"Auto-Detected PDF ToC\",\n            chunkIndex: pdfTocChunkIndex, segmentIndex: 0, chunkId: tocChunk.chunk_id,\n            matchedSegmentContent: `[Chunk ${pdfTocChunkIndex} (ID: ${tocChunk.chunk_id || 'N/A'}) auto-identified as PDF ToC, ~${maxTocItemsInAChunk} golden items. Sample: ${normalizeText(tocContentSample)}]`,\n            sourceSegmentType: \"AggregatedChunkAsPDFToC\",\n            sectionText: \"\", sectionHTML: \"\", sectionMarkdown: \"\"\n        }\n    };\n    usedChunkIndices.add(pdfTocChunkIndex);\n    console.log(`PDF ToC Chunk identified: Index ${pdfTocChunkIndex}.`);\n} else {\n    console.log(\"No single dominant PDF ToC Chunk identified.\");\n}\n\n// --- Pass 1: Anchor Golden Headings with 'SectionHeader' ---\nconsole.log(\"\\n--- Starting Pass 1 for Golden ToC: Matching 'SectionHeader' Segments ---\");\nflatGoldenHeadings.forEach(goldenHeading => {\n    if (!goldenHeading.normalizedText || goldenHeading.isMapped) return;\n    for (let chunkIdx = 0; chunkIdx < chunkrChunks.length; chunkIdx++) {\n        if (usedChunkIndices.has(chunkIdx)) continue;\n        const chunk = chunkrChunks[chunkIdx];\n        for (let segIdx = 0; segIdx < chunk.segments.length; segIdx++) {\n            const segment = chunk.segments[segIdx];\n            if (segment.segment_type === 'SectionHeader' && segment.content && normalizeText(segment.content) === goldenHeading.normalizedText) {\n                console.log(`  ✅ PASS 1: \"${goldenHeading.originalText}\" (L${goldenHeading.level}) -> SectionHeader in Chunk ${chunkIdx}`);\n                goldenHeading.isMapped = true;\n                goldenHeading.matchDetails = { // Overwrite initial matchDetails\n                    type: \"SectionHeader Exact\", chunkIndex: chunkIdx, segmentIndex: segIdx, chunkId: chunk.chunk_id,\n                    matchedSegmentContent: segment.content, sourceSegmentType: segment.segment_type,\n                    sectionText: \"\", sectionHTML: \"\", sectionMarkdown: \"\"\n                };\n                usedChunkIndices.add(chunkIdx);\n                break;\n            }\n        }\n        if (goldenHeading.isMapped) break;\n    }\n});\nconsole.log(`--- Pass 1 Complete: ${flatGoldenHeadings.filter(h => h.isMapped).length} golden headings mapped initially.`);\n\n// --- Pass 2: Find Remaining Golden Headings (Content Includes) ---\nconsole.log(\"\\n--- Starting Pass 2 for Golden ToC: Content Search (Simplified) ---\");\nlet searchAfterChunkIndex_Pass2 = identifiedPdfToCItem ? identifiedPdfToCItem.matchDetails.chunkIndex : -1;\n\nflatGoldenHeadings.forEach(goldenHeading => {\n    if (goldenHeading.isMapped) {\n        if (goldenHeading.matchDetails.chunkIndex > searchAfterChunkIndex_Pass2) {\n            searchAfterChunkIndex_Pass2 = goldenHeading.matchDetails.chunkIndex;\n        }\n        return;\n    }\n    if (!goldenHeading.normalizedText) return;\n    console.log(`Pass 2 - Seeking: \"${goldenHeading.originalText}\" (L${goldenHeading.level}) (after chunk ${searchAfterChunkIndex_Pass2})`);\n    let potentialMatches = [];\n    for (let chunkIdx = searchAfterChunkIndex_Pass2 + 1; chunkIdx < chunkrChunks.length; chunkIdx++) {\n        if (usedChunkIndices.has(chunkIdx)) continue;\n        const chunk = chunkrChunks[chunkIdx];\n        for (let segIdx = 0; segIdx < chunk.segments.length; segIdx++) {\n            const segment = chunk.segments[segIdx];\n            if (segment.content && normalizeText(segment.content).includes(goldenHeading.normalizedText)) {\n                potentialMatches.push({\n                    chunkIndex: chunkIdx, segmentIndex: segIdx, chunkId: chunk.chunk_id,\n                    matchedSegmentContent: segment.content, segmentType: segment.segment_type\n                });\n            }\n        }\n    }\n    if (!potentialMatches.length) {\n        console.log(`  ❌ PASS 2: NO RAW MATCHES for \"${goldenHeading.originalText}\" (L${goldenHeading.level}).`);\n        return;\n    }\n    potentialMatches.sort((a, b) => (a.chunkIndex !== b.chunkIndex ? a.chunkIndex - b.chunkIndex : a.segmentIndex - b.segmentIndex));\n    let chosenMatch = potentialMatches.find(match => !usedChunkIndices.has(match.chunkIndex));\n\n    if (chosenMatch) {\n        console.log(`  ✅ PASS 2: \"${goldenHeading.originalText}\" (L${goldenHeading.level}) -> Chunk ${chosenMatch.chunkIndex}, Seg ${chosenMatch.segmentIndex}`);\n        goldenHeading.isMapped = true;\n        goldenHeading.matchDetails = { // Overwrite initial matchDetails\n            type: \"Content Includes\", chunkIndex: chosenMatch.chunkIndex, segmentIndex: chosenMatch.segmentIndex, chunkId: chosenMatch.chunkId,\n            matchedSegmentContent: chosenMatch.matchedSegmentContent, sourceSegmentType: chosenMatch.segmentType,\n            sectionText: \"\", sectionHTML: \"\", sectionMarkdown: \"\"\n        };\n        usedChunkIndices.add(chosenMatch.chunkIndex);\n        searchAfterChunkIndex_Pass2 = chosenMatch.chunkIndex;\n    } else {\n        console.log(`  ❌ PASS 2: NO SUITABLE UNUSED CHUNK for \"${goldenHeading.originalText}\" (L${goldenHeading.level}).`);\n    }\n});\n\n// --- Consolidate, Sort, and Extract Section Content (Text, HTML, Markdown) ---\nif (identifiedPdfToCItem) {\n    internalProcessedToc.push(identifiedPdfToCItem);\n}\nflatGoldenHeadings.forEach(gh => internalProcessedToc.push(gh));\n\ninternalProcessedToc.sort((a, b) => {\n    const aChunk = a.matchDetails.chunkIndex;\n    const bChunk = b.matchDetails.chunkIndex;\n    const aSeg = a.matchDetails.segmentIndex;\n    const bSeg = b.matchDetails.segmentIndex;\n    if (aChunk !== bChunk) return aChunk - bChunk;\n    if (aSeg !== bSeg) return aSeg - bSeg;\n    return (a.id && b.id) ? String(a.id).localeCompare(String(b.id)) : 0;\n});\n\nconsole.log(\"\\n--- Extracting Section Content (Text, HTML, Markdown) for Mapped Items ---\");\n\nfunction getContentStartPoint(headingChunkIdx, headingSegIdx, allChunkrChunks) {\n    let contentStartChunkIdx = headingChunkIdx;\n    let contentStartSegmentIdx = headingSegIdx + 1;\n    if (headingChunkIdx >= allChunkrChunks.length || !allChunkrChunks[headingChunkIdx] || !allChunkrChunks[headingChunkIdx].segments) {\n        return { chunkIdx: headingChunkIdx, segmentIdx: headingSegIdx };\n    }\n    const headingChunk = allChunkrChunks[headingChunkIdx];\n    if (contentStartSegmentIdx >= headingChunk.segments.length) {\n        contentStartChunkIdx++;\n        contentStartSegmentIdx = 0;\n    }\n    return { chunkIdx: contentStartChunkIdx, segmentIdx: contentStartSegmentIdx };\n}\n\nfunction extractSectionContents(contentStartChunkIdx, contentStartSegmentIdx, nextSectionStartChunkIdx, nextSectionStartSegIdx, allChunkrChunks) {\n    let accumulatedText = \"\";\n    let accumulatedHtml = \"\";\n    let accumulatedMarkdown = \"\";\n    for (let cIdx = contentStartChunkIdx; cIdx < allChunkrChunks.length; cIdx++) {\n        const chunk = allChunkrChunks[cIdx];\n        if (!chunk || !chunk.segments) continue;\n        const sStart = (cIdx === contentStartChunkIdx) ? contentStartSegmentIdx : 0;\n        let sEnd = chunk.segments.length;\n        if (cIdx === nextSectionStartChunkIdx) sEnd = nextSectionStartSegIdx;\n        for (let sIdx = sStart; sIdx < sEnd; sIdx++) {\n            const segment = chunk.segments[sIdx];\n            if (segment) {\n                if (segment.content) accumulatedText += segment.content + \"\\n\";\n                accumulatedHtml += (segment.html || \"\") + \"\\n\";\n                accumulatedMarkdown += (segment.markdown || \"\") + \"\\n\";\n            }\n        }\n        if (cIdx >= nextSectionStartChunkIdx && nextSectionStartChunkIdx < allChunkrChunks.length) break;\n    }\n    return {\n        text: accumulatedText.trim(),\n        html: accumulatedHtml.trim(),\n        markdown: accumulatedMarkdown.trim()\n    };\n}\n\nfor (let i = 0; i < internalProcessedToc.length; i++) {\n    const currentItem = internalProcessedToc[i];\n    if (!currentItem.isMapped || !currentItem.matchDetails || currentItem.matchDetails.chunkIndex === -1) {\n        continue;\n    }\n\n    const headingChunkIdx = currentItem.matchDetails.chunkIndex;\n    const headingSegIdx = currentItem.matchDetails.segmentIndex;\n\n    // Determine end of current section (start of next *mapped* section)\n    let nextSectionStartChunkIdx = chunkrChunks.length;\n    let nextSectionStartSegIdx = 0;\n\n    for (let j = i + 1; j < internalProcessedToc.length; j++) {\n        if (internalProcessedToc[j].isMapped && internalProcessedToc[j].matchDetails && internalProcessedToc[j].matchDetails.chunkIndex !== -1) {\n            nextSectionStartChunkIdx = internalProcessedToc[j].matchDetails.chunkIndex;\n            nextSectionStartSegIdx = internalProcessedToc[j].matchDetails.segmentIndex;\n            break;\n        }\n    }\n    // These end boundaries are used for extraction, but not stored in matchDetails for output\n    \n    let contentExtractionStartChunk = headingChunkIdx;\n    let contentExtractionStartSegment = headingSegIdx;\n\n    if (currentItem.id && String(currentItem.id).startsWith(\"pdf_toc_\")) {\n        contentExtractionStartChunk = headingChunkIdx;\n        contentExtractionStartSegment = 0; \n    } else {\n        const contentStartPoint = getContentStartPoint(headingChunkIdx, headingSegIdx, chunkrChunks);\n        contentExtractionStartChunk = contentStartPoint.chunkIdx;\n        contentExtractionStartSegment = contentStartPoint.segmentIdx;\n    }\n    \n    const sectionContents = extractSectionContents(\n        contentExtractionStartChunk, contentExtractionStartSegment,\n        nextSectionStartChunkIdx, nextSectionStartSegIdx,\n        chunkrChunks\n    );\n\n    currentItem.matchDetails.sectionText = sectionContents.text;\n    currentItem.matchDetails.sectionHTML = sectionContents.html;\n    currentItem.matchDetails.sectionMarkdown = sectionContents.markdown;\n}\n\n// --- Final Output Formatting to Individual n8n Items (Simplified) ---\nconst outputN8nItems = [];\ninternalProcessedToc.forEach(item => {\n    if (item.isMapped && item.matchDetails && item.matchDetails.chunkIndex !== -1) {\n        const detail = item.matchDetails;\n        outputN8nItems.push({\n            json: {\n                heading: item.originalText,\n                headingLevel: item.level !== undefined ? item.level : 1,\n                sectionText: detail.sectionText || \"\",\n                sectionHTML: detail.sectionHTML || \"\",\n                sectionMarkdown: detail.sectionMarkdown || \"\"\n                // Removed: sourceChunkId, sectionStartChunkIndex, sectionStartSegmentIndex,\n                // sectionEndChunkIndex, sectionEndSegmentIndex\n            }\n        });\n    }\n});\n\nconsole.log(`--- Processing Complete. Returning ${outputN8nItems.length} mapped sections as individual items. ---`);\nif (outputN8nItems.length > 0) {\n    console.log(\"\\n--- Sample of First Output Item (JSON content) ---\");\n    console.log(JSON.stringify(outputN8nItems[0].json, null, 2));\n}\n\nif (outputN8nItems.length === 0 && (goldenTocArray.length > 0 || chunkrChunks.length > 0)) {\n    console.warn(\"No sections were successfully mapped to output.\");\n    return [{ json: { warning: \"No sections mapped.\" } }];\n}\n\nreturn outputN8nItems;"
      },
      "typeVersion": 2
    },
    {
      "id": "8622fab4-edcc-41d4-8456-0c652e8f6eb2",
      "name": "目录代理",
      "type": "@n8n/n8n-nodes-langchain.agent",
      "position": [
        1420,
        -160
      ],
      "parameters": {
        "text": "=You are an expert at understanding document structure. Based on the following ordered list of section headings from a document, please analyze the numbering (e.g., 1., 1.1, 1.1.1, A., B.) and semantic content to create a nested JSON object representing the document's hierarchy.\n\nEach node in the JSON must have a \"title\" (the heading text) and a \"children\" (an array of child nodes).\n\n**Example:**\nInput List: [\"1. Introduction\", \"1.1 Background\", \"1.2 Scope\", \"2. Methodology\", \"2.1 Data Collection\"]\nDesired JSON Output:\n[\n  {\n    \"title\": \"1. Introduction\",\n    \"children\": [\n      { \"title\": \"1.1 Background\", \"children\": [] },\n      { \"title\": \"1.2 Scope\", \"children\": [] }\n    ]\n  },\n  {\n    \"title\": \"2. Methodology\",\n    \"children\": [\n      { \"title\": \"2.1 Data Collection\", \"children\": [] }\n    ]\n  }\n]\n\n**These Headings were extracted programatically:**\n(these are generated using OCR and should only be used as fallback. this might include too many headings, or may be missing some. only use if no TOC is found in the beginning of the document)\n{{ $('Extract Sections headers as fallback').first().json.headings }}\n\n\n**This is the first pages of the document**\n{{ $('Take beginning of Document to look for Table of contents').first().json.firstXNumberOfChunks }}\n(may or may not contain a table of contents. if it does, use as primary source of truth)\n\nyour task is to use the above sources to construct a table of contents.\noutput in json using the required format.",
        "options": {},
        "promptType": "define",
        "hasOutputParser": true
      },
      "typeVersion": 2
    },
    {
      "id": "b627fe7d-f342-40f6-912f-c090a619c96c",
      "name": "返回整个文档",
      "type": "n8n-nodes-base.code",
      "position": [
        2120,
        340
      ],
      "parameters": {
        "jsCode": "const goldenTocContainer = $(\"Table of Content Agent\").first().json.output;\nconst chunkrOutputContainer = $('GET Chunkr Task').first().json.output;\n\n\nconsole.log(\"Script Start: Adding Heading Levels\");\n\nconst goldenTocArray = goldenTocContainer?.tableOfContents || [];\nconst chunkrChunks = chunkrOutputContainer?.chunks || [];\n\nif (!goldenTocArray.length) {\n    console.error(\"STOP: Golden ToC array is empty or not found.\");\n    return [{ json: { error: \"Golden ToC array is empty.\", processedSections: [] }}];\n}\nif (!chunkrChunks.length) {\n    console.error(\"STOP: Chunkr chunks array is empty or not found.\");\n    return [{ json: { error: \"Chunkr chunks array is empty.\", processedSections: [] }}];\n}\nconsole.log(`Inputs loaded: Golden ToC items: ${goldenTocArray.length}, Chunkr Chunks: ${chunkrChunks.length}`);\n\n// --- Helper Functions ---\nfunction normalizeText(text) {\n    if (typeof text !== 'string') return '';\n    return text.toLowerCase().replace(/\\s+/g, ' ').trim();\n}\n\nconst flatGoldenHeadings = [];\nlet uniqueIdCounter = 0;\n\n// Modified flattenGoldenToC to capture the 'level' property\n// Assumes 'node.level' from your input goldenTocArray is the authoritative source.\n// Adds a defaultLevel for robustness if 'node.level' is unexpectedly missing.\nfunction flattenGoldenToC(nodes, defaultParentLevel = 0) {\n    for (const node of nodes) {\n        // Determine the current item's level: use explicit if present, else infer from parent.\n        const currentItemLevel = node.level !== undefined ? node.level : defaultParentLevel + 1;\n\n        if (node.text) {\n            flatGoldenHeadings.push({\n                id: uniqueIdCounter++,\n                originalText: node.text.trim(),\n                normalizedText: normalizeText(node.text.trim()),\n                level: currentItemLevel, // Store the heading level\n                isMapped: false,\n                matchDetails: { // Initialize all expected matchDetail fields\n                    type: \"Unmatched\", chunkIndex: -1, segmentIndex: -1, chunkId: null,\n                    matchedSegmentContent: \"\", sourceSegmentType: null,\n                    sectionText: \"\", sectionHTML: \"\", sectionMarkdown: \"\",\n                    sectionStartChunkIndex: -1, sectionStartSegmentIndex: -1,\n                    sectionEndChunkIndex: -1, sectionEndSegmentIndex: -1\n                }\n            });\n        }\n        if (node.children && node.children.length > 0) {\n            // Children's level is determined by their own 'level' or inferred from current item's level\n            flattenGoldenToC(node.children, currentItemLevel);\n        }\n    }\n}\nflattenGoldenToC(goldenTocArray, 0); // Initial call, top-level items will use their own 'level' or default to 1\nconsole.log(`Golden ToC flattened: ${flatGoldenHeadings.length} total headings with levels.`);\n\nconst usedChunkIndices = new Set();\nlet internalProcessedToc = [];\nlet identifiedPdfToCItem = null;\n\n// --- Step 1: Identify and Create Entry for the PDF's Own Table of Contents Chunk ---\nlet pdfTocChunkIndex = -1;\nlet maxTocItemsInAChunk = 0;\nconst minThreshold = 3;\nconst percentageThreshold = Math.floor(flatGoldenHeadings.length * 0.10);\nconst TOC_CHUNK_THRESHOLD_COUNT = Math.min(Math.max(minThreshold, percentageThreshold), 15);\n\nconsole.log(`Pre-computation: Identifying PDF's own ToC Chunk (threshold: > ${TOC_CHUNK_THRESHOLD_COUNT} distinct golden items)`);\nif (flatGoldenHeadings.length > 0) {\n    for (let chunkIdx = 0; chunkIdx < chunkrChunks.length; chunkIdx++) {\n        const chunk = chunkrChunks[chunkIdx];\n        if (!chunk.segments || chunk.segments.length === 0) continue;\n        let combinedChunkText = chunk.segments.reduce((acc, seg) => acc + (seg.content ? normalizeText(seg.content) + \" \" : \"\"), \"\").trim();\n        if (!combinedChunkText) continue;\n        const foundIds = new Set(flatGoldenHeadings.filter(gh => gh.normalizedText && combinedChunkText.includes(gh.normalizedText)).map(gh => gh.id));\n        const distinctItemsCount = foundIds.size;\n        if (distinctItemsCount > TOC_CHUNK_THRESHOLD_COUNT && distinctItemsCount > maxTocItemsInAChunk) {\n            maxTocItemsInAChunk = distinctItemsCount;\n            pdfTocChunkIndex = chunkIdx;\n        }\n    }\n}\n\nif (pdfTocChunkIndex !== -1) {\n    const tocChunk = chunkrChunks[pdfTocChunkIndex];\n    let tocContentSample = (tocChunk.segments && tocChunk.segments.length > 0)\n        ? tocChunk.segments.map(s => s.content || \"\").join(' ').substring(0, 150) + \"...\"\n        : \"[No segment content for sample]\";\n    identifiedPdfToCItem = {\n        id: \"pdf_toc_\" + pdfTocChunkIndex,\n        originalText: \"Document Table of Contents (Auto-Detected)\",\n        level: 1, // Assign level 1 to the auto-detected ToC\n        isMapped: true,\n        matchDetails: {\n            type: \"Auto-Detected PDF ToC\",\n            chunkIndex: pdfTocChunkIndex, segmentIndex: 0, chunkId: tocChunk.chunk_id,\n            matchedSegmentContent: `[Chunk ${pdfTocChunkIndex} (ID: ${tocChunk.chunk_id || 'N/A'}) auto-identified as PDF ToC, ~${maxTocItemsInAChunk} golden items. Sample: ${normalizeText(tocContentSample)}]`,\n            sourceSegmentType: \"AggregatedChunkAsPDFToC\",\n            sectionText: \"\", sectionHTML: \"\", sectionMarkdown: \"\",\n            sectionStartChunkIndex: pdfTocChunkIndex, sectionStartSegmentIndex: 0,\n            sectionEndChunkIndex: -1, sectionEndSegmentIndex: -1\n        }\n    };\n    usedChunkIndices.add(pdfTocChunkIndex);\n    console.log(`PDF ToC Chunk identified: Index ${pdfTocChunkIndex}. Level set to 1.`);\n} else {\n    console.log(\"No single dominant PDF ToC Chunk identified.\");\n}\n\n// --- Pass 1: Anchor Golden Headings with 'SectionHeader' ---\nconsole.log(\"\\n--- Starting Pass 1 for Golden ToC: Matching 'SectionHeader' Segments ---\");\nflatGoldenHeadings.forEach(goldenHeading => {\n    if (!goldenHeading.normalizedText || goldenHeading.isMapped) return;\n    for (let chunkIdx = 0; chunkIdx < chunkrChunks.length; chunkIdx++) {\n        if (usedChunkIndices.has(chunkIdx)) continue;\n        const chunk = chunkrChunks[chunkIdx];\n        for (let segIdx = 0; segIdx < chunk.segments.length; segIdx++) {\n            const segment = chunk.segments[segIdx];\n            if (segment.segment_type === 'SectionHeader' && segment.content && normalizeText(segment.content) === goldenHeading.normalizedText) {\n                console.log(`  ✅ PASS 1: \"${goldenHeading.originalText}\" (L${goldenHeading.level}) -> SectionHeader in Chunk ${chunkIdx}`);\n                goldenHeading.isMapped = true;\n                goldenHeading.matchDetails = {\n                    type: \"SectionHeader Exact\", chunkIndex: chunkIdx, segmentIndex: segIdx, chunkId: chunk.chunk_id,\n                    matchedSegmentContent: segment.content, sourceSegmentType: segment.segment_type,\n                    sectionText: \"\", sectionHTML: \"\", sectionMarkdown: \"\",\n                    sectionStartChunkIndex: chunkIdx, sectionStartSegmentIndex: segIdx,\n                    sectionEndChunkIndex: -1, sectionEndSegmentIndex: -1\n                };\n                usedChunkIndices.add(chunkIdx);\n                break;\n            }\n        }\n        if (goldenHeading.isMapped) break;\n    }\n});\nconsole.log(`--- Pass 1 Complete: ${flatGoldenHeadings.filter(h => h.isMapped).length} golden headings mapped initially.`);\n\n// --- Pass 2: Find Remaining Golden Headings (Content Includes) ---\nconsole.log(\"\\n--- Starting Pass 2 for Golden ToC: Content Search (Simplified) ---\");\nlet searchAfterChunkIndex_Pass2 = identifiedPdfToCItem ? identifiedPdfToCItem.matchDetails.chunkIndex : -1;\n\nflatGoldenHeadings.forEach(goldenHeading => {\n    if (goldenHeading.isMapped) {\n        if (goldenHeading.matchDetails.chunkIndex > searchAfterChunkIndex_Pass2) {\n            searchAfterChunkIndex_Pass2 = goldenHeading.matchDetails.chunkIndex;\n        }\n        return;\n    }\n    if (!goldenHeading.normalizedText) return;\n\n    console.log(`Pass 2 - Seeking: \"${goldenHeading.originalText}\" (L${goldenHeading.level}) (after chunk ${searchAfterChunkIndex_Pass2})`);\n    let potentialMatches = [];\n    for (let chunkIdx = searchAfterChunkIndex_Pass2 + 1; chunkIdx < chunkrChunks.length; chunkIdx++) {\n        if (usedChunkIndices.has(chunkIdx)) continue;\n        const chunk = chunkrChunks[chunkIdx];\n        for (let segIdx = 0; segIdx < chunk.segments.length; segIdx++) {\n            const segment = chunk.segments[segIdx];\n            if (segment.content && normalizeText(segment.content).includes(goldenHeading.normalizedText)) {\n                potentialMatches.push({\n                    chunkIndex: chunkIdx, segmentIndex: segIdx, chunkId: chunk.chunk_id,\n                    matchedSegmentContent: segment.content, segmentType: segment.segment_type\n                });\n            }\n        }\n    }\n    if (!potentialMatches.length) {\n        console.log(`  ❌ PASS 2: NO RAW MATCHES for \"${goldenHeading.originalText}\" (L${goldenHeading.level}).`);\n        return;\n    }\n    potentialMatches.sort((a, b) => (a.chunkIndex !== b.chunkIndex ? a.chunkIndex - b.chunkIndex : a.segmentIndex - b.segmentIndex));\n    let chosenMatch = potentialMatches.find(match => !usedChunkIndices.has(match.chunkIndex));\n\n    if (chosenMatch) {\n        console.log(`  ✅ PASS 2: \"${goldenHeading.originalText}\" (L${goldenHeading.level}) -> Chunk ${chosenMatch.chunkIndex}, Seg ${chosenMatch.segmentIndex}`);\n        goldenHeading.isMapped = true;\n        goldenHeading.matchDetails = {\n            type: \"Content Includes\", chunkIndex: chosenMatch.chunkIndex, segmentIndex: chosenMatch.segmentIndex, chunkId: chosenMatch.chunkId,\n            matchedSegmentContent: chosenMatch.matchedSegmentContent, sourceSegmentType: chosenMatch.segmentType,\n            sectionText: \"\", sectionHTML: \"\", sectionMarkdown: \"\",\n            sectionStartChunkIndex: chosenMatch.chunkIndex, sectionStartSegmentIndex: chosenMatch.segmentIndex,\n            sectionEndChunkIndex: -1, sectionEndSegmentIndex: -1\n        };\n        usedChunkIndices.add(chosenMatch.chunkIndex);\n        searchAfterChunkIndex_Pass2 = chosenMatch.chunkIndex;\n    } else {\n        console.log(`  ❌ PASS 2: NO SUITABLE UNUSED CHUNK for \"${goldenHeading.originalText}\" (L${goldenHeading.level}).`);\n    }\n});\n\n// --- Consolidate, Sort, and Extract Section Content (Text, HTML, Markdown) ---\nif (identifiedPdfToCItem) {\n    internalProcessedToc.push(identifiedPdfToCItem);\n}\nflatGoldenHeadings.forEach(gh => internalProcessedToc.push(gh));\n\ninternalProcessedToc.sort((a, b) => {\n    const aChunk = a.matchDetails.chunkIndex;\n    const bChunk = b.matchDetails.chunkIndex;\n    const aSeg = a.matchDetails.segmentIndex;\n    const bSeg = b.matchDetails.segmentIndex;\n    if (aChunk !== bChunk) return aChunk - bChunk;\n    if (aSeg !== bSeg) return aSeg - bSeg;\n    return (a.id && b.id) ? String(a.id).localeCompare(String(b.id)) : 0;\n});\n\nconsole.log(\"\\n--- Extracting Section Content (Text, HTML, Markdown) for Mapped Items ---\");\n\nfunction getContentStartPoint(headingChunkIdx, headingSegIdx, allChunkrChunks) {\n    let contentStartChunkIdx = headingChunkIdx;\n    let contentStartSegmentIdx = headingSegIdx + 1;\n    if (headingChunkIdx >= allChunkrChunks.length || !allChunkrChunks[headingChunkIdx] || !allChunkrChunks[headingChunkIdx].segments) {\n        return { chunkIdx: headingChunkIdx, segmentIdx: headingSegIdx };\n    }\n    const headingChunk = allChunkrChunks[headingChunkIdx];\n    if (contentStartSegmentIdx >= headingChunk.segments.length) {\n        contentStartChunkIdx++;\n        contentStartSegmentIdx = 0;\n    }\n    return { chunkIdx: contentStartChunkIdx, segmentIdx: contentStartSegmentIdx };\n}\n\nfunction extractSectionContents(contentStartChunkIdx, contentStartSegmentIdx, nextSectionStartChunkIdx, nextSectionStartSegIdx, allChunkrChunks) {\n    let accumulatedText = \"\";\n    let accumulatedHtml = \"\";\n    let accumulatedMarkdown = \"\";\n    for (let cIdx = contentStartChunkIdx; cIdx < allChunkrChunks.length; cIdx++) {\n        const chunk = allChunkrChunks[cIdx];\n        if (!chunk || !chunk.segments) continue;\n        const sStart = (cIdx === contentStartChunkIdx) ? contentStartSegmentIdx : 0;\n        let sEnd = chunk.segments.length;\n        if (cIdx === nextSectionStartChunkIdx) sEnd = nextSectionStartSegIdx;\n        for (let sIdx = sStart; sIdx < sEnd; sIdx++) {\n            const segment = chunk.segments[sIdx];\n            if (segment) {\n                if (segment.content) accumulatedText += segment.content + \"\\n\";\n                accumulatedHtml += (segment.html || \"\") + \"\\n\";\n                accumulatedMarkdown += (segment.markdown || \"\") + \"\\n\";\n            }\n        }\n        if (cIdx >= nextSectionStartChunkIdx && nextSectionStartChunkIdx < allChunkrChunks.length) break;\n    }\n    return {\n        text: accumulatedText.trim(),\n        html: accumulatedHtml.trim(),\n        markdown: accumulatedMarkdown.trim()\n    };\n}\n\nfor (let i = 0; i < internalProcessedToc.length; i++) {\n    const currentItem = internalProcessedToc[i];\n    if (!currentItem.isMapped || !currentItem.matchDetails || currentItem.matchDetails.chunkIndex === -1) continue;\n\n    const headingChunkIdx = currentItem.matchDetails.chunkIndex;\n    const headingSegIdx = currentItem.matchDetails.segmentIndex;\n\n    let nextSectionStartChunkIdx = chunkrChunks.length;\n    let nextSectionStartSegIdx = 0;\n\n    for (let j = i + 1; j < internalProcessedToc.length; j++) {\n        if (internalProcessedToc[j].isMapped && internalProcessedToc[j].matchDetails && internalProcessedToc[j].matchDetails.chunkIndex !== -1) {\n            nextSectionStartChunkIdx = internalProcessedToc[j].matchDetails.chunkIndex;\n            nextSectionStartSegIdx = internalProcessedToc[j].matchDetails.segmentIndex;\n            break;\n        }\n    }\n    currentItem.matchDetails.sectionEndChunkIndex = nextSectionStartChunkIdx;\n    currentItem.matchDetails.sectionEndSegmentIndex = nextSectionStartSegIdx;\n    \n    let contentExtractionStartChunk = headingChunkIdx;\n    let contentExtractionStartSegment = headingSegIdx;\n\n    if (currentItem.id && String(currentItem.id).startsWith(\"pdf_toc_\")) {\n        contentExtractionStartChunk = headingChunkIdx; // Start of its own chunk\n        contentExtractionStartSegment = 0;             // From the very first segment\n    } else {\n        // For regular headings, content starts *after* the heading's segment\n        const contentStartPoint = getContentStartPoint(headingChunkIdx, headingSegIdx, chunkrChunks);\n        contentExtractionStartChunk = contentStartPoint.chunkIdx;\n        contentExtractionStartSegment = contentStartPoint.segmentIdx;\n    }\n    \n    const sectionContents = extractSectionContents(\n        contentExtractionStartChunk, contentExtractionStartSegment,\n        nextSectionStartChunkIdx, nextSectionStartSegIdx,\n        chunkrChunks\n    );\n\n    currentItem.matchDetails.sectionText = sectionContents.text;\n    currentItem.matchDetails.sectionHTML = sectionContents.html;\n    currentItem.matchDetails.sectionMarkdown = sectionContents.markdown;\n}\n\n// --- Final Output Formatting to a Single n8n Item with an Array of Sections ---\nconst outputSectionsArray = [];\ninternalProcessedToc.forEach(item => {\n    if (item.isMapped && item.matchDetails && item.matchDetails.chunkIndex !== -1) {\n        const detail = item.matchDetails;\n        outputSectionsArray.push({\n            heading: item.originalText,\n            headingLevel: item.level !== undefined ? item.level : 1, // Add headingLevel, default to 1 if missing\n            sectionText: detail.sectionText || \"\",\n            sectionHTML: detail.sectionHTML || \"\",\n            sectionMarkdown: detail.sectionMarkdown || \"\",\n            sourceChunkId: detail.chunkId || null,\n            sectionStartChunkIndex: detail.sectionStartChunkIndex,\n            sectionStartSegmentIndex: detail.sectionStartSegmentIndex,\n            sectionEndChunkIndex: detail.sectionEndChunkIndex,\n            sectionEndSegmentIndex: detail.sectionEndSegmentIndex,\n        });\n    }\n});\n\nconsole.log(`--- Processing Complete. Returning 1 item with ${outputSectionsArray.length} mapped sections. ---`);\nif (outputSectionsArray.length > 0) {\n    console.log(\"\\n--- Sample of First Section in Output Array (showing heading and level) ---\");\n    const sample = outputSectionsArray[0];\n    console.log(`Heading: \"${sample.heading}\", Level: ${sample.headingLevel}, TextLen: ${sample.sectionText.length}, HTMLLen: ${sample.sectionHTML.length}, MDLen: ${sample.sectionMarkdown.length}`);\n}\n\nif (outputSectionsArray.length === 0 && (goldenTocArray.length > 0 || chunkrChunks.length > 0)) {\n    console.warn(\"No sections were successfully mapped to output array.\");\n    return [{ json: { warning: \"No sections mapped.\", processedSections: [] } }];\n}\n\nreturn [{ json: { processedSections: outputSectionsArray } }];"
      },
      "typeVersion": 2
    },
    {
      "id": "14257369-a732-45cc-a45d-16408f9408d7",
      "name": "创建 HTML 文档",
      "type": "n8n-nodes-base.code",
      "position": [
        2580,
        140
      ],
      "parameters": {
        "jsCode": "// n8n Code Node: Generate Full HTML Document\n\n// Assuming the input from the previous node is items[0]\nconst inputData = $('Return the whole document').first().json;\nconst processedSections = inputData.processedSections;\n\nif (!processedSections || !Array.isArray(processedSections)) {\n  console.error(\"Error: processedSections array not found in input or is not an array.\");\n  // Return an error or an empty HTML string to prevent workflow failure\n  return [{ json: { error: \"Input data is not in the expected format.\", html_output: \"\", fileName: \"error.html\" } }];\n}\n\nlet fullHtmlContent = \"\";\n\n// Start HTML Document\nfullHtmlContent += \"<!DOCTYPE html>\\n\";\nfullHtmlContent += '<html lang=\"en\">\\n';\nfullHtmlContent += \"<head>\\n\";\nfullHtmlContent += '  <meta charset=\"UTF-8\">\\n';\nfullHtmlContent += '  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\\n';\n\n// Attempt to use the first H1 heading as the page title\nlet pageTitle = \"Generated Document\";\nif (processedSections.length > 0 && processedSections[0].heading) {\n    // A simple way to find a prominent heading for the title\n    const firstRealHeading = processedSections.find(s => s.headingLevel > 0 && s.heading !== \"Document Table of Contents (Auto-Detected)\");\n    pageTitle = firstRealHeading ? firstRealHeading.heading : (processedSections[0].heading || pageTitle);\n}\nfullHtmlContent += `  <title>${pageTitle.replace(/</g, \"&lt;\").replace(/>/g, \"&gt;\")}</title>\\n`;\n\n// Optional: Add some basic styling\nfullHtmlContent += `  <style>\n    body { font-family: -apple-system, BlinkMacSystemFont, \"Segoe UI\", Roboto, Helvetica, Arial, sans-serif; line-height: 1.6; padding: 20px; max-width: 900px; margin: 0 auto; color: #333; }\n    h1, h2, h3, h4, h5, h6 { margin-top: 1.8em; margin-bottom: 0.6em; line-height: 1.2; color: #111; }\n    h1 { font-size: 2.2em; }\n    h2 { font-size: 1.8em; }\n    h3 { font-size: 1.5em; }\n    p { margin-bottom: 1em; }\n    pre, code { font-family: monospace; background-color: #f4f4f4; padding: 2px 4px; border-radius: 3px;}\n    pre { padding: 10px; overflow-x: auto; }\n    table { border-collapse: collapse; width: 100%; margin-bottom: 1em; }\n    th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }\n    th { background-color: #f2f2f2; }\n    img { max-width: 100%; height: auto; display: block; margin: 1em 0; }\n  </style>\\n`;\n\nfullHtmlContent += \"</head>\\n\";\nfullHtmlContent += \"<body>\\n\\n\";\n\nprocessedSections.forEach(section => {\n  // Ensure heading and headingLevel are present and valid\n  if (section.heading && typeof section.headingLevel === 'number' && section.headingLevel > 0 && section.headingLevel <= 6) {\n    // Basic sanitization for heading text to prevent HTML injection if source is untrusted\n    const safeHeading = section.heading.replace(/</g, \"&lt;\").replace(/>/g, \"&gt;\");\n    fullHtmlContent += `<h${section.headingLevel}>${safeHeading}</h${section.headingLevel}>\\n`;\n  }\n\n  // Append the section's HTML content\n  // It's assumed section.sectionHTML is already valid HTML for the body of the section\n  fullHtmlContent += (section.sectionHTML || \"\") + \"\\n\\n\";\n});\n\nfullHtmlContent += \"</body>\\n\";\nfullHtmlContent += \"</html>\";\n\n// Output the generated HTML string. This can then be used by a \"Write Binary File\" node\n// or other nodes that can handle file content.\nreturn [{ json: { fullHtmlContent} }];"
      },
      "typeVersion": 2
    },
    {
      "id": "3c472dcc-492b-4eb1-8c4d-bef5565047ba",
      "name": "HTML",
      "type": "n8n-nodes-base.html",
      "position": [
        2780,
        140
      ],
      "parameters": {
        "html": "{{ $json.fullHtmlContent }}"
      },
      "typeVersion": 1.2
    },
    {
      "id": "b912a32d-ae45-40f7-a5a4-e180cf306c6e",
      "name": "移动二进制数据",
      "type": "n8n-nodes-base.moveBinaryData",
      "position": [
        3000,
        140
      ],
      "parameters": {
        "mode": "jsonToBinary",
        "options": {
          "fileName": "={{ $('Set File Name').item.json.fileNameSnake }}.html",
          "mimeType": "text/html",
          "useRawData": true
        },
        "sourceKey": "html",
        "convertAllData": false
      },
      "typeVersion": 1
    },
    {
      "id": "10b96111-14e2-4061-82ae-643fb243894d",
      "name": "创建 Markdown 文档",
      "type": "n8n-nodes-base.code",
      "position": [
        2600,
        500
      ],
      "parameters": {
        "jsCode": "// n8n Code Node: Generate Full Markdown Document\n\n// Assuming the input from the previous node is items[0]\nconst inputData = $('Return the whole document').first().json;\nconst processedSections = inputData.processedSections;\n\nif (!processedSections || !Array.isArray(processedSections)) {\n  console.error(\"Error: processedSections array not found in input or is not an array.\");\n  return [{ json: { error: \"Input data is not in the expected format.\", markdown_output: \"\", fileName: \"error.md\" } }];\n}\n\nlet fullMarkdownContent = \"\";\n\nprocessedSections.forEach(section => {\n  // Ensure heading and headingLevel are present and valid\n  if (section.heading && typeof section.headingLevel === 'number' && section.headingLevel > 0) {\n    // Repeat '#' for the heading level\n    const markdownHeaderPrefix = '#'.repeat(section.headingLevel);\n    fullMarkdownContent += `${markdownHeaderPrefix} ${section.heading}\\n\\n`;\n  }\n\n  // Append the section's Markdown content\n  // It's assumed section.sectionMarkdown is already valid Markdown for the body of the section\n  fullMarkdownContent += (section.sectionMarkdown || \"\") + \"\\n\\n\"; // Add extra newline for spacing between sections\n});\n\n// Trim any excessive newlines at the very end\nfullMarkdownContent = fullMarkdownContent.trim();\n\n// Output the generated Markdown string.\nreturn [{ json: { fullMarkdownContent} }];"
      },
      "typeVersion": 2
    },
    {
      "id": "8eeb8129-e5e5-434f-b33a-35c7ea465a6d",
      "name": "转换为文件",
      "type": "n8n-nodes-base.convertToFile",
      "position": [
        2780,
        500
      ],
      "parameters": {
        "options": {
          "fileName": "={{ $('Set File Name').first().json.fileNameSnake }}.md"
        },
        "operation": "toText",
        "sourceProperty": "fullMarkdownContent"
      },
      "typeVersion": 1.1
    },
    {
      "id": "f50391c5-8ebc-48a6-a9d1-4c0d835ff5ea",
      "name": "便签6",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1260,
        -600
      ],
      "parameters": {
        "width": 560,
        "height": 400,
        "content": "### 节点:AI Agent(AI Agent)"
      },
      "typeVersion": 1
    },
    {
      "id": "c03c377b-6204-40e6-8079-56c726e8f8a8",
      "name": "便签7",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1940,
        -640
      ],
      "parameters": {
        "width": 400,
        "height": 400,
        "content": "## 单独返回每个章节"
      },
      "typeVersion": 1
    },
    {
      "id": "2b6e50ac-702a-4ff9-86cf-7fbde7a58dba",
      "name": "便签8",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1980,
        60
      ],
      "parameters": {
        "width": 400,
        "height": 400,
        "content": "## ... 或者返回整个文档"
      },
      "typeVersion": 1
    },
    {
      "id": "c4ec2654-ed9d-4bc3-a1a4-3e8f44aa115d",
      "name": "### 替换 Airtable 连接",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2600,
        -820
      ],
      "parameters": {
        "width": 1540,
        "height": 260,
        "content": "# 将 PDF 转换为具有正确子标题层次结构的结构化 JSON"
      },
      "typeVersion": 1
    },
    {
      "id": "a887cdad-a2c3-4477-a6ed-72d007f560a2",
      "name": "GET Chunkr 任务",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -260,
        -60
      ],
      "parameters": {
        "url": "=https://api.chunkr.ai/api/v1/task/{{ $('POST Chunkr Task').item.json.task_id }}",
        "options": {},
        "sendHeaders": true,
        "headerParameters": {
          "parameters": [
            {
              "name": "Authorization",
              "value": "<YOUR_CHUNKR_API_KEY>"
            }
          ]
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "cd52c24f-1698-4887-8d5d-248eb0a904fd",
      "name": "POST Chunkr 任务",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -700,
        20
      ],
      "parameters": {
        "url": "https://api.chunkr.ai/api/v1/task/parse",
        "method": "POST",
        "options": {},
        "jsonBody": "={\n  \"chunk_processing\": null,\n  \"error_handling\": null,\n  \"expires_in\": 123,\n  \"file\": \"{{ $('Convert the PDF to base64').item.json.data}}\",\n  \"file_name\": \"{{ $json.fileNameSnake }}\",\n  \"high_resolution\": false,\n  \"llm_processing\": null,\n  \"ocr_strategy\": null,\n  \"pipeline\": null,\n  \"segment_processing\": null,\n  \"segmentation_strategy\": null\n}",
        "sendBody": true,
        "sendHeaders": true,
        "specifyBody": "json",
        "headerParameters": {
          "parameters": [
            {
              "name": "Authorization",
              "value": "=<YOUR_CHUNKR_API_KEY>"
            }
          ]
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "5350ae70-85e1-44d2-bdb1-47b7a02708e5",
      "name": "合并",
      "type": "n8n-nodes-base.merge",
      "position": [
        -1380,
        20
      ],
      "parameters": {},
      "typeVersion": 3.2
    }
  ],
  "pinData": {
    "When Executed by Another Workflow": [
      {
        "URL": "https://register.awmf.org/assets/guidelines/001-036l_S1_Management-des-erwartet-schwierigen-Atemwegs-beim-Kind_2021-05.pdf"
      }
    ],
    "When clicking ‘Execute workflow’": [
      {}
    ]
  },
  "connections": {
    "HTML": {
      "main": [
        [
          {
            "node": "Move Binary Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Merge": {
      "main": [
        [
          {
            "node": "Convert the PDF to base64",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Status is:": {
      "main": [
        [
          {
            "node": "Take beginning of Document to look for Table of contents",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Wait Before Polling the Chunkr Result",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Stop and Error",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Set File Name": {
      "main": [
        [
          {
            "node": "POST Chunkr Task",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "GET Chunkr Task": {
      "main": [
        [
          {
            "node": "Status is:",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "POST Chunkr Task": {
      "main": [
        [
          {
            "node": "Wait Before Polling the Chunkr Result",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Create HTML document": {
      "main": [
        [
          {
            "node": "HTML",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Download PDF from URL": {
      "main": [
        [
          {
            "node": "Merge",
            "type": "main",
            "index": 1
          }
        ]
      ]
    },
    "Table of Content Agent": {
      "main": [
        [
          {
            "node": "Return each section individually",
            "type": "main",
            "index": 0
          },
          {
            "node": "Return the whole document",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Create Markdown Document": {
      "main": [
        [
          {
            "node": "Convert to File",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Google Gemini Chat Model": {
      "ai_languageModel": [
        [
          {
            "node": "Table of Content Agent",
            "type": "ai_languageModel",
            "index": 0
          }
        ]
      ]
    },
    "Structured Output Parser": {
      "ai_outputParser": [
        [
          {
            "node": "Auto-fixing Output Parser",
            "type": "ai_outputParser",
            "index": 0
          }
        ]
      ]
    },
    "Auto-fixing Output Parser": {
      "ai_outputParser": [
        [
          {
            "node": "Table of Content Agent",
            "type": "ai_outputParser",
            "index": 0
          }
        ]
      ]
    },
    "Convert the PDF to base64": {
      "main": [
        [
          {
            "node": "Set File Name",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Google Gemini Chat Model1": {
      "ai_languageModel": [
        [
          {
            "node": "Auto-fixing Output Parser",
            "type": "ai_languageModel",
            "index": 0
          }
        ]
      ]
    },
    "Return the whole document": {
      "main": [
        [
          {
            "node": "Create HTML document",
            "type": "main",
            "index": 0
          },
          {
            "node": "Create Markdown Document",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Download PDF from Google Drive": {
      "main": [
        [
          {
            "node": "Merge",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Return each section individually": {
      "main": [
        []
      ]
    },
    "When Executed by Another Workflow": {
      "main": [
        [
          {
            "node": "Download PDF from URL",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Sections headers as fallback": {
      "main": [
        [
          {
            "node": "Table of Content Agent",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "When clicking ‘Execute workflow’": {
      "main": [
        [
          {
            "node": "Download PDF from Google Drive",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Wait Before Polling the Chunkr Result": {
      "main": [
        [
          {
            "node": "GET Chunkr Task",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Take beginning of Document to look for Table of contents": {
      "main": [
        [
          {
            "node": "Extract Sections headers as fallback",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

高级 - 人工智能, IT 运维

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量36
分类2
节点类型19
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

作者
Lukas Kunhardt

Lukas Kunhardt

@lukaskunhardt

Automation Expert with a focus on structured data extraction from messy inputs.

外部链接
在 n8n.io 查看

分享此工作流