{
  "meta": {
    "instanceId": "9a562c06a632241f66aadd52a495ad98e76b760ef5cfce9c319a4759c47cd94e"
  },
  "nodes": [
    {
      "id": "ed429607-b22c-494c-b767-7dc2eca5a561",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2160,
        -112
      ],
      "parameters": {
        "width": 720,
        "height": 592,
        "content": "# n8n 工作流说明：网络爬虫"
      },
      "typeVersion": 1
    },
    {
      "id": "26230b6f-528a-41fa-b9f0-9597659e2f23",
      "name": "便签1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1376,
        -112
      ],
      "parameters": {
        "width": 800,
        "height": 1136,
        "content": "## 逐步详细分解"
      },
      "typeVersion": 1
    },
    {
      "id": "c3ea4128-8963-4000-af38-e7f2be48bb7e",
      "name": "Webhook",
      "type": "n8n-nodes-base.webhook",
      "position": [
        -2128,
        -336
      ],
      "webhookId": "603a09ed-516c-4c7d-bad3-b05b030503a2",
      "parameters": {
        "path": "603a09ed-516c-4c7d-bad3-b05b030503a2",
        "options": {
          "rawBody": false
        },
        "httpMethod": "POST",
        "responseMode": "responseNode"
      },
      "typeVersion": 2.1
    },
    {
      "id": "a35808cb-d2ea-4797-86a6-a36670377560",
      "name": "循环链接（批次）",
      "type": "n8n-nodes-base.splitInBatches",
      "notes": "Iterates through the queue of links to be crawled one at a time.",
      "position": [
        48,
        -480
      ],
      "parameters": {
        "options": {
          "reset": false
        },
        "batchSize": 1
      },
      "executeOnce": false,
      "typeVersion": 1
    },
    {
      "id": "798444a5-0df4-4727-818f-657901ad60a1",
      "name": "IF 抓取深度是否合适？",
      "type": "n8n-nodes-base.if",
      "notes": "Validates whether the current depth is below the maximum depth allowed.",
      "onError": "continueRegularOutput",
      "position": [
        -352,
        -464
      ],
      "parameters": {
        "conditions": {
          "number": [
            {
              "value1": "={{ $json.depth }}",
              "value2": "={{ $json.maxDepth}} ",
              "operation": "smallerEqual"
            }
          ],
          "string": [
            {
              "value1": "={{ $json.type }}",
              "value2": "link"
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "ecc2707f-0605-4c88-98eb-8c8ea234e9ff",
      "name": "提取正文和链接",
      "type": "n8n-nodes-base.html",
      "notes": "Parses HTML content and extracts body text and anchor href links.",
      "position": [
        -784,
        -464
      ],
      "parameters": {
        "options": {
          "trimValues": true,
          "cleanUpText": true
        },
        "operation": "extractHtmlContent",
        "extractionValues": {
          "values": [
            {
              "key": "links",
              "attribute": "href",
              "cssSelector": "a[href]",
              "returnArray": true,
              "returnValue": "attribute"
            },
            {
              "key": "content",
              "cssSelector": "body"
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "d4dfda4a-e20a-4014-b024-c0fde8f41aed",
      "name": "将 URL/深度附加到 HTML",
      "type": "n8n-nodes-base.code",
      "position": [
        -976,
        -464
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "jsCode": "  return {\n    json: {\n      url:$('Seed Root Crawl Item').item.json.url,\n      depth: $('Seed Root Crawl Item').item.json.depth,\n     ...item.json // Preserve original HTML response (optional)\n    }\n  };\n"
      },
      "typeVersion": 2
    },
    {
      "id": "239040b9-3c08-47d9-a188-18776817df23",
      "name": "获取 HTML 页面",
      "type": "n8n-nodes-base.httpRequest",
      "notes": "Makes HTTP request to fetch the content of the current URL.",
      "onError": "continueRegularOutput",
      "position": [
        -1200,
        -464
      ],
      "parameters": {
        "url": "={{ $json.url }}",
        "options": {
          "timeout": 5000,
          "response": {
            "response": {}
          }
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "3d960fb8-2224-4f50-becf-b2f03bd7de6e",
      "name": "种子根抓取项",
      "type": "n8n-nodes-base.merge",
      "position": [
        -1408,
        -464
      ],
      "parameters": {
        "mode": "combine",
        "options": {
          "clashHandling": {
            "values": {
              "resolveClash": "preferLast",
              "overrideEmpty": true
            }
          },
          "includeUnpaired": true
        },
        "combineBy": "combineByPosition"
      },
      "typeVersion": 3.2
    },
    {
      "id": "3e02f965-84f5-40da-90d4-ae91bbf0434e",
      "name": "收集页面并在完成时发出",
      "type": "n8n-nodes-base.code",
      "position": [
        32,
        -288
      ],
      "parameters": {
        "jsCode": "const s = $getWorkflowStaticData('global');\nif (!s.pages) s.pages = [];\ns.pages.push({\n  url: $json.url,\n  depth: $json.depth,\n  content: $json.content\n});\nconsole.log(s.pending)\nif (s.pending <= 0) {\n  const pages = s.pages || [];\n  let combinedContent = pages.map(page => `URL: ${page.url}\\nDepth: ${page.depth}\\nContent: ${page.content}\\n`).join('\\n-----------------\\n');\n  return { json: { content: combinedContent } };\n} else {\n  return [];\n}"
      },
      "typeVersion": 2
    },
    {
      "id": "63f581a0-4794-4908-be22-dda1136e7593",
      "name": "存储页面数据",
      "type": "n8n-nodes-base.set",
      "notes": "Captures the URL, page content, and depth for storage or export.",
      "position": [
        -128,
        -304
      ],
      "parameters": {
        "values": {
          "number": [
            {
              "name": "depth",
              "value": "={{ $json.depth || 0 }}"
            }
          ],
          "string": [
            {
              "name": "url",
              "value": "={{ $json.url || '' }}"
            },
            {
              "name": "content",
              "value": "={{ $json.content || '' }}"
            }
          ]
        },
        "options": {},
        "keepOnlySet": true
      },
      "typeVersion": 2
    },
    {
      "id": "c3cf4541-c31f-4257-8729-44f8ed211bcd",
      "name": "合并网页",
      "type": "n8n-nodes-base.merge",
      "position": [
        208,
        -176
      ],
      "parameters": {},
      "typeVersion": 3.2
    },
    {
      "id": "a7d480bc-ef4b-4cad-989f-0eda36a26a00",
      "name": "组合和分块",
      "type": "n8n-nodes-base.code",
      "position": [
        400,
        -176
      ],
      "parameters": {
        "jsCode": "/* Combine static pages + extra JSON, then chunk pages for model calls */\nconst s = $getWorkflowStaticData('global');\nif (!s.pages) s.pages = [];\n\nfunction normPage(p = {}) {\n  return {\n    url: p.url || '',\n    depth: p.depth ?? null,\n    content: typeof p.content === 'string' ? p.content : ''\n  };\n}\n\nconst incomingPageItems = items\n  .filter(i => typeof i.json.content === 'string')\n  .map(i => normPage(i.json));\n\nconst storedPages = (s.pages || []).map(normPage);\nconst pages = storedPages.length ? storedPages : incomingPageItems;\n\nconst extraJson = items\n  .filter(i => typeof i.json.content !== 'string')\n  .map(i => i.json);\n\nlet combinedContent = pages\n  .map(p => `URL: ${p.url}\\nDepth: ${p.depth}\\nContent:\\n${p.content}\\n`)\n  .join('\\n-----------------\\n');\n\nif (extraJson.length) {\n  combinedContent += `\\n\\nLINKEDIN_DATA::\\n\\n${JSON.stringify(extraJson)}`;\n}\n\nconst CHUNK_SIZE = 5;\nconst MAX_CHARS_PER_BATCH = 12000;\n\nfunction chunkByChars(arr, maxChars) {\n  const batches = [];\n  let current = [];\n  let chars = 0;\n  for (const it of arr) {\n    const len = (it.content || '').length;\n    if (current.length && chars + len > maxChars) {\n      batches.push(current);\n      current = [];\n      chars = 0;\n    }\n    current.push(it);\n    chars += len;\n  }\n  if (current.length) batches.push(current);\n  return batches;\n}\n\nconst charBatches = chunkByChars(pages, MAX_CHARS_PER_BATCH);\nconst groups = [];\nfor (const batch of charBatches) {\n  for (let i = 0; i < batch.length; i += CHUNK_SIZE) {\n    groups.push(batch.slice(i, i + CHUNK_SIZE));\n  }\n}\n\nreturn groups.length\n  ? groups.map((g, idx) => ({ json: { batchIndex: idx, pages: g, combinedContent,accId:s.accountId } }))\n  : [{ json: { batchIndex: 0, pages: [], combinedContent } }];\n"
      },
      "typeVersion": 2
    },
    {
      "id": "1e36bc72-2db7-4ce7-a42e-51609a0c9065",
      "name": "响应 Webhook",
      "type": "n8n-nodes-base.respondToWebhook",
      "position": [
        608,
        -176
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 1.4
    },
    {
      "id": "99f16b20-3398-45a9-a652-7b51351283b2",
      "name": "初始化全局变量",
      "type": "n8n-nodes-base.code",
      "notes": "Initializes the pending count in static data for crawl completion tracking.",
      "position": [
        -1632,
        -336
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "jsCode": "const s = $getWorkflowStaticData('global');\ns.pending = 1;\ns.visited = [];\ns.queued = {};\ns.pages = [];\n\n// Ensure url has a scheme so URL() won't throw\nconst ensureUrl = u => (/^https?:\\/\\//i.test(u) ? u : `https://${u}`);\n\ntry {\n  $json.domain = new URL(ensureUrl($json.url)).hostname; // => \"www.crmaiinsight.com\"\n} catch (e) {\n  // Fallback if url is malformed\n  $json.domain = String($json.url || '')\n    .replace(/^[a-z]+:\\/\\//i, '')\n    .replace(/\\/.*$/, '')\n    .replace(/:\\d+$/, '');\n}\n\nreturn $json;\n"
      },
      "typeVersion": 2
    },
    {
      "id": "e56c711e-c7eb-4024-bd31-66680514d62c",
      "name": "初始化抓取参数",
      "type": "n8n-nodes-base.set",
      "notes": "Defines the root URL, domain name, and max crawl depth.",
      "position": [
        -1856,
        -336
      ],
      "parameters": {
        "values": {
          "number": [
            {
              "name": "maxDepth",
              "value": 3
            },
            {
              "name": "depth"
            }
          ],
          "string": [
            {
              "name": "url",
              "value": "={{ $json.body.url }}"
            },
            {
              "name": "domain",
              "value": "={{ $json.body.url }}"
            }
          ]
        },
        "options": {},
        "keepOnlySet": true
      },
      "typeVersion": 2
    },
    {
      "id": "29bf5f0a-97dc-4631-a485-f7ef9bcfd852",
      "name": "重新排队链接项",
      "type": "n8n-nodes-base.code",
      "notes": "Removes internal 'type' field and re-enqueues the link for next crawl.",
      "position": [
        -144,
        -480
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "jsCode": "const s = $getWorkflowStaticData('global');\n\ndelete $json.type\nreturn item;"
      },
      "typeVersion": 2
    },
    {
      "id": "3f81f588-a041-4ae9-92b5-2f79ae855355",
      "name": "排队和去重链接",
      "type": "n8n-nodes-base.code",
      "notes": "Cleans and deduplicates links. Tracks visited URLs. Prepares next crawl queue.",
      "onError": "continueRegularOutput",
      "position": [
        -560,
        -464
      ],
      "parameters": {
        "jsCode": "const staticData = $getWorkflowStaticData('global');\nif (!Array.isArray(staticData.visited)) staticData.visited = [];\nif (typeof staticData.pending !== 'number') staticData.pending = 0;\nif (!staticData.queued || typeof staticData.queued !== 'object') staticData.queued = {};\n\nconst currentUrl = $('Attach URL/Depth to HTML').item.json.url.replace(/\\/+$/, '');\nconst currentDepth = $('Attach URL/Depth to HTML').item.json.depth || 0;\nconst maxDepth = $('Seed Root Crawl Item').first().json.maxDepth;\nconst domainParamRaw = ($('Init Crawl Params').first().json.domain || '').toString();\nconst content = typeof $json.content === 'string' ? $json.content : '';\n\nconst PROTO_RE = /^[a-zA-Z][a-zA-Z0-9+.-]*:\\/\\//;\n\n// Normalize a host string: strip protocol, path, and leading \"www.\"\nfunction hostOf(u) {\n  if (!u) return '';\n  let s = u.toString();\n  if (PROTO_RE.test(s)) s = s.replace(PROTO_RE, '');\n  const i = s.indexOf('/');\n  if (i !== -1) s = s.slice(0, i);\n  return s.toLowerCase().replace(/^www\\./, '');\n}\n\n// Build absolute URL from href + base without using URL()\nfunction toAbsolute(href, base) {\n  if (!href) return '';\n  const h = href.trim();\n  if (PROTO_RE.test(h)) return h.replace(/\\/+$/, '');\n  if (h.startsWith('//')) {\n    const proto = (base.match(PROTO_RE) || ['https://'])[0];\n    return (proto + h.slice(2)).replace(/\\/+$/, '');\n  }\n  if (h.startsWith('/')) {\n    const baseHost = base.replace(PROTO_RE, '').split('/')[0];\n    const proto = (base.match(PROTO_RE) || ['https://'])[0];\n    return (proto + baseHost + h).replace(/\\/+$/, '');\n  }\n  // relative path\n  let dir = base;\n  if (!dir.endsWith('/')) {\n    const cut = dir.lastIndexOf('/');\n    dir = cut > (dir.indexOf('://') + 2) ? dir.slice(0, cut + 1) : (dir + '/');\n  }\n  return (dir + h).replace(/\\/+$/, '');\n}\n\nfunction extractHostname(abs) {\n  let s = abs.replace(PROTO_RE, '');\n  const i = s.indexOf('/');\n  const host = (i === -1 ? s : s.slice(0, i)).toLowerCase();\n  return host.replace(/^www\\./, '');\n}\n\nconst allowedHost = hostOf(domainParamRaw) || hostOf(currentUrl);\nconst currentHost  = hostOf(currentUrl);\n\n// mark current as visited & dequeue\nif (!staticData.visited.includes(currentUrl)) staticData.visited.push(currentUrl);\ndelete staticData.queued[currentUrl];\n\nconst links = Array.isArray($json.links) ? $json.links : [];\nconst newLinks = [];\nconst queuedLocal = new Set();\n\nfor (const link of links) {\n  if (!link) continue;\n  const l = String(link).trim();\n  if (!l || l.startsWith('mailto:') || l.startsWith('tel:') || l.startsWith('javascript:')) continue;\n  if (l.includes('#')) continue;\n  if (/\\.(pdf|docx?|xlsx?|pptx?)($|\\?)/i.test(l)) continue;\n\n  const absolute = toAbsolute(l, currentUrl);\n  const host = extractHostname(absolute);\n\n  // treat apex and www as same-site\n  const sameSite = (host === allowedHost) || (host === currentHost);\n\n  if (\n    sameSite &&\n    !staticData.visited.includes(absolute) &&\n    !staticData.queued[absolute] &&\n    !queuedLocal.has(absolute) &&\n    currentDepth < maxDepth\n  ) {\n    newLinks.push({\n      json: { url: absolute, depth: currentDepth + 1, type: 'link', maxDepth }\n    });\n    queuedLocal.add(absolute);\n    staticData.queued[absolute] = true;\n  }\n}\n\nstaticData.pending += newLinks.length;\nstaticData.pending--; // this page done\n\nreturn newLinks.concat({\n  json: { url: currentUrl, depth: currentDepth, content, type: 'page', maxDepth }\n});\n"
      },
      "typeVersion": 2
    }
  ],
  "pinData": {},
  "connections": {
    "Webhook": {
      "main": [
        [
          {
            "node": "Init Crawl Params",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Init Globals": {
      "main": [
        [
          {
            "node": "Seed Root Crawl Item",
            "type": "main",
            "index": 0
          },
          {
            "node": "Merge Web Pages",
            "type": "main",
            "index": 1
          }
        ]
      ]
    },
    "Combine & Chunk": {
      "main": [
        [
          {
            "node": "Respond to Webhook",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Fetch HTML Page": {
      "main": [
        [
          {
            "node": "Attach URL/Depth to HTML",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Merge Web Pages": {
      "main": [
        [
          {
            "node": "Combine & Chunk",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store Page Data": {
      "main": [
        [
          {
            "node": "Collect Pages & Emit When Done",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Init Crawl Params": {
      "main": [
        [
          {
            "node": "Init Globals",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Requeue Link Item": {
      "main": [
        [
          {
            "node": "Loop Links (Batches)",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "IF Crawl Depth OK?": {
      "main": [
        [
          {
            "node": "Requeue Link Item",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Store Page Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Queue & Dedup Links": {
      "main": [
        [
          {
            "node": "IF Crawl Depth OK?",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Body & Links": {
      "main": [
        [
          {
            "node": "Queue & Dedup Links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Loop Links (Batches)": {
      "main": [
        [
          {
            "node": "Seed Root Crawl Item",
            "type": "main",
            "index": 1
          }
        ]
      ]
    },
    "Seed Root Crawl Item": {
      "main": [
        [
          {
            "node": "Fetch HTML Page",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Attach URL/Depth to HTML": {
      "main": [
        [
          {
            "node": "Extract Body & Links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Collect Pages & Emit When Done": {
      "main": [
        [
          {
            "node": "Merge Web Pages",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题
如何使用这个工作流？

复制上方的 JSON 配置代码，在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」，粘贴配置后根据需要修改凭证设置即可。
这个工作流适合什么场景？

高级 - 内容创作, 多模态 AI
需要付费吗？

本工作流完全免费，您可以直接导入使用。但请注意，工作流中使用的第三方服务（如 OpenAI API）可能需要您自行付费。
特定域名网页内容爬虫，带深度控制和文本提取

使用的节点 (18)

分类

如何使用这个工作流？

这个工作流适合什么场景？

需要付费吗？

相关工作流推荐