8
n8n 中文网amn8n.com

特定域名网页内容爬虫,带深度控制和文本提取

高级

这是一个Content Creation, Multimodal AI领域的自动化工作流,包含 18 个节点。主要使用 If, Set, Code, Html, Merge 等节点。 特定域名网页内容爬虫,带深度控制和文本提取

前置要求
  • HTTP Webhook 端点(n8n 会自动生成)
  • 可能需要目标 API 的认证凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "meta": {
    "instanceId": "9a562c06a632241f66aadd52a495ad98e76b760ef5cfce9c319a4759c47cd94e"
  },
  "nodes": [
    {
      "id": "ed429607-b22c-494c-b767-7dc2eca5a561",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2160,
        -112
      ],
      "parameters": {
        "width": 720,
        "height": 592,
        "content": "# n8n 工作流说明:网络爬虫"
      },
      "typeVersion": 1
    },
    {
      "id": "26230b6f-528a-41fa-b9f0-9597659e2f23",
      "name": "便签1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1376,
        -112
      ],
      "parameters": {
        "width": 800,
        "height": 1136,
        "content": "## 逐步详细分解"
      },
      "typeVersion": 1
    },
    {
      "id": "c3ea4128-8963-4000-af38-e7f2be48bb7e",
      "name": "Webhook",
      "type": "n8n-nodes-base.webhook",
      "position": [
        -2128,
        -336
      ],
      "webhookId": "603a09ed-516c-4c7d-bad3-b05b030503a2",
      "parameters": {
        "path": "603a09ed-516c-4c7d-bad3-b05b030503a2",
        "options": {
          "rawBody": false
        },
        "httpMethod": "POST",
        "responseMode": "responseNode"
      },
      "typeVersion": 2.1
    },
    {
      "id": "a35808cb-d2ea-4797-86a6-a36670377560",
      "name": "循环链接(批次)",
      "type": "n8n-nodes-base.splitInBatches",
      "notes": "Iterates through the queue of links to be crawled one at a time.",
      "position": [
        48,
        -480
      ],
      "parameters": {
        "options": {
          "reset": false
        },
        "batchSize": 1
      },
      "executeOnce": false,
      "typeVersion": 1
    },
    {
      "id": "798444a5-0df4-4727-818f-657901ad60a1",
      "name": "IF 抓取深度是否合适?",
      "type": "n8n-nodes-base.if",
      "notes": "Validates whether the current depth is below the maximum depth allowed.",
      "onError": "continueRegularOutput",
      "position": [
        -352,
        -464
      ],
      "parameters": {
        "conditions": {
          "number": [
            {
              "value1": "={{ $json.depth }}",
              "value2": "={{ $json.maxDepth}} ",
              "operation": "smallerEqual"
            }
          ],
          "string": [
            {
              "value1": "={{ $json.type }}",
              "value2": "link"
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "ecc2707f-0605-4c88-98eb-8c8ea234e9ff",
      "name": "提取正文和链接",
      "type": "n8n-nodes-base.html",
      "notes": "Parses HTML content and extracts body text and anchor href links.",
      "position": [
        -784,
        -464
      ],
      "parameters": {
        "options": {
          "trimValues": true,
          "cleanUpText": true
        },
        "operation": "extractHtmlContent",
        "extractionValues": {
          "values": [
            {
              "key": "links",
              "attribute": "href",
              "cssSelector": "a[href]",
              "returnArray": true,
              "returnValue": "attribute"
            },
            {
              "key": "content",
              "cssSelector": "body"
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "d4dfda4a-e20a-4014-b024-c0fde8f41aed",
      "name": "将 URL/深度附加到 HTML",
      "type": "n8n-nodes-base.code",
      "position": [
        -976,
        -464
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "jsCode": "  return {\n    json: {\n      url:$('Seed Root Crawl Item').item.json.url,\n      depth: $('Seed Root Crawl Item').item.json.depth,\n     ...item.json // Preserve original HTML response (optional)\n    }\n  };\n"
      },
      "typeVersion": 2
    },
    {
      "id": "239040b9-3c08-47d9-a188-18776817df23",
      "name": "获取 HTML 页面",
      "type": "n8n-nodes-base.httpRequest",
      "notes": "Makes HTTP request to fetch the content of the current URL.",
      "onError": "continueRegularOutput",
      "position": [
        -1200,
        -464
      ],
      "parameters": {
        "url": "={{ $json.url }}",
        "options": {
          "timeout": 5000,
          "response": {
            "response": {}
          }
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "3d960fb8-2224-4f50-becf-b2f03bd7de6e",
      "name": "种子根抓取项",
      "type": "n8n-nodes-base.merge",
      "position": [
        -1408,
        -464
      ],
      "parameters": {
        "mode": "combine",
        "options": {
          "clashHandling": {
            "values": {
              "resolveClash": "preferLast",
              "overrideEmpty": true
            }
          },
          "includeUnpaired": true
        },
        "combineBy": "combineByPosition"
      },
      "typeVersion": 3.2
    },
    {
      "id": "3e02f965-84f5-40da-90d4-ae91bbf0434e",
      "name": "收集页面并在完成时发出",
      "type": "n8n-nodes-base.code",
      "position": [
        32,
        -288
      ],
      "parameters": {
        "jsCode": "const s = $getWorkflowStaticData('global');\nif (!s.pages) s.pages = [];\ns.pages.push({\n  url: $json.url,\n  depth: $json.depth,\n  content: $json.content\n});\nconsole.log(s.pending)\nif (s.pending <= 0) {\n  const pages = s.pages || [];\n  let combinedContent = pages.map(page => `URL: ${page.url}\\nDepth: ${page.depth}\\nContent: ${page.content}\\n`).join('\\n-----------------\\n');\n  return { json: { content: combinedContent } };\n} else {\n  return [];\n}"
      },
      "typeVersion": 2
    },
    {
      "id": "63f581a0-4794-4908-be22-dda1136e7593",
      "name": "存储页面数据",
      "type": "n8n-nodes-base.set",
      "notes": "Captures the URL, page content, and depth for storage or export.",
      "position": [
        -128,
        -304
      ],
      "parameters": {
        "values": {
          "number": [
            {
              "name": "depth",
              "value": "={{ $json.depth || 0 }}"
            }
          ],
          "string": [
            {
              "name": "url",
              "value": "={{ $json.url || '' }}"
            },
            {
              "name": "content",
              "value": "={{ $json.content || '' }}"
            }
          ]
        },
        "options": {},
        "keepOnlySet": true
      },
      "typeVersion": 2
    },
    {
      "id": "c3cf4541-c31f-4257-8729-44f8ed211bcd",
      "name": "合并网页",
      "type": "n8n-nodes-base.merge",
      "position": [
        208,
        -176
      ],
      "parameters": {},
      "typeVersion": 3.2
    },
    {
      "id": "a7d480bc-ef4b-4cad-989f-0eda36a26a00",
      "name": "组合和分块",
      "type": "n8n-nodes-base.code",
      "position": [
        400,
        -176
      ],
      "parameters": {
        "jsCode": "/* Combine static pages + extra JSON, then chunk pages for model calls */\nconst s = $getWorkflowStaticData('global');\nif (!s.pages) s.pages = [];\n\nfunction normPage(p = {}) {\n  return {\n    url: p.url || '',\n    depth: p.depth ?? null,\n    content: typeof p.content === 'string' ? p.content : ''\n  };\n}\n\nconst incomingPageItems = items\n  .filter(i => typeof i.json.content === 'string')\n  .map(i => normPage(i.json));\n\nconst storedPages = (s.pages || []).map(normPage);\nconst pages = storedPages.length ? storedPages : incomingPageItems;\n\nconst extraJson = items\n  .filter(i => typeof i.json.content !== 'string')\n  .map(i => i.json);\n\nlet combinedContent = pages\n  .map(p => `URL: ${p.url}\\nDepth: ${p.depth}\\nContent:\\n${p.content}\\n`)\n  .join('\\n-----------------\\n');\n\nif (extraJson.length) {\n  combinedContent += `\\n\\nLINKEDIN_DATA::\\n\\n${JSON.stringify(extraJson)}`;\n}\n\nconst CHUNK_SIZE = 5;\nconst MAX_CHARS_PER_BATCH = 12000;\n\nfunction chunkByChars(arr, maxChars) {\n  const batches = [];\n  let current = [];\n  let chars = 0;\n  for (const it of arr) {\n    const len = (it.content || '').length;\n    if (current.length && chars + len > maxChars) {\n      batches.push(current);\n      current = [];\n      chars = 0;\n    }\n    current.push(it);\n    chars += len;\n  }\n  if (current.length) batches.push(current);\n  return batches;\n}\n\nconst charBatches = chunkByChars(pages, MAX_CHARS_PER_BATCH);\nconst groups = [];\nfor (const batch of charBatches) {\n  for (let i = 0; i < batch.length; i += CHUNK_SIZE) {\n    groups.push(batch.slice(i, i + CHUNK_SIZE));\n  }\n}\n\nreturn groups.length\n  ? groups.map((g, idx) => ({ json: { batchIndex: idx, pages: g, combinedContent,accId:s.accountId } }))\n  : [{ json: { batchIndex: 0, pages: [], combinedContent } }];\n"
      },
      "typeVersion": 2
    },
    {
      "id": "1e36bc72-2db7-4ce7-a42e-51609a0c9065",
      "name": "响应 Webhook",
      "type": "n8n-nodes-base.respondToWebhook",
      "position": [
        608,
        -176
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 1.4
    },
    {
      "id": "99f16b20-3398-45a9-a652-7b51351283b2",
      "name": "初始化全局变量",
      "type": "n8n-nodes-base.code",
      "notes": "Initializes the pending count in static data for crawl completion tracking.",
      "position": [
        -1632,
        -336
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "jsCode": "const s = $getWorkflowStaticData('global');\ns.pending = 1;\ns.visited = [];\ns.queued = {};\ns.pages = [];\n\n// Ensure url has a scheme so URL() won't throw\nconst ensureUrl = u => (/^https?:\\/\\//i.test(u) ? u : `https://${u}`);\n\ntry {\n  $json.domain = new URL(ensureUrl($json.url)).hostname; // => \"www.crmaiinsight.com\"\n} catch (e) {\n  // Fallback if url is malformed\n  $json.domain = String($json.url || '')\n    .replace(/^[a-z]+:\\/\\//i, '')\n    .replace(/\\/.*$/, '')\n    .replace(/:\\d+$/, '');\n}\n\nreturn $json;\n"
      },
      "typeVersion": 2
    },
    {
      "id": "e56c711e-c7eb-4024-bd31-66680514d62c",
      "name": "初始化抓取参数",
      "type": "n8n-nodes-base.set",
      "notes": "Defines the root URL, domain name, and max crawl depth.",
      "position": [
        -1856,
        -336
      ],
      "parameters": {
        "values": {
          "number": [
            {
              "name": "maxDepth",
              "value": 3
            },
            {
              "name": "depth"
            }
          ],
          "string": [
            {
              "name": "url",
              "value": "={{ $json.body.url }}"
            },
            {
              "name": "domain",
              "value": "={{ $json.body.url }}"
            }
          ]
        },
        "options": {},
        "keepOnlySet": true
      },
      "typeVersion": 2
    },
    {
      "id": "29bf5f0a-97dc-4631-a485-f7ef9bcfd852",
      "name": "重新排队链接项",
      "type": "n8n-nodes-base.code",
      "notes": "Removes internal 'type' field and re-enqueues the link for next crawl.",
      "position": [
        -144,
        -480
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "jsCode": "const s = $getWorkflowStaticData('global');\n\ndelete $json.type\nreturn item;"
      },
      "typeVersion": 2
    },
    {
      "id": "3f81f588-a041-4ae9-92b5-2f79ae855355",
      "name": "排队和去重链接",
      "type": "n8n-nodes-base.code",
      "notes": "Cleans and deduplicates links. Tracks visited URLs. Prepares next crawl queue.",
      "onError": "continueRegularOutput",
      "position": [
        -560,
        -464
      ],
      "parameters": {
        "jsCode": "const staticData = $getWorkflowStaticData('global');\nif (!Array.isArray(staticData.visited)) staticData.visited = [];\nif (typeof staticData.pending !== 'number') staticData.pending = 0;\nif (!staticData.queued || typeof staticData.queued !== 'object') staticData.queued = {};\n\nconst currentUrl = $('Attach URL/Depth to HTML').item.json.url.replace(/\\/+$/, '');\nconst currentDepth = $('Attach URL/Depth to HTML').item.json.depth || 0;\nconst maxDepth = $('Seed Root Crawl Item').first().json.maxDepth;\nconst domainParamRaw = ($('Init Crawl Params').first().json.domain || '').toString();\nconst content = typeof $json.content === 'string' ? $json.content : '';\n\nconst PROTO_RE = /^[a-zA-Z][a-zA-Z0-9+.-]*:\\/\\//;\n\n// Normalize a host string: strip protocol, path, and leading \"www.\"\nfunction hostOf(u) {\n  if (!u) return '';\n  let s = u.toString();\n  if (PROTO_RE.test(s)) s = s.replace(PROTO_RE, '');\n  const i = s.indexOf('/');\n  if (i !== -1) s = s.slice(0, i);\n  return s.toLowerCase().replace(/^www\\./, '');\n}\n\n// Build absolute URL from href + base without using URL()\nfunction toAbsolute(href, base) {\n  if (!href) return '';\n  const h = href.trim();\n  if (PROTO_RE.test(h)) return h.replace(/\\/+$/, '');\n  if (h.startsWith('//')) {\n    const proto = (base.match(PROTO_RE) || ['https://'])[0];\n    return (proto + h.slice(2)).replace(/\\/+$/, '');\n  }\n  if (h.startsWith('/')) {\n    const baseHost = base.replace(PROTO_RE, '').split('/')[0];\n    const proto = (base.match(PROTO_RE) || ['https://'])[0];\n    return (proto + baseHost + h).replace(/\\/+$/, '');\n  }\n  // relative path\n  let dir = base;\n  if (!dir.endsWith('/')) {\n    const cut = dir.lastIndexOf('/');\n    dir = cut > (dir.indexOf('://') + 2) ? dir.slice(0, cut + 1) : (dir + '/');\n  }\n  return (dir + h).replace(/\\/+$/, '');\n}\n\nfunction extractHostname(abs) {\n  let s = abs.replace(PROTO_RE, '');\n  const i = s.indexOf('/');\n  const host = (i === -1 ? s : s.slice(0, i)).toLowerCase();\n  return host.replace(/^www\\./, '');\n}\n\nconst allowedHost = hostOf(domainParamRaw) || hostOf(currentUrl);\nconst currentHost  = hostOf(currentUrl);\n\n// mark current as visited & dequeue\nif (!staticData.visited.includes(currentUrl)) staticData.visited.push(currentUrl);\ndelete staticData.queued[currentUrl];\n\nconst links = Array.isArray($json.links) ? $json.links : [];\nconst newLinks = [];\nconst queuedLocal = new Set();\n\nfor (const link of links) {\n  if (!link) continue;\n  const l = String(link).trim();\n  if (!l || l.startsWith('mailto:') || l.startsWith('tel:') || l.startsWith('javascript:')) continue;\n  if (l.includes('#')) continue;\n  if (/\\.(pdf|docx?|xlsx?|pptx?)($|\\?)/i.test(l)) continue;\n\n  const absolute = toAbsolute(l, currentUrl);\n  const host = extractHostname(absolute);\n\n  // treat apex and www as same-site\n  const sameSite = (host === allowedHost) || (host === currentHost);\n\n  if (\n    sameSite &&\n    !staticData.visited.includes(absolute) &&\n    !staticData.queued[absolute] &&\n    !queuedLocal.has(absolute) &&\n    currentDepth < maxDepth\n  ) {\n    newLinks.push({\n      json: { url: absolute, depth: currentDepth + 1, type: 'link', maxDepth }\n    });\n    queuedLocal.add(absolute);\n    staticData.queued[absolute] = true;\n  }\n}\n\nstaticData.pending += newLinks.length;\nstaticData.pending--; // this page done\n\nreturn newLinks.concat({\n  json: { url: currentUrl, depth: currentDepth, content, type: 'page', maxDepth }\n});\n"
      },
      "typeVersion": 2
    }
  ],
  "pinData": {},
  "connections": {
    "Webhook": {
      "main": [
        [
          {
            "node": "Init Crawl Params",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Init Globals": {
      "main": [
        [
          {
            "node": "Seed Root Crawl Item",
            "type": "main",
            "index": 0
          },
          {
            "node": "Merge Web Pages",
            "type": "main",
            "index": 1
          }
        ]
      ]
    },
    "Combine & Chunk": {
      "main": [
        [
          {
            "node": "Respond to Webhook",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Fetch HTML Page": {
      "main": [
        [
          {
            "node": "Attach URL/Depth to HTML",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Merge Web Pages": {
      "main": [
        [
          {
            "node": "Combine & Chunk",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store Page Data": {
      "main": [
        [
          {
            "node": "Collect Pages & Emit When Done",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Init Crawl Params": {
      "main": [
        [
          {
            "node": "Init Globals",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Requeue Link Item": {
      "main": [
        [
          {
            "node": "Loop Links (Batches)",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "IF Crawl Depth OK?": {
      "main": [
        [
          {
            "node": "Requeue Link Item",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Store Page Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Queue & Dedup Links": {
      "main": [
        [
          {
            "node": "IF Crawl Depth OK?",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Body & Links": {
      "main": [
        [
          {
            "node": "Queue & Dedup Links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Loop Links (Batches)": {
      "main": [
        [
          {
            "node": "Seed Root Crawl Item",
            "type": "main",
            "index": 1
          }
        ]
      ]
    },
    "Seed Root Crawl Item": {
      "main": [
        [
          {
            "node": "Fetch HTML Page",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Attach URL/Depth to HTML": {
      "main": [
        [
          {
            "node": "Extract Body & Links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Collect Pages & Emit When Done": {
      "main": [
        [
          {
            "node": "Merge Web Pages",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

高级 - 内容创作, 多模态 AI

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量18
分类2
节点类型10
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

作者
Le Nguyen

Le Nguyen

@leeseifer

Salesforce Architect with 10+ years of experience in CRM, integrations, and automation. Skilled in Apex, LWC, REST APIs, and full-stack dev (JavaScript, .NET). I build secure, scalable workflows in n8n—connecting Salesforce, Stripe, and more. Passionate about lead scoring, data sync, and secure field masking. Certified Application Architect with deep expertise in platform, integration, and data architecture.

外部链接
在 n8n.io 查看

分享此工作流