特定域名网页内容爬虫,带深度控制和文本提取
高级
这是一个Content Creation, Multimodal AI领域的自动化工作流,包含 18 个节点。主要使用 If, Set, Code, Html, Merge 等节点。 特定域名网页内容爬虫,带深度控制和文本提取
前置要求
- •HTTP Webhook 端点(n8n 会自动生成)
- •可能需要目标 API 的认证凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
"meta": {
"instanceId": "9a562c06a632241f66aadd52a495ad98e76b760ef5cfce9c319a4759c47cd94e"
},
"nodes": [
{
"id": "ed429607-b22c-494c-b767-7dc2eca5a561",
"name": "便签",
"type": "n8n-nodes-base.stickyNote",
"position": [
-2160,
-112
],
"parameters": {
"width": 720,
"height": 592,
"content": "# n8n 工作流说明:网络爬虫"
},
"typeVersion": 1
},
{
"id": "26230b6f-528a-41fa-b9f0-9597659e2f23",
"name": "便签1",
"type": "n8n-nodes-base.stickyNote",
"position": [
-1376,
-112
],
"parameters": {
"width": 800,
"height": 1136,
"content": "## 逐步详细分解"
},
"typeVersion": 1
},
{
"id": "c3ea4128-8963-4000-af38-e7f2be48bb7e",
"name": "Webhook",
"type": "n8n-nodes-base.webhook",
"position": [
-2128,
-336
],
"webhookId": "603a09ed-516c-4c7d-bad3-b05b030503a2",
"parameters": {
"path": "603a09ed-516c-4c7d-bad3-b05b030503a2",
"options": {
"rawBody": false
},
"httpMethod": "POST",
"responseMode": "responseNode"
},
"typeVersion": 2.1
},
{
"id": "a35808cb-d2ea-4797-86a6-a36670377560",
"name": "循环链接(批次)",
"type": "n8n-nodes-base.splitInBatches",
"notes": "Iterates through the queue of links to be crawled one at a time.",
"position": [
48,
-480
],
"parameters": {
"options": {
"reset": false
},
"batchSize": 1
},
"executeOnce": false,
"typeVersion": 1
},
{
"id": "798444a5-0df4-4727-818f-657901ad60a1",
"name": "IF 抓取深度是否合适?",
"type": "n8n-nodes-base.if",
"notes": "Validates whether the current depth is below the maximum depth allowed.",
"onError": "continueRegularOutput",
"position": [
-352,
-464
],
"parameters": {
"conditions": {
"number": [
{
"value1": "={{ $json.depth }}",
"value2": "={{ $json.maxDepth}} ",
"operation": "smallerEqual"
}
],
"string": [
{
"value1": "={{ $json.type }}",
"value2": "link"
}
]
}
},
"typeVersion": 1
},
{
"id": "ecc2707f-0605-4c88-98eb-8c8ea234e9ff",
"name": "提取正文和链接",
"type": "n8n-nodes-base.html",
"notes": "Parses HTML content and extracts body text and anchor href links.",
"position": [
-784,
-464
],
"parameters": {
"options": {
"trimValues": true,
"cleanUpText": true
},
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "links",
"attribute": "href",
"cssSelector": "a[href]",
"returnArray": true,
"returnValue": "attribute"
},
{
"key": "content",
"cssSelector": "body"
}
]
}
},
"typeVersion": 1
},
{
"id": "d4dfda4a-e20a-4014-b024-c0fde8f41aed",
"name": "将 URL/深度附加到 HTML",
"type": "n8n-nodes-base.code",
"position": [
-976,
-464
],
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": " return {\n json: {\n url:$('Seed Root Crawl Item').item.json.url,\n depth: $('Seed Root Crawl Item').item.json.depth,\n ...item.json // Preserve original HTML response (optional)\n }\n };\n"
},
"typeVersion": 2
},
{
"id": "239040b9-3c08-47d9-a188-18776817df23",
"name": "获取 HTML 页面",
"type": "n8n-nodes-base.httpRequest",
"notes": "Makes HTTP request to fetch the content of the current URL.",
"onError": "continueRegularOutput",
"position": [
-1200,
-464
],
"parameters": {
"url": "={{ $json.url }}",
"options": {
"timeout": 5000,
"response": {
"response": {}
}
}
},
"typeVersion": 4.2
},
{
"id": "3d960fb8-2224-4f50-becf-b2f03bd7de6e",
"name": "种子根抓取项",
"type": "n8n-nodes-base.merge",
"position": [
-1408,
-464
],
"parameters": {
"mode": "combine",
"options": {
"clashHandling": {
"values": {
"resolveClash": "preferLast",
"overrideEmpty": true
}
},
"includeUnpaired": true
},
"combineBy": "combineByPosition"
},
"typeVersion": 3.2
},
{
"id": "3e02f965-84f5-40da-90d4-ae91bbf0434e",
"name": "收集页面并在完成时发出",
"type": "n8n-nodes-base.code",
"position": [
32,
-288
],
"parameters": {
"jsCode": "const s = $getWorkflowStaticData('global');\nif (!s.pages) s.pages = [];\ns.pages.push({\n url: $json.url,\n depth: $json.depth,\n content: $json.content\n});\nconsole.log(s.pending)\nif (s.pending <= 0) {\n const pages = s.pages || [];\n let combinedContent = pages.map(page => `URL: ${page.url}\\nDepth: ${page.depth}\\nContent: ${page.content}\\n`).join('\\n-----------------\\n');\n return { json: { content: combinedContent } };\n} else {\n return [];\n}"
},
"typeVersion": 2
},
{
"id": "63f581a0-4794-4908-be22-dda1136e7593",
"name": "存储页面数据",
"type": "n8n-nodes-base.set",
"notes": "Captures the URL, page content, and depth for storage or export.",
"position": [
-128,
-304
],
"parameters": {
"values": {
"number": [
{
"name": "depth",
"value": "={{ $json.depth || 0 }}"
}
],
"string": [
{
"name": "url",
"value": "={{ $json.url || '' }}"
},
{
"name": "content",
"value": "={{ $json.content || '' }}"
}
]
},
"options": {},
"keepOnlySet": true
},
"typeVersion": 2
},
{
"id": "c3cf4541-c31f-4257-8729-44f8ed211bcd",
"name": "合并网页",
"type": "n8n-nodes-base.merge",
"position": [
208,
-176
],
"parameters": {},
"typeVersion": 3.2
},
{
"id": "a7d480bc-ef4b-4cad-989f-0eda36a26a00",
"name": "组合和分块",
"type": "n8n-nodes-base.code",
"position": [
400,
-176
],
"parameters": {
"jsCode": "/* Combine static pages + extra JSON, then chunk pages for model calls */\nconst s = $getWorkflowStaticData('global');\nif (!s.pages) s.pages = [];\n\nfunction normPage(p = {}) {\n return {\n url: p.url || '',\n depth: p.depth ?? null,\n content: typeof p.content === 'string' ? p.content : ''\n };\n}\n\nconst incomingPageItems = items\n .filter(i => typeof i.json.content === 'string')\n .map(i => normPage(i.json));\n\nconst storedPages = (s.pages || []).map(normPage);\nconst pages = storedPages.length ? storedPages : incomingPageItems;\n\nconst extraJson = items\n .filter(i => typeof i.json.content !== 'string')\n .map(i => i.json);\n\nlet combinedContent = pages\n .map(p => `URL: ${p.url}\\nDepth: ${p.depth}\\nContent:\\n${p.content}\\n`)\n .join('\\n-----------------\\n');\n\nif (extraJson.length) {\n combinedContent += `\\n\\nLINKEDIN_DATA::\\n\\n${JSON.stringify(extraJson)}`;\n}\n\nconst CHUNK_SIZE = 5;\nconst MAX_CHARS_PER_BATCH = 12000;\n\nfunction chunkByChars(arr, maxChars) {\n const batches = [];\n let current = [];\n let chars = 0;\n for (const it of arr) {\n const len = (it.content || '').length;\n if (current.length && chars + len > maxChars) {\n batches.push(current);\n current = [];\n chars = 0;\n }\n current.push(it);\n chars += len;\n }\n if (current.length) batches.push(current);\n return batches;\n}\n\nconst charBatches = chunkByChars(pages, MAX_CHARS_PER_BATCH);\nconst groups = [];\nfor (const batch of charBatches) {\n for (let i = 0; i < batch.length; i += CHUNK_SIZE) {\n groups.push(batch.slice(i, i + CHUNK_SIZE));\n }\n}\n\nreturn groups.length\n ? groups.map((g, idx) => ({ json: { batchIndex: idx, pages: g, combinedContent,accId:s.accountId } }))\n : [{ json: { batchIndex: 0, pages: [], combinedContent } }];\n"
},
"typeVersion": 2
},
{
"id": "1e36bc72-2db7-4ce7-a42e-51609a0c9065",
"name": "响应 Webhook",
"type": "n8n-nodes-base.respondToWebhook",
"position": [
608,
-176
],
"parameters": {
"options": {}
},
"typeVersion": 1.4
},
{
"id": "99f16b20-3398-45a9-a652-7b51351283b2",
"name": "初始化全局变量",
"type": "n8n-nodes-base.code",
"notes": "Initializes the pending count in static data for crawl completion tracking.",
"position": [
-1632,
-336
],
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": "const s = $getWorkflowStaticData('global');\ns.pending = 1;\ns.visited = [];\ns.queued = {};\ns.pages = [];\n\n// Ensure url has a scheme so URL() won't throw\nconst ensureUrl = u => (/^https?:\\/\\//i.test(u) ? u : `https://${u}`);\n\ntry {\n $json.domain = new URL(ensureUrl($json.url)).hostname; // => \"www.crmaiinsight.com\"\n} catch (e) {\n // Fallback if url is malformed\n $json.domain = String($json.url || '')\n .replace(/^[a-z]+:\\/\\//i, '')\n .replace(/\\/.*$/, '')\n .replace(/:\\d+$/, '');\n}\n\nreturn $json;\n"
},
"typeVersion": 2
},
{
"id": "e56c711e-c7eb-4024-bd31-66680514d62c",
"name": "初始化抓取参数",
"type": "n8n-nodes-base.set",
"notes": "Defines the root URL, domain name, and max crawl depth.",
"position": [
-1856,
-336
],
"parameters": {
"values": {
"number": [
{
"name": "maxDepth",
"value": 3
},
{
"name": "depth"
}
],
"string": [
{
"name": "url",
"value": "={{ $json.body.url }}"
},
{
"name": "domain",
"value": "={{ $json.body.url }}"
}
]
},
"options": {},
"keepOnlySet": true
},
"typeVersion": 2
},
{
"id": "29bf5f0a-97dc-4631-a485-f7ef9bcfd852",
"name": "重新排队链接项",
"type": "n8n-nodes-base.code",
"notes": "Removes internal 'type' field and re-enqueues the link for next crawl.",
"position": [
-144,
-480
],
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": "const s = $getWorkflowStaticData('global');\n\ndelete $json.type\nreturn item;"
},
"typeVersion": 2
},
{
"id": "3f81f588-a041-4ae9-92b5-2f79ae855355",
"name": "排队和去重链接",
"type": "n8n-nodes-base.code",
"notes": "Cleans and deduplicates links. Tracks visited URLs. Prepares next crawl queue.",
"onError": "continueRegularOutput",
"position": [
-560,
-464
],
"parameters": {
"jsCode": "const staticData = $getWorkflowStaticData('global');\nif (!Array.isArray(staticData.visited)) staticData.visited = [];\nif (typeof staticData.pending !== 'number') staticData.pending = 0;\nif (!staticData.queued || typeof staticData.queued !== 'object') staticData.queued = {};\n\nconst currentUrl = $('Attach URL/Depth to HTML').item.json.url.replace(/\\/+$/, '');\nconst currentDepth = $('Attach URL/Depth to HTML').item.json.depth || 0;\nconst maxDepth = $('Seed Root Crawl Item').first().json.maxDepth;\nconst domainParamRaw = ($('Init Crawl Params').first().json.domain || '').toString();\nconst content = typeof $json.content === 'string' ? $json.content : '';\n\nconst PROTO_RE = /^[a-zA-Z][a-zA-Z0-9+.-]*:\\/\\//;\n\n// Normalize a host string: strip protocol, path, and leading \"www.\"\nfunction hostOf(u) {\n if (!u) return '';\n let s = u.toString();\n if (PROTO_RE.test(s)) s = s.replace(PROTO_RE, '');\n const i = s.indexOf('/');\n if (i !== -1) s = s.slice(0, i);\n return s.toLowerCase().replace(/^www\\./, '');\n}\n\n// Build absolute URL from href + base without using URL()\nfunction toAbsolute(href, base) {\n if (!href) return '';\n const h = href.trim();\n if (PROTO_RE.test(h)) return h.replace(/\\/+$/, '');\n if (h.startsWith('//')) {\n const proto = (base.match(PROTO_RE) || ['https://'])[0];\n return (proto + h.slice(2)).replace(/\\/+$/, '');\n }\n if (h.startsWith('/')) {\n const baseHost = base.replace(PROTO_RE, '').split('/')[0];\n const proto = (base.match(PROTO_RE) || ['https://'])[0];\n return (proto + baseHost + h).replace(/\\/+$/, '');\n }\n // relative path\n let dir = base;\n if (!dir.endsWith('/')) {\n const cut = dir.lastIndexOf('/');\n dir = cut > (dir.indexOf('://') + 2) ? dir.slice(0, cut + 1) : (dir + '/');\n }\n return (dir + h).replace(/\\/+$/, '');\n}\n\nfunction extractHostname(abs) {\n let s = abs.replace(PROTO_RE, '');\n const i = s.indexOf('/');\n const host = (i === -1 ? s : s.slice(0, i)).toLowerCase();\n return host.replace(/^www\\./, '');\n}\n\nconst allowedHost = hostOf(domainParamRaw) || hostOf(currentUrl);\nconst currentHost = hostOf(currentUrl);\n\n// mark current as visited & dequeue\nif (!staticData.visited.includes(currentUrl)) staticData.visited.push(currentUrl);\ndelete staticData.queued[currentUrl];\n\nconst links = Array.isArray($json.links) ? $json.links : [];\nconst newLinks = [];\nconst queuedLocal = new Set();\n\nfor (const link of links) {\n if (!link) continue;\n const l = String(link).trim();\n if (!l || l.startsWith('mailto:') || l.startsWith('tel:') || l.startsWith('javascript:')) continue;\n if (l.includes('#')) continue;\n if (/\\.(pdf|docx?|xlsx?|pptx?)($|\\?)/i.test(l)) continue;\n\n const absolute = toAbsolute(l, currentUrl);\n const host = extractHostname(absolute);\n\n // treat apex and www as same-site\n const sameSite = (host === allowedHost) || (host === currentHost);\n\n if (\n sameSite &&\n !staticData.visited.includes(absolute) &&\n !staticData.queued[absolute] &&\n !queuedLocal.has(absolute) &&\n currentDepth < maxDepth\n ) {\n newLinks.push({\n json: { url: absolute, depth: currentDepth + 1, type: 'link', maxDepth }\n });\n queuedLocal.add(absolute);\n staticData.queued[absolute] = true;\n }\n}\n\nstaticData.pending += newLinks.length;\nstaticData.pending--; // this page done\n\nreturn newLinks.concat({\n json: { url: currentUrl, depth: currentDepth, content, type: 'page', maxDepth }\n});\n"
},
"typeVersion": 2
}
],
"pinData": {},
"connections": {
"Webhook": {
"main": [
[
{
"node": "Init Crawl Params",
"type": "main",
"index": 0
}
]
]
},
"Init Globals": {
"main": [
[
{
"node": "Seed Root Crawl Item",
"type": "main",
"index": 0
},
{
"node": "Merge Web Pages",
"type": "main",
"index": 1
}
]
]
},
"Combine & Chunk": {
"main": [
[
{
"node": "Respond to Webhook",
"type": "main",
"index": 0
}
]
]
},
"Fetch HTML Page": {
"main": [
[
{
"node": "Attach URL/Depth to HTML",
"type": "main",
"index": 0
}
]
]
},
"Merge Web Pages": {
"main": [
[
{
"node": "Combine & Chunk",
"type": "main",
"index": 0
}
]
]
},
"Store Page Data": {
"main": [
[
{
"node": "Collect Pages & Emit When Done",
"type": "main",
"index": 0
}
]
]
},
"Init Crawl Params": {
"main": [
[
{
"node": "Init Globals",
"type": "main",
"index": 0
}
]
]
},
"Requeue Link Item": {
"main": [
[
{
"node": "Loop Links (Batches)",
"type": "main",
"index": 0
}
]
]
},
"IF Crawl Depth OK?": {
"main": [
[
{
"node": "Requeue Link Item",
"type": "main",
"index": 0
}
],
[
{
"node": "Store Page Data",
"type": "main",
"index": 0
}
]
]
},
"Queue & Dedup Links": {
"main": [
[
{
"node": "IF Crawl Depth OK?",
"type": "main",
"index": 0
}
]
]
},
"Extract Body & Links": {
"main": [
[
{
"node": "Queue & Dedup Links",
"type": "main",
"index": 0
}
]
]
},
"Loop Links (Batches)": {
"main": [
[
{
"node": "Seed Root Crawl Item",
"type": "main",
"index": 1
}
]
]
},
"Seed Root Crawl Item": {
"main": [
[
{
"node": "Fetch HTML Page",
"type": "main",
"index": 0
}
]
]
},
"Attach URL/Depth to HTML": {
"main": [
[
{
"node": "Extract Body & Links",
"type": "main",
"index": 0
}
]
]
},
"Collect Pages & Emit When Done": {
"main": [
[
{
"node": "Merge Web Pages",
"type": "main",
"index": 0
}
]
]
}
}
}常见问题
如何使用这个工作流?
复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。
这个工作流适合什么场景?
高级 - 内容创作, 多模态 AI
需要付费吗?
本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。
相关工作流推荐
GitHub 同步仪表板 - V2
具有提交历史和回滚功能的 GitHub 工作流版本控制仪表板
If
N8n
Set
+20
94 节点Eduard
开发运维
使用 OpenAI、LangChain 和 API 集成的工作流自动化初学者指南
使用 OpenAI、LangChain 和 API 集成的工作流自动化初学者指南
If
Set
Code
+13
33 节点Meelioo
内容创作
通过关键词搜索和Markdown格式化提取和过滤Reddit帖子和评论
通过关键词搜索和Markdown格式化提取和过滤Reddit帖子和评论
If
Set
Code
+10
28 节点Muhammad Asadullah
内容创作
使用GPT-4o-mini的技术SEO审计与多格式报告(Sheets-Email)
使用GPT-4o-mini的技术SEO审计与多格式报告(Sheets/Email)
Set
Xml
Code
+14
45 节点Oriol Seguí
内容创作
来自多个招聘网站的求职自动化
使用 5 个招聘平台和 AI 简历生成器自动化求职与申请
If
Set
Code
+14
34 节点Gerald Denor
个人效率
我的工作流12
通过网络爬取、LinkedIn数据和GPT-4o自动丰富Salesforce账户信息
If
Set
Code
+8
30 节点Le Nguyen
客户关系管理
工作流信息
难度等级
高级
节点数量18
分类2
节点类型10
作者
Le Nguyen
@leeseiferSalesforce Architect with 10+ years of experience in CRM, integrations, and automation. Skilled in Apex, LWC, REST APIs, and full-stack dev (JavaScript, .NET). I build secure, scalable workflows in n8n—connecting Salesforce, Stripe, and more. Passionate about lead scoring, data sync, and secure field masking. Certified Application Architect with deep expertise in platform, integration, and data architecture.
外部链接
在 n8n.io 查看 →
分享此工作流