从网站获取所有页面内容并存储至Pinecone的Gemini嵌入
高级
这是一个Document Extraction, AI RAG领域的自动化工作流,包含 16 个节点。主要使用 Xml, Code, Html, Wait, Merge 等节点。 从网站获取所有页面内容并存储至Pinecone的Gemini嵌入
前置要求
- •可能需要目标 API 的认证凭证
- •Pinecone API Key
使用的节点 (16)
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
"nodes": [
{
"id": "5ad6a510-3c4a-47e4-b8ff-c0e565e25d25",
"name": "便签",
"type": "n8n-nodes-base.stickyNote",
"position": [
368,
944
],
"parameters": {
"width": 832,
"height": 816,
"content": "此 n8n 工作流从网站内容构建 Pinecone 知识库,同时处理站点地图和直接 URL 输入。"
},
"typeVersion": 1
},
{
"id": "3ff777b7-24bd-420c-af38-62a395f52a1a",
"name": "提取页面 URL",
"type": "n8n-nodes-base.code",
"position": [
1936,
1392
],
"parameters": {
"jsCode": "const items = []\nfor (const item of $input.first().json.urlset.url) {\n items.push({ url: item.loc })\n}\n\nreturn items;"
},
"typeVersion": 2
},
{
"id": "6176e651-cef5-44e8-abed-0f6f6b81517b",
"name": "XML 转换",
"type": "n8n-nodes-base.xml",
"position": [
1792,
1392
],
"parameters": {
"options": {}
},
"typeVersion": 1
},
{
"id": "cca1e7e7-32f6-42fd-b23c-3c2586344a50",
"name": "获取站点地图",
"type": "n8n-nodes-base.httpRequest",
"position": [
1632,
1392
],
"parameters": {
"url": "={{ $json['Sitemap URL'] }}",
"options": {}
},
"typeVersion": 4.2
},
{
"id": "520e131d-b5f2-4857-aebd-5724da2a8083",
"name": "拆分页面 URL",
"type": "n8n-nodes-base.code",
"position": [
1792,
1216
],
"parameters": {
"jsCode": "function addTrailingSlash(str) {\n if (typeof str !== 'string') {\n return str; // Or throw an error, handle non-string inputs\n }\n if (!str.endsWith('/')) {\n return str + '/';\n }\n return str;\n}\n\nconst urls = []\nfor (const item of $input.first().json['Page URLs'].split(',')) {\n urls.push({ url: addTrailingSlash(item).trim()})\n}\n\nreturn urls;"
},
"typeVersion": 2
},
{
"id": "7e7fe528-8748-470b-b627-a0c79b5aface",
"name": "合并 URL",
"type": "n8n-nodes-base.merge",
"position": [
2128,
1232
],
"parameters": {},
"typeVersion": 3.2
},
{
"id": "a0517aaf-6ccd-481d-b97e-b183d305451b",
"name": "移除重复 URL",
"type": "n8n-nodes-base.removeDuplicates",
"position": [
2272,
1232
],
"parameters": {
"options": {}
},
"typeVersion": 2
},
{
"id": "72c85ccf-a9d6-42b1-85a7-76800ba831e5",
"name": "循环处理页面 URL",
"type": "n8n-nodes-base.splitInBatches",
"position": [
2480,
1232
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "73aebd19-60ae-40d1-a747-0b9537d9d67c",
"name": "提取内容",
"type": "n8n-nodes-base.html",
"position": [
2672,
1136
],
"parameters": {
"options": {
"cleanUpText": true
},
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "content",
"cssSelector": "body",
"skipSelectors": "img"
}
]
}
},
"typeVersion": 1.2
},
{
"id": "0dbf70c1-cb57-4691-916f-2a2aa9a4cec0",
"name": "获取页面 HTML 内容",
"type": "n8n-nodes-base.httpRequest",
"position": [
2672,
1328
],
"parameters": {
"url": "={{ $json.url }}",
"options": {}
},
"typeVersion": 4.2
},
{
"id": "fa1c18c6-6c29-4e71-905e-0945909af99b",
"name": "等待 5 秒",
"type": "n8n-nodes-base.wait",
"position": [
2832,
1328
],
"webhookId": "9d87e60f-9df8-4a13-9c22-e3e5a5bb9c0e",
"parameters": {},
"typeVersion": 1.1
},
{
"id": "2bf3ad7f-a2fd-44f9-b6af-5a500ef80591",
"name": "数据加载器",
"type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
"position": [
3264,
1344
],
"parameters": {
"options": {}
},
"typeVersion": 1.1
},
{
"id": "a86d4c2e-559c-4942-ac0d-2ddcc7eb7f39",
"name": "Gemini 嵌入",
"type": "@n8n/n8n-nodes-langchain.embeddingsGoogleGemini",
"position": [
3072,
1344
],
"parameters": {
"modelName": "models/gemini-embedding-001"
},
"typeVersion": 1
},
{
"id": "f46188bd-c0a2-4d49-9b67-0937f891ae36",
"name": "Pinecone 知识库",
"type": "@n8n/n8n-nodes-langchain.vectorStorePinecone",
"position": [
3072,
1136
],
"parameters": {
"mode": "insert",
"options": {
"clearNamespace": true
}
},
"typeVersion": 1.3
},
{
"id": "4f5dc6e3-8f75-46ab-b3e1-49deb7695469",
"name": "输入站点地图或页面 URL",
"type": "n8n-nodes-base.formTrigger",
"position": [
1296,
1376
],
"webhookId": "ab54a2cd-2eda-4cf7-b822-8fb49ecb257e",
"parameters": {
"options": {},
"formTitle": "Agent Knowledge Base Input",
"formFields": {
"values": [
{
"fieldLabel": "Sitemap URL",
"placeholder": "https://website.com/page-sitemap.xml"
},
{
"fieldType": "textarea",
"fieldLabel": "Page URLs",
"placeholder": "https://website.com/about, https://website.com/contact"
}
]
},
"formDescription": "This form is to input the page sitemap or pages of your website"
},
"typeVersion": 2.2
},
{
"id": "67f6e98a-946c-4460-93d4-707511deb4f5",
"name": "条件分支",
"type": "n8n-nodes-base.switch",
"position": [
1440,
1376
],
"parameters": {
"rules": {
"values": [
{
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "2af7e15b-2e56-40e5-addc-74bd0b4de214",
"operator": {
"type": "string",
"operation": "notEmpty",
"singleValue": true
},
"leftValue": "={{ $json['Page URLs'] }}",
"rightValue": ""
}
]
}
},
{
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "02899ab6-0c0b-4c0f-89ad-ec5787da36eb",
"operator": {
"type": "string",
"operation": "endsWith"
},
"leftValue": "={{ $json['Sitemap URL'] }}",
"rightValue": "xml"
}
]
}
}
]
},
"options": {
"allMatchingOutputs": true
}
},
"typeVersion": 3.2
}
],
"connections": {
"Switch": {
"main": [
[
{
"node": "Split Pages URL",
"type": "main",
"index": 0
}
],
[
{
"node": "Fetch Sitemap",
"type": "main",
"index": 0
}
]
]
},
"Merge URLs": {
"main": [
[
{
"node": "Remove Duplicate URLs",
"type": "main",
"index": 0
}
]
]
},
"Wait 5 sec": {
"main": [
[
{
"node": "Loop Over Page URLs",
"type": "main",
"index": 0
}
]
]
},
"Data Loader": {
"ai_document": [
[
{
"node": "Pinecone KnowledgeBase",
"type": "ai_document",
"index": 0
}
]
]
},
"Fetch Sitemap": {
"main": [
[
{
"node": "XML Conversion",
"type": "main",
"index": 0
}
]
]
},
"XML Conversion": {
"main": [
[
{
"node": "Extract Page URLs",
"type": "main",
"index": 0
}
]
]
},
"Extract Content": {
"main": [
[
{
"node": "Pinecone KnowledgeBase",
"type": "main",
"index": 0
}
]
]
},
"Split Pages URL": {
"main": [
[
{
"node": "Merge URLs",
"type": "main",
"index": 0
}
]
]
},
"Extract Page URLs": {
"main": [
[
{
"node": "Merge URLs",
"type": "main",
"index": 1
}
]
]
},
"Gemini Embeddings": {
"ai_embedding": [
[
{
"node": "Pinecone KnowledgeBase",
"type": "ai_embedding",
"index": 0
}
]
]
},
"Loop Over Page URLs": {
"main": [
[
{
"node": "Extract Content",
"type": "main",
"index": 0
}
],
[
{
"node": "Fetch Page HTML For content",
"type": "main",
"index": 0
}
]
]
},
"Remove Duplicate URLs": {
"main": [
[
{
"node": "Loop Over Page URLs",
"type": "main",
"index": 0
}
]
]
},
"Input Sitemap or page urls": {
"main": [
[
{
"node": "Switch",
"type": "main",
"index": 0
}
]
]
},
"Fetch Page HTML For content": {
"main": [
[
{
"node": "Wait 5 sec",
"type": "main",
"index": 0
}
]
]
}
}
}常见问题
如何使用这个工作流?
复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。
这个工作流适合什么场景?
高级 - 文档提取, AI RAG 检索增强
需要付费吗?
本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。
相关工作流推荐
基于AI的WhatsApp支持机器人(Google Sheets工单创建)
基于AI的WhatsApp支持机器人(Google Sheets工单创建)
Set
Xml
Code
+20
35 节点Zain Khan
客服机器人
在可视化参考库中探索n8n节点
在可视化参考库中探索n8n节点
If
Ftp
Set
+93
113 节点I versus AI
其他
PDF 转订单
使用AI将PDF采购订单自动化转换为Adobe Commerce销售订单
If
Set
Code
+19
96 节点JKingma
文档提取
API架构提取器
API架构提取器
If
Set
Code
+22
88 节点Polina Medvedieva
工程
使用GPT-4.1、Outlook和Mem.ai自动化Microsoft Teams会议分析
使用GPT-4.1、Outlook和Mem.ai自动化Microsoft Teams会议分析
If
Set
Code
+19
61 节点Wayne Simpson
人力资源
WordPress博客自动化专业版(深度研究)v2.1市场
使用GPT-4o、Perplexity AI和多语言支持自动化SEO优化的博客创建
If
Set
Xml
+27
125 节点Daniel Ng
内容创作
工作流信息
难度等级
高级
节点数量16
分类2
节点类型14
作者
Zain Khan
@zainI partner with businesses to streamline processes and accelerate growth through intelligent AI automation and Web/mobile Development. Leveraging deep expertise in GPT-4, LangChain, and n8n, I develop AI-powered agents and sophisticated LLM pipelines.
外部链接
在 n8n.io 查看 →
分享此工作流