8
n8n 中文网amn8n.com

从网站获取所有页面内容并存储至Pinecone的Gemini嵌入

高级

这是一个Document Extraction, AI RAG领域的自动化工作流,包含 16 个节点。主要使用 Xml, Code, Html, Wait, Merge 等节点。 从网站获取所有页面内容并存储至Pinecone的Gemini嵌入

前置要求
  • 可能需要目标 API 的认证凭证
  • Pinecone API Key
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "nodes": [
    {
      "id": "5ad6a510-3c4a-47e4-b8ff-c0e565e25d25",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        368,
        944
      ],
      "parameters": {
        "width": 832,
        "height": 816,
        "content": "此 n8n 工作流从网站内容构建 Pinecone 知识库,同时处理站点地图和直接 URL 输入。"
      },
      "typeVersion": 1
    },
    {
      "id": "3ff777b7-24bd-420c-af38-62a395f52a1a",
      "name": "提取页面 URL",
      "type": "n8n-nodes-base.code",
      "position": [
        1936,
        1392
      ],
      "parameters": {
        "jsCode": "const items = []\nfor (const item of $input.first().json.urlset.url) {\n  items.push({ url: item.loc })\n}\n\nreturn items;"
      },
      "typeVersion": 2
    },
    {
      "id": "6176e651-cef5-44e8-abed-0f6f6b81517b",
      "name": "XML 转换",
      "type": "n8n-nodes-base.xml",
      "position": [
        1792,
        1392
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 1
    },
    {
      "id": "cca1e7e7-32f6-42fd-b23c-3c2586344a50",
      "name": "获取站点地图",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        1632,
        1392
      ],
      "parameters": {
        "url": "={{ $json['Sitemap URL'] }}",
        "options": {}
      },
      "typeVersion": 4.2
    },
    {
      "id": "520e131d-b5f2-4857-aebd-5724da2a8083",
      "name": "拆分页面 URL",
      "type": "n8n-nodes-base.code",
      "position": [
        1792,
        1216
      ],
      "parameters": {
        "jsCode": "function addTrailingSlash(str) {\n  if (typeof str !== 'string') {\n    return str; // Or throw an error, handle non-string inputs\n  }\n  if (!str.endsWith('/')) {\n    return str + '/';\n  }\n  return str;\n}\n\nconst urls = []\nfor (const item of $input.first().json['Page URLs'].split(',')) {\n  urls.push({ url: addTrailingSlash(item).trim()})\n}\n\nreturn urls;"
      },
      "typeVersion": 2
    },
    {
      "id": "7e7fe528-8748-470b-b627-a0c79b5aface",
      "name": "合并 URL",
      "type": "n8n-nodes-base.merge",
      "position": [
        2128,
        1232
      ],
      "parameters": {},
      "typeVersion": 3.2
    },
    {
      "id": "a0517aaf-6ccd-481d-b97e-b183d305451b",
      "name": "移除重复 URL",
      "type": "n8n-nodes-base.removeDuplicates",
      "position": [
        2272,
        1232
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 2
    },
    {
      "id": "72c85ccf-a9d6-42b1-85a7-76800ba831e5",
      "name": "循环处理页面 URL",
      "type": "n8n-nodes-base.splitInBatches",
      "position": [
        2480,
        1232
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 3
    },
    {
      "id": "73aebd19-60ae-40d1-a747-0b9537d9d67c",
      "name": "提取内容",
      "type": "n8n-nodes-base.html",
      "position": [
        2672,
        1136
      ],
      "parameters": {
        "options": {
          "cleanUpText": true
        },
        "operation": "extractHtmlContent",
        "extractionValues": {
          "values": [
            {
              "key": "content",
              "cssSelector": "body",
              "skipSelectors": "img"
            }
          ]
        }
      },
      "typeVersion": 1.2
    },
    {
      "id": "0dbf70c1-cb57-4691-916f-2a2aa9a4cec0",
      "name": "获取页面 HTML 内容",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        2672,
        1328
      ],
      "parameters": {
        "url": "={{ $json.url }}",
        "options": {}
      },
      "typeVersion": 4.2
    },
    {
      "id": "fa1c18c6-6c29-4e71-905e-0945909af99b",
      "name": "等待 5 秒",
      "type": "n8n-nodes-base.wait",
      "position": [
        2832,
        1328
      ],
      "webhookId": "9d87e60f-9df8-4a13-9c22-e3e5a5bb9c0e",
      "parameters": {},
      "typeVersion": 1.1
    },
    {
      "id": "2bf3ad7f-a2fd-44f9-b6af-5a500ef80591",
      "name": "数据加载器",
      "type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
      "position": [
        3264,
        1344
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 1.1
    },
    {
      "id": "a86d4c2e-559c-4942-ac0d-2ddcc7eb7f39",
      "name": "Gemini 嵌入",
      "type": "@n8n/n8n-nodes-langchain.embeddingsGoogleGemini",
      "position": [
        3072,
        1344
      ],
      "parameters": {
        "modelName": "models/gemini-embedding-001"
      },
      "typeVersion": 1
    },
    {
      "id": "f46188bd-c0a2-4d49-9b67-0937f891ae36",
      "name": "Pinecone 知识库",
      "type": "@n8n/n8n-nodes-langchain.vectorStorePinecone",
      "position": [
        3072,
        1136
      ],
      "parameters": {
        "mode": "insert",
        "options": {
          "clearNamespace": true
        }
      },
      "typeVersion": 1.3
    },
    {
      "id": "4f5dc6e3-8f75-46ab-b3e1-49deb7695469",
      "name": "输入站点地图或页面 URL",
      "type": "n8n-nodes-base.formTrigger",
      "position": [
        1296,
        1376
      ],
      "webhookId": "ab54a2cd-2eda-4cf7-b822-8fb49ecb257e",
      "parameters": {
        "options": {},
        "formTitle": "Agent Knowledge Base Input",
        "formFields": {
          "values": [
            {
              "fieldLabel": "Sitemap URL",
              "placeholder": "https://website.com/page-sitemap.xml"
            },
            {
              "fieldType": "textarea",
              "fieldLabel": "Page URLs",
              "placeholder": "https://website.com/about, https://website.com/contact"
            }
          ]
        },
        "formDescription": "This form is to input the page sitemap or pages of your website"
      },
      "typeVersion": 2.2
    },
    {
      "id": "67f6e98a-946c-4460-93d4-707511deb4f5",
      "name": "条件分支",
      "type": "n8n-nodes-base.switch",
      "position": [
        1440,
        1376
      ],
      "parameters": {
        "rules": {
          "values": [
            {
              "conditions": {
                "options": {
                  "version": 2,
                  "leftValue": "",
                  "caseSensitive": true,
                  "typeValidation": "strict"
                },
                "combinator": "and",
                "conditions": [
                  {
                    "id": "2af7e15b-2e56-40e5-addc-74bd0b4de214",
                    "operator": {
                      "type": "string",
                      "operation": "notEmpty",
                      "singleValue": true
                    },
                    "leftValue": "={{ $json['Page URLs'] }}",
                    "rightValue": ""
                  }
                ]
              }
            },
            {
              "conditions": {
                "options": {
                  "version": 2,
                  "leftValue": "",
                  "caseSensitive": true,
                  "typeValidation": "strict"
                },
                "combinator": "and",
                "conditions": [
                  {
                    "id": "02899ab6-0c0b-4c0f-89ad-ec5787da36eb",
                    "operator": {
                      "type": "string",
                      "operation": "endsWith"
                    },
                    "leftValue": "={{ $json['Sitemap URL'] }}",
                    "rightValue": "xml"
                  }
                ]
              }
            }
          ]
        },
        "options": {
          "allMatchingOutputs": true
        }
      },
      "typeVersion": 3.2
    }
  ],
  "connections": {
    "Switch": {
      "main": [
        [
          {
            "node": "Split Pages URL",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Fetch Sitemap",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Merge URLs": {
      "main": [
        [
          {
            "node": "Remove Duplicate URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Wait 5 sec": {
      "main": [
        [
          {
            "node": "Loop Over Page URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Data Loader": {
      "ai_document": [
        [
          {
            "node": "Pinecone KnowledgeBase",
            "type": "ai_document",
            "index": 0
          }
        ]
      ]
    },
    "Fetch Sitemap": {
      "main": [
        [
          {
            "node": "XML Conversion",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "XML Conversion": {
      "main": [
        [
          {
            "node": "Extract Page URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Content": {
      "main": [
        [
          {
            "node": "Pinecone KnowledgeBase",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Split Pages URL": {
      "main": [
        [
          {
            "node": "Merge URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Page URLs": {
      "main": [
        [
          {
            "node": "Merge URLs",
            "type": "main",
            "index": 1
          }
        ]
      ]
    },
    "Gemini Embeddings": {
      "ai_embedding": [
        [
          {
            "node": "Pinecone KnowledgeBase",
            "type": "ai_embedding",
            "index": 0
          }
        ]
      ]
    },
    "Loop Over Page URLs": {
      "main": [
        [
          {
            "node": "Extract Content",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Fetch Page HTML For content",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Remove Duplicate URLs": {
      "main": [
        [
          {
            "node": "Loop Over Page URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Input Sitemap or page urls": {
      "main": [
        [
          {
            "node": "Switch",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Fetch Page HTML For content": {
      "main": [
        [
          {
            "node": "Wait 5 sec",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

高级 - 文档提取, AI RAG 检索增强

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量16
分类2
节点类型14
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

作者
Zain Khan

Zain Khan

@zain

I partner with businesses to streamline processes and accelerate growth through intelligent AI automation and Web/mobile Development. Leveraging deep expertise in GPT-4, LangChain, and n8n, I develop AI-powered agents and sophisticated LLM pipelines.

外部链接
在 n8n.io 查看

分享此工作流