8
n8n 中文网amn8n.com

解析、规范化、提取 PDF 内容并存储到 Pinecone 以用于 RAG

高级

这是一个AI RAG, Multimodal AI领域的自动化工作流,包含 18 个节点。主要使用 If, Code, Wait, GoogleDrive, HttpRequest 等节点。 使用LlamaIndex、OpenAI嵌入和Pinecone向量数据库构建PDF问答系统

前置要求
  • Google Drive API 凭证
  • 可能需要目标 API 的认证凭证
  • OpenAI API Key
  • Pinecone API Key
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "id": "xDiuqZUZnShKpPzX",
  "meta": {
    "instanceId": "70273a2379644db63ce659827cfd8abac2d0b189210eafa02dd5376e3a62cd1d",
    "templateCredsSetupCompleted": true
  },
  "name": "解析、规范化、提取 PDF 内容并存储到 Pinecone 以用于 RAG",
  "tags": [],
  "nodes": [
    {
      "id": "19b009db-a418-458c-a216-bdcc9af6fd2f",
      "name": "Google Drive触发器",
      "type": "n8n-nodes-base.googleDriveTrigger",
      "position": [
        -1504,
        2080
      ],
      "parameters": {
        "event": "fileCreated",
        "options": {},
        "pollTimes": {
          "item": [
            {
              "mode": "everyMinute"
            }
          ]
        },
        "triggerOn": "specificFolder",
        "folderToWatch": {
          "__rl": true,
          "mode": "list",
          "value": ""
        }
      },
      "credentials": {
        "googleDriveOAuth2Api": {
          "id": "aU33fzddE6s3ZQw6",
          "name": "LearnBy-Google-Drive"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "ff933f76-d719-40b5-b193-8a29e5fa2197",
      "name": "下载文件",
      "type": "n8n-nodes-base.googleDrive",
      "position": [
        -1248,
        2096
      ],
      "parameters": {
        "fileId": {
          "__rl": true,
          "mode": "id",
          "value": "={{ $json.id }}"
        },
        "options": {},
        "operation": "download"
      },
      "credentials": {
        "googleDriveOAuth2Api": {
          "id": "aU33fzddE6s3ZQw6",
          "name": "LearnBy-Google-Drive"
        }
      },
      "typeVersion": 3
    },
    {
      "id": "127b41ed-ad45-4234-b87f-4f3c2b6ea531",
      "name": "默认数据加载器",
      "type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
      "position": [
        528,
        2192
      ],
      "parameters": {
        "options": {},
        "textSplittingMode": "custom"
      },
      "typeVersion": 1.1
    },
    {
      "id": "0316c7d4-449f-4275-a9b1-8848545beba8",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        336,
        1712
      ],
      "parameters": {
        "width": 736,
        "height": 832,
        "content": "## 保存到向量数据库"
      },
      "typeVersion": 1
    },
    {
      "id": "b06702b5-c322-4a5a-949a-855c8b97dadc",
      "name": "便签1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1088,
        1720
      ],
      "parameters": {
        "color": 4,
        "width": 1392,
        "height": 656,
        "content": "## 准备数据 - 解析和规范化"
      },
      "typeVersion": 1
    },
    {
      "id": "05034e35-f6bf-45a6-860e-94f4da566daf",
      "name": "等待",
      "type": "n8n-nodes-base.wait",
      "position": [
        -720,
        2088
      ],
      "webhookId": "a0518843-31f8-44f9-bd8e-1189e16de0f1",
      "parameters": {
        "amount": 30
      },
      "typeVersion": 1.1
    },
    {
      "id": "9bb49bf6-a02e-4cf7-a1d1-ca4addff2bc6",
      "name": "条件判断",
      "type": "n8n-nodes-base.if",
      "position": [
        -272,
        2016
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "7a07aec1-fc5f-4b76-94d9-6fa8f509ac8e",
              "operator": {
                "name": "filter.operator.equals",
                "type": "string",
                "operation": "equals"
              },
              "leftValue": "={{ $json.status }}",
              "rightValue": "SUCCESS"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "28654aff-e603-4080-9e7a-e706aaee47c4",
      "name": "等待 2",
      "type": "n8n-nodes-base.wait",
      "position": [
        -48,
        2184
      ],
      "webhookId": "8da5da31-1ebd-4c82-8c6a-476d5d277cdd",
      "parameters": {
        "amount": 60
      },
      "typeVersion": 1.1
    },
    {
      "id": "ba935542-d2c0-4781-b6f9-5e1e007a9740",
      "name": "便签2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        0,
        1584
      ],
      "parameters": {
        "width": 288,
        "height": 352,
        "content": "## 规范化内容"
      },
      "typeVersion": 1
    },
    {
      "id": "45efcdf9-89a9-4638-a9ed-cac39506270f",
      "name": "便签3",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2080,
        1472
      ],
      "parameters": {
        "width": 464,
        "height": 1200,
        "content": "## 试试看!"
      },
      "typeVersion": 1
    },
    {
      "id": "34f5ba5e-7f4e-4c94-a4e8-41bfbaf163a1",
      "name": "上传到 Llama Cloud",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -944,
        2088
      ],
      "parameters": {
        "url": "https://api.cloud.llamaindex.ai/api/v1/parsing/upload",
        "method": "POST",
        "options": {},
        "sendBody": true,
        "contentType": "multipart-form-data",
        "sendHeaders": true,
        "authentication": "genericCredentialType",
        "bodyParameters": {
          "parameters": [
            {
              "name": "file",
              "parameterType": "formBinaryData",
              "inputDataFieldName": "data"
            }
          ]
        },
        "genericAuthType": "httpBearerAuth",
        "headerParameters": {
          "parameters": [
            {
              "name": "accept",
              "value": "application/json"
            },
            {
              "name": "Content-Type",
              "value": "multipart/form-data"
            }
          ]
        }
      },
      "credentials": {
        "httpBearerAuth": {
          "id": "FlAAm17M7G6as02l",
          "name": "learnby_llama_cloud"
        }
      },
      "executeOnce": false,
      "retryOnFail": true,
      "typeVersion": 4.2,
      "alwaysOutputData": false
    },
    {
      "id": "1199e4ff-1952-4225-b655-1f63875f8903",
      "name": "检查解析状态",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -496,
        2088
      ],
      "parameters": {
        "url": "=https://api.cloud.llamaindex.ai/api/parsing/job/{{ $('Upload to Llama Cloud').item.json.id }}",
        "options": {},
        "sendHeaders": true,
        "authentication": "genericCredentialType",
        "genericAuthType": "httpBearerAuth",
        "headerParameters": {
          "parameters": [
            {
              "name": "accept",
              "value": "application/json"
            }
          ]
        }
      },
      "credentials": {
        "httpBearerAuth": {
          "id": "FlAAm17M7G6as02l",
          "name": "learnby_llama_cloud"
        }
      },
      "retryOnFail": true,
      "typeVersion": 4.2
    },
    {
      "id": "cfaf9e10-0297-423a-b3f4-c25561c92078",
      "name": "从 Llama Cloud 提取 Markdown",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -48,
        1968
      ],
      "parameters": {
        "url": "=https://api.cloud.llamaindex.ai/api/v1/parsing/job/{{ $json.id }}/result/markdown",
        "options": {},
        "sendHeaders": true,
        "authentication": "genericCredentialType",
        "genericAuthType": "httpBearerAuth",
        "headerParameters": {
          "parameters": [
            {
              "name": "accept",
              "value": "application/json"
            }
          ]
        }
      },
      "credentials": {
        "httpBearerAuth": {
          "id": "FlAAm17M7G6as02l",
          "name": "learnby_llama_cloud"
        }
      },
      "retryOnFail": true,
      "typeVersion": 4.2
    },
    {
      "id": "564a0930-e80b-4db6-a62a-5224248e5cd9",
      "name": "规范化文本",
      "type": "n8n-nodes-base.code",
      "position": [
        176,
        1968
      ],
      "parameters": {
        "mode": "runOnceForEachItem",
        "jsCode": "// Get the input text from the previous node\nconst input = $json.markdown || $json.text || \"\";\n\nlet text = input.replace(/Car Insurance Policy\\s*\\d+/gi, \"\");\n\n// Remove \"Page X\" markers\ntext = text.replace(/Page\\s*\\d+/gi, \"\");\n\n// Replace --- dividers with a single newline\ntext = text.replace(/-{3,}/g, \"\\n\");\n\n// Decode & cleanup artifacts\ntext = text.replace(/&/g, \"&\");   // fix HTML entities\ntext = text.replace(/[ⓤ]/g, \"-\");      // replace bullet symbols with dashes\n\n// Collapse whitespace\ntext = text.replace(/\\n{2,}/g, \"\\n\\n\"); // keep paragraph breaks\ntext = text.replace(/[ \\t]+/g, \" \");    // collapse spaces\n\n// Step 5: Trim\ntext = text.trim();\n\n// Output for next node\nreturn { json: { normalizedText: text } };\n"
      },
      "typeVersion": 2
    },
    {
      "id": "fe32694a-2cbc-4ad4-88aa-4eb3dba0256c",
      "name": "分块文本",
      "type": "@n8n/n8n-nodes-langchain.textSplitterRecursiveCharacterTextSplitter",
      "position": [
        608,
        2400
      ],
      "parameters": {
        "options": {
          "splitCode": "markdown"
        },
        "chunkSize": 1200,
        "chunkOverlap": 150
      },
      "typeVersion": 1
    },
    {
      "id": "b399c9fa-03d7-4126-9026-674d091b9ddf",
      "name": "生成嵌入",
      "type": "@n8n/n8n-nodes-langchain.embeddingsOpenAi",
      "position": [
        400,
        2192
      ],
      "parameters": {
        "options": {}
      },
      "credentials": {
        "openAiApi": {
          "id": "Yj4Rt75fspowAEru",
          "name": "nextweb-openai"
        }
      },
      "typeVersion": 1.2
    },
    {
      "id": "1da27207-b77e-41d0-a249-5096ec8ac259",
      "name": "存储到 Pinecone",
      "type": "@n8n/n8n-nodes-langchain.vectorStorePinecone",
      "position": [
        432,
        1968
      ],
      "parameters": {
        "mode": "insert",
        "options": {
          "pineconeNamespace": "rag"
        },
        "pineconeIndex": {
          "__rl": true,
          "mode": "id",
          "value": "demo"
        }
      },
      "credentials": {
        "pineconeApi": {
          "id": "uo1lZDPNWTsMAeOC",
          "name": "learnby-PineconeApi-account"
        }
      },
      "notesInFlow": false,
      "typeVersion": 1.3
    },
    {
      "id": "4f65ec8b-f936-41cb-b05c-0cc710df1c9e",
      "name": "便签说明4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1568,
        1728
      ],
      "parameters": {
        "color": 6,
        "width": 464,
        "height": 640,
        "content": "## 提取数据"
      },
      "typeVersion": 1
    }
  ],
  "active": false,
  "pinData": {},
  "settings": {
    "executionOrder": "v1"
  },
  "versionId": "5ec0ee83-34cd-423d-8bd5-41400bde4a4a",
  "connections": {
    "If": {
      "main": [
        [
          {
            "node": "Extract Markdown from Llama Cloud",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Wait2",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Wait": {
      "main": [
        [
          {
            "node": "Check Parsing Status",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Wait2": {
      "main": [
        [
          {
            "node": "Check Parsing Status",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Chunk Text": {
      "ai_textSplitter": [
        [
          {
            "node": "Default Data Loader",
            "type": "ai_textSplitter",
            "index": 0
          }
        ]
      ]
    },
    "Download file": {
      "main": [
        [
          {
            "node": "Upload to Llama Cloud",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Normalize Text": {
      "main": [
        [
          {
            "node": "Store in Pinecone",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store in Pinecone": {
      "main": [
        []
      ]
    },
    "Default Data Loader": {
      "ai_document": [
        [
          {
            "node": "Store in Pinecone",
            "type": "ai_document",
            "index": 0
          }
        ]
      ]
    },
    "Generate Embeddings": {
      "ai_embedding": [
        [
          {
            "node": "Store in Pinecone",
            "type": "ai_embedding",
            "index": 0
          }
        ]
      ]
    },
    "Check Parsing Status": {
      "main": [
        [
          {
            "node": "If",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Google Drive Trigger": {
      "main": [
        [
          {
            "node": "Download file",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Upload to Llama Cloud": {
      "main": [
        [
          {
            "node": "Wait",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Markdown from Llama Cloud": {
      "main": [
        [
          {
            "node": "Normalize Text",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

高级 - AI RAG 检索增强, 多模态 AI

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量18
分类2
节点类型11
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

作者
Alok Kumar

Alok Kumar

@alokkumar

I am a Principal Software Engineer based in Ireland with a deep passion for AI and emerging technologies. With extensive experience in designing and implementing scalable software solutions, I focus on leveraging artificial intelligence to solve real-world problems. I enjoy exploring innovative applications of AI, from intelligent automation to data-driven insights, and I’m dedicated to building systems that are both efficient and impactful.

外部链接
在 n8n.io 查看

分享此工作流