解析、规范化、提取 PDF 内容并存储到 Pinecone 以用于 RAG
这是一个AI RAG, Multimodal AI领域的自动化工作流,包含 18 个节点。主要使用 If, Code, Wait, GoogleDrive, HttpRequest 等节点。 使用LlamaIndex、OpenAI嵌入和Pinecone向量数据库构建PDF问答系统
- •Google Drive API 凭证
- •可能需要目标 API 的认证凭证
- •OpenAI API Key
- •Pinecone API Key
使用的节点 (18)
{
"id": "xDiuqZUZnShKpPzX",
"meta": {
"instanceId": "70273a2379644db63ce659827cfd8abac2d0b189210eafa02dd5376e3a62cd1d",
"templateCredsSetupCompleted": true
},
"name": "解析、规范化、提取 PDF 内容并存储到 Pinecone 以用于 RAG",
"tags": [],
"nodes": [
{
"id": "19b009db-a418-458c-a216-bdcc9af6fd2f",
"name": "Google Drive触发器",
"type": "n8n-nodes-base.googleDriveTrigger",
"position": [
-1504,
2080
],
"parameters": {
"event": "fileCreated",
"options": {},
"pollTimes": {
"item": [
{
"mode": "everyMinute"
}
]
},
"triggerOn": "specificFolder",
"folderToWatch": {
"__rl": true,
"mode": "list",
"value": ""
}
},
"credentials": {
"googleDriveOAuth2Api": {
"id": "aU33fzddE6s3ZQw6",
"name": "LearnBy-Google-Drive"
}
},
"typeVersion": 1
},
{
"id": "ff933f76-d719-40b5-b193-8a29e5fa2197",
"name": "下载文件",
"type": "n8n-nodes-base.googleDrive",
"position": [
-1248,
2096
],
"parameters": {
"fileId": {
"__rl": true,
"mode": "id",
"value": "={{ $json.id }}"
},
"options": {},
"operation": "download"
},
"credentials": {
"googleDriveOAuth2Api": {
"id": "aU33fzddE6s3ZQw6",
"name": "LearnBy-Google-Drive"
}
},
"typeVersion": 3
},
{
"id": "127b41ed-ad45-4234-b87f-4f3c2b6ea531",
"name": "默认数据加载器",
"type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
"position": [
528,
2192
],
"parameters": {
"options": {},
"textSplittingMode": "custom"
},
"typeVersion": 1.1
},
{
"id": "0316c7d4-449f-4275-a9b1-8848545beba8",
"name": "便签",
"type": "n8n-nodes-base.stickyNote",
"position": [
336,
1712
],
"parameters": {
"width": 736,
"height": 832,
"content": "## 保存到向量数据库"
},
"typeVersion": 1
},
{
"id": "b06702b5-c322-4a5a-949a-855c8b97dadc",
"name": "便签1",
"type": "n8n-nodes-base.stickyNote",
"position": [
-1088,
1720
],
"parameters": {
"color": 4,
"width": 1392,
"height": 656,
"content": "## 准备数据 - 解析和规范化"
},
"typeVersion": 1
},
{
"id": "05034e35-f6bf-45a6-860e-94f4da566daf",
"name": "等待",
"type": "n8n-nodes-base.wait",
"position": [
-720,
2088
],
"webhookId": "a0518843-31f8-44f9-bd8e-1189e16de0f1",
"parameters": {
"amount": 30
},
"typeVersion": 1.1
},
{
"id": "9bb49bf6-a02e-4cf7-a1d1-ca4addff2bc6",
"name": "条件判断",
"type": "n8n-nodes-base.if",
"position": [
-272,
2016
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "7a07aec1-fc5f-4b76-94d9-6fa8f509ac8e",
"operator": {
"name": "filter.operator.equals",
"type": "string",
"operation": "equals"
},
"leftValue": "={{ $json.status }}",
"rightValue": "SUCCESS"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "28654aff-e603-4080-9e7a-e706aaee47c4",
"name": "等待 2",
"type": "n8n-nodes-base.wait",
"position": [
-48,
2184
],
"webhookId": "8da5da31-1ebd-4c82-8c6a-476d5d277cdd",
"parameters": {
"amount": 60
},
"typeVersion": 1.1
},
{
"id": "ba935542-d2c0-4781-b6f9-5e1e007a9740",
"name": "便签2",
"type": "n8n-nodes-base.stickyNote",
"position": [
0,
1584
],
"parameters": {
"width": 288,
"height": 352,
"content": "## 规范化内容"
},
"typeVersion": 1
},
{
"id": "45efcdf9-89a9-4638-a9ed-cac39506270f",
"name": "便签3",
"type": "n8n-nodes-base.stickyNote",
"position": [
-2080,
1472
],
"parameters": {
"width": 464,
"height": 1200,
"content": "## 试试看!"
},
"typeVersion": 1
},
{
"id": "34f5ba5e-7f4e-4c94-a4e8-41bfbaf163a1",
"name": "上传到 Llama Cloud",
"type": "n8n-nodes-base.httpRequest",
"position": [
-944,
2088
],
"parameters": {
"url": "https://api.cloud.llamaindex.ai/api/v1/parsing/upload",
"method": "POST",
"options": {},
"sendBody": true,
"contentType": "multipart-form-data",
"sendHeaders": true,
"authentication": "genericCredentialType",
"bodyParameters": {
"parameters": [
{
"name": "file",
"parameterType": "formBinaryData",
"inputDataFieldName": "data"
}
]
},
"genericAuthType": "httpBearerAuth",
"headerParameters": {
"parameters": [
{
"name": "accept",
"value": "application/json"
},
{
"name": "Content-Type",
"value": "multipart/form-data"
}
]
}
},
"credentials": {
"httpBearerAuth": {
"id": "FlAAm17M7G6as02l",
"name": "learnby_llama_cloud"
}
},
"executeOnce": false,
"retryOnFail": true,
"typeVersion": 4.2,
"alwaysOutputData": false
},
{
"id": "1199e4ff-1952-4225-b655-1f63875f8903",
"name": "检查解析状态",
"type": "n8n-nodes-base.httpRequest",
"position": [
-496,
2088
],
"parameters": {
"url": "=https://api.cloud.llamaindex.ai/api/parsing/job/{{ $('Upload to Llama Cloud').item.json.id }}",
"options": {},
"sendHeaders": true,
"authentication": "genericCredentialType",
"genericAuthType": "httpBearerAuth",
"headerParameters": {
"parameters": [
{
"name": "accept",
"value": "application/json"
}
]
}
},
"credentials": {
"httpBearerAuth": {
"id": "FlAAm17M7G6as02l",
"name": "learnby_llama_cloud"
}
},
"retryOnFail": true,
"typeVersion": 4.2
},
{
"id": "cfaf9e10-0297-423a-b3f4-c25561c92078",
"name": "从 Llama Cloud 提取 Markdown",
"type": "n8n-nodes-base.httpRequest",
"position": [
-48,
1968
],
"parameters": {
"url": "=https://api.cloud.llamaindex.ai/api/v1/parsing/job/{{ $json.id }}/result/markdown",
"options": {},
"sendHeaders": true,
"authentication": "genericCredentialType",
"genericAuthType": "httpBearerAuth",
"headerParameters": {
"parameters": [
{
"name": "accept",
"value": "application/json"
}
]
}
},
"credentials": {
"httpBearerAuth": {
"id": "FlAAm17M7G6as02l",
"name": "learnby_llama_cloud"
}
},
"retryOnFail": true,
"typeVersion": 4.2
},
{
"id": "564a0930-e80b-4db6-a62a-5224248e5cd9",
"name": "规范化文本",
"type": "n8n-nodes-base.code",
"position": [
176,
1968
],
"parameters": {
"mode": "runOnceForEachItem",
"jsCode": "// Get the input text from the previous node\nconst input = $json.markdown || $json.text || \"\";\n\nlet text = input.replace(/Car Insurance Policy\\s*\\d+/gi, \"\");\n\n// Remove \"Page X\" markers\ntext = text.replace(/Page\\s*\\d+/gi, \"\");\n\n// Replace --- dividers with a single newline\ntext = text.replace(/-{3,}/g, \"\\n\");\n\n// Decode & cleanup artifacts\ntext = text.replace(/&/g, \"&\"); // fix HTML entities\ntext = text.replace(/[ⓤ]/g, \"-\"); // replace bullet symbols with dashes\n\n// Collapse whitespace\ntext = text.replace(/\\n{2,}/g, \"\\n\\n\"); // keep paragraph breaks\ntext = text.replace(/[ \\t]+/g, \" \"); // collapse spaces\n\n// Step 5: Trim\ntext = text.trim();\n\n// Output for next node\nreturn { json: { normalizedText: text } };\n"
},
"typeVersion": 2
},
{
"id": "fe32694a-2cbc-4ad4-88aa-4eb3dba0256c",
"name": "分块文本",
"type": "@n8n/n8n-nodes-langchain.textSplitterRecursiveCharacterTextSplitter",
"position": [
608,
2400
],
"parameters": {
"options": {
"splitCode": "markdown"
},
"chunkSize": 1200,
"chunkOverlap": 150
},
"typeVersion": 1
},
{
"id": "b399c9fa-03d7-4126-9026-674d091b9ddf",
"name": "生成嵌入",
"type": "@n8n/n8n-nodes-langchain.embeddingsOpenAi",
"position": [
400,
2192
],
"parameters": {
"options": {}
},
"credentials": {
"openAiApi": {
"id": "Yj4Rt75fspowAEru",
"name": "nextweb-openai"
}
},
"typeVersion": 1.2
},
{
"id": "1da27207-b77e-41d0-a249-5096ec8ac259",
"name": "存储到 Pinecone",
"type": "@n8n/n8n-nodes-langchain.vectorStorePinecone",
"position": [
432,
1968
],
"parameters": {
"mode": "insert",
"options": {
"pineconeNamespace": "rag"
},
"pineconeIndex": {
"__rl": true,
"mode": "id",
"value": "demo"
}
},
"credentials": {
"pineconeApi": {
"id": "uo1lZDPNWTsMAeOC",
"name": "learnby-PineconeApi-account"
}
},
"notesInFlow": false,
"typeVersion": 1.3
},
{
"id": "4f65ec8b-f936-41cb-b05c-0cc710df1c9e",
"name": "便签说明4",
"type": "n8n-nodes-base.stickyNote",
"position": [
-1568,
1728
],
"parameters": {
"color": 6,
"width": 464,
"height": 640,
"content": "## 提取数据"
},
"typeVersion": 1
}
],
"active": false,
"pinData": {},
"settings": {
"executionOrder": "v1"
},
"versionId": "5ec0ee83-34cd-423d-8bd5-41400bde4a4a",
"connections": {
"If": {
"main": [
[
{
"node": "Extract Markdown from Llama Cloud",
"type": "main",
"index": 0
}
],
[
{
"node": "Wait2",
"type": "main",
"index": 0
}
]
]
},
"Wait": {
"main": [
[
{
"node": "Check Parsing Status",
"type": "main",
"index": 0
}
]
]
},
"Wait2": {
"main": [
[
{
"node": "Check Parsing Status",
"type": "main",
"index": 0
}
]
]
},
"Chunk Text": {
"ai_textSplitter": [
[
{
"node": "Default Data Loader",
"type": "ai_textSplitter",
"index": 0
}
]
]
},
"Download file": {
"main": [
[
{
"node": "Upload to Llama Cloud",
"type": "main",
"index": 0
}
]
]
},
"Normalize Text": {
"main": [
[
{
"node": "Store in Pinecone",
"type": "main",
"index": 0
}
]
]
},
"Store in Pinecone": {
"main": [
[]
]
},
"Default Data Loader": {
"ai_document": [
[
{
"node": "Store in Pinecone",
"type": "ai_document",
"index": 0
}
]
]
},
"Generate Embeddings": {
"ai_embedding": [
[
{
"node": "Store in Pinecone",
"type": "ai_embedding",
"index": 0
}
]
]
},
"Check Parsing Status": {
"main": [
[
{
"node": "If",
"type": "main",
"index": 0
}
]
]
},
"Google Drive Trigger": {
"main": [
[
{
"node": "Download file",
"type": "main",
"index": 0
}
]
]
},
"Upload to Llama Cloud": {
"main": [
[
{
"node": "Wait",
"type": "main",
"index": 0
}
]
]
},
"Extract Markdown from Llama Cloud": {
"main": [
[
{
"node": "Normalize Text",
"type": "main",
"index": 0
}
]
]
}
}
}如何使用这个工作流?
复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。
这个工作流适合什么场景?
高级 - AI RAG 检索增强, 多模态 AI
需要付费吗?
本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。
相关工作流推荐
Alok Kumar
@alokkumarI am a Principal Software Engineer based in Ireland with a deep passion for AI and emerging technologies. With extensive experience in designing and implementing scalable software solutions, I focus on leveraging artificial intelligence to solve real-world problems. I enjoy exploring innovative applications of AI, from intelligent automation to data-driven insights, and I’m dedicated to building systems that are both efficient and impactful.
分享此工作流