使用PDF Vector进行OCR、分析和Google Drive的文档处理
中级
这是一个Document Extraction, AI Summarization, Multimodal AI领域的自动化工作流,包含 13 个节点。主要使用 Set, Code, SplitOut, Aggregate, GoogleDrive 等节点。 使用PDF Vector进行OCR、分析和Google Drive的文档处理
前置要求
- •Google Drive API 凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
"meta": {
"instanceId": "placeholder"
},
"nodes": [
{
"id": "overview-note",
"name": "分析概览",
"type": "n8n-nodes-base.stickyNote",
"position": [
50,
50
],
"parameters": {
"color": 5,
"width": 350,
"height": 180,
"content": "## 📊 实时分析"
},
"typeVersion": 1
},
{
"id": "metrics-note",
"name": "跟踪的指标",
"type": "n8n-nodes-base.stickyNote",
"position": [
450,
550
],
"parameters": {
"width": 260,
"height": 160,
"content": "## 📈 关键指标"
},
"typeVersion": 1
},
{
"id": "output-note",
"name": "仪表板输出",
"type": "n8n-nodes-base.stickyNote",
"position": [
750,
550
],
"parameters": {
"color": 6,
"width": 250,
"height": 150,
"content": "## 📊 可视化"
},
"typeVersion": 1
},
{
"id": "manual-trigger",
"name": "手动触发器",
"type": "n8n-nodes-base.manualTrigger",
"notes": "Start batch processing",
"position": [
250,
300
],
"parameters": {},
"typeVersion": 1
},
{
"id": "list-documents",
"name": "列出文档",
"type": "n8n-nodes-base.googleDrive",
"notes": "Replace FOLDER_ID_HERE with your Google Drive folder ID",
"position": [
450,
300
],
"parameters": {
"limit": 100,
"fields": [
"id",
"name",
"mimeType",
"size",
"webViewLink",
"createdTime"
],
"operation": "list",
"queryString": "'FOLDER_ID_HERE' in parents and trashed=false"
},
"typeVersion": 3
},
{
"id": "validate-files",
"name": "验证并排队文件",
"type": "n8n-nodes-base.code",
"notes": "Validate and prioritize files",
"position": [
650,
300
],
"parameters": {
"jsCode": "// Validate and categorize documents\nconst files = $input.all().map(item => item.json);\nconst processingQueue = {\n valid: [],\n invalid: [],\n stats: {\n totalFiles: files.length,\n pdfCount: 0,\n wordCount: 0,\n imageCount: 0,\n otherCount: 0,\n totalSizeMB: 0\n }\n};\n\n// Define supported formats\nconst supportedFormats = {\n pdf: ['application/pdf'],\n word: [\n 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',\n 'application/msword'\n ],\n image: ['image/jpeg', 'image/png', 'image/gif']\n};\n\nfiles.forEach(file => {\n const mimeType = file.mimeType;\n const sizeMB = (parseInt(file.size) || 0) / (1024 * 1024);\n \n // Check if supported\n let fileType = 'other';\n let isValid = false;\n \n if (supportedFormats.pdf.includes(mimeType)) {\n fileType = 'pdf';\n isValid = true;\n processingQueue.stats.pdfCount++;\n } else if (supportedFormats.word.includes(mimeType)) {\n fileType = 'word';\n isValid = true;\n processingQueue.stats.wordCount++;\n } else if (supportedFormats.image.includes(mimeType)) {\n fileType = 'image';\n isValid = true;\n processingQueue.stats.imageCount++;\n } else {\n processingQueue.stats.otherCount++;\n }\n \n // Check file size (max 50MB)\n if (sizeMB > 50) {\n isValid = false;\n }\n \n const fileInfo = {\n ...file,\n fileType,\n sizeMB: Math.round(sizeMB * 100) / 100,\n processingPriority: sizeMB < 5 ? 'high' : sizeMB < 20 ? 'medium' : 'low',\n estimatedCredits: fileType === 'pdf' ? Math.ceil(sizeMB * 2) : 1\n };\n \n if (isValid) {\n processingQueue.valid.push(fileInfo);\n } else {\n processingQueue.invalid.push({\n ...fileInfo,\n reason: sizeMB > 50 ? 'File too large' : 'Unsupported format'\n });\n }\n \n processingQueue.stats.totalSizeMB += sizeMB;\n});\n\n// Sort by priority\nprocessingQueue.valid.sort((a, b) => {\n const priority = { high: 1, medium: 2, low: 3 };\n return priority[a.processingPriority] - priority[b.processingPriority];\n});\n\nreturn [{\n json: processingQueue\n}];"
},
"typeVersion": 2
},
{
"id": "batch-processor",
"name": "批量处理",
"type": "n8n-nodes-base.splitInBatches",
"notes": "Process 5 files at a time",
"position": [
850,
300
],
"parameters": {
"options": {},
"batchSize": 5
},
"typeVersion": 3
},
{
"id": "split-files",
"name": "拆分输出文件",
"type": "n8n-nodes-base.set",
"notes": "Prepare individual files",
"position": [
1050,
300
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "assignment1",
"name": "processingBatch",
"type": "object",
"value": "={{ $json }}"
}
]
}
},
"typeVersion": 3
},
{
"id": "split-items",
"name": "拆分项目",
"type": "n8n-nodes-base.splitOut",
"position": [
1250,
300
],
"parameters": {
"options": {},
"fieldToSplitOut": "processingBatch.valid"
},
"typeVersion": 1
},
{
"id": "pdfvector-process",
"name": "PDF 向量 - 处理文档/图像",
"type": "n8n-nodes-pdfvector.pdfVector",
"notes": "Process document or image",
"position": [
1450,
250
],
"parameters": {
"url": "={{ $json.webViewLink }}",
"useLLM": "auto",
"resource": "document",
"inputType": "url",
"operation": "parse"
},
"typeVersion": 1,
"continueOnFail": true
},
{
"id": "track-results",
"name": "跟踪处理结果",
"type": "n8n-nodes-base.code",
"notes": "Analyze results",
"position": [
1650,
300
],
"parameters": {
"jsCode": "// Track processing results\nconst result = $input.first().json;\nconst originalFile = $node['Split Items'].json;\nconst startTime = new Date($node['Split Items'].context.executionTime);\nconst endTime = new Date();\nconst processingTime = (endTime - startTime) / 1000;\n\nconst processedFile = {\n // Original file info\n fileName: originalFile.name,\n fileType: originalFile.fileType,\n sizeMB: originalFile.sizeMB,\n \n // Processing results\n success: !result.error,\n processingTime: Math.round(processingTime * 100) / 100,\n creditsUsed: result.creditsUsed || originalFile.estimatedCredits,\n \n // Content info\n contentLength: result.content?.length || 0,\n wordCount: result.content?.split(' ').length || 0,\n \n // Error tracking\n error: result.error ? {\n message: result.error.message || 'Unknown error',\n code: result.error.code\n } : null,\n \n // Timestamps\n processedAt: new Date().toISOString()\n};\n\n// Quality checks\nif (processedFile.success) {\n processedFile.qualityChecks = {\n hasContent: processedFile.contentLength > 100,\n reasonableLength: processedFile.wordCount > 10 && processedFile.wordCount < 100000,\n properEncoding: !result.content?.includes('�'),\n creditsEfficiency: processedFile.creditsUsed / processedFile.sizeMB < 5\n };\n \n // Overall quality score\n const checks = Object.values(processedFile.qualityChecks);\n processedFile.qualityScore = (checks.filter(c => c).length / checks.length) * 100;\n}\n\nreturn [{ json: processedFile }];"
},
"typeVersion": 2
},
{
"id": "collect-batch",
"name": "收集批次结果",
"type": "n8n-nodes-base.aggregate",
"notes": "Aggregate batch results",
"position": [
1850,
300
],
"parameters": {
"options": {},
"aggregate": "aggregateAllItemData"
},
"typeVersion": 1
},
{
"id": "generate-analytics",
"name": "生成分析报告",
"type": "n8n-nodes-base.code",
"notes": "Create analytics dashboard",
"position": [
2050,
300
],
"parameters": {
"jsCode": "// Generate comprehensive analytics report\nconst allResults = $input.all().map(item => item.json);\nconst initialStats = $node['Validate & Queue Files'].json.stats;\n\n// Calculate processing analytics\nconst analytics = {\n overview: {\n totalFilesFound: initialStats.totalFiles,\n filesProcessed: allResults.length,\n successfulProcessing: allResults.filter(r => r.success).length,\n failedProcessing: allResults.filter(r => !r.success).length,\n successRate: 0,\n totalProcessingTime: 0,\n totalCreditsUsed: 0,\n averageQualityScore: 0\n },\n \n byFileType: {\n pdf: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 },\n word: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 },\n image: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 }\n },\n \n errors: {},\n \n performance: {\n fastestFile: null,\n slowestFile: null,\n mostEfficientCredit: null,\n leastEfficientCredit: null\n },\n \n quality: {\n highQuality: [],\n lowQuality: [],\n averageWordCount: 0\n }\n};\n\n// Process results\nlet totalQualityScore = 0;\nlet qualityCount = 0;\n\nallResults.forEach(result => {\n // Update overview\n analytics.overview.totalProcessingTime += result.processingTime || 0;\n analytics.overview.totalCreditsUsed += result.creditsUsed || 0;\n \n // Update by file type\n const type = result.fileType;\n if (analytics.byFileType[type]) {\n analytics.byFileType[type].processed++;\n if (result.success) {\n analytics.byFileType[type].successful++;\n } else {\n analytics.byFileType[type].failed++;\n }\n analytics.byFileType[type].avgTime += result.processingTime || 0;\n analytics.byFileType[type].creditsUsed += result.creditsUsed || 0;\n }\n \n // Track errors\n if (result.error) {\n const errorType = result.error.message || 'Unknown';\n analytics.errors[errorType] = (analytics.errors[errorType] || 0) + 1;\n }\n \n // Track performance\n if (!analytics.performance.fastestFile || result.processingTime < analytics.performance.fastestFile.time) {\n analytics.performance.fastestFile = {\n name: result.fileName,\n time: result.processingTime\n };\n }\n if (!analytics.performance.slowestFile || result.processingTime > analytics.performance.slowestFile.time) {\n analytics.performance.slowestFile = {\n name: result.fileName,\n time: result.processingTime\n };\n }\n \n // Track quality\n if (result.qualityScore !== undefined) {\n totalQualityScore += result.qualityScore;\n qualityCount++;\n \n if (result.qualityScore >= 75) {\n analytics.quality.highQuality.push(result.fileName);\n } else if (result.qualityScore < 50) {\n analytics.quality.lowQuality.push(result.fileName);\n }\n }\n \n analytics.quality.averageWordCount += result.wordCount || 0;\n});\n\n// Calculate averages\nanalytics.overview.successRate = Math.round((analytics.overview.successfulProcessing / analytics.overview.filesProcessed) * 100);\nanalytics.overview.averageQualityScore = qualityCount > 0 ? Math.round(totalQualityScore / qualityCount) : 0;\nanalytics.quality.averageWordCount = Math.round(analytics.quality.averageWordCount / allResults.length);\n\n// Calculate file type averages\nObject.keys(analytics.byFileType).forEach(type => {\n const typeData = analytics.byFileType[type];\n if (typeData.processed > 0) {\n typeData.avgTime = Math.round((typeData.avgTime / typeData.processed) * 100) / 100;\n typeData.successRate = Math.round((typeData.successful / typeData.processed) * 100);\n }\n});\n\n// Generate report\nlet report = `# Batch Processing Analytics Report\\n\\n`;\nreport += `**Generated:** ${new Date().toLocaleString()}\\n\\n`;\n\nreport += `## Overview\\n`;\nreport += `- **Files Processed:** ${analytics.overview.filesProcessed} of ${analytics.overview.totalFilesFound}\\n`;\nreport += `- **Success Rate:** ${analytics.overview.successRate}%\\n`;\nreport += `- **Total Processing Time:** ${Math.round(analytics.overview.totalProcessingTime)}s\\n`;\nreport += `- **Credits Used:** ${analytics.overview.totalCreditsUsed}\\n`;\nreport += `- **Average Quality Score:** ${analytics.overview.averageQualityScore}%\\n\\n`;\n\nreport += `## Performance by File Type\\n`;\nObject.entries(analytics.byFileType).forEach(([type, data]) => {\n if (data.processed > 0) {\n report += `### ${type.toUpperCase()}\\n`;\n report += `- Processed: ${data.processed}\\n`;\n report += `- Success Rate: ${data.successRate}%\\n`;\n report += `- Avg Time: ${data.avgTime}s\\n`;\n report += `- Credits: ${data.creditsUsed}\\n\\n`;\n }\n});\n\nif (Object.keys(analytics.errors).length > 0) {\n report += `## Errors Encountered\\n`;\n Object.entries(analytics.errors).forEach(([error, count]) => {\n report += `- ${error}: ${count} occurrences\\n`;\n });\n report += `\\n`;\n}\n\nreport += `## Recommendations\\n`;\nif (analytics.overview.successRate < 90) {\n report += `- Success rate is below 90%. Review error logs for common issues.\\n`;\n}\nif (analytics.overview.averageQualityScore < 70) {\n report += `- Quality scores are low. Consider using 'always' LLM mode for better results.\\n`;\n}\nif (analytics.quality.lowQuality.length > 0) {\n report += `- ${analytics.quality.lowQuality.length} files had low quality scores. Manual review recommended.\\n`;\n}\n\nreturn [{\n json: {\n analytics,\n report,\n processedFiles: allResults,\n timestamp: new Date().toISOString()\n }\n}];"
},
"typeVersion": 2
}
],
"connections": {
"Split Items": {
"main": [
[
{
"node": "PDF Vector - Process Document/Image",
"type": "main",
"index": 0
}
]
]
},
"List Documents": {
"main": [
[
{
"node": "Validate & Queue Files",
"type": "main",
"index": 0
}
]
]
},
"Manual Trigger": {
"main": [
[
{
"node": "List Documents",
"type": "main",
"index": 0
}
]
]
},
"Split Out Files": {
"main": [
[
{
"node": "Split Items",
"type": "main",
"index": 0
}
]
]
},
"Process in Batches": {
"main": [
[
{
"node": "Split Out Files",
"type": "main",
"index": 0
}
],
[
{
"node": "Generate Analytics Report",
"type": "main",
"index": 0
}
]
]
},
"Collect Batch Results": {
"main": [
[
{
"node": "Generate Analytics Report",
"type": "main",
"index": 0
}
]
]
},
"Validate & Queue Files": {
"main": [
[
{
"node": "Process in Batches",
"type": "main",
"index": 0
}
]
]
},
"Track Processing Results": {
"main": [
[
{
"node": "Collect Batch Results",
"type": "main",
"index": 0
}
]
]
},
"PDF Vector - Process Document/Image": {
"main": [
[
{
"node": "Track Processing Results",
"type": "main",
"index": 0
}
]
]
}
}
}常见问题
如何使用这个工作流?
复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。
这个工作流适合什么场景?
中级 - 文档提取, AI 摘要总结, 多模态 AI
需要付费吗?
本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。
相关工作流推荐
使用PDF Vector和HIPAA合规从医疗文档提取临床数据
使用PDF Vector和HIPAA合规从医疗文档提取临床数据
If
Code
Postgres
+4
9 节点PDF Vector
文档提取
使用PDF向量AI解析评分简历
使用PDF向量AI解析评分简历
Set
Code
Google Drive
+3
11 节点PDF Vector
人力资源
使用GPT-4和多数据库搜索自动化学术文献综述
使用GPT-4和多数据库搜索自动化学术文献综述
If
Set
Code
+4
13 节点PDF Vector
文档提取
使用 PDF Vector 和 Google Drive 的自动化收据处理与税务分类
使用 PDF Vector 和 Google Drive 的自动化收据处理与税务分类
Code
Google Drive
Google Sheets
+3
9 节点PDF Vector
发票处理
使用 PDF 向量、OCR、GPT-4 和 Google Drive 的研究论文分析系统
使用 PDF 向量、OCR、GPT-4 和 Google Drive 的研究论文分析系统
Code
Open Ai
Postgres
+4
11 节点PDF Vector
文档提取
自动翻译Google Slides演示文稿
基于Gemini的AI驱动自动翻译Google Slides演示文稿至任意语言
Set
Code
Wait
+12
18 节点Davide
文档提取
工作流信息
难度等级
中级
节点数量13
分类3
节点类型9
作者
PDF Vector
@pdfvectorA fully featured PDF APIs for developers - Parse any PDF or Word document, extract structured data, and access millions of academic papers - all through simple APIs.
外部链接
在 n8n.io 查看 →
分享此工作流