使用PDF Vector进行OCR、分析和Google Drive的文档处理

Name: 使用PDF Vector进行OCR、分析和Google Drive的文档处理
Rating: 4.5 (10 reviews)
Author: PDF Vector
中级
这是一个Document Extraction, AI Summarization, Multimodal AI领域的自动化工作流，包含 13 个节点。主要使用 Set, Code, SplitOut, Aggregate, GoogleDrive 等节点。使用PDF Vector进行OCR、分析和Google Drive的文档处理
前置要求
•Google Drive API 凭证
使用的节点 (13)

分类

文档提取
AI 摘要总结
多模态 AI
工作流预览
可视化展示节点连接关系，支持缩放和平移
手动触发器
列出文档
验证并排队文件
批量处理
拆分输出文件
拆分项目
PDF 向量 - 处理文档/图像
跟踪处理结果
收集批次结果
生成分析报告
React Flow
导出工作流
复制以下 JSON 配置到 n8n 导入，即可使用此工作流
{
  "meta": {
    "instanceId": "placeholder"
  },
  "nodes": [
    {
      "id": "overview-note",
      "name": "分析概览",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        50,
        50
      ],
      "parameters": {
        "color": 5,
        "width": 350,
        "height": 180,
        "content": "## 📊 实时分析"
      },
      "typeVersion": 1
    },
    {
      "id": "metrics-note",
      "name": "跟踪的指标",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        450,
        550
      ],
      "parameters": {
        "width": 260,
        "height": 160,
        "content": "## 📈 关键指标"
      },
      "typeVersion": 1
    },
    {
      "id": "output-note",
      "name": "仪表板输出",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        750,
        550
      ],
      "parameters": {
        "color": 6,
        "width": 250,
        "height": 150,
        "content": "## 📊 可视化"
      },
      "typeVersion": 1
    },
    {
      "id": "manual-trigger",
      "name": "手动触发器",
      "type": "n8n-nodes-base.manualTrigger",
      "notes": "Start batch processing",
      "position": [
        250,
        300
      ],
      "parameters": {},
      "typeVersion": 1
    },
    {
      "id": "list-documents",
      "name": "列出文档",
      "type": "n8n-nodes-base.googleDrive",
      "notes": "Replace FOLDER_ID_HERE with your Google Drive folder ID",
      "position": [
        450,
        300
      ],
      "parameters": {
        "limit": 100,
        "fields": [
          "id",
          "name",
          "mimeType",
          "size",
          "webViewLink",
          "createdTime"
        ],
        "operation": "list",
        "queryString": "'FOLDER_ID_HERE' in parents and trashed=false"
      },
      "typeVersion": 3
    },
    {
      "id": "validate-files",
      "name": "验证并排队文件",
      "type": "n8n-nodes-base.code",
      "notes": "Validate and prioritize files",
      "position": [
        650,
        300
      ],
      "parameters": {
        "jsCode": "// Validate and categorize documents\nconst files = $input.all().map(item => item.json);\nconst processingQueue = {\n  valid: [],\n  invalid: [],\n  stats: {\n    totalFiles: files.length,\n    pdfCount: 0,\n    wordCount: 0,\n    imageCount: 0,\n    otherCount: 0,\n    totalSizeMB: 0\n  }\n};\n\n// Define supported formats\nconst supportedFormats = {\n  pdf: ['application/pdf'],\n  word: [\n    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',\n    'application/msword'\n  ],\n  image: ['image/jpeg', 'image/png', 'image/gif']\n};\n\nfiles.forEach(file => {\n  const mimeType = file.mimeType;\n  const sizeMB = (parseInt(file.size) || 0) / (1024 * 1024);\n  \n  // Check if supported\n  let fileType = 'other';\n  let isValid = false;\n  \n  if (supportedFormats.pdf.includes(mimeType)) {\n    fileType = 'pdf';\n    isValid = true;\n    processingQueue.stats.pdfCount++;\n  } else if (supportedFormats.word.includes(mimeType)) {\n    fileType = 'word';\n    isValid = true;\n    processingQueue.stats.wordCount++;\n  } else if (supportedFormats.image.includes(mimeType)) {\n    fileType = 'image';\n    isValid = true;\n    processingQueue.stats.imageCount++;\n  } else {\n    processingQueue.stats.otherCount++;\n  }\n  \n  // Check file size (max 50MB)\n  if (sizeMB > 50) {\n    isValid = false;\n  }\n  \n  const fileInfo = {\n    ...file,\n    fileType,\n    sizeMB: Math.round(sizeMB * 100) / 100,\n    processingPriority: sizeMB < 5 ? 'high' : sizeMB < 20 ? 'medium' : 'low',\n    estimatedCredits: fileType === 'pdf' ? Math.ceil(sizeMB * 2) : 1\n  };\n  \n  if (isValid) {\n    processingQueue.valid.push(fileInfo);\n  } else {\n    processingQueue.invalid.push({\n      ...fileInfo,\n      reason: sizeMB > 50 ? 'File too large' : 'Unsupported format'\n    });\n  }\n  \n  processingQueue.stats.totalSizeMB += sizeMB;\n});\n\n// Sort by priority\nprocessingQueue.valid.sort((a, b) => {\n  const priority = { high: 1, medium: 2, low: 3 };\n  return priority[a.processingPriority] - priority[b.processingPriority];\n});\n\nreturn [{\n  json: processingQueue\n}];"
      },
      "typeVersion": 2
    },
    {
      "id": "batch-processor",
      "name": "批量处理",
      "type": "n8n-nodes-base.splitInBatches",
      "notes": "Process 5 files at a time",
      "position": [
        850,
        300
      ],
      "parameters": {
        "options": {},
        "batchSize": 5
      },
      "typeVersion": 3
    },
    {
      "id": "split-files",
      "name": "拆分输出文件",
      "type": "n8n-nodes-base.set",
      "notes": "Prepare individual files",
      "position": [
        1050,
        300
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "assignment1",
              "name": "processingBatch",
              "type": "object",
              "value": "={{ $json }}"
            }
          ]
        }
      },
      "typeVersion": 3
    },
    {
      "id": "split-items",
      "name": "拆分项目",
      "type": "n8n-nodes-base.splitOut",
      "position": [
        1250,
        300
      ],
      "parameters": {
        "options": {},
        "fieldToSplitOut": "processingBatch.valid"
      },
      "typeVersion": 1
    },
    {
      "id": "pdfvector-process",
      "name": "PDF 向量 - 处理文档/图像",
      "type": "n8n-nodes-pdfvector.pdfVector",
      "notes": "Process document or image",
      "position": [
        1450,
        250
      ],
      "parameters": {
        "url": "={{ $json.webViewLink }}",
        "useLLM": "auto",
        "resource": "document",
        "inputType": "url",
        "operation": "parse"
      },
      "typeVersion": 1,
      "continueOnFail": true
    },
    {
      "id": "track-results",
      "name": "跟踪处理结果",
      "type": "n8n-nodes-base.code",
      "notes": "Analyze results",
      "position": [
        1650,
        300
      ],
      "parameters": {
        "jsCode": "// Track processing results\nconst result = $input.first().json;\nconst originalFile = $node['Split Items'].json;\nconst startTime = new Date($node['Split Items'].context.executionTime);\nconst endTime = new Date();\nconst processingTime = (endTime - startTime) / 1000;\n\nconst processedFile = {\n  // Original file info\n  fileName: originalFile.name,\n  fileType: originalFile.fileType,\n  sizeMB: originalFile.sizeMB,\n  \n  // Processing results\n  success: !result.error,\n  processingTime: Math.round(processingTime * 100) / 100,\n  creditsUsed: result.creditsUsed || originalFile.estimatedCredits,\n  \n  // Content info\n  contentLength: result.content?.length || 0,\n  wordCount: result.content?.split(' ').length || 0,\n  \n  // Error tracking\n  error: result.error ? {\n    message: result.error.message || 'Unknown error',\n    code: result.error.code\n  } : null,\n  \n  // Timestamps\n  processedAt: new Date().toISOString()\n};\n\n// Quality checks\nif (processedFile.success) {\n  processedFile.qualityChecks = {\n    hasContent: processedFile.contentLength > 100,\n    reasonableLength: processedFile.wordCount > 10 && processedFile.wordCount < 100000,\n    properEncoding: !result.content?.includes('�'),\n    creditsEfficiency: processedFile.creditsUsed / processedFile.sizeMB < 5\n  };\n  \n  // Overall quality score\n  const checks = Object.values(processedFile.qualityChecks);\n  processedFile.qualityScore = (checks.filter(c => c).length / checks.length) * 100;\n}\n\nreturn [{ json: processedFile }];"
      },
      "typeVersion": 2
    },
    {
      "id": "collect-batch",
      "name": "收集批次结果",
      "type": "n8n-nodes-base.aggregate",
      "notes": "Aggregate batch results",
      "position": [
        1850,
        300
      ],
      "parameters": {
        "options": {},
        "aggregate": "aggregateAllItemData"
      },
      "typeVersion": 1
    },
    {
      "id": "generate-analytics",
      "name": "生成分析报告",
      "type": "n8n-nodes-base.code",
      "notes": "Create analytics dashboard",
      "position": [
        2050,
        300
      ],
      "parameters": {
        "jsCode": "// Generate comprehensive analytics report\nconst allResults = $input.all().map(item => item.json);\nconst initialStats = $node['Validate & Queue Files'].json.stats;\n\n// Calculate processing analytics\nconst analytics = {\n  overview: {\n    totalFilesFound: initialStats.totalFiles,\n    filesProcessed: allResults.length,\n    successfulProcessing: allResults.filter(r => r.success).length,\n    failedProcessing: allResults.filter(r => !r.success).length,\n    successRate: 0,\n    totalProcessingTime: 0,\n    totalCreditsUsed: 0,\n    averageQualityScore: 0\n  },\n  \n  byFileType: {\n    pdf: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 },\n    word: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 },\n    image: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 }\n  },\n  \n  errors: {},\n  \n  performance: {\n    fastestFile: null,\n    slowestFile: null,\n    mostEfficientCredit: null,\n    leastEfficientCredit: null\n  },\n  \n  quality: {\n    highQuality: [],\n    lowQuality: [],\n    averageWordCount: 0\n  }\n};\n\n// Process results\nlet totalQualityScore = 0;\nlet qualityCount = 0;\n\nallResults.forEach(result => {\n  // Update overview\n  analytics.overview.totalProcessingTime += result.processingTime || 0;\n  analytics.overview.totalCreditsUsed += result.creditsUsed || 0;\n  \n  // Update by file type\n  const type = result.fileType;\n  if (analytics.byFileType[type]) {\n    analytics.byFileType[type].processed++;\n    if (result.success) {\n      analytics.byFileType[type].successful++;\n    } else {\n      analytics.byFileType[type].failed++;\n    }\n    analytics.byFileType[type].avgTime += result.processingTime || 0;\n    analytics.byFileType[type].creditsUsed += result.creditsUsed || 0;\n  }\n  \n  // Track errors\n  if (result.error) {\n    const errorType = result.error.message || 'Unknown';\n    analytics.errors[errorType] = (analytics.errors[errorType] || 0) + 1;\n  }\n  \n  // Track performance\n  if (!analytics.performance.fastestFile || result.processingTime < analytics.performance.fastestFile.time) {\n    analytics.performance.fastestFile = {\n      name: result.fileName,\n      time: result.processingTime\n    };\n  }\n  if (!analytics.performance.slowestFile || result.processingTime > analytics.performance.slowestFile.time) {\n    analytics.performance.slowestFile = {\n      name: result.fileName,\n      time: result.processingTime\n    };\n  }\n  \n  // Track quality\n  if (result.qualityScore !== undefined) {\n    totalQualityScore += result.qualityScore;\n    qualityCount++;\n    \n    if (result.qualityScore >= 75) {\n      analytics.quality.highQuality.push(result.fileName);\n    } else if (result.qualityScore < 50) {\n      analytics.quality.lowQuality.push(result.fileName);\n    }\n  }\n  \n  analytics.quality.averageWordCount += result.wordCount || 0;\n});\n\n// Calculate averages\nanalytics.overview.successRate = Math.round((analytics.overview.successfulProcessing / analytics.overview.filesProcessed) * 100);\nanalytics.overview.averageQualityScore = qualityCount > 0 ? Math.round(totalQualityScore / qualityCount) : 0;\nanalytics.quality.averageWordCount = Math.round(analytics.quality.averageWordCount / allResults.length);\n\n// Calculate file type averages\nObject.keys(analytics.byFileType).forEach(type => {\n  const typeData = analytics.byFileType[type];\n  if (typeData.processed > 0) {\n    typeData.avgTime = Math.round((typeData.avgTime / typeData.processed) * 100) / 100;\n    typeData.successRate = Math.round((typeData.successful / typeData.processed) * 100);\n  }\n});\n\n// Generate report\nlet report = `# Batch Processing Analytics Report\\n\\n`;\nreport += `**Generated:** ${new Date().toLocaleString()}\\n\\n`;\n\nreport += `## Overview\\n`;\nreport += `- **Files Processed:** ${analytics.overview.filesProcessed} of ${analytics.overview.totalFilesFound}\\n`;\nreport += `- **Success Rate:** ${analytics.overview.successRate}%\\n`;\nreport += `- **Total Processing Time:** ${Math.round(analytics.overview.totalProcessingTime)}s\\n`;\nreport += `- **Credits Used:** ${analytics.overview.totalCreditsUsed}\\n`;\nreport += `- **Average Quality Score:** ${analytics.overview.averageQualityScore}%\\n\\n`;\n\nreport += `## Performance by File Type\\n`;\nObject.entries(analytics.byFileType).forEach(([type, data]) => {\n  if (data.processed > 0) {\n    report += `### ${type.toUpperCase()}\\n`;\n    report += `- Processed: ${data.processed}\\n`;\n    report += `- Success Rate: ${data.successRate}%\\n`;\n    report += `- Avg Time: ${data.avgTime}s\\n`;\n    report += `- Credits: ${data.creditsUsed}\\n\\n`;\n  }\n});\n\nif (Object.keys(analytics.errors).length > 0) {\n  report += `## Errors Encountered\\n`;\n  Object.entries(analytics.errors).forEach(([error, count]) => {\n    report += `- ${error}: ${count} occurrences\\n`;\n  });\n  report += `\\n`;\n}\n\nreport += `## Recommendations\\n`;\nif (analytics.overview.successRate < 90) {\n  report += `- Success rate is below 90%. Review error logs for common issues.\\n`;\n}\nif (analytics.overview.averageQualityScore < 70) {\n  report += `- Quality scores are low. Consider using 'always' LLM mode for better results.\\n`;\n}\nif (analytics.quality.lowQuality.length > 0) {\n  report += `- ${analytics.quality.lowQuality.length} files had low quality scores. Manual review recommended.\\n`;\n}\n\nreturn [{\n  json: {\n    analytics,\n    report,\n    processedFiles: allResults,\n    timestamp: new Date().toISOString()\n  }\n}];"
      },
      "typeVersion": 2
    }
  ],
  "connections": {
    "Split Items": {
      "main": [
        [
          {
            "node": "PDF Vector - Process Document/Image",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "List Documents": {
      "main": [
        [
          {
            "node": "Validate & Queue Files",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Manual Trigger": {
      "main": [
        [
          {
            "node": "List Documents",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Split Out Files": {
      "main": [
        [
          {
            "node": "Split Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Process in Batches": {
      "main": [
        [
          {
            "node": "Split Out Files",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Generate Analytics Report",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Collect Batch Results": {
      "main": [
        [
          {
            "node": "Generate Analytics Report",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Validate & Queue Files": {
      "main": [
        [
          {
            "node": "Process in Batches",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Track Processing Results": {
      "main": [
        [
          {
            "node": "Collect Batch Results",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "PDF Vector - Process Document/Image": {
      "main": [
        [
          {
            "node": "Track Processing Results",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题
如何使用这个工作流？

复制上方的 JSON 配置代码，在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」，粘贴配置后根据需要修改凭证设置即可。
这个工作流适合什么场景？

中级 - 文档提取, AI 摘要总结, 多模态 AI
需要付费吗？

本工作流完全免费，您可以直接导入使用。但请注意，工作流中使用的第三方服务（如 OpenAI API）可能需要您自行付费。
使用PDF Vector进行OCR、分析和Google Drive的文档处理

使用的节点 (13)

分类

如何使用这个工作流？

这个工作流适合什么场景？

需要付费吗？

相关工作流推荐