8
n8n 中文网amn8n.com

使用PDF Vector进行OCR、分析和Google Drive的文档处理

中级

这是一个Document Extraction, AI Summarization, Multimodal AI领域的自动化工作流,包含 13 个节点。主要使用 Set, Code, SplitOut, Aggregate, GoogleDrive 等节点。 使用PDF Vector进行OCR、分析和Google Drive的文档处理

前置要求
  • Google Drive API 凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "meta": {
    "instanceId": "placeholder"
  },
  "nodes": [
    {
      "id": "overview-note",
      "name": "分析概览",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        50,
        50
      ],
      "parameters": {
        "color": 5,
        "width": 350,
        "height": 180,
        "content": "## 📊 实时分析"
      },
      "typeVersion": 1
    },
    {
      "id": "metrics-note",
      "name": "跟踪的指标",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        450,
        550
      ],
      "parameters": {
        "width": 260,
        "height": 160,
        "content": "## 📈 关键指标"
      },
      "typeVersion": 1
    },
    {
      "id": "output-note",
      "name": "仪表板输出",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        750,
        550
      ],
      "parameters": {
        "color": 6,
        "width": 250,
        "height": 150,
        "content": "## 📊 可视化"
      },
      "typeVersion": 1
    },
    {
      "id": "manual-trigger",
      "name": "手动触发器",
      "type": "n8n-nodes-base.manualTrigger",
      "notes": "Start batch processing",
      "position": [
        250,
        300
      ],
      "parameters": {},
      "typeVersion": 1
    },
    {
      "id": "list-documents",
      "name": "列出文档",
      "type": "n8n-nodes-base.googleDrive",
      "notes": "Replace FOLDER_ID_HERE with your Google Drive folder ID",
      "position": [
        450,
        300
      ],
      "parameters": {
        "limit": 100,
        "fields": [
          "id",
          "name",
          "mimeType",
          "size",
          "webViewLink",
          "createdTime"
        ],
        "operation": "list",
        "queryString": "'FOLDER_ID_HERE' in parents and trashed=false"
      },
      "typeVersion": 3
    },
    {
      "id": "validate-files",
      "name": "验证并排队文件",
      "type": "n8n-nodes-base.code",
      "notes": "Validate and prioritize files",
      "position": [
        650,
        300
      ],
      "parameters": {
        "jsCode": "// Validate and categorize documents\nconst files = $input.all().map(item => item.json);\nconst processingQueue = {\n  valid: [],\n  invalid: [],\n  stats: {\n    totalFiles: files.length,\n    pdfCount: 0,\n    wordCount: 0,\n    imageCount: 0,\n    otherCount: 0,\n    totalSizeMB: 0\n  }\n};\n\n// Define supported formats\nconst supportedFormats = {\n  pdf: ['application/pdf'],\n  word: [\n    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',\n    'application/msword'\n  ],\n  image: ['image/jpeg', 'image/png', 'image/gif']\n};\n\nfiles.forEach(file => {\n  const mimeType = file.mimeType;\n  const sizeMB = (parseInt(file.size) || 0) / (1024 * 1024);\n  \n  // Check if supported\n  let fileType = 'other';\n  let isValid = false;\n  \n  if (supportedFormats.pdf.includes(mimeType)) {\n    fileType = 'pdf';\n    isValid = true;\n    processingQueue.stats.pdfCount++;\n  } else if (supportedFormats.word.includes(mimeType)) {\n    fileType = 'word';\n    isValid = true;\n    processingQueue.stats.wordCount++;\n  } else if (supportedFormats.image.includes(mimeType)) {\n    fileType = 'image';\n    isValid = true;\n    processingQueue.stats.imageCount++;\n  } else {\n    processingQueue.stats.otherCount++;\n  }\n  \n  // Check file size (max 50MB)\n  if (sizeMB > 50) {\n    isValid = false;\n  }\n  \n  const fileInfo = {\n    ...file,\n    fileType,\n    sizeMB: Math.round(sizeMB * 100) / 100,\n    processingPriority: sizeMB < 5 ? 'high' : sizeMB < 20 ? 'medium' : 'low',\n    estimatedCredits: fileType === 'pdf' ? Math.ceil(sizeMB * 2) : 1\n  };\n  \n  if (isValid) {\n    processingQueue.valid.push(fileInfo);\n  } else {\n    processingQueue.invalid.push({\n      ...fileInfo,\n      reason: sizeMB > 50 ? 'File too large' : 'Unsupported format'\n    });\n  }\n  \n  processingQueue.stats.totalSizeMB += sizeMB;\n});\n\n// Sort by priority\nprocessingQueue.valid.sort((a, b) => {\n  const priority = { high: 1, medium: 2, low: 3 };\n  return priority[a.processingPriority] - priority[b.processingPriority];\n});\n\nreturn [{\n  json: processingQueue\n}];"
      },
      "typeVersion": 2
    },
    {
      "id": "batch-processor",
      "name": "批量处理",
      "type": "n8n-nodes-base.splitInBatches",
      "notes": "Process 5 files at a time",
      "position": [
        850,
        300
      ],
      "parameters": {
        "options": {},
        "batchSize": 5
      },
      "typeVersion": 3
    },
    {
      "id": "split-files",
      "name": "拆分输出文件",
      "type": "n8n-nodes-base.set",
      "notes": "Prepare individual files",
      "position": [
        1050,
        300
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "assignment1",
              "name": "processingBatch",
              "type": "object",
              "value": "={{ $json }}"
            }
          ]
        }
      },
      "typeVersion": 3
    },
    {
      "id": "split-items",
      "name": "拆分项目",
      "type": "n8n-nodes-base.splitOut",
      "position": [
        1250,
        300
      ],
      "parameters": {
        "options": {},
        "fieldToSplitOut": "processingBatch.valid"
      },
      "typeVersion": 1
    },
    {
      "id": "pdfvector-process",
      "name": "PDF 向量 - 处理文档/图像",
      "type": "n8n-nodes-pdfvector.pdfVector",
      "notes": "Process document or image",
      "position": [
        1450,
        250
      ],
      "parameters": {
        "url": "={{ $json.webViewLink }}",
        "useLLM": "auto",
        "resource": "document",
        "inputType": "url",
        "operation": "parse"
      },
      "typeVersion": 1,
      "continueOnFail": true
    },
    {
      "id": "track-results",
      "name": "跟踪处理结果",
      "type": "n8n-nodes-base.code",
      "notes": "Analyze results",
      "position": [
        1650,
        300
      ],
      "parameters": {
        "jsCode": "// Track processing results\nconst result = $input.first().json;\nconst originalFile = $node['Split Items'].json;\nconst startTime = new Date($node['Split Items'].context.executionTime);\nconst endTime = new Date();\nconst processingTime = (endTime - startTime) / 1000;\n\nconst processedFile = {\n  // Original file info\n  fileName: originalFile.name,\n  fileType: originalFile.fileType,\n  sizeMB: originalFile.sizeMB,\n  \n  // Processing results\n  success: !result.error,\n  processingTime: Math.round(processingTime * 100) / 100,\n  creditsUsed: result.creditsUsed || originalFile.estimatedCredits,\n  \n  // Content info\n  contentLength: result.content?.length || 0,\n  wordCount: result.content?.split(' ').length || 0,\n  \n  // Error tracking\n  error: result.error ? {\n    message: result.error.message || 'Unknown error',\n    code: result.error.code\n  } : null,\n  \n  // Timestamps\n  processedAt: new Date().toISOString()\n};\n\n// Quality checks\nif (processedFile.success) {\n  processedFile.qualityChecks = {\n    hasContent: processedFile.contentLength > 100,\n    reasonableLength: processedFile.wordCount > 10 && processedFile.wordCount < 100000,\n    properEncoding: !result.content?.includes('�'),\n    creditsEfficiency: processedFile.creditsUsed / processedFile.sizeMB < 5\n  };\n  \n  // Overall quality score\n  const checks = Object.values(processedFile.qualityChecks);\n  processedFile.qualityScore = (checks.filter(c => c).length / checks.length) * 100;\n}\n\nreturn [{ json: processedFile }];"
      },
      "typeVersion": 2
    },
    {
      "id": "collect-batch",
      "name": "收集批次结果",
      "type": "n8n-nodes-base.aggregate",
      "notes": "Aggregate batch results",
      "position": [
        1850,
        300
      ],
      "parameters": {
        "options": {},
        "aggregate": "aggregateAllItemData"
      },
      "typeVersion": 1
    },
    {
      "id": "generate-analytics",
      "name": "生成分析报告",
      "type": "n8n-nodes-base.code",
      "notes": "Create analytics dashboard",
      "position": [
        2050,
        300
      ],
      "parameters": {
        "jsCode": "// Generate comprehensive analytics report\nconst allResults = $input.all().map(item => item.json);\nconst initialStats = $node['Validate & Queue Files'].json.stats;\n\n// Calculate processing analytics\nconst analytics = {\n  overview: {\n    totalFilesFound: initialStats.totalFiles,\n    filesProcessed: allResults.length,\n    successfulProcessing: allResults.filter(r => r.success).length,\n    failedProcessing: allResults.filter(r => !r.success).length,\n    successRate: 0,\n    totalProcessingTime: 0,\n    totalCreditsUsed: 0,\n    averageQualityScore: 0\n  },\n  \n  byFileType: {\n    pdf: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 },\n    word: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 },\n    image: { processed: 0, successful: 0, failed: 0, avgTime: 0, creditsUsed: 0 }\n  },\n  \n  errors: {},\n  \n  performance: {\n    fastestFile: null,\n    slowestFile: null,\n    mostEfficientCredit: null,\n    leastEfficientCredit: null\n  },\n  \n  quality: {\n    highQuality: [],\n    lowQuality: [],\n    averageWordCount: 0\n  }\n};\n\n// Process results\nlet totalQualityScore = 0;\nlet qualityCount = 0;\n\nallResults.forEach(result => {\n  // Update overview\n  analytics.overview.totalProcessingTime += result.processingTime || 0;\n  analytics.overview.totalCreditsUsed += result.creditsUsed || 0;\n  \n  // Update by file type\n  const type = result.fileType;\n  if (analytics.byFileType[type]) {\n    analytics.byFileType[type].processed++;\n    if (result.success) {\n      analytics.byFileType[type].successful++;\n    } else {\n      analytics.byFileType[type].failed++;\n    }\n    analytics.byFileType[type].avgTime += result.processingTime || 0;\n    analytics.byFileType[type].creditsUsed += result.creditsUsed || 0;\n  }\n  \n  // Track errors\n  if (result.error) {\n    const errorType = result.error.message || 'Unknown';\n    analytics.errors[errorType] = (analytics.errors[errorType] || 0) + 1;\n  }\n  \n  // Track performance\n  if (!analytics.performance.fastestFile || result.processingTime < analytics.performance.fastestFile.time) {\n    analytics.performance.fastestFile = {\n      name: result.fileName,\n      time: result.processingTime\n    };\n  }\n  if (!analytics.performance.slowestFile || result.processingTime > analytics.performance.slowestFile.time) {\n    analytics.performance.slowestFile = {\n      name: result.fileName,\n      time: result.processingTime\n    };\n  }\n  \n  // Track quality\n  if (result.qualityScore !== undefined) {\n    totalQualityScore += result.qualityScore;\n    qualityCount++;\n    \n    if (result.qualityScore >= 75) {\n      analytics.quality.highQuality.push(result.fileName);\n    } else if (result.qualityScore < 50) {\n      analytics.quality.lowQuality.push(result.fileName);\n    }\n  }\n  \n  analytics.quality.averageWordCount += result.wordCount || 0;\n});\n\n// Calculate averages\nanalytics.overview.successRate = Math.round((analytics.overview.successfulProcessing / analytics.overview.filesProcessed) * 100);\nanalytics.overview.averageQualityScore = qualityCount > 0 ? Math.round(totalQualityScore / qualityCount) : 0;\nanalytics.quality.averageWordCount = Math.round(analytics.quality.averageWordCount / allResults.length);\n\n// Calculate file type averages\nObject.keys(analytics.byFileType).forEach(type => {\n  const typeData = analytics.byFileType[type];\n  if (typeData.processed > 0) {\n    typeData.avgTime = Math.round((typeData.avgTime / typeData.processed) * 100) / 100;\n    typeData.successRate = Math.round((typeData.successful / typeData.processed) * 100);\n  }\n});\n\n// Generate report\nlet report = `# Batch Processing Analytics Report\\n\\n`;\nreport += `**Generated:** ${new Date().toLocaleString()}\\n\\n`;\n\nreport += `## Overview\\n`;\nreport += `- **Files Processed:** ${analytics.overview.filesProcessed} of ${analytics.overview.totalFilesFound}\\n`;\nreport += `- **Success Rate:** ${analytics.overview.successRate}%\\n`;\nreport += `- **Total Processing Time:** ${Math.round(analytics.overview.totalProcessingTime)}s\\n`;\nreport += `- **Credits Used:** ${analytics.overview.totalCreditsUsed}\\n`;\nreport += `- **Average Quality Score:** ${analytics.overview.averageQualityScore}%\\n\\n`;\n\nreport += `## Performance by File Type\\n`;\nObject.entries(analytics.byFileType).forEach(([type, data]) => {\n  if (data.processed > 0) {\n    report += `### ${type.toUpperCase()}\\n`;\n    report += `- Processed: ${data.processed}\\n`;\n    report += `- Success Rate: ${data.successRate}%\\n`;\n    report += `- Avg Time: ${data.avgTime}s\\n`;\n    report += `- Credits: ${data.creditsUsed}\\n\\n`;\n  }\n});\n\nif (Object.keys(analytics.errors).length > 0) {\n  report += `## Errors Encountered\\n`;\n  Object.entries(analytics.errors).forEach(([error, count]) => {\n    report += `- ${error}: ${count} occurrences\\n`;\n  });\n  report += `\\n`;\n}\n\nreport += `## Recommendations\\n`;\nif (analytics.overview.successRate < 90) {\n  report += `- Success rate is below 90%. Review error logs for common issues.\\n`;\n}\nif (analytics.overview.averageQualityScore < 70) {\n  report += `- Quality scores are low. Consider using 'always' LLM mode for better results.\\n`;\n}\nif (analytics.quality.lowQuality.length > 0) {\n  report += `- ${analytics.quality.lowQuality.length} files had low quality scores. Manual review recommended.\\n`;\n}\n\nreturn [{\n  json: {\n    analytics,\n    report,\n    processedFiles: allResults,\n    timestamp: new Date().toISOString()\n  }\n}];"
      },
      "typeVersion": 2
    }
  ],
  "connections": {
    "Split Items": {
      "main": [
        [
          {
            "node": "PDF Vector - Process Document/Image",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "List Documents": {
      "main": [
        [
          {
            "node": "Validate & Queue Files",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Manual Trigger": {
      "main": [
        [
          {
            "node": "List Documents",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Split Out Files": {
      "main": [
        [
          {
            "node": "Split Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Process in Batches": {
      "main": [
        [
          {
            "node": "Split Out Files",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Generate Analytics Report",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Collect Batch Results": {
      "main": [
        [
          {
            "node": "Generate Analytics Report",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Validate & Queue Files": {
      "main": [
        [
          {
            "node": "Process in Batches",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Track Processing Results": {
      "main": [
        [
          {
            "node": "Collect Batch Results",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "PDF Vector - Process Document/Image": {
      "main": [
        [
          {
            "node": "Track Processing Results",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

中级 - 文档提取, AI 摘要总结, 多模态 AI

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
中级
节点数量13
分类3
节点类型9
难度说明

适合有一定经验的用户,包含 6-15 个节点的中等复杂度工作流

作者
PDF Vector

PDF Vector

@pdfvector

A fully featured PDF APIs for developers - Parse any PDF or Word document, extract structured data, and access millions of academic papers - all through simple APIs.

外部链接
在 n8n.io 查看

分享此工作流