8
n8n 中文网amn8n.com

使用GPT-4、PDFVector和PostgreSQL导出从文档提取数据

中级

这是一个Document Extraction, Multimodal AI领域的自动化工作流,包含 9 个节点。主要使用 Code, OpenAi, Switch, Postgres, PdfVector 等节点。 使用GPT-4、PDFVector和PostgreSQL导出从文档提取数据

前置要求
  • OpenAI API Key
  • PostgreSQL 数据库连接信息
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "meta": {
    "instanceId": "placeholder"
  },
  "nodes": [
    {
      "id": "workflow-info",
      "name": "流水线信息",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        250,
        150
      ],
      "parameters": {
        "content": "## 文档提取流水线"
      },
      "typeVersion": 1
    },
    {
      "id": "file-trigger",
      "name": "监控文件夹",
      "type": "n8n-nodes-base.localFileTrigger",
      "notes": "Triggers when new documents arrive",
      "position": [
        450,
        300
      ],
      "parameters": {
        "path": "/documents/incoming",
        "events": [
          "file:created"
        ]
      },
      "typeVersion": 1
    },
    {
      "id": "pdfvector-parse",
      "name": "PDF 向量 - 解析文档",
      "type": "n8n-nodes-pdfvector.pdfVector",
      "notes": "Parse with LLM for better extraction",
      "position": [
        650,
        300
      ],
      "parameters": {
        "useLlm": "always",
        "resource": "document",
        "operation": "parse",
        "documentUrl": "={{ $json.filePath }}"
      },
      "typeVersion": 1
    },
    {
      "id": "extract-data",
      "name": "提取结构化数据",
      "type": "n8n-nodes-base.openAi",
      "position": [
        850,
        300
      ],
      "parameters": {
        "model": "gpt-4",
        "options": {
          "responseFormat": {
            "type": "json_object"
          }
        },
        "messages": {
          "values": [
            {
              "content": "Extract the following information from this document:\n\n1. Document Type (invoice, contract, report, etc.)\n2. Date/Dates mentioned\n3. Parties involved (names, companies)\n4. Key amounts/values\n5. Important terms or conditions\n6. Reference numbers\n7. Addresses\n8. Contact information\n\nDocument content:\n{{ $json.content }}\n\nReturn as structured JSON."
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "validate-data",
      "name": "验证与清理数据",
      "type": "n8n-nodes-base.code",
      "position": [
        1050,
        300
      ],
      "parameters": {
        "functionCode": "// Validate and clean extracted data\nconst extracted = JSON.parse($json.content);\nconst validated = {};\n\n// Validate document type\nvalidated.documentType = extracted.documentType || 'unknown';\n\n// Parse and validate dates\nif (extracted.date) {\n  const date = new Date(extracted.date);\n  validated.date = isNaN(date) ? null : date.toISOString();\n}\n\n// Clean monetary values\nif (extracted.amounts) {\n  validated.amounts = extracted.amounts.map(amt => {\n    const cleaned = amt.replace(/[^0-9.-]/g, '');\n    return parseFloat(cleaned) || 0;\n  });\n}\n\n// Validate email addresses\nif (extracted.emails) {\n  validated.emails = extracted.emails.filter(email => \n    /^[^\\s@]+@[^\\s@]+\\.[^\\s@]+$/.test(email)\n  );\n}\n\nvalidated.raw = extracted;\nvalidated.fileName = $node['Watch Folder'].json.fileName;\nvalidated.processedAt = new Date().toISOString();\n\nreturn validated;"
      },
      "typeVersion": 1
    },
    {
      "id": "route-by-type",
      "name": "按文档类型路由",
      "type": "n8n-nodes-base.switch",
      "position": [
        1250,
        300
      ],
      "parameters": {
        "conditions": {
          "string": [
            {
              "value1": "={{ $json.documentType }}",
              "value2": "invoice",
              "operation": "equals"
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "store-invoice",
      "name": "存储发票数据",
      "type": "n8n-nodes-base.postgres",
      "position": [
        1450,
        250
      ],
      "parameters": {
        "table": "invoices",
        "columns": "invoice_number,vendor,amount,date,raw_data",
        "operation": "insert"
      },
      "typeVersion": 1
    },
    {
      "id": "store-other",
      "name": "存储其他文档",
      "type": "n8n-nodes-base.postgres",
      "position": [
        1450,
        350
      ],
      "parameters": {
        "table": "documents",
        "columns": "type,content,metadata,processed_at",
        "operation": "insert"
      },
      "typeVersion": 1
    },
    {
      "id": "export-csv",
      "name": "导出到 CSV",
      "type": "n8n-nodes-base.writeBinaryFile",
      "position": [
        1650,
        300
      ],
      "parameters": {
        "fileName": "extracted_data_{{ $now.format('yyyy-MM-dd') }}.csv",
        "fileContent": "={{ $items().map(item => item.json).toCsv() }}"
      },
      "typeVersion": 1
    }
  ],
  "connections": {
    "Watch Folder": {
      "main": [
        [
          {
            "node": "PDF Vector - Parse Document",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store Invoice Data": {
      "main": [
        [
          {
            "node": "Export to CSV",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Store Other Documents": {
      "main": [
        [
          {
            "node": "Export to CSV",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Validate & Clean Data": {
      "main": [
        [
          {
            "node": "Route by Document Type",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Route by Document Type": {
      "main": [
        [
          {
            "node": "Store Invoice Data",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Store Other Documents",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Structured Data": {
      "main": [
        [
          {
            "node": "Validate & Clean Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "PDF Vector - Parse Document": {
      "main": [
        [
          {
            "node": "Extract Structured Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

中级 - 文档提取, 多模态 AI

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
中级
节点数量9
分类2
节点类型8
难度说明

适合有一定经验的用户,包含 6-15 个节点的中等复杂度工作流

作者
PDF Vector

PDF Vector

@pdfvector

A fully featured PDF APIs for developers - Parse any PDF or Word document, extract structured data, and access millions of academic papers - all through simple APIs.

外部链接
在 n8n.io 查看

分享此工作流