8
n8n 中文网amn8n.com

网站抓取器

高级

这是一个Marketing, IT Ops领域的自动化工作流,包含 24 个节点。主要使用 If, Set, Code, Airtop, GoogleDocs 等节点。 基于Airtop的LLM网站抓取器

前置要求
  • Google Sheets API 凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "id": "3SNBO1RAF0ZIyido",
  "meta": {
    "instanceId": "28a947b92b197fc2524eaba16e57560338657b2b0b5796300b2f1cedc1d0d355",
    "templateCredsSetupCompleted": true
  },
  "name": "网站抓取器",
  "tags": [],
  "nodes": [
    {
      "id": "aaa5bf43-bf76-4433-93c7-7e168f2e140c",
      "name": "表单提交时",
      "type": "n8n-nodes-base.formTrigger",
      "position": [
        -400,
        -200
      ],
      "webhookId": "6c508326-84d3-4155-9f2e-fe7ddb50f14a",
      "parameters": {
        "options": {},
        "formTitle": "Website scraper",
        "formFields": {
          "values": [
            {
              "fieldLabel": "Seed url",
              "requiredField": true
            },
            {
              "fieldLabel": "Links must contain",
              "requiredField": true
            },
            {
              "fieldType": "number",
              "fieldLabel": "Depth",
              "requiredField": true
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "5f862a6f-8079-4d24-a2ae-7442c45c8f04",
      "name": "要上传到电子表格的信息",
      "type": "n8n-nodes-base.set",
      "position": [
        260,
        -100
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "5970bab1-0bfb-4212-a8f2-df6b2e003800",
              "name": "URL",
              "type": "string",
              "value": "={{ $('Unify params').item.json[\"Seed url\"] }}"
            },
            {
              "id": "5c941ad3-70bf-4fba-8cf3-1f79c1b20d3a",
              "name": "Scraped",
              "type": "string",
              "value": ""
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "556e5da4-4454-46cd-b17d-bd8695515670",
      "name": "将信息加载到电子表格",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        480,
        -100
      ],
      "parameters": {
        "columns": {
          "value": {},
          "schema": [
            {
              "id": "URL",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "URL",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "autoMapInputData",
          "matchingColumns": [
            "URL"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "append",
        "sheetName": {
          "__rl": true,
          "mode": "name",
          "value": "={{ $('Create Spreadsheet').item.json.sheets[0].properties.title }}"
        },
        "documentId": {
          "__rl": true,
          "mode": "id",
          "value": "={{ $('Create Spreadsheet').item.json.spreadsheetId }}"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "DdxyhUJLGcclD2YO",
          "name": "Google sheets Andres"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "4592ada8-8f21-4117-a0ee-8906922f5685",
      "name": "抓取网页",
      "type": "n8n-nodes-base.airtop",
      "position": [
        700,
        -100
      ],
      "parameters": {
        "url": "={{ $json.URL }}",
        "resource": "extraction",
        "operation": "scrape",
        "profileName": "=",
        "sessionMode": "new"
      },
      "credentials": {
        "airtopApi": {
          "id": "Yi4YPNnovLVUjFn5",
          "name": "Airtop Official Org"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "74a1399d-a335-455c-a3dd-167624b4a5f2",
      "name": "创建 Google Docs",
      "type": "n8n-nodes-base.googleDocs",
      "position": [
        920,
        -100
      ],
      "parameters": {
        "title": "=Site to File - {{ $('Unify params').item.json[\"Seed url\"] }}",
        "folderId": "default"
      },
      "credentials": {
        "googleDocsOAuth2Api": {
          "id": "pDMGILToKi9P4taJ",
          "name": "Andres Google Docs"
        }
      },
      "typeVersion": 2
    },
    {
      "id": "1e69e40f-0465-430b-85ec-c6a71e1cb4a4",
      "name": "写入抓取的内容",
      "type": "n8n-nodes-base.googleDocs",
      "position": [
        1140,
        -100
      ],
      "parameters": {
        "actionsUi": {
          "actionFields": [
            {
              "text": "=The entire content from {{ $('Info to upload into spreadsheet').item.json.URL }} up to {{ $('Unify params').item.json.Depth }} levels deep.\n---------------------------------------------  \n{{ $('Scrape webpage').item.json.data.modelResponse.scrapedContent.text }}\n---------------------------------------------",
              "action": "insert"
            }
          ]
        },
        "operation": "update",
        "documentURL": "={{ $json.id }}"
      },
      "credentials": {
        "googleDocsOAuth2Api": {
          "id": "pDMGILToKi9P4taJ",
          "name": "Andres Google Docs"
        }
      },
      "typeVersion": 2
    },
    {
      "id": "49e9b38c-c2d4-49f3-bf3f-531b10257db4",
      "name": "是否抓取更多?",
      "type": "n8n-nodes-base.if",
      "position": [
        1380,
        -100
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "707d2bae-e834-478b-a4f6-9e6bbfa47530",
              "operator": {
                "type": "number",
                "operation": "lt"
              },
              "leftValue": "={{ $runIndex }}",
              "rightValue": "={{ $('Unify params').first().json.Depth - 1 }}"
            },
            {
              "id": "cc763610-f775-4dcb-ae29-a3235b011b75",
              "operator": {
                "type": "number",
                "operation": "gt"
              },
              "leftValue": "={{ Number($('Unify params').first().json.Depth) }}",
              "rightValue": 1
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "239b2a02-bd66-4805-8ad3-b4ef6daa5e60",
      "name": "读取抓取的网页",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        1580,
        -175
      ],
      "parameters": {
        "options": {},
        "sheetName": {
          "__rl": true,
          "mode": "name",
          "value": "={{ $('Create Spreadsheet').first().json.sheets[0].properties.title }}"
        },
        "documentId": {
          "__rl": true,
          "mode": "id",
          "value": "={{ $('Create Spreadsheet').first().json.spreadsheetId }}"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "DdxyhUJLGcclD2YO",
          "name": "Google sheets Andres"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "029d7de1-9043-424e-9c22-aed436436e6a",
      "name": "检索要抓取的链接",
      "type": "n8n-nodes-base.airtop",
      "position": [
        1800,
        -175
      ],
      "parameters": {
        "url": "={{ $json.URL }}",
        "prompt": "Extract the list of links that lead to other pages in the same domain",
        "resource": "extraction",
        "operation": "query",
        "sessionMode": "new",
        "additionalFields": {
          "outputSchema": "{\n  \"type\": \"object\",\n  \"properties\": {\n    \"internal_links\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\",\n        \"description\": \"A URL that leads to another page within the airtop.ai domain\"\n      },\n      \"description\": \"A list of URLs that lead to other pages within the airtop.ai domain\"\n    }\n  },\n  \"required\": [\n    \"internal_links\"\n  ],\n  \"additionalProperties\": false,\n  \"$schema\": \"http://json-schema.org/draft-07/schema#\"\n}"
        }
      },
      "credentials": {
        "airtopApi": {
          "id": "Yi4YPNnovLVUjFn5",
          "name": "Airtop Official Org"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "57f2a3ee-40c9-4e72-99f4-739fff04667a",
      "name": "筛选要插入到 Sheets 的链接",
      "type": "n8n-nodes-base.code",
      "position": [
        2020,
        -180
      ],
      "parameters": {
        "jsCode": "const modelResponse = $('Retrieve links to scrape').first().json.data.modelResponse;\nconst containsString = $('Unify params').first().json[\"Links must contain\"];\nconst parsed = JSON.parse(modelResponse);\nconst links = [...new Set(parsed.internal_links)]; \nconst sheetsLinks = $input.all().map(item => item.json.URL)\nlet response; \n\nif(containsString === \"\"){\n  response = links\n    .map(item =>(\n        { json: { link: item.split('?')[0] } }\n    ))\n} else {\n  response = links\n    .map(item =>\n      item.includes(containsString)\n        ? { json: { link: item.split('?')[0] } }\n        : null\n    )\n    .filter(item => item !== null);\n}\nconst dedupeBetweenSheetsAndModel = response.filter(item => !sheetsLinks.includes(item.json.link));\nconst deduped = [...new Map(dedupeBetweenSheetsAndModel.map(item => [item.json.link, item])).values()]\nreturn deduped\n"
      },
      "typeVersion": 2
    },
    {
      "id": "6a0f20e8-df45-4799-8162-21d427e19e49",
      "name": "插入新链接",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        2240,
        -175
      ],
      "parameters": {
        "columns": {
          "value": {
            "URL": "={{ $json.link }}"
          },
          "schema": [
            {
              "id": "URL",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "URL",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Scraped",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "Scraped",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "append",
        "sheetName": {
          "__rl": true,
          "mode": "name",
          "value": "={{ $('Create Spreadsheet').first().json.sheets[0].properties.title }}"
        },
        "documentId": {
          "__rl": true,
          "mode": "id",
          "value": "={{ $('Create Spreadsheet').first().json.spreadsheetId }}"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "DdxyhUJLGcclD2YO",
          "name": "Google sheets Andres"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "9433ab9e-b562-4aa3-a311-ae2a355ce774",
      "name": "抓取网页1",
      "type": "n8n-nodes-base.airtop",
      "position": [
        2460,
        -175
      ],
      "parameters": {
        "url": "={{ $json.URL }}",
        "resource": "extraction",
        "operation": "scrape",
        "sessionMode": "new"
      },
      "credentials": {
        "airtopApi": {
          "id": "Yi4YPNnovLVUjFn5",
          "name": "Airtop Official Org"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "704c25cf-b690-492d-a759-7b24a870edf4",
      "name": "用新抓取的内容更新",
      "type": "n8n-nodes-base.googleDocs",
      "position": [
        2680,
        -175
      ],
      "parameters": {
        "actionsUi": {
          "actionFields": [
            {
              "text": "=-------------------------  \n{{ $json.data.modelResponse.scrapedContent.text }}\n-------------------------",
              "action": "insert"
            }
          ]
        },
        "operation": "update",
        "documentURL": "={{ $('Create Google Docs').first().json.id }}"
      },
      "credentials": {
        "googleDocsOAuth2Api": {
          "id": "pDMGILToKi9P4taJ",
          "name": "Andres Google Docs"
        }
      },
      "typeVersion": 2
    },
    {
      "id": "14f1465d-3d7d-4b7a-87d2-2552b9514e37",
      "name": "标记已抓取的链接",
      "type": "n8n-nodes-base.set",
      "position": [
        2900,
        -175
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "72c9cd92-fd01-4339-b9fe-52477b691df3",
              "name": "URL",
              "type": "string",
              "value": "={{ $('Insert new links').item.json.URL }}"
            },
            {
              "id": "a817116e-6ae5-4e0a-b210-4fd27f5a455a",
              "name": "Scraped",
              "type": "string",
              "value": "={{ $runIndex }}"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "364513e7-39c0-47af-83bd-475ffb0ae2a0",
      "name": "插入标记",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        3120,
        -100
      ],
      "parameters": {
        "columns": {
          "value": {
            "URL": "={{ $('Insert new links').item.json.URL }}"
          },
          "schema": [
            {
              "id": "URL",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "URL",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Scraped",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "Scraped",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "row_number",
              "type": "string",
              "display": true,
              "removed": false,
              "readOnly": true,
              "required": false,
              "displayName": "row_number",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "autoMapInputData",
          "matchingColumns": [
            "URL"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "update",
        "sheetName": {
          "__rl": true,
          "mode": "name",
          "value": "={{ $('Create Spreadsheet').first().json.sheets[0].properties.title }}"
        },
        "documentId": {
          "__rl": true,
          "mode": "id",
          "value": "={{ $('Create Spreadsheet').first().json.spreadsheetId }}"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "DdxyhUJLGcclD2YO",
          "name": "Google sheets Andres"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "bd799398-e6c6-4cd4-a8a9-d189acabb194",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -20,
        -220
      ],
      "parameters": {
        "width": 660,
        "height": 280,
        "content": "## 创建电子表格"
      },
      "typeVersion": 1
    },
    {
      "id": "7a5aeddd-9c0f-4ea2-8452-35dd14e6963a",
      "name": "创建电子表格",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        40,
        -100
      ],
      "parameters": {
        "title": "=Site map -  {{ $json[\"Seed url\"] }} (Depth - {{ $json.Depth }})",
        "options": {},
        "resource": "spreadsheet"
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "DdxyhUJLGcclD2YO",
          "name": "Google sheets Andres"
        }
      },
      "typeVersion": 4.5
    },
    {
      "id": "19a6c42b-8be5-4599-bc60-99f0b09e3623",
      "name": "便签1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        660,
        -220
      ],
      "parameters": {
        "color": 4,
        "width": 180,
        "height": 280,
        "content": "## 抓取网页"
      },
      "typeVersion": 1
    },
    {
      "id": "5de141f9-9e1e-4159-8611-06d23ee7b476",
      "name": "便签2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        860,
        -220
      ],
      "parameters": {
        "width": 440,
        "height": 280,
        "content": "## 创建文档"
      },
      "typeVersion": 1
    },
    {
      "id": "a9108190-01d9-4c9a-a7b0-9ea582026acb",
      "name": "便签3",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1320,
        -280
      ],
      "parameters": {
        "width": 1980,
        "height": 380,
        "content": "## 递归抓取过程"
      },
      "typeVersion": 1
    },
    {
      "id": "bec8b412-51e1-45df-95b5-ae4e4aeb1fc2",
      "name": "便签4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1160,
        -540
      ],
      "parameters": {
        "color": 7,
        "width": 660,
        "height": 940,
        "content": "README"
      },
      "typeVersion": 1
    },
    {
      "id": "ef3cfdbf-bd61-4452-bad0-d0154bbd893b",
      "name": "当被另一个工作流执行时",
      "type": "n8n-nodes-base.executeWorkflowTrigger",
      "position": [
        -400,
        0
      ],
      "parameters": {
        "workflowInputs": {
          "values": [
            {
              "name": "Seed url"
            },
            {
              "name": "Links must contain"
            },
            {
              "name": "Depth",
              "type": "number"
            }
          ]
        }
      },
      "typeVersion": 1.1
    },
    {
      "id": "16d54958-18b6-497f-af9e-5953a39ae0bb",
      "name": "便签5",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -440,
        -300
      ],
      "parameters": {
        "width": 400,
        "height": 460,
        "content": "## 输入参数"
      },
      "typeVersion": 1
    },
    {
      "id": "99cc34ec-37eb-423a-9cc5-7c1b7736d352",
      "name": "统一参数",
      "type": "n8n-nodes-base.set",
      "position": [
        -180,
        -100
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "18e3200c-ab2a-44da-b401-eace47a3ccc0",
              "name": "Seed url",
              "type": "string",
              "value": "={{ $json[\"Seed url\"] }}"
            },
            {
              "id": "cb21aa86-bf75-427f-bf07-70d4f2d83894",
              "name": "Links must contain",
              "type": "string",
              "value": "={{ $json[\"Links must contain\"] }}"
            },
            {
              "id": "80bb934f-816d-43f0-9432-170047fa02a3",
              "name": "Depth",
              "type": "number",
              "value": "={{ $json.Depth }}"
            }
          ]
        }
      },
      "typeVersion": 3.4
    }
  ],
  "active": false,
  "pinData": {},
  "settings": {
    "executionOrder": "v1"
  },
  "versionId": "dffadb4b-7c72-42ec-802c-26d4f96b5ec7",
  "connections": {
    "Insert flag": {
      "main": [
        [
          {
            "node": "Should scrape more?",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Unify params": {
      "main": [
        [
          {
            "node": "Create Spreadsheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape webpage": {
      "main": [
        [
          {
            "node": "Create Google Docs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape webpage1": {
      "main": [
        [
          {
            "node": "Update with new scraped content",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Insert new links": {
      "main": [
        [
          {
            "node": "Scrape webpage1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Flag scraped link": {
      "main": [
        [
          {
            "node": "Insert flag",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Create Google Docs": {
      "main": [
        [
          {
            "node": "Write scraped content",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Create Spreadsheet": {
      "main": [
        [
          {
            "node": "Info to upload into spreadsheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "On form submission": {
      "main": [
        [
          {
            "node": "Unify params",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Should scrape more?": {
      "main": [
        [
          {
            "node": "Read scraped webpages",
            "type": "main",
            "index": 0
          }
        ],
        []
      ]
    },
    "Read scraped webpages": {
      "main": [
        [
          {
            "node": "Retrieve links to scrape",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Write scraped content": {
      "main": [
        [
          {
            "node": "Should scrape more?",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Load info to spreadsheet": {
      "main": [
        [
          {
            "node": "Scrape webpage",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Retrieve links to scrape": {
      "main": [
        [
          {
            "node": "Filter links to insert to Sheets",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Info to upload into spreadsheet": {
      "main": [
        [
          {
            "node": "Load info to spreadsheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Update with new scraped content": {
      "main": [
        [
          {
            "node": "Flag scraped link",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Filter links to insert to Sheets": {
      "main": [
        [
          {
            "node": "Insert new links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "When Executed by Another Workflow": {
      "main": [
        [
          {
            "node": "Unify params",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

高级 - 营销, IT 运维

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量24
分类2
节点类型9
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

作者

Airtop provides an intelligent browser automation API for AI agents, enabling seamless web interaction, including login, navigation, and data extraction from any site, even those with complex authentication - all with natural language instructions.In simple terms, we allow you to automate anything humans can do online, on any site with just words

外部链接
在 n8n.io 查看

分享此工作流