8
n8n 中文网amn8n.com

研究论文爬虫到Google Sheets

中级

这是一个AI领域的自动化工作流,包含 12 个节点。主要使用 Set, Code, Html, HttpRequest, GoogleSheets 等节点,结合人工智能技术实现智能自动化。 使用Bright Data和n8n自动化研究论文收集

前置要求
  • 可能需要目标 API 的认证凭证
  • Google Sheets API 凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "id": "giq3zqaP4QbY6LgC",
  "meta": {
    "instanceId": "60046904b104f0f72b2629a9d88fe9f676be4035769f1f08dad1dd38a76b9480"
  },
  "name": "研究论文爬虫到Google Sheets",
  "tags": [],
  "nodes": [
    {
      "id": "7d81edf3-6f00-4634-b79f-dbda3f9958e5",
      "name": "开始爬取(手动触发器)",
      "type": "n8n-nodes-base.manualTrigger",
      "position": [
        -1080,
        580
      ],
      "parameters": {},
      "typeVersion": 1
    },
    {
      "id": "6e172db5-7483-4079-bf8a-785602526bdc",
      "name": "设置研究主题",
      "type": "n8n-nodes-base.set",
      "position": [
        -860,
        580
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "b530a847-0bb2-4039-9ad0-cbc9cc4d909e",
              "name": "Topic",
              "type": "string",
              "value": "machine+learning"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "e65d092a-6854-478c-b33e-2fc309f71ae8",
      "name": "发送请求到Bright Data API",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -600,
        580
      ],
      "parameters": {
        "url": "https://api.brightdata.com/request",
        "method": "POST",
        "options": {},
        "sendBody": true,
        "sendHeaders": true,
        "bodyParameters": {
          "parameters": [
            {
              "name": "zone",
              "value": "n8n_unblocker"
            },
            {
              "name": "url",
              "value": "=https://scholar.google.com/scholar?q={{ $json.Topic }}"
            },
            {
              "name": "country",
              "value": "us"
            },
            {
              "name": "format",
              "value": "raw"
            }
          ]
        },
        "headerParameters": {
          "parameters": [
            {
              "name": "Authorization",
              "value": "Bearer 40127ac3c2b4861572c8ad4c6d2273a0ce0472cb3ea7d3ac85a74a34629067aa"
            }
          ]
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "211bae33-32c5-44e8-b306-a5e0d520a4a0",
      "name": "从HTML提取数据(标题、作者等)",
      "type": "n8n-nodes-base.html",
      "position": [
        -400,
        580
      ],
      "parameters": {
        "options": {},
        "operation": "extractHtmlContent",
        "extractionValues": {
          "values": [
            {
              "key": "Title",
              "cssSelector": "h3.gs_rt, a.gs_rt",
              "returnArray": true
            },
            {
              "key": "Author",
              "cssSelector": "div.gs_a",
              "returnArray": true
            },
            {
              "key": "Abstract",
              "cssSelector": "div.gs_rs",
              "returnArray": true
            },
            {
              "key": "PDF Link\t",
              "cssSelector": "a[href*='pdf']",
              "returnArray": true,
              "returnValue": "attribute"
            }
          ]
        }
      },
      "typeVersion": 1.2
    },
    {
      "id": "9ab7ba20-8614-46c5-b57a-3749d6ae04c4",
      "name": "清理和结构化提取的数据",
      "type": "n8n-nodes-base.code",
      "position": [
        -200,
        580
      ],
      "parameters": {
        "jsCode": "const titles = items[0].json.Title || [];\nconst authors = items[0].json.Author || [];\nconst abstracts = items[0].json.Abstract || [];\nconst pdfLinks = items[0].json[\"PDF Link\\t\"] || [];\n\nconst output = [];\n\nfor (let i = 0; i < titles.length; i++) {\n  // Clean title (remove tags like [PDF][B])\n  let title = titles[i].replace(/\\[.*?\\]/g, '').trim();\n\n  // Clean author (remove any trailing dashes or HTML leftovers)\n  let author = authors[i] ? authors[i].replace(/\\s*-\\s*.*/, '').trim() : '';\n\n  // Abstract fallback\n  let abstract = abstracts[i] || '';\n\n  // Get PDF link — from either a single object or array of duplicates\n  let linkObj = pdfLinks[i];\n  let pdfLink = '';\n\n  if (Array.isArray(linkObj)) {\n    // If multiple objects per item\n    pdfLink = linkObj.find(obj => obj.href)?.href || '';\n  } else if (linkObj?.href) {\n    pdfLink = linkObj.href;\n  }\n\n  // Push cleaned object\n  output.push({\n    json: {\n      title,\n      author,\n      abstract,\n      pdfLink\n    }\n  });\n}\n\nreturn output;\n"
      },
      "typeVersion": 2
    },
    {
      "id": "a246f20c-2bb9-4319-8812-e296c87a7df0",
      "name": "保存结果到Google Sheet",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        120,
        580
      ],
      "parameters": {
        "columns": {
          "value": {
            "Topic": "={{ $('Set Research topic').item.json.Topic }}",
            "title": "={{ $json.title }}",
            "author": "={{ $json.author }}",
            "abstract": "={{ $json.abstract }}",
            "pdf link": "={{ $json.pdfLink }}"
          },
          "schema": [
            {
              "id": "Topic",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Topic",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "title",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "title",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "author",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "author",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "abstract",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "abstract",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "pdf link",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "pdf link",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "append",
        "sheetName": {
          "__rl": true,
          "mode": "list",
          "value": "gid=0",
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ/edit#gid=0",
          "cachedResultName": "Sheet1"
        },
        "documentId": {
          "__rl": true,
          "mode": "list",
          "value": "1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ",
          "cachedResultUrl": "https://docs.google.com/spreadsheets/d/1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ/edit?usp=drivesdk",
          "cachedResultName": "Research papers from Google Scholar"
        }
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "r2mDaisH6e9VkwHl",
          "name": "Google Sheets account"
        }
      },
      "typeVersion": 4.6
    },
    {
      "id": "1b4a1504-4a4a-4a0d-892b-d0c3e205ed85",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1140,
        60
      ],
      "parameters": {
        "color": 5,
        "width": 420,
        "height": 720,
        "content": "## 🔹 **第一部分:用户输入与触发器**"
      },
      "typeVersion": 1
    },
    {
      "id": "bc56f528-6d18-4e05-942f-c06bb6e10b27",
      "name": "便签1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -660,
        80
      ],
      "parameters": {
        "color": 6,
        "width": 600,
        "height": 700,
        "content": "## 🔸 **第二部分:爬取与解析网站**"
      },
      "typeVersion": 1
    },
    {
      "id": "2c54e5e6-011a-4562-98ac-9cc9834bc284",
      "name": "便签2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        0,
        0
      ],
      "parameters": {
        "color": 3,
        "width": 340,
        "height": 780,
        "content": "## 🟢 **第三部分:保存到Google Sheets**"
      },
      "typeVersion": 1
    },
    {
      "id": "4ce90703-961e-4070-9356-c9dffc23a6c5",
      "name": "便签 9",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2980,
        80
      ],
      "parameters": {
        "color": 4,
        "width": 1300,
        "height": 320,
        "content": "======================================="
      },
      "typeVersion": 1
    },
    {
      "id": "069ddb89-f7a1-4c4b-b65d-212be3252750",
      "name": "便签说明4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -2980,
        420
      ],
      "parameters": {
        "color": 4,
        "width": 1289,
        "height": 1878,
        "content": "## 🌟 研究论文爬虫到Google Sheets"
      },
      "typeVersion": 1
    },
    {
      "id": "a1a5e609-756a-4757-a026-1349cf388e61",
      "name": "便签说明5",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        400,
        0
      ],
      "parameters": {
        "color": 7,
        "width": 380,
        "height": 240,
        "content": "## 如果您通过此链接加入 Bright Data,我将获得少量佣金 — 感谢您支持更多免费内容!"
      },
      "typeVersion": 1
    }
  ],
  "active": false,
  "pinData": {},
  "settings": {
    "executionOrder": "v1"
  },
  "versionId": "f931202a-3c22-495d-b775-71665bdf6c27",
  "connections": {
    "Set Research topic": {
      "main": [
        [
          {
            "node": "Send Request to Bright Data API",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Send Request to Bright Data API": {
      "main": [
        [
          {
            "node": "Extract Data from HTML (Title, Author, etc.)",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Start Scraping (Manual Trigger)": {
      "main": [
        [
          {
            "node": "Set Research topic",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Clean & Structure Extracted Data": {
      "main": [
        [
          {
            "node": "Save Results to Google Sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Data from HTML (Title, Author, etc.)": {
      "main": [
        [
          {
            "node": "Clean & Structure Extracted Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

中级 - 人工智能

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
中级
节点数量12
分类1
节点类型7
难度说明

适合有一定经验的用户,包含 6-15 个节点的中等复杂度工作流

作者
Yaron Been

Yaron Been

@yaron-nofluff

Building AI Agents and Automations | Growth Marketer | Entrepreneur | Book Author & Podcast Host If you need any help with Automations, feel free to reach out via linkedin: https://www.linkedin.com/in/yaronbeen/ And check out my Youtube channel: https://www.youtube.com/@YaronBeen/videos

外部链接
在 n8n.io 查看

分享此工作流