Scraping de publications de recherche vers Google Sheets
Ceci est unAIworkflow d'automatisation du domainecontenant 12 nœuds.Utilise principalement des nœuds comme Set, Code, Html, HttpRequest, GoogleSheets, combinant la technologie d'intelligence artificielle pour une automatisation intelligente. utilisationBright Dataetn8nautomatisation研究论文收集
- •Peut nécessiter les informations d'identification d'authentification de l'API cible
- •Informations d'identification Google Sheets API
Nœuds utilisés (12)
Catégorie
{
"id": "giq3zqaP4QbY6LgC",
"meta": {
"instanceId": "60046904b104f0f72b2629a9d88fe9f676be4035769f1f08dad1dd38a76b9480"
},
"name": "Research_Paper_Scraper_to_Google_Sheets",
"tags": [],
"nodes": [
{
"id": "7d81edf3-6f00-4634-b79f-dbda3f9958e5",
"name": "Démarrer le scraping (Déclencheur Manuel)",
"type": "n8n-nodes-base.manualTrigger",
"position": [
-1080,
580
],
"parameters": {},
"typeVersion": 1
},
{
"id": "6e172db5-7483-4079-bf8a-785602526bdc",
"name": "Définir le sujet de recherche",
"type": "n8n-nodes-base.set",
"position": [
-860,
580
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "b530a847-0bb2-4039-9ad0-cbc9cc4d909e",
"name": "Topic",
"type": "string",
"value": "machine+learning"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "e65d092a-6854-478c-b33e-2fc309f71ae8",
"name": "Envoyer une requête à l'API Bright Data",
"type": "n8n-nodes-base.httpRequest",
"position": [
-600,
580
],
"parameters": {
"url": "https://api.brightdata.com/request",
"method": "POST",
"options": {},
"sendBody": true,
"sendHeaders": true,
"bodyParameters": {
"parameters": [
{
"name": "zone",
"value": "n8n_unblocker"
},
{
"name": "url",
"value": "=https://scholar.google.com/scholar?q={{ $json.Topic }}"
},
{
"name": "country",
"value": "us"
},
{
"name": "format",
"value": "raw"
}
]
},
"headerParameters": {
"parameters": [
{
"name": "Authorization",
"value": "Bearer 40127ac3c2b4861572c8ad4c6d2273a0ce0472cb3ea7d3ac85a74a34629067aa"
}
]
}
},
"typeVersion": 4.2
},
{
"id": "211bae33-32c5-44e8-b306-a5e0d520a4a0",
"name": "Extraire les données du HTML (Titre, Auteur, etc.)",
"type": "n8n-nodes-base.html",
"position": [
-400,
580
],
"parameters": {
"options": {},
"operation": "extractHtmlContent",
"extractionValues": {
"values": [
{
"key": "Title",
"cssSelector": "h3.gs_rt, a.gs_rt",
"returnArray": true
},
{
"key": "Author",
"cssSelector": "div.gs_a",
"returnArray": true
},
{
"key": "Abstract",
"cssSelector": "div.gs_rs",
"returnArray": true
},
{
"key": "PDF Link\t",
"cssSelector": "a[href*='pdf']",
"returnArray": true,
"returnValue": "attribute"
}
]
}
},
"typeVersion": 1.2
},
{
"id": "9ab7ba20-8614-46c5-b57a-3749d6ae04c4",
"name": "Nettoyer et structurer les données extraites",
"type": "n8n-nodes-base.code",
"position": [
-200,
580
],
"parameters": {
"jsCode": "const titles = items[0].json.Title || [];\nconst authors = items[0].json.Author || [];\nconst abstracts = items[0].json.Abstract || [];\nconst pdfLinks = items[0].json[\"PDF Link\\t\"] || [];\n\nconst output = [];\n\nfor (let i = 0; i < titles.length; i++) {\n // Clean title (remove tags like [PDF][B])\n let title = titles[i].replace(/\\[.*?\\]/g, '').trim();\n\n // Clean author (remove any trailing dashes or HTML leftovers)\n let author = authors[i] ? authors[i].replace(/\\s*-\\s*.*/, '').trim() : '';\n\n // Abstract fallback\n let abstract = abstracts[i] || '';\n\n // Get PDF link — from either a single object or array of duplicates\n let linkObj = pdfLinks[i];\n let pdfLink = '';\n\n if (Array.isArray(linkObj)) {\n // If multiple objects per item\n pdfLink = linkObj.find(obj => obj.href)?.href || '';\n } else if (linkObj?.href) {\n pdfLink = linkObj.href;\n }\n\n // Push cleaned object\n output.push({\n json: {\n title,\n author,\n abstract,\n pdfLink\n }\n });\n}\n\nreturn output;\n"
},
"typeVersion": 2
},
{
"id": "a246f20c-2bb9-4319-8812-e296c87a7df0",
"name": "Sauvegarder les résultats dans Google Sheet",
"type": "n8n-nodes-base.googleSheets",
"position": [
120,
580
],
"parameters": {
"columns": {
"value": {
"Topic": "={{ $('Set Research topic').item.json.Topic }}",
"title": "={{ $json.title }}",
"author": "={{ $json.author }}",
"abstract": "={{ $json.abstract }}",
"pdf link": "={{ $json.pdfLink }}"
},
"schema": [
{
"id": "Topic",
"type": "string",
"display": true,
"required": false,
"displayName": "Topic",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "title",
"type": "string",
"display": true,
"required": false,
"displayName": "title",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "author",
"type": "string",
"display": true,
"required": false,
"displayName": "author",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "abstract",
"type": "string",
"display": true,
"required": false,
"displayName": "abstract",
"defaultMatch": false,
"canBeUsedToMatch": true
},
{
"id": "pdf link",
"type": "string",
"display": true,
"required": false,
"displayName": "pdf link",
"defaultMatch": false,
"canBeUsedToMatch": true
}
],
"mappingMode": "defineBelow",
"matchingColumns": [],
"attemptToConvertTypes": false,
"convertFieldsToString": false
},
"options": {},
"operation": "append",
"sheetName": {
"__rl": true,
"mode": "list",
"value": "gid=0",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ/edit#gid=0",
"cachedResultName": "Sheet1"
},
"documentId": {
"__rl": true,
"mode": "list",
"value": "1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ",
"cachedResultUrl": "https://docs.google.com/spreadsheets/d/1sOfCFsvHS9-BeE_PQ6J_jtQofCRcOv02XS7hrmFmpxQ/edit?usp=drivesdk",
"cachedResultName": "Research papers from Google Scholar"
}
},
"credentials": {
"googleSheetsOAuth2Api": {
"id": "r2mDaisH6e9VkwHl",
"name": "Google Sheets account"
}
},
"typeVersion": 4.6
},
{
"id": "1b4a1504-4a4a-4a0d-892b-d0c3e205ed85",
"name": "Note adhésive",
"type": "n8n-nodes-base.stickyNote",
"position": [
-1140,
60
],
"parameters": {
"color": 5,
"width": 420,
"height": 720,
"content": "## 🔹 **Section 1: User Input & Trigger**\n\n**🧩 Nodes: Start Scraping | Set Topic**\n📍 **Purpose:** Let users easily input the topic they want to scrape — no need to deal with complex URLs.\n\n| 🧱 Node | ✅ New Name | 💡 Description |\n| --------- | ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| ⚡ Trigger | **Start Scraping (Manual)** | This node starts the workflow when you click “Execute Workflow.” It's the entry point. |\n| ✏️ Set | **Set Topic (Manual Input)** | Instead of requiring a URL, the user will enter a topic (like \"machine learning\" or \"digital marketing\"). This topic will be used to automatically generate the URL behind the scenes. |\n\n### 🧠 How it helps:\n\n* Great for beginners: Just type the topic, hit run.\n* Keeps the interface clean and user-friendly.\n* Avoids confusion around URLs and formats.\n\n---\n\n"
},
"typeVersion": 1
},
{
"id": "bc56f528-6d18-4e05-942f-c06bb6e10b27",
"name": "Note adhésive1",
"type": "n8n-nodes-base.stickyNote",
"position": [
-660,
80
],
"parameters": {
"color": 6,
"width": 600,
"height": 700,
"content": "## 🔸 **Section 2: Scrape & Parse Website**\n\n**🧩 Nodes: Send Request | Extract HTML | Clean Data**\n📍 **Purpose:** Uses the Bright Data proxy to access the webpage, extract raw HTML content, and clean it up into a readable format (title, author, abstract, etc.).\n\n| 🧱 Node | ✅ New Name | 💡 Description |\n| --------------- | ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| 🌐 HTTP Request | **Send Topic Request to Bright Data** | This sends a request to the Bright Data API using the topic you set earlier. It uses Bright Data’s network to safely load the actual website and return HTML content. |\n| 🧱 HTML Extract | **Extract Data from Webpage** | Parses the returned HTML to find relevant data like titles, authors, abstracts, and links. |\n| 🔣 Code | **Clean and Format Scraped Data** | A custom code block that organizes the messy data into neat records. For example: title → column A, abstract → column B, etc. |\n\n### 🧠 How it helps:\n\n* Makes web scraping safe and reliable by using proxies.\n* Converts unreadable HTML into structured information.\n* Beginner-friendly: No need to write a parser yourself.\n\n---\n\n"
},
"typeVersion": 1
},
{
"id": "2c54e5e6-011a-4562-98ac-9cc9834bc284",
"name": "Note adhésive2",
"type": "n8n-nodes-base.stickyNote",
"position": [
0,
0
],
"parameters": {
"color": 3,
"width": 340,
"height": 780,
"content": "## 🟢 **Section 3: Save to Google Sheets**\n\n**🧩 Node: Append to Google Sheets**\n📍 **Purpose:** Automatically sends the clean data into a Google Sheet for easy access, filtering, or sharing.\n\n| 🧱 Node | ✅ New Name | 💡 Description |\n| ---------------- | ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |\n| 📄 Google Sheets | **Store Scraped Data in Spreadsheet** | Takes the structured output and appends it to the connected Google Sheet. Each result gets a row with title, author, abstract, etc. |\n\n### 🧠 How it helps:\n\n* No manual copy-pasting ever again!\n* Shareable and searchable format.\n* Updates automatically as you scrape more topics.\n\n---\n\n"
},
"typeVersion": 1
},
{
"id": "4ce90703-961e-4070-9356-c9dffc23a6c5",
"name": "Note adhésive9",
"type": "n8n-nodes-base.stickyNote",
"position": [
-2980,
80
],
"parameters": {
"color": 4,
"width": 1300,
"height": 320,
"content": "=======================================\n WORKFLOW ASSISTANCE\n=======================================\nFor any questions or support, please contact:\n Yaron@nofluff.online\n\nExplore more tips and tutorials here:\n - YouTube: https://www.youtube.com/@YaronBeen/videos\n - LinkedIn: https://www.linkedin.com/in/yaronbeen/\n=======================================\n"
},
"typeVersion": 1
},
{
"id": "069ddb89-f7a1-4c4b-b65d-212be3252750",
"name": "Note adhésive4",
"type": "n8n-nodes-base.stickyNote",
"position": [
-2980,
420
],
"parameters": {
"color": 4,
"width": 1289,
"height": 1878,
"content": "## 🌟 Research Paper Scraper to Google Sheets\n\n**Automate extraction of data from any website based on a topic — no coding needed!**\n\n---\n\n## 🔹 **Section 1: User Input & Trigger**\n\n**🧩 Nodes: Start Scraping | Set Topic**\n📍 **Purpose:** Let users easily input the topic they want to scrape — no need to deal with complex URLs.\n\n| 🧱 Node | ✅ New Name | 💡 Description |\n| --------- | ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| ⚡ Trigger | **Start Scraping (Manual)** | This node starts the workflow when you click “Execute Workflow.” It's the entry point. |\n| ✏️ Set | **Set Topic (Manual Input)** | Instead of requiring a URL, the user will enter a topic (like \"machine learning\" or \"digital marketing\"). This topic will be used to automatically generate the URL behind the scenes. |\n\n### 🧠 How it helps:\n\n* Great for beginners: Just type the topic, hit run.\n* Keeps the interface clean and user-friendly.\n* Avoids confusion around URLs and formats.\n\n---\n\n## 🔸 **Section 2: Scrape & Parse Website**\n\n**🧩 Nodes: Send Request | Extract HTML | Clean Data**\n📍 **Purpose:** Uses the Bright Data proxy to access the webpage, extract raw HTML content, and clean it up into a readable format (title, author, abstract, etc.).\n\n| 🧱 Node | ✅ New Name | 💡 Description |\n| --------------- | ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| 🌐 HTTP Request | **Send Topic Request to Bright Data** | This sends a request to the Bright Data API using the topic you set earlier. It uses Bright Data’s network to safely load the actual website and return HTML content. |\n| 🧱 HTML Extract | **Extract Data from Webpage** | Parses the returned HTML to find relevant data like titles, authors, abstracts, and links. |\n| 🔣 Code | **Clean and Format Scraped Data** | A custom code block that organizes the messy data into neat records. For example: title → column A, abstract → column B, etc. |\n\n### 🧠 How it helps:\n\n* Makes web scraping safe and reliable by using proxies.\n* Converts unreadable HTML into structured information.\n* Beginner-friendly: No need to write a parser yourself.\n\n---\n\n## 🟢 **Section 3: Save to Google Sheets**\n\n**🧩 Node: Append to Google Sheets**\n📍 **Purpose:** Automatically sends the clean data into a Google Sheet for easy access, filtering, or sharing.\n\n| 🧱 Node | ✅ New Name | 💡 Description |\n| ---------------- | ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |\n| 📄 Google Sheets | **Store Scraped Data in Spreadsheet** | Takes the structured output and appends it to the connected Google Sheet. Each result gets a row with title, author, abstract, etc. |\n\n### 🧠 How it helps:\n\n* No manual copy-pasting ever again!\n* Shareable and searchable format.\n* Updates automatically as you scrape more topics.\n\n---\n\n## ✅ What a Beginner Gains from This Workflow\n\n| 💡 Feature | 🚀 Benefit |\n| --------------------------- | --------------------------------------------------------------------------------- |\n| Topic-based input | You don’t need to find or understand complex URLs. Just type “AI” or “marketing.” |\n| Fully automated scraping | You don’t need to open browsers or inspect elements. |\n| Ready-to-use Google Sheet | The final data is clean and saved into a sheet you can use anywhere. |\n| Beautiful, modular workflow | Each step is visual, editable, and reusable without coding skills. |\n\n---\n\n## 🎯 Final Result:\n\nYou type a **topic** → Bright Data scrapes the web → It extracts content → Cleans it → Saves it into **Google Sheets**.\nEverything happens automatically. **No code. No hassle. Just data.**\n\n---\n\n"
},
"typeVersion": 1
},
{
"id": "a1a5e609-756a-4757-a026-1349cf388e61",
"name": "Note adhésive5",
"type": "n8n-nodes-base.stickyNote",
"position": [
400,
0
],
"parameters": {
"color": 7,
"width": 380,
"height": 240,
"content": "## I’ll receive a tiny commission if you join Bright Data through this link—thanks for fueling more free content!\n\n### https://get.brightdata.com/1tndi4600b25"
},
"typeVersion": 1
}
],
"active": false,
"pinData": {},
"settings": {
"executionOrder": "v1"
},
"versionId": "f931202a-3c22-495d-b775-71665bdf6c27",
"connections": {
"6e172db5-7483-4079-bf8a-785602526bdc": {
"main": [
[
{
"node": "e65d092a-6854-478c-b33e-2fc309f71ae8",
"type": "main",
"index": 0
}
]
]
},
"e65d092a-6854-478c-b33e-2fc309f71ae8": {
"main": [
[
{
"node": "211bae33-32c5-44e8-b306-a5e0d520a4a0",
"type": "main",
"index": 0
}
]
]
},
"7d81edf3-6f00-4634-b79f-dbda3f9958e5": {
"main": [
[
{
"node": "6e172db5-7483-4079-bf8a-785602526bdc",
"type": "main",
"index": 0
}
]
]
},
"9ab7ba20-8614-46c5-b57a-3749d6ae04c4": {
"main": [
[
{
"node": "a246f20c-2bb9-4319-8812-e296c87a7df0",
"type": "main",
"index": 0
}
]
]
},
"211bae33-32c5-44e8-b306-a5e0d520a4a0": {
"main": [
[
{
"node": "9ab7ba20-8614-46c5-b57a-3749d6ae04c4",
"type": "main",
"index": 0
}
]
]
}
}
}Comment utiliser ce workflow ?
Copiez le code de configuration JSON ci-dessus, créez un nouveau workflow dans votre instance n8n et sélectionnez "Importer depuis le JSON", collez la configuration et modifiez les paramètres d'authentification selon vos besoins.
Dans quelles scénarios ce workflow est-il adapté ?
Intermédiaire - Intelligence Artificielle
Est-ce payant ?
Ce workflow est entièrement gratuit et peut être utilisé directement. Veuillez noter que les services tiers utilisés dans le workflow (comme l'API OpenAI) peuvent nécessiter un paiement de votre part.
Workflows recommandés
Yaron Been
@yaron-nofluffBuilding AI Agents and Automations | Growth Marketer | Entrepreneur | Book Author & Podcast Host If you need any help with Automations, feel free to reach out via linkedin: https://www.linkedin.com/in/yaronbeen/ And check out my Youtube channel: https://www.youtube.com/@YaronBeen/videos
Partager ce workflow