De l'extraction du sitemap au stockage vectoriel : création de workflows RAG efficaces
Ceci est unContent Creation, Multimodal AIworkflow d'automatisation du domainecontenant 40 nœuds.Utilise principalement des nœuds comme If, Set, Xml, Code, Wait. Du crawling du sitemap au stockage vectoriel : Création d'un workflow RAG efficace
- •Informations de connexion à la base de données PostgreSQL
- •URL et Clé API Supabase
- •Peut nécessiter les informations d'identification d'authentification de l'API cible
- •Clé API OpenAI
Nœuds utilisés (40)
Catégorie
{
"meta": {
"instanceId": "0862f70dc42e115052f6a2d4c2b6537665b4361a614cec7cd17d1c45c8868621",
"templateCredsSetupCompleted": true
},
"nodes": [
{
"id": "ab180eb3-c086-4f9f-b9d0-f3f56056a416",
"name": "Lors du clic sur 'Tester le flux'",
"type": "n8n-nodes-base.manualTrigger",
"position": [
-6816,
-304
],
"parameters": {},
"typeVersion": 1
},
{
"id": "20e77374-c3ce-457f-945c-d6f6dc928de1",
"name": "HTTP Request",
"type": "n8n-nodes-base.httpRequest",
"position": [
-6624,
-304
],
"parameters": {
"url": "https://www.kiekens.com/sitemap.xml",
"options": {}
},
"typeVersion": 4.2
},
{
"id": "b23dd724-1bd7-4eef-9e22-8bef987b2128",
"name": "XML",
"type": "n8n-nodes-base.xml",
"position": [
-6432,
-304
],
"parameters": {
"options": {}
},
"typeVersion": 1
},
{
"id": "4715b380-f386-4926-892e-2c133a1155c1",
"name": "Split Out",
"type": "n8n-nodes-base.splitOut",
"position": [
-6224,
-304
],
"parameters": {
"options": {},
"fieldToSplitOut": "urlset.url"
},
"typeVersion": 1
},
{
"id": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"name": "Boucler sur les éléments",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-5152,
-592
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
"name": "Attendre",
"type": "n8n-nodes-base.wait",
"position": [
-4192,
-608
],
"webhookId": "9af87c5e-b07f-48dc-9ca8-61b471a24cad",
"parameters": {
"amount": 30
},
"typeVersion": 1.1
},
{
"id": "961143cf-c387-4e2d-a477-0988c0b0f512",
"name": "Si",
"type": "n8n-nodes-base.if",
"position": [
-3728,
-608
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "9d90c1ce-590e-40a5-ae8c-d92326032975",
"operator": {
"type": "string",
"operation": "equals"
},
"leftValue": "={{ $json.status }}",
"rightValue": "completed"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "991580c5-10ed-4bab-811e-2ec50d4050fd",
"name": "Default Data Loader",
"type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
"position": [
-2384,
-496
],
"parameters": {
"options": {
"metadata": {
"metadataValues": [
{
"name": "page",
"value": "={{ $json.result.url }}"
}
]
}
},
"jsonData": "={{ $json.cleanedText }}",
"jsonMode": "expressionData"
},
"typeVersion": 1
},
{
"id": "0fc79f0d-8ebd-4d61-ac29-7ba65284af52",
"name": "Character Text Splitter",
"type": "@n8n/n8n-nodes-langchain.textSplitterCharacterTextSplitter",
"position": [
-2368,
-352
],
"parameters": {
"chunkSize": 5000
},
"typeVersion": 1
},
{
"id": "bc5aac68-bb66-4c9c-abd7-9a913b0a56fa",
"name": "Embeddings OpenAI",
"type": "@n8n/n8n-nodes-langchain.embeddingsOpenAi",
"position": [
-2528,
-464
],
"parameters": {
"model": "text-embedding-ada-002",
"options": {}
},
"credentials": {
"openAiApi": {
"id": "OwpPpcltPaXyVklS",
"name": "OpenAi_Mariela.b.d."
}
},
"typeVersion": 1.1
},
{
"id": "e3b525eb-7a3f-456d-a476-b013293c85e0",
"name": "Éditer les champs",
"type": "n8n-nodes-base.set",
"position": [
-4064,
-288
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "f2bcdb54-e1fe-4670-99aa-6eec973bf5f1",
"name": "task_id",
"type": "string",
"value": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "bdbed5ea-d1a1-4922-a7b7-759466709fcb",
"name": "Statut de tâche Crawl4AI",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"position": [
-3968,
-608
],
"parameters": {
"url": "=https://crawl4ai-app-nrcsv.ondigitalocean.app/task/{{ $json.task_id }}",
"options": {
"timeout": 5000
},
"authentication": "genericCredentialType",
"genericAuthType": "httpHeaderAuth"
},
"credentials": {
"httpHeaderAuth": {
"id": "De808MMiUFOFLbNm",
"name": "Crawl4ai_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000
},
{
"id": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"name": "Boucler sur les éléments1",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-5824,
144
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "f78a39bd-183c-4985-b1b1-f3142dfe31f3",
"name": "Si2",
"type": "n8n-nodes-base.if",
"position": [
-4736,
-592
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "or",
"conditions": [
{
"id": "fbc89427-990b-45d0-8538-e403c1b18ddd",
"operator": {
"type": "string",
"operation": "contains"
},
"leftValue": "={{ $json.status }}",
"rightValue": "pending"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "b6dfe888-4e2e-4c74-8a66-c3db28604514",
"name": "Split Out1",
"type": "n8n-nodes-base.splitOut",
"position": [
-5392,
-384
],
"parameters": {
"include": "selectedOtherFields",
"options": {},
"fieldToSplitOut": "url",
"fieldsToInclude": "status"
},
"typeVersion": 1
},
{
"id": "78f05cb5-8b9c-4f51-b252-4ca2195b52ad",
"name": "Formater l'URL",
"type": "n8n-nodes-base.set",
"position": [
-5648,
160
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "9038a5b3-6985-4edc-bdd1-8dc5a3e8877c",
"name": "loc",
"type": "string",
"value": "={{ $json.loc.trim().toLowerCase() }}"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "805f1fea-841b-40aa-a055-de7ddbbb306f",
"name": "Vérifier si l'URL est dans la table Supabase",
"type": "n8n-nodes-base.supabase",
"onError": "continueErrorOutput",
"position": [
-5456,
160
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $json.loc }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"operation": "getAll",
"returnAll": true
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 1,
"alwaysOutputData": true,
"waitBetweenTries": 5000
},
{
"id": "4f6e6ccb-7757-4e9f-b50c-9acb2fe99009",
"name": "Formater la sortie du nœud Supabase",
"type": "n8n-nodes-base.code",
"position": [
-5184,
160
],
"parameters": {
"jsCode": "const supabaseResult = $json;\n\n// Get the clean URL from the Set node (Edit Fields1)\nconst originalLoc = $('Format the URL').item.json.loc;\nconst cleanUrl = typeof originalLoc === 'string' ? originalLoc.trim().toLowerCase() : '';\n\n// Check if URL already exists\n// Empty object {} means URL doesn't exist, so we should insert\nconst shouldInsert = Object.keys(supabaseResult).length === 0;\n\nreturn [\n {\n json: {\n url: cleanUrl,\n shouldInsert,\n }\n }\n];"
},
"typeVersion": 2
},
{
"id": "54ed36e4-e675-4bd2-a74e-aeadbe7f486c",
"name": "Si 'shouldInsert' est vrai",
"type": "n8n-nodes-base.if",
"position": [
-4992,
160
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "f3a00d98-73af-4d35-b4e5-5158c120753f",
"operator": {
"type": "boolean",
"operation": "true",
"singleValue": true
},
"leftValue": "={{ $json.shouldInsert }}",
"rightValue": "true"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "483dc0c7-da52-423a-a3bb-cc9ef6d6f1df",
"name": "URL dans une nouvelle ligne",
"type": "n8n-nodes-base.supabase",
"position": [
-4752,
272
],
"parameters": {
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "url",
"fieldValue": "={{ $json.url }}"
}
]
}
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "632752e1-138e-481f-92ad-2ac14c245c45",
"name": "Note adhésive1",
"type": "n8n-nodes-base.stickyNote",
"position": [
-5888,
64
],
"parameters": {
"width": 1280,
"height": 500,
"content": "## Put all Website`s URLs in Supabase Table - scrape_queue"
},
"typeVersion": 1
},
{
"id": "5fc57e6f-771c-4eaa-ba8e-8e233dc2a343",
"name": "CREATE TABLE scrape_queue dans Supabase",
"type": "n8n-nodes-base.postgres",
"position": [
-6816,
-688
],
"parameters": {
"query": "CREATE TABLE scrape_queue (\n id uuid DEFAULT gen_random_uuid() PRIMARY KEY,\n url text NOT NULL UNIQUE,\n status text NOT NULL DEFAULT 'pending', -- 'pending', 'completed', 'error'\n task_id text,\n result text,\n created_at timestamp with time zone DEFAULT now(),\n updated_at timestamp with time zone DEFAULT now()\n);\n\n-- Optional: Auto-update updated_at on row change\nCREATE OR REPLACE FUNCTION update_updated_at_column()\nRETURNS TRIGGER AS $$\nBEGIN\n NEW.updated_at = now();\n RETURN NEW;\nEND;\n$$ language 'plpgsql';\n\nCREATE TRIGGER update_scrape_queue_updated_at\nBEFORE UPDATE ON scrape_queue\nFOR EACH ROW\nEXECUTE PROCEDURE update_updated_at_column();",
"options": {},
"operation": "executeQuery"
},
"credentials": {
"postgres": {
"id": "k1GeBv6AjFuwp2B1",
"name": "Postgres_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 2.6
},
{
"id": "8b2666b7-0eb6-42df-9ae2-e204516dd3d1",
"name": "CREATE TABLE scrape_queue dans Supabase1",
"type": "n8n-nodes-base.postgres",
"position": [
-6608,
-688
],
"parameters": {
"query": "CREATE TABLE documents (\n id SERIAL PRIMARY KEY,\n content TEXT,\n metadata JSONB,\n embedding VECTOR(1536) -- Adjust the dimension size based on your OpenAI model (e.g. ada-002 returns 1536)\n);",
"options": {},
"operation": "executeQuery"
},
"credentials": {
"postgres": {
"id": "k1GeBv6AjFuwp2B1",
"name": "Postgres_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 2.6
},
{
"id": "7c7b8f66-00f6-48db-af03-fba30dc5e6b1",
"name": "Note adhésive2",
"type": "n8n-nodes-base.stickyNote",
"position": [
-6848,
-768
],
"parameters": {
"color": 3,
"width": 500,
"height": 280,
"content": "## Execute Once"
},
"typeVersion": 1
},
{
"id": "82279582-c71b-43aa-8e60-6b8af7ce866c",
"name": "Note adhésive",
"type": "n8n-nodes-base.stickyNote",
"position": [
-4992,
-736
],
"parameters": {
"color": 4,
"width": 460,
"height": 360,
"content": "## Get the URL from Supabase and check if it is completed or not\n\n**Only the NOT completed URLs will be passed**"
},
"typeVersion": 1
},
{
"id": "8b2245b2-cdc2-408a-879b-260335a10bcb",
"name": "Note adhésive3",
"type": "n8n-nodes-base.stickyNote",
"position": [
-4448,
-736
],
"parameters": {
"color": 5,
"width": 640,
"height": 360,
"content": "## Crawl4AI URL Scraping"
},
"typeVersion": 1
},
{
"id": "b42143d2-1e13-4031-996a-26af2dc26632",
"name": "Scraping de page web Crawl4ai",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"position": [
-4384,
-608
],
"parameters": {
"url": "https://crawl4ai-app-nrcsv.ondigitalocean.app/crawl",
"method": "POST",
"options": {},
"sendBody": true,
"authentication": "genericCredentialType",
"bodyParameters": {
"parameters": [
{
"name": "urls",
"value": "={{ $json.url }}"
},
{
"name": "priority",
"value": "10"
}
]
},
"genericAuthType": "httpHeaderAuth"
},
"credentials": {
"httpHeaderAuth": {
"id": "De808MMiUFOFLbNm",
"name": "Crawl4ai_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000
},
{
"id": "6ac1fda6-8363-4cff-8810-7cb2ffa63b67",
"name": "Supprimer les données redondantes du scraping",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3488,
-768
],
"parameters": {
"jsCode": "return items.map(item => {\n // Handle both data structures\n const raw = item.json.result?.markdown || item.json.cleanedText || item.json.html || '';\n \n // Add a safety check for null/undefined\n if (!raw) {\n return {\n json: {\n url: item.json.result?.url || item.json.url || '',\n cleanedText: '',\n error: 'No content found to process'\n }\n };\n }\n \n let cleaned = raw\n // Remove headers but keep the content structure\n .replace(/^#{1,6}\\s+(.+)$/gm, '$1') // Convert headers to plain text\n \n // Remove markdown links but keep the text\n .replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1') // Keep link text, remove URL\n \n // Remove code blocks completely\n .replace(/```[\\s\\S]*?```/g, '') \n .replace(/`([^`]+)`/g, '$1') // Remove inline code backticks but keep content\n \n // Remove markdown formatting\n .replace(/\\*\\*([^*]+)\\*\\*/g, '$1') // Remove bold formatting\n .replace(/\\*([^*]+)\\*/g, '$1') // Remove italic formatting\n .replace(/_{2,}([^_]+)_{2,}/g, '$1') // Remove underline formatting\n .replace(/~~([^~]+)~~/g, '$1') // Remove strikethrough\n \n // Remove lists formatting but keep content\n .replace(/^\\s*[-*+]\\s+/gm, '') // Remove bullet points\n .replace(/^\\s*\\d+\\.\\s+/gm, '') // Remove numbered lists\n \n // Remove HTML remnants\n .replace(/<[^>]*>/g, '') // Remove any remaining HTML tags\n .replace(/&[a-zA-Z0-9#]+;/g, '') // Remove HTML entities\n \n // Remove navigation and common web elements\n .replace(/\\b(Home|About|Contact|Privacy|Terms|Login|Register|Menu|Navigation|Footer|Header|Sidebar)\\b/gi, '')\n .replace(/\\b(Click here|Read more|Learn more|Show more|View all|See all)\\b/gi, '')\n .replace(/\\b(Previous|Next|Page \\d+|Back to top)\\b/gi, '')\n \n // Remove social media and sharing text\n .replace(/\\b(Share|Tweet|Facebook|LinkedIn|Instagram|Follow us|Subscribe)\\b/gi, '')\n \n // Remove common website noise\n .replace(/\\b(Cookie|Cookies|GDPR|Accept|Decline|Consent)\\b/gi, '')\n .replace(/\\b(Advertisement|Ad|Sponsored|Promotion)\\b/gi, '')\n \n // Remove excessive punctuation and symbols\n .replace(/[^\\w\\s.,!?;:()\\-\"']/g, '') // Keep only essential punctuation\n .replace(/\\.{2,}/g, '.') // Replace multiple dots with single dot\n .replace(/\\?{2,}/g, '?') // Replace multiple question marks\n .replace(/!{2,}/g, '!') // Replace multiple exclamation marks\n \n // Clean up whitespace and line breaks\n .replace(/\\n{3,}/g, '\\n\\n') // Replace multiple line breaks with double\n .replace(/\\s+/g, ' ') // Normalize whitespace\n .replace(/\\s*\\n\\s*/g, '\\n') // Clean line breaks\n \n // Remove lines that are too short (likely noise)\n .split('\\n')\n .filter(line => line.trim().length > 10) // Remove very short lines\n .join('\\n')\n \n .trim();\n \n // Additional quality checks\n const wordCount = cleaned.split(/\\s+/).length;\n const hasMinimumContent = wordCount >= 50; // Minimum 50 words\n \n // Check if content is mostly meaningful (not just numbers/symbols)\n const meaningfulContent = cleaned.replace(/[^\\w\\s]/g, '').length > cleaned.length * 0.7;\n \n // Extract additional metadata for better context\n const extractedTitle = raw.match(/^#{1,3}\\s+(.+)$/m)?.[1] || '';\n const domain = (item.json.result?.url || item.json.url || '').replace(/^https?:\\/\\//, '').split('/')[0];\n \n return {\n json: {\n url: item.json.result?.url || item.json.url || '',\n cleanedText: cleaned,\n wordCount: wordCount,\n hasMinimumContent: hasMinimumContent,\n meaningfulContent: meaningfulContent,\n extractedTitle: extractedTitle,\n domain: domain,\n contentLength: cleaned.length,\n // Quality score for filtering\n qualityScore: (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0)\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"name": "Supabase Vector Store_documents",
"type": "@n8n/n8n-nodes-langchain.vectorStoreSupabase",
"position": [
-2544,
-672
],
"parameters": {
"mode": "insert",
"options": {
"queryName": "match_documents"
},
"tableName": {
"__rl": true,
"mode": "list",
"value": "documents",
"cachedResultName": "documents"
}
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "edb03374-1674-4070-b8a6-7afff6118f9a",
"name": "Obtenir une ligne - Table scrape_queue",
"type": "n8n-nodes-base.supabase",
"position": [
-4912,
-592
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $json.url }}"
}
]
},
"tableId": "scrape_queue",
"operation": "get"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "57358b66-0d48-4d53-a188-c5c550e46a9e",
"name": "Mettre à jour une ligne dans la table scrape_queue",
"type": "n8n-nodes-base.supabase",
"position": [
-2224,
-992
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').item.json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "status",
"fieldValue": "={{ $('Crawl4AI_Task Status').item.json.status }}"
},
{
"fieldId": "task_id",
"fieldValue": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "3291a358-282c-4cc2-a869-c9b4651e157e",
"name": "Mettre à jour une ligne dans la table scrape_queue1",
"type": "n8n-nodes-base.supabase",
"position": [
-3984,
-1072
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "task_id",
"fieldValue": "={{ $json.task_id }}"
},
{
"fieldId": "status",
"fieldValue": "={{ $json.error.status }}"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "f801de82-dbe9-44c1-a6c3-ac2847e93060",
"name": "Attendre1",
"type": "n8n-nodes-base.wait",
"position": [
-4352,
-208
],
"webhookId": "32f2ac99-68dc-4afc-8ebb-f64625cc96ef",
"parameters": {
"unit": "minutes"
},
"typeVersion": 1.1
},
{
"id": "10aecbd3-6fd8-420f-b997-34d68eecde54",
"name": "Nœud de filtre de qualité",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3264,
-768
],
"parameters": {
"jsCode": "// Filter out low-quality content\nreturn items.filter(item => {\n const quality = item.json.qualityScore || 0;\n const minWords = item.json.wordCount >= 50;\n const hasContent = item.json.cleanedText.length > 200;\n \n return quality >= 0.5 && minWords && hasContent;\n});"
},
"typeVersion": 2
},
{
"id": "9473c86c-7525-41f6-a2be-f7750d930317",
"name": "Détection du type de contenu",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3008,
-768
],
"parameters": {
"jsCode": "// Content Type Detection - Fixed Version\nreturn items.map(item => {\n const text = item.json.cleanedText || '';\n \n // Content type detection function\n const detectContentType = (text) => {\n if (!text || text.length < 10) {\n return 'unknown';\n }\n \n const lowerText = text.toLowerCase();\n \n // Check for code content\n if (lowerText.includes('function') || lowerText.includes('class') || \n lowerText.includes('import') || lowerText.includes('def ') ||\n lowerText.includes('var ') || lowerText.includes('const ')) {\n return 'code';\n }\n \n // Check for tutorial content\n if (lowerText.includes('step 1') || lowerText.includes('tutorial') || \n lowerText.includes('how to') || lowerText.includes('guide') ||\n lowerText.includes('walkthrough')) {\n return 'tutorial';\n }\n \n // Check for FAQ content\n if (lowerText.includes('faq') || lowerText.includes('q:') || \n lowerText.includes('a:') || lowerText.includes('question') ||\n lowerText.includes('frequently asked')) {\n return 'faq';\n }\n \n // Check for documentation\n if (lowerText.includes('api') || lowerText.includes('documentation') ||\n lowerText.includes('reference') || lowerText.includes('manual')) {\n return 'documentation';\n }\n \n // Check for news/blog content\n if (lowerText.includes('published') || lowerText.includes('author') ||\n lowerText.includes('posted') || lowerText.includes('blog')) {\n return 'blog';\n }\n \n // Check for product/service pages\n if (lowerText.includes('price') || lowerText.includes('buy') ||\n lowerText.includes('purchase') || lowerText.includes('product')) {\n return 'product';\n }\n \n // Default to article\n return 'article';\n };\n \n // Detect content type\n const contentType = detectContentType(text);\n \n // Return the item with added content type\n return {\n json: {\n ...item.json, // Keep all existing data\n contentType: contentType\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "54873bf5-ecb2-44e3-9dfb-e0e6ace02917",
"name": "Extraction de métadonnées améliorée",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-2784,
-768
],
"parameters": {
"jsCode": "// Enhanced metadata extraction - Fixed Version\nreturn items.map(item => {\n const cleaned = item.json.cleanedText || '';\n const url = item.json.url || '';\n const contentType = item.json.contentType || 'article';\n \n // Extract title from the cleaned text (look for first meaningful line)\n const extractTitle = (text) => {\n if (!text) return '';\n \n const lines = text.split('\\n').filter(line => line.trim().length > 0);\n if (lines.length === 0) return '';\n \n // Find the first substantial line (likely the title)\n const titleLine = lines.find(line => \n line.trim().length > 10 && \n line.trim().length < 200 &&\n !line.includes('http') &&\n !line.includes('www.')\n );\n \n return titleLine ? titleLine.trim() : lines[0].trim();\n };\n \n // Extract domain from URL\n const extractDomain = (url) => {\n if (!url) return '';\n try {\n return url.replace(/^https?:\\/\\//, '').split('/')[0];\n } catch (e) {\n return '';\n }\n };\n \n // Count words in the text\n const countWords = (text) => {\n if (!text) return 0;\n return text.trim().split(/\\s+/).filter(word => word.length > 0).length;\n };\n \n // Calculate quality score\n const calculateQualityScore = (text, wordCount) => {\n if (!text || wordCount < 50) return 0;\n \n const meaningfulContent = text.replace(/[^\\w\\s]/g, '').length > text.length * 0.7;\n const hasMinimumContent = wordCount >= 50;\n \n return (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0);\n };\n \n // Simple language detection (basic version)\n const detectLanguage = (text) => {\n if (!text) return 'unknown';\n \n // Simple heuristic - could be improved with a proper language detection library\n const commonEnglishWords = ['the', 'and', 'is', 'in', 'to', 'of', 'a', 'for', 'on', 'with'];\n const commonDutchWords = ['de', 'het', 'en', 'van', 'een', 'in', 'op', 'te', 'aan', 'met'];\n \n const lowerText = text.toLowerCase();\n const englishCount = commonEnglishWords.filter(word => lowerText.includes(` ${word} `)).length;\n const dutchCount = commonDutchWords.filter(word => lowerText.includes(` ${word} `)).length;\n \n if (englishCount > dutchCount) return 'en';\n if (dutchCount > englishCount) return 'nl';\n return 'unknown';\n };\n \n // Extract all metadata\n const extractedTitle = extractTitle(cleaned);\n const domain = extractDomain(url);\n const wordCount = countWords(cleaned);\n const qualityScore = calculateQualityScore(cleaned, wordCount);\n const detectedLanguage = detectLanguage(cleaned);\n \n // Enhanced metadata object\n const metadata = {\n page: url,\n title: extractedTitle,\n domain: domain,\n contentType: contentType,\n wordCount: wordCount,\n scrapedDate: new Date().toISOString(),\n language: detectedLanguage,\n qualityScore: qualityScore,\n contentLength: cleaned.length\n };\n \n return {\n json: {\n ...item.json, // Keep all existing data\n metadata: metadata,\n // Also keep individual fields for easier access\n extractedTitle: extractedTitle,\n domain: domain,\n wordCount: wordCount,\n qualityScore: qualityScore,\n detectedLanguage: detectedLanguage\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "f2d3d6a3-b48e-4b08-bf8e-f8fff06d3494",
"name": "Note adhésive4",
"type": "n8n-nodes-base.stickyNote",
"position": [
-3536,
-912
],
"parameters": {
"color": 6,
"width": 900,
"height": 340,
"content": "## Clean te HTML code"
},
"typeVersion": 1
},
{
"id": "6ddcf33d-84cb-4ee7-bf62-cb2747aff406",
"name": "Si1",
"type": "n8n-nodes-base.if",
"position": [
-3632,
-288
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "3e84e5d8-e49c-4a7b-98c3-9e115f592c10",
"operator": {
"type": "string",
"operation": "exists",
"singleValue": true
},
"leftValue": "={{ $json.task_id }}",
"rightValue": ""
},
{
"id": "c6a0525f-3224-4ad5-8d0a-e0a7a27fb5d1",
"operator": {
"type": "number",
"operation": "gte"
},
"leftValue": "={{ $json.attempt_count }}",
"rightValue": 10
}
]
}
},
"typeVersion": 2.2
},
{
"id": "ffb7b9cb-a4fb-4db2-833c-331672de42bd",
"name": "Mettre à jour une ligne dans la table scrape_queue2",
"type": "n8n-nodes-base.supabase",
"position": [
-3376,
-176
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "task_id",
"fieldValue": "={{ $json.task_id }}"
},
{
"fieldId": "status",
"fieldValue": "=error"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "44c7fe75-0e88-4114-b506-6e7850c2a038",
"name": "Compteur Task_id",
"type": "n8n-nodes-base.code",
"position": [
-3856,
-288
],
"parameters": {
"jsCode": "// Simple counter that resets for each new task ID\nif (typeof globalThis.currentTaskId === 'undefined') {\n globalThis.currentTaskId = null;\n globalThis.currentCounter = 0;\n}\n\nreturn items.map(item => {\n const taskId = item.json.task_id;\n \n // Check if this is a new task ID\n if (globalThis.currentTaskId !== taskId) {\n // New task ID detected - reset counter\n globalThis.currentTaskId = taskId;\n globalThis.currentCounter = 1;\n } else {\n // Same task ID - increment counter\n globalThis.currentCounter++;\n }\n \n return {\n json: {\n ...item.json,\n attempt_count: globalThis.currentCounter\n }\n };\n});"
},
"typeVersion": 2
}
],
"pinData": {},
"connections": {
"961143cf-c387-4e2d-a477-0988c0b0f512": {
"main": [
[
{
"node": "6ac1fda6-8363-4cff-8810-7cb2ffa63b67",
"type": "main",
"index": 0
}
],
[
{
"node": "e3b525eb-7a3f-456d-a476-b013293c85e0",
"type": "main",
"index": 0
}
]
]
},
"6ddcf33d-84cb-4ee7-bf62-cb2747aff406": {
"main": [
[
{
"node": "ffb7b9cb-a4fb-4db2-833c-331672de42bd",
"type": "main",
"index": 0
}
],
[
{
"node": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
"type": "main",
"index": 0
}
]
]
},
"f78a39bd-183c-4985-b1b1-f3142dfe31f3": {
"main": [
[
{
"node": "b42143d2-1e13-4031-996a-26af2dc26632",
"type": "main",
"index": 0
}
],
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"b23dd724-1bd7-4eef-9e22-8bef987b2128": {
"main": [
[
{
"node": "4715b380-f386-4926-892e-2c133a1155c1",
"type": "main",
"index": 0
}
]
]
},
"f52b3e19-7d64-4f3d-848d-81cf2b65bb15": {
"main": [
[
{
"node": "bdbed5ea-d1a1-4922-a7b7-759466709fcb",
"type": "main",
"index": 0
}
]
]
},
"f801de82-dbe9-44c1-a6c3-ac2847e93060": {
"main": [
[
{
"node": "b42143d2-1e13-4031-996a-26af2dc26632",
"type": "main",
"index": 0
}
]
]
},
"4715b380-f386-4926-892e-2c133a1155c1": {
"main": [
[
{
"node": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"type": "main",
"index": 0
}
]
]
},
"b6dfe888-4e2e-4c74-8a66-c3db28604514": {
"main": [
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"e3b525eb-7a3f-456d-a476-b013293c85e0": {
"main": [
[
{
"node": "44c7fe75-0e88-4114-b506-6e7850c2a038",
"type": "main",
"index": 0
}
]
]
},
"20e77374-c3ce-457f-945c-d6f6dc928de1": {
"main": [
[
{
"node": "b23dd724-1bd7-4eef-9e22-8bef987b2128",
"type": "main",
"index": 0
}
]
]
},
"78f05cb5-8b9c-4f51-b252-4ca2195b52ad": {
"main": [
[
{
"node": "805f1fea-841b-40aa-a055-de7ddbbb306f",
"type": "main",
"index": 0
}
]
]
},
"56181432-63f2-4d93-be6d-6f1489e04ca9": {
"main": [
[],
[
{
"node": "edb03374-1674-4070-b8a6-7afff6118f9a",
"type": "main",
"index": 0
}
]
]
},
"44c7fe75-0e88-4114-b506-6e7850c2a038": {
"main": [
[
{
"node": "6ddcf33d-84cb-4ee7-bf62-cb2747aff406",
"type": "main",
"index": 0
}
]
]
},
"f0da6b36-885a-4e86-b044-f3b490bf3829": {
"main": [
[
{
"node": "b6dfe888-4e2e-4c74-8a66-c3db28604514",
"type": "main",
"index": 0
}
],
[
{
"node": "78f05cb5-8b9c-4f51-b252-4ca2195b52ad",
"type": "main",
"index": 0
}
]
]
},
"483dc0c7-da52-423a-a3bb-cc9ef6d6f1df": {
"main": [
[
{
"node": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"type": "main",
"index": 0
}
]
]
},
"bc5aac68-bb66-4c9c-abd7-9a913b0a56fa": {
"ai_embedding": [
[
{
"node": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"type": "ai_embedding",
"index": 0
}
]
]
},
"991580c5-10ed-4bab-811e-2ec50d4050fd": {
"ai_document": [
[
{
"node": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"type": "ai_document",
"index": 0
}
]
]
},
"10aecbd3-6fd8-420f-b997-34d68eecde54": {
"main": [
[
{
"node": "9473c86c-7525-41f6-a2be-f7750d930317",
"type": "main",
"index": 0
}
]
]
},
"bdbed5ea-d1a1-4922-a7b7-759466709fcb": {
"main": [
[
{
"node": "961143cf-c387-4e2d-a477-0988c0b0f512",
"type": "main",
"index": 0
}
],
[
{
"node": "3291a358-282c-4cc2-a869-c9b4651e157e",
"type": "main",
"index": 0
}
]
]
},
"9473c86c-7525-41f6-a2be-f7750d930317": {
"main": [
[
{
"node": "54873bf5-ecb2-44e3-9dfb-e0e6ace02917",
"type": "main",
"index": 0
}
]
]
},
"0fc79f0d-8ebd-4d61-ac29-7ba65284af52": {
"ai_textSplitter": [
[
{
"node": "991580c5-10ed-4bab-811e-2ec50d4050fd",
"type": "ai_textSplitter",
"index": 0
}
]
]
},
"b42143d2-1e13-4031-996a-26af2dc26632": {
"main": [
[
{
"node": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
"type": "main",
"index": 0
}
],
[
{
"node": "f801de82-dbe9-44c1-a6c3-ac2847e93060",
"type": "main",
"index": 0
}
]
]
},
"54ed36e4-e675-4bd2-a74e-aeadbe7f486c": {
"main": [
[
{
"node": "483dc0c7-da52-423a-a3bb-cc9ef6d6f1df",
"type": "main",
"index": 0
}
],
[
{
"node": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"type": "main",
"index": 0
}
]
]
},
"54873bf5-ecb2-44e3-9dfb-e0e6ace02917": {
"main": [
[
{
"node": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"type": "main",
"index": 0
}
]
]
},
"edb03374-1674-4070-b8a6-7afff6118f9a": {
"main": [
[
{
"node": "f78a39bd-183c-4985-b1b1-f3142dfe31f3",
"type": "main",
"index": 0
}
]
]
},
"520a512f-2da8-4cb7-b834-fe6fbfa2ad02": {
"main": [
[
{
"node": "57358b66-0d48-4d53-a188-c5c550e46a9e",
"type": "main",
"index": 0
}
]
]
},
"ab180eb3-c086-4f9f-b9d0-f3f56056a416": {
"main": [
[
{
"node": "20e77374-c3ce-457f-945c-d6f6dc928de1",
"type": "main",
"index": 0
}
]
]
},
"57358b66-0d48-4d53-a188-c5c550e46a9e": {
"main": [
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"3291a358-282c-4cc2-a869-c9b4651e157e": {
"main": [
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"ffb7b9cb-a4fb-4db2-833c-331672de42bd": {
"main": [
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"6ac1fda6-8363-4cff-8810-7cb2ffa63b67": {
"main": [
[
{
"node": "10aecbd3-6fd8-420f-b997-34d68eecde54",
"type": "main",
"index": 0
}
]
]
},
"4f6e6ccb-7757-4e9f-b50c-9acb2fe99009": {
"main": [
[
{
"node": "54ed36e4-e675-4bd2-a74e-aeadbe7f486c",
"type": "main",
"index": 0
}
]
]
},
"805f1fea-841b-40aa-a055-de7ddbbb306f": {
"main": [
[
{
"node": "4f6e6ccb-7757-4e9f-b50c-9acb2fe99009",
"type": "main",
"index": 0
}
]
]
}
}
}Comment utiliser ce workflow ?
Copiez le code de configuration JSON ci-dessus, créez un nouveau workflow dans votre instance n8n et sélectionnez "Importer depuis le JSON", collez la configuration et modifiez les paramètres d'authentification selon vos besoins.
Dans quelles scénarios ce workflow est-il adapté ?
Avancé - Création de contenu, IA Multimodale
Est-ce payant ?
Ce workflow est entièrement gratuit et peut être utilisé directement. Veuillez noter que les services tiers utilisés dans le workflow (comme l'API OpenAI) peuvent nécessiter un paiement de votre part.
Workflows recommandés
Mariela Slavenova
@marielabg🚀 Fractional Head of AI Ops | COO | CTO | I diagnose, fix & ship automations that pay for themselves | The Harden Method™ - Discover→Design→Build→Break→Harden→Launch→Monitor | Founder @ MarinextAI
Partager ce workflow