De la extracción del sitemap al almacenamiento vectorial: crear un flujo de trabajo RAG eficiente
Este es unContent Creation, Multimodal AIflujo de automatización del dominio deautomatización que contiene 40 nodos.Utiliza principalmente nodos como If, Set, Xml, Code, Wait. Crawl del mapa del sitio a almacenamiento vectorial: crear flujos de trabajo RAG eficientes
- •Información de conexión de la base de datos PostgreSQL
- •URL y Clave de API de Supabase
- •Pueden requerirse credenciales de autenticación para la API de destino
- •Clave de API de OpenAI
Nodos utilizados (40)
Categoría
{
"meta": {
"instanceId": "0862f70dc42e115052f6a2d4c2b6537665b4361a614cec7cd17d1c45c8868621",
"templateCredsSetupCompleted": true
},
"nodes": [
{
"id": "ab180eb3-c086-4f9f-b9d0-f3f56056a416",
"name": "Al hacer clic en 'Probar flujo de trabajo'",
"type": "n8n-nodes-base.manualTrigger",
"position": [
-6816,
-304
],
"parameters": {},
"typeVersion": 1
},
{
"id": "20e77374-c3ce-457f-945c-d6f6dc928de1",
"name": "HTTP Request",
"type": "n8n-nodes-base.httpRequest",
"position": [
-6624,
-304
],
"parameters": {
"url": "https://www.kiekens.com/sitemap.xml",
"options": {}
},
"typeVersion": 4.2
},
{
"id": "b23dd724-1bd7-4eef-9e22-8bef987b2128",
"name": "XML",
"type": "n8n-nodes-base.xml",
"position": [
-6432,
-304
],
"parameters": {
"options": {}
},
"typeVersion": 1
},
{
"id": "4715b380-f386-4926-892e-2c133a1155c1",
"name": "Dividir salida",
"type": "n8n-nodes-base.splitOut",
"position": [
-6224,
-304
],
"parameters": {
"options": {},
"fieldToSplitOut": "urlset.url"
},
"typeVersion": 1
},
{
"id": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"name": "Iterar sobre elementos",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-5152,
-592
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
"name": "Esperar",
"type": "n8n-nodes-base.wait",
"position": [
-4192,
-608
],
"webhookId": "9af87c5e-b07f-48dc-9ca8-61b471a24cad",
"parameters": {
"amount": 30
},
"typeVersion": 1.1
},
{
"id": "961143cf-c387-4e2d-a477-0988c0b0f512",
"name": "Si",
"type": "n8n-nodes-base.if",
"position": [
-3728,
-608
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "9d90c1ce-590e-40a5-ae8c-d92326032975",
"operator": {
"type": "string",
"operation": "equals"
},
"leftValue": "={{ $json.status }}",
"rightValue": "completed"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "991580c5-10ed-4bab-811e-2ec50d4050fd",
"name": "Cargador de datos predeterminado",
"type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
"position": [
-2384,
-496
],
"parameters": {
"options": {
"metadata": {
"metadataValues": [
{
"name": "page",
"value": "={{ $json.result.url }}"
}
]
}
},
"jsonData": "={{ $json.cleanedText }}",
"jsonMode": "expressionData"
},
"typeVersion": 1
},
{
"id": "0fc79f0d-8ebd-4d61-ac29-7ba65284af52",
"name": "Separador de texto por caracteres",
"type": "@n8n/n8n-nodes-langchain.textSplitterCharacterTextSplitter",
"position": [
-2368,
-352
],
"parameters": {
"chunkSize": 5000
},
"typeVersion": 1
},
{
"id": "bc5aac68-bb66-4c9c-abd7-9a913b0a56fa",
"name": "Embeddings OpenAI",
"type": "@n8n/n8n-nodes-langchain.embeddingsOpenAi",
"position": [
-2528,
-464
],
"parameters": {
"model": "text-embedding-ada-002",
"options": {}
},
"credentials": {
"openAiApi": {
"id": "OwpPpcltPaXyVklS",
"name": "OpenAi_Mariela.b.d."
}
},
"typeVersion": 1.1
},
{
"id": "e3b525eb-7a3f-456d-a476-b013293c85e0",
"name": "Editar campos",
"type": "n8n-nodes-base.set",
"position": [
-4064,
-288
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "f2bcdb54-e1fe-4670-99aa-6eec973bf5f1",
"name": "task_id",
"type": "string",
"value": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "bdbed5ea-d1a1-4922-a7b7-759466709fcb",
"name": "Estado de tarea Crawl4AI",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"position": [
-3968,
-608
],
"parameters": {
"url": "=https://crawl4ai-app-nrcsv.ondigitalocean.app/task/{{ $json.task_id }}",
"options": {
"timeout": 5000
},
"authentication": "genericCredentialType",
"genericAuthType": "httpHeaderAuth"
},
"credentials": {
"httpHeaderAuth": {
"id": "De808MMiUFOFLbNm",
"name": "Crawl4ai_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000
},
{
"id": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"name": "Iterar sobre elementos1",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-5824,
144
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "f78a39bd-183c-4985-b1b1-f3142dfe31f3",
"name": "Si2",
"type": "n8n-nodes-base.if",
"position": [
-4736,
-592
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "or",
"conditions": [
{
"id": "fbc89427-990b-45d0-8538-e403c1b18ddd",
"operator": {
"type": "string",
"operation": "contains"
},
"leftValue": "={{ $json.status }}",
"rightValue": "pending"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "b6dfe888-4e2e-4c74-8a66-c3db28604514",
"name": "Dividir salida1",
"type": "n8n-nodes-base.splitOut",
"position": [
-5392,
-384
],
"parameters": {
"include": "selectedOtherFields",
"options": {},
"fieldToSplitOut": "url",
"fieldsToInclude": "status"
},
"typeVersion": 1
},
{
"id": "78f05cb5-8b9c-4f51-b252-4ca2195b52ad",
"name": "Formatear la URL",
"type": "n8n-nodes-base.set",
"position": [
-5648,
160
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "9038a5b3-6985-4edc-bdd1-8dc5a3e8877c",
"name": "loc",
"type": "string",
"value": "={{ $json.loc.trim().toLowerCase() }}"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "805f1fea-841b-40aa-a055-de7ddbbb306f",
"name": "Verificar si la URL está en la tabla Supabase",
"type": "n8n-nodes-base.supabase",
"onError": "continueErrorOutput",
"position": [
-5456,
160
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $json.loc }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"operation": "getAll",
"returnAll": true
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 1,
"alwaysOutputData": true,
"waitBetweenTries": 5000
},
{
"id": "4f6e6ccb-7757-4e9f-b50c-9acb2fe99009",
"name": "Formatear la salida del nodo Supabase",
"type": "n8n-nodes-base.code",
"position": [
-5184,
160
],
"parameters": {
"jsCode": "const supabaseResult = $json;\n\n// Get the clean URL from the Set node (Edit Fields1)\nconst originalLoc = $('Format the URL').item.json.loc;\nconst cleanUrl = typeof originalLoc === 'string' ? originalLoc.trim().toLowerCase() : '';\n\n// Check if URL already exists\n// Empty object {} means URL doesn't exist, so we should insert\nconst shouldInsert = Object.keys(supabaseResult).length === 0;\n\nreturn [\n {\n json: {\n url: cleanUrl,\n shouldInsert,\n }\n }\n];"
},
"typeVersion": 2
},
{
"id": "54ed36e4-e675-4bd2-a74e-aeadbe7f486c",
"name": "Si 'shouldInsert' es verdadero",
"type": "n8n-nodes-base.if",
"position": [
-4992,
160
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "f3a00d98-73af-4d35-b4e5-5158c120753f",
"operator": {
"type": "boolean",
"operation": "true",
"singleValue": true
},
"leftValue": "={{ $json.shouldInsert }}",
"rightValue": "true"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "483dc0c7-da52-423a-a3bb-cc9ef6d6f1df",
"name": "URL en una nueva fila",
"type": "n8n-nodes-base.supabase",
"position": [
-4752,
272
],
"parameters": {
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "url",
"fieldValue": "={{ $json.url }}"
}
]
}
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "632752e1-138e-481f-92ad-2ac14c245c45",
"name": "Nota adhesiva1",
"type": "n8n-nodes-base.stickyNote",
"position": [
-5888,
64
],
"parameters": {
"width": 1280,
"height": 500,
"content": "## Put all Website`s URLs in Supabase Table - scrape_queue"
},
"typeVersion": 1
},
{
"id": "5fc57e6f-771c-4eaa-ba8e-8e233dc2a343",
"name": "CREAR TABLA scrape_queue en Supabase",
"type": "n8n-nodes-base.postgres",
"position": [
-6816,
-688
],
"parameters": {
"query": "CREATE TABLE scrape_queue (\n id uuid DEFAULT gen_random_uuid() PRIMARY KEY,\n url text NOT NULL UNIQUE,\n status text NOT NULL DEFAULT 'pending', -- 'pending', 'completed', 'error'\n task_id text,\n result text,\n created_at timestamp with time zone DEFAULT now(),\n updated_at timestamp with time zone DEFAULT now()\n);\n\n-- Optional: Auto-update updated_at on row change\nCREATE OR REPLACE FUNCTION update_updated_at_column()\nRETURNS TRIGGER AS $$\nBEGIN\n NEW.updated_at = now();\n RETURN NEW;\nEND;\n$$ language 'plpgsql';\n\nCREATE TRIGGER update_scrape_queue_updated_at\nBEFORE UPDATE ON scrape_queue\nFOR EACH ROW\nEXECUTE PROCEDURE update_updated_at_column();",
"options": {},
"operation": "executeQuery"
},
"credentials": {
"postgres": {
"id": "k1GeBv6AjFuwp2B1",
"name": "Postgres_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 2.6
},
{
"id": "8b2666b7-0eb6-42df-9ae2-e204516dd3d1",
"name": "CREAR TABLA scrape_queue en Supabase1",
"type": "n8n-nodes-base.postgres",
"position": [
-6608,
-688
],
"parameters": {
"query": "CREATE TABLE documents (\n id SERIAL PRIMARY KEY,\n content TEXT,\n metadata JSONB,\n embedding VECTOR(1536) -- Adjust the dimension size based on your OpenAI model (e.g. ada-002 returns 1536)\n);",
"options": {},
"operation": "executeQuery"
},
"credentials": {
"postgres": {
"id": "k1GeBv6AjFuwp2B1",
"name": "Postgres_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 2.6
},
{
"id": "7c7b8f66-00f6-48db-af03-fba30dc5e6b1",
"name": "Nota adhesiva2",
"type": "n8n-nodes-base.stickyNote",
"position": [
-6848,
-768
],
"parameters": {
"color": 3,
"width": 500,
"height": 280,
"content": "## Execute Once"
},
"typeVersion": 1
},
{
"id": "82279582-c71b-43aa-8e60-6b8af7ce866c",
"name": "Nota adhesiva",
"type": "n8n-nodes-base.stickyNote",
"position": [
-4992,
-736
],
"parameters": {
"color": 4,
"width": 460,
"height": 360,
"content": "## Get the URL from Supabase and check if it is completed or not\n\n**Only the NOT completed URLs will be passed**"
},
"typeVersion": 1
},
{
"id": "8b2245b2-cdc2-408a-879b-260335a10bcb",
"name": "Nota adhesiva3",
"type": "n8n-nodes-base.stickyNote",
"position": [
-4448,
-736
],
"parameters": {
"color": 5,
"width": 640,
"height": 360,
"content": "## Crawl4AI URL Scraping"
},
"typeVersion": 1
},
{
"id": "b42143d2-1e13-4031-996a-26af2dc26632",
"name": "Rastreo de página web Crawl4ai",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"position": [
-4384,
-608
],
"parameters": {
"url": "https://crawl4ai-app-nrcsv.ondigitalocean.app/crawl",
"method": "POST",
"options": {},
"sendBody": true,
"authentication": "genericCredentialType",
"bodyParameters": {
"parameters": [
{
"name": "urls",
"value": "={{ $json.url }}"
},
{
"name": "priority",
"value": "10"
}
]
},
"genericAuthType": "httpHeaderAuth"
},
"credentials": {
"httpHeaderAuth": {
"id": "De808MMiUFOFLbNm",
"name": "Crawl4ai_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000
},
{
"id": "6ac1fda6-8363-4cff-8810-7cb2ffa63b67",
"name": "Eliminar datos redundantes del rastreo",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3488,
-768
],
"parameters": {
"jsCode": "return items.map(item => {\n // Handle both data structures\n const raw = item.json.result?.markdown || item.json.cleanedText || item.json.html || '';\n \n // Add a safety check for null/undefined\n if (!raw) {\n return {\n json: {\n url: item.json.result?.url || item.json.url || '',\n cleanedText: '',\n error: 'No content found to process'\n }\n };\n }\n \n let cleaned = raw\n // Remove headers but keep the content structure\n .replace(/^#{1,6}\\s+(.+)$/gm, '$1') // Convert headers to plain text\n \n // Remove markdown links but keep the text\n .replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1') // Keep link text, remove URL\n \n // Remove code blocks completely\n .replace(/```[\\s\\S]*?```/g, '') \n .replace(/`([^`]+)`/g, '$1') // Remove inline code backticks but keep content\n \n // Remove markdown formatting\n .replace(/\\*\\*([^*]+)\\*\\*/g, '$1') // Remove bold formatting\n .replace(/\\*([^*]+)\\*/g, '$1') // Remove italic formatting\n .replace(/_{2,}([^_]+)_{2,}/g, '$1') // Remove underline formatting\n .replace(/~~([^~]+)~~/g, '$1') // Remove strikethrough\n \n // Remove lists formatting but keep content\n .replace(/^\\s*[-*+]\\s+/gm, '') // Remove bullet points\n .replace(/^\\s*\\d+\\.\\s+/gm, '') // Remove numbered lists\n \n // Remove HTML remnants\n .replace(/<[^>]*>/g, '') // Remove any remaining HTML tags\n .replace(/&[a-zA-Z0-9#]+;/g, '') // Remove HTML entities\n \n // Remove navigation and common web elements\n .replace(/\\b(Home|About|Contact|Privacy|Terms|Login|Register|Menu|Navigation|Footer|Header|Sidebar)\\b/gi, '')\n .replace(/\\b(Click here|Read more|Learn more|Show more|View all|See all)\\b/gi, '')\n .replace(/\\b(Previous|Next|Page \\d+|Back to top)\\b/gi, '')\n \n // Remove social media and sharing text\n .replace(/\\b(Share|Tweet|Facebook|LinkedIn|Instagram|Follow us|Subscribe)\\b/gi, '')\n \n // Remove common website noise\n .replace(/\\b(Cookie|Cookies|GDPR|Accept|Decline|Consent)\\b/gi, '')\n .replace(/\\b(Advertisement|Ad|Sponsored|Promotion)\\b/gi, '')\n \n // Remove excessive punctuation and symbols\n .replace(/[^\\w\\s.,!?;:()\\-\"']/g, '') // Keep only essential punctuation\n .replace(/\\.{2,}/g, '.') // Replace multiple dots with single dot\n .replace(/\\?{2,}/g, '?') // Replace multiple question marks\n .replace(/!{2,}/g, '!') // Replace multiple exclamation marks\n \n // Clean up whitespace and line breaks\n .replace(/\\n{3,}/g, '\\n\\n') // Replace multiple line breaks with double\n .replace(/\\s+/g, ' ') // Normalize whitespace\n .replace(/\\s*\\n\\s*/g, '\\n') // Clean line breaks\n \n // Remove lines that are too short (likely noise)\n .split('\\n')\n .filter(line => line.trim().length > 10) // Remove very short lines\n .join('\\n')\n \n .trim();\n \n // Additional quality checks\n const wordCount = cleaned.split(/\\s+/).length;\n const hasMinimumContent = wordCount >= 50; // Minimum 50 words\n \n // Check if content is mostly meaningful (not just numbers/symbols)\n const meaningfulContent = cleaned.replace(/[^\\w\\s]/g, '').length > cleaned.length * 0.7;\n \n // Extract additional metadata for better context\n const extractedTitle = raw.match(/^#{1,3}\\s+(.+)$/m)?.[1] || '';\n const domain = (item.json.result?.url || item.json.url || '').replace(/^https?:\\/\\//, '').split('/')[0];\n \n return {\n json: {\n url: item.json.result?.url || item.json.url || '',\n cleanedText: cleaned,\n wordCount: wordCount,\n hasMinimumContent: hasMinimumContent,\n meaningfulContent: meaningfulContent,\n extractedTitle: extractedTitle,\n domain: domain,\n contentLength: cleaned.length,\n // Quality score for filtering\n qualityScore: (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0)\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"name": "Supabase Vector Store_documents",
"type": "@n8n/n8n-nodes-langchain.vectorStoreSupabase",
"position": [
-2544,
-672
],
"parameters": {
"mode": "insert",
"options": {
"queryName": "match_documents"
},
"tableName": {
"__rl": true,
"mode": "list",
"value": "documents",
"cachedResultName": "documents"
}
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "edb03374-1674-4070-b8a6-7afff6118f9a",
"name": "Obtener una fila - Tabla scrape_queue",
"type": "n8n-nodes-base.supabase",
"position": [
-4912,
-592
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $json.url }}"
}
]
},
"tableId": "scrape_queue",
"operation": "get"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "57358b66-0d48-4d53-a188-c5c550e46a9e",
"name": "Actualizar una fila en la tabla scrape_queue",
"type": "n8n-nodes-base.supabase",
"position": [
-2224,
-992
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').item.json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "status",
"fieldValue": "={{ $('Crawl4AI_Task Status').item.json.status }}"
},
{
"fieldId": "task_id",
"fieldValue": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "3291a358-282c-4cc2-a869-c9b4651e157e",
"name": "Actualizar una fila en la tabla scrape_queue1",
"type": "n8n-nodes-base.supabase",
"position": [
-3984,
-1072
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "task_id",
"fieldValue": "={{ $json.task_id }}"
},
{
"fieldId": "status",
"fieldValue": "={{ $json.error.status }}"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "f801de82-dbe9-44c1-a6c3-ac2847e93060",
"name": "Esperar1",
"type": "n8n-nodes-base.wait",
"position": [
-4352,
-208
],
"webhookId": "32f2ac99-68dc-4afc-8ebb-f64625cc96ef",
"parameters": {
"unit": "minutes"
},
"typeVersion": 1.1
},
{
"id": "10aecbd3-6fd8-420f-b997-34d68eecde54",
"name": "Nodo de filtro de calidad",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3264,
-768
],
"parameters": {
"jsCode": "// Filter out low-quality content\nreturn items.filter(item => {\n const quality = item.json.qualityScore || 0;\n const minWords = item.json.wordCount >= 50;\n const hasContent = item.json.cleanedText.length > 200;\n \n return quality >= 0.5 && minWords && hasContent;\n});"
},
"typeVersion": 2
},
{
"id": "9473c86c-7525-41f6-a2be-f7750d930317",
"name": "Detección de tipo de contenido",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3008,
-768
],
"parameters": {
"jsCode": "// Content Type Detection - Fixed Version\nreturn items.map(item => {\n const text = item.json.cleanedText || '';\n \n // Content type detection function\n const detectContentType = (text) => {\n if (!text || text.length < 10) {\n return 'unknown';\n }\n \n const lowerText = text.toLowerCase();\n \n // Check for code content\n if (lowerText.includes('function') || lowerText.includes('class') || \n lowerText.includes('import') || lowerText.includes('def ') ||\n lowerText.includes('var ') || lowerText.includes('const ')) {\n return 'code';\n }\n \n // Check for tutorial content\n if (lowerText.includes('step 1') || lowerText.includes('tutorial') || \n lowerText.includes('how to') || lowerText.includes('guide') ||\n lowerText.includes('walkthrough')) {\n return 'tutorial';\n }\n \n // Check for FAQ content\n if (lowerText.includes('faq') || lowerText.includes('q:') || \n lowerText.includes('a:') || lowerText.includes('question') ||\n lowerText.includes('frequently asked')) {\n return 'faq';\n }\n \n // Check for documentation\n if (lowerText.includes('api') || lowerText.includes('documentation') ||\n lowerText.includes('reference') || lowerText.includes('manual')) {\n return 'documentation';\n }\n \n // Check for news/blog content\n if (lowerText.includes('published') || lowerText.includes('author') ||\n lowerText.includes('posted') || lowerText.includes('blog')) {\n return 'blog';\n }\n \n // Check for product/service pages\n if (lowerText.includes('price') || lowerText.includes('buy') ||\n lowerText.includes('purchase') || lowerText.includes('product')) {\n return 'product';\n }\n \n // Default to article\n return 'article';\n };\n \n // Detect content type\n const contentType = detectContentType(text);\n \n // Return the item with added content type\n return {\n json: {\n ...item.json, // Keep all existing data\n contentType: contentType\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "54873bf5-ecb2-44e3-9dfb-e0e6ace02917",
"name": "Extracción mejorada de metadatos",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-2784,
-768
],
"parameters": {
"jsCode": "// Enhanced metadata extraction - Fixed Version\nreturn items.map(item => {\n const cleaned = item.json.cleanedText || '';\n const url = item.json.url || '';\n const contentType = item.json.contentType || 'article';\n \n // Extract title from the cleaned text (look for first meaningful line)\n const extractTitle = (text) => {\n if (!text) return '';\n \n const lines = text.split('\\n').filter(line => line.trim().length > 0);\n if (lines.length === 0) return '';\n \n // Find the first substantial line (likely the title)\n const titleLine = lines.find(line => \n line.trim().length > 10 && \n line.trim().length < 200 &&\n !line.includes('http') &&\n !line.includes('www.')\n );\n \n return titleLine ? titleLine.trim() : lines[0].trim();\n };\n \n // Extract domain from URL\n const extractDomain = (url) => {\n if (!url) return '';\n try {\n return url.replace(/^https?:\\/\\//, '').split('/')[0];\n } catch (e) {\n return '';\n }\n };\n \n // Count words in the text\n const countWords = (text) => {\n if (!text) return 0;\n return text.trim().split(/\\s+/).filter(word => word.length > 0).length;\n };\n \n // Calculate quality score\n const calculateQualityScore = (text, wordCount) => {\n if (!text || wordCount < 50) return 0;\n \n const meaningfulContent = text.replace(/[^\\w\\s]/g, '').length > text.length * 0.7;\n const hasMinimumContent = wordCount >= 50;\n \n return (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0);\n };\n \n // Simple language detection (basic version)\n const detectLanguage = (text) => {\n if (!text) return 'unknown';\n \n // Simple heuristic - could be improved with a proper language detection library\n const commonEnglishWords = ['the', 'and', 'is', 'in', 'to', 'of', 'a', 'for', 'on', 'with'];\n const commonDutchWords = ['de', 'het', 'en', 'van', 'een', 'in', 'op', 'te', 'aan', 'met'];\n \n const lowerText = text.toLowerCase();\n const englishCount = commonEnglishWords.filter(word => lowerText.includes(` ${word} `)).length;\n const dutchCount = commonDutchWords.filter(word => lowerText.includes(` ${word} `)).length;\n \n if (englishCount > dutchCount) return 'en';\n if (dutchCount > englishCount) return 'nl';\n return 'unknown';\n };\n \n // Extract all metadata\n const extractedTitle = extractTitle(cleaned);\n const domain = extractDomain(url);\n const wordCount = countWords(cleaned);\n const qualityScore = calculateQualityScore(cleaned, wordCount);\n const detectedLanguage = detectLanguage(cleaned);\n \n // Enhanced metadata object\n const metadata = {\n page: url,\n title: extractedTitle,\n domain: domain,\n contentType: contentType,\n wordCount: wordCount,\n scrapedDate: new Date().toISOString(),\n language: detectedLanguage,\n qualityScore: qualityScore,\n contentLength: cleaned.length\n };\n \n return {\n json: {\n ...item.json, // Keep all existing data\n metadata: metadata,\n // Also keep individual fields for easier access\n extractedTitle: extractedTitle,\n domain: domain,\n wordCount: wordCount,\n qualityScore: qualityScore,\n detectedLanguage: detectedLanguage\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "f2d3d6a3-b48e-4b08-bf8e-f8fff06d3494",
"name": "Nota adhesiva4",
"type": "n8n-nodes-base.stickyNote",
"position": [
-3536,
-912
],
"parameters": {
"color": 6,
"width": 900,
"height": 340,
"content": "## Clean te HTML code"
},
"typeVersion": 1
},
{
"id": "6ddcf33d-84cb-4ee7-bf62-cb2747aff406",
"name": "Si1",
"type": "n8n-nodes-base.if",
"position": [
-3632,
-288
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "3e84e5d8-e49c-4a7b-98c3-9e115f592c10",
"operator": {
"type": "string",
"operation": "exists",
"singleValue": true
},
"leftValue": "={{ $json.task_id }}",
"rightValue": ""
},
{
"id": "c6a0525f-3224-4ad5-8d0a-e0a7a27fb5d1",
"operator": {
"type": "number",
"operation": "gte"
},
"leftValue": "={{ $json.attempt_count }}",
"rightValue": 10
}
]
}
},
"typeVersion": 2.2
},
{
"id": "ffb7b9cb-a4fb-4db2-833c-331672de42bd",
"name": "Actualizar una fila en la tabla scrape_queue2",
"type": "n8n-nodes-base.supabase",
"position": [
-3376,
-176
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "task_id",
"fieldValue": "={{ $json.task_id }}"
},
{
"fieldId": "status",
"fieldValue": "=error"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "44c7fe75-0e88-4114-b506-6e7850c2a038",
"name": "Contador de Task_id",
"type": "n8n-nodes-base.code",
"position": [
-3856,
-288
],
"parameters": {
"jsCode": "// Simple counter that resets for each new task ID\nif (typeof globalThis.currentTaskId === 'undefined') {\n globalThis.currentTaskId = null;\n globalThis.currentCounter = 0;\n}\n\nreturn items.map(item => {\n const taskId = item.json.task_id;\n \n // Check if this is a new task ID\n if (globalThis.currentTaskId !== taskId) {\n // New task ID detected - reset counter\n globalThis.currentTaskId = taskId;\n globalThis.currentCounter = 1;\n } else {\n // Same task ID - increment counter\n globalThis.currentCounter++;\n }\n \n return {\n json: {\n ...item.json,\n attempt_count: globalThis.currentCounter\n }\n };\n});"
},
"typeVersion": 2
}
],
"pinData": {},
"connections": {
"961143cf-c387-4e2d-a477-0988c0b0f512": {
"main": [
[
{
"node": "6ac1fda6-8363-4cff-8810-7cb2ffa63b67",
"type": "main",
"index": 0
}
],
[
{
"node": "e3b525eb-7a3f-456d-a476-b013293c85e0",
"type": "main",
"index": 0
}
]
]
},
"6ddcf33d-84cb-4ee7-bf62-cb2747aff406": {
"main": [
[
{
"node": "ffb7b9cb-a4fb-4db2-833c-331672de42bd",
"type": "main",
"index": 0
}
],
[
{
"node": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
"type": "main",
"index": 0
}
]
]
},
"f78a39bd-183c-4985-b1b1-f3142dfe31f3": {
"main": [
[
{
"node": "b42143d2-1e13-4031-996a-26af2dc26632",
"type": "main",
"index": 0
}
],
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"b23dd724-1bd7-4eef-9e22-8bef987b2128": {
"main": [
[
{
"node": "4715b380-f386-4926-892e-2c133a1155c1",
"type": "main",
"index": 0
}
]
]
},
"f52b3e19-7d64-4f3d-848d-81cf2b65bb15": {
"main": [
[
{
"node": "bdbed5ea-d1a1-4922-a7b7-759466709fcb",
"type": "main",
"index": 0
}
]
]
},
"f801de82-dbe9-44c1-a6c3-ac2847e93060": {
"main": [
[
{
"node": "b42143d2-1e13-4031-996a-26af2dc26632",
"type": "main",
"index": 0
}
]
]
},
"4715b380-f386-4926-892e-2c133a1155c1": {
"main": [
[
{
"node": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"type": "main",
"index": 0
}
]
]
},
"b6dfe888-4e2e-4c74-8a66-c3db28604514": {
"main": [
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"e3b525eb-7a3f-456d-a476-b013293c85e0": {
"main": [
[
{
"node": "44c7fe75-0e88-4114-b506-6e7850c2a038",
"type": "main",
"index": 0
}
]
]
},
"20e77374-c3ce-457f-945c-d6f6dc928de1": {
"main": [
[
{
"node": "b23dd724-1bd7-4eef-9e22-8bef987b2128",
"type": "main",
"index": 0
}
]
]
},
"78f05cb5-8b9c-4f51-b252-4ca2195b52ad": {
"main": [
[
{
"node": "805f1fea-841b-40aa-a055-de7ddbbb306f",
"type": "main",
"index": 0
}
]
]
},
"56181432-63f2-4d93-be6d-6f1489e04ca9": {
"main": [
[],
[
{
"node": "edb03374-1674-4070-b8a6-7afff6118f9a",
"type": "main",
"index": 0
}
]
]
},
"44c7fe75-0e88-4114-b506-6e7850c2a038": {
"main": [
[
{
"node": "6ddcf33d-84cb-4ee7-bf62-cb2747aff406",
"type": "main",
"index": 0
}
]
]
},
"f0da6b36-885a-4e86-b044-f3b490bf3829": {
"main": [
[
{
"node": "b6dfe888-4e2e-4c74-8a66-c3db28604514",
"type": "main",
"index": 0
}
],
[
{
"node": "78f05cb5-8b9c-4f51-b252-4ca2195b52ad",
"type": "main",
"index": 0
}
]
]
},
"483dc0c7-da52-423a-a3bb-cc9ef6d6f1df": {
"main": [
[
{
"node": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"type": "main",
"index": 0
}
]
]
},
"bc5aac68-bb66-4c9c-abd7-9a913b0a56fa": {
"ai_embedding": [
[
{
"node": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"type": "ai_embedding",
"index": 0
}
]
]
},
"991580c5-10ed-4bab-811e-2ec50d4050fd": {
"ai_document": [
[
{
"node": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"type": "ai_document",
"index": 0
}
]
]
},
"10aecbd3-6fd8-420f-b997-34d68eecde54": {
"main": [
[
{
"node": "9473c86c-7525-41f6-a2be-f7750d930317",
"type": "main",
"index": 0
}
]
]
},
"bdbed5ea-d1a1-4922-a7b7-759466709fcb": {
"main": [
[
{
"node": "961143cf-c387-4e2d-a477-0988c0b0f512",
"type": "main",
"index": 0
}
],
[
{
"node": "3291a358-282c-4cc2-a869-c9b4651e157e",
"type": "main",
"index": 0
}
]
]
},
"9473c86c-7525-41f6-a2be-f7750d930317": {
"main": [
[
{
"node": "54873bf5-ecb2-44e3-9dfb-e0e6ace02917",
"type": "main",
"index": 0
}
]
]
},
"0fc79f0d-8ebd-4d61-ac29-7ba65284af52": {
"ai_textSplitter": [
[
{
"node": "991580c5-10ed-4bab-811e-2ec50d4050fd",
"type": "ai_textSplitter",
"index": 0
}
]
]
},
"b42143d2-1e13-4031-996a-26af2dc26632": {
"main": [
[
{
"node": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
"type": "main",
"index": 0
}
],
[
{
"node": "f801de82-dbe9-44c1-a6c3-ac2847e93060",
"type": "main",
"index": 0
}
]
]
},
"54ed36e4-e675-4bd2-a74e-aeadbe7f486c": {
"main": [
[
{
"node": "483dc0c7-da52-423a-a3bb-cc9ef6d6f1df",
"type": "main",
"index": 0
}
],
[
{
"node": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"type": "main",
"index": 0
}
]
]
},
"54873bf5-ecb2-44e3-9dfb-e0e6ace02917": {
"main": [
[
{
"node": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"type": "main",
"index": 0
}
]
]
},
"edb03374-1674-4070-b8a6-7afff6118f9a": {
"main": [
[
{
"node": "f78a39bd-183c-4985-b1b1-f3142dfe31f3",
"type": "main",
"index": 0
}
]
]
},
"520a512f-2da8-4cb7-b834-fe6fbfa2ad02": {
"main": [
[
{
"node": "57358b66-0d48-4d53-a188-c5c550e46a9e",
"type": "main",
"index": 0
}
]
]
},
"ab180eb3-c086-4f9f-b9d0-f3f56056a416": {
"main": [
[
{
"node": "20e77374-c3ce-457f-945c-d6f6dc928de1",
"type": "main",
"index": 0
}
]
]
},
"57358b66-0d48-4d53-a188-c5c550e46a9e": {
"main": [
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"3291a358-282c-4cc2-a869-c9b4651e157e": {
"main": [
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"ffb7b9cb-a4fb-4db2-833c-331672de42bd": {
"main": [
[
{
"node": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"type": "main",
"index": 0
}
]
]
},
"6ac1fda6-8363-4cff-8810-7cb2ffa63b67": {
"main": [
[
{
"node": "10aecbd3-6fd8-420f-b997-34d68eecde54",
"type": "main",
"index": 0
}
]
]
},
"4f6e6ccb-7757-4e9f-b50c-9acb2fe99009": {
"main": [
[
{
"node": "54ed36e4-e675-4bd2-a74e-aeadbe7f486c",
"type": "main",
"index": 0
}
]
]
},
"805f1fea-841b-40aa-a055-de7ddbbb306f": {
"main": [
[
{
"node": "4f6e6ccb-7757-4e9f-b50c-9acb2fe99009",
"type": "main",
"index": 0
}
]
]
}
}
}¿Cómo usar este flujo de trabajo?
Copie el código de configuración JSON de arriba, cree un nuevo flujo de trabajo en su instancia de n8n y seleccione "Importar desde JSON", pegue la configuración y luego modifique la configuración de credenciales según sea necesario.
¿En qué escenarios es adecuado este flujo de trabajo?
Avanzado - Creación de contenido, IA Multimodal
¿Es de pago?
Este flujo de trabajo es completamente gratuito, puede importarlo y usarlo directamente. Sin embargo, tenga en cuenta que los servicios de terceros utilizados en el flujo de trabajo (como la API de OpenAI) pueden requerir un pago por su cuenta.
Flujos de trabajo relacionados recomendados
Mariela Slavenova
@marielabg🚀 Fractional Head of AI Ops | COO | CTO | I diagnose, fix & ship automations that pay for themselves | The Harden Method™ - Discover→Design→Build→Break→Harden→Launch→Monitor | Founder @ MarinextAI
Compartir este flujo de trabajo