fix(extractor): blindaje contra mezcla de comentarios y deduplicación de media

D4vRAM369 · D4vRAM369 · commit 8c94568ebe1e · 2026-03-09T21:49:13.000Z
diff --git a/src/lib/utils/post-extractor.ts b/src/lib/utils/post-extractor.ts
@@ -152,24 +152,32 @@ export function extractPostSectionMedia(jinaMarkdown: string, postId: string | n
     // [Thread ------ 1.1K views](https://www.threads.com/@user/post/postId)
     // Ese enlace falso ancla la extracción al inicio del documento (solapando con el padre).
     // Omitimos esos enlaces de metadatos al buscar el ancla real.
-    const serviceLinkRe = new RegExp(`^\\[[^\\]]+\\]\\(https?://[^)]+/post/${postId}[^)]*\\)`, 'im')
+    const serviceLinkRe = new RegExp(`^\\[[^\\]]*Thread[^\\]]*\\]\\(https?://[^)]+/post/${postId}[^)]*\\)`, 'im')
 
     // Si el postId es el principal (header), tomamos desde el inicio (para posts sueltos).
     const urlSourceMatch = /^URL Source:\s*(.+)$/im.exec(jinaMarkdown)
     const urlSource = urlSourceMatch?.[1]?.trim() ?? ''
     const isMainPost = urlSource.includes(`/post/${postId}`) || urlSource.includes(`/t/${postId}`)
 
     const postMatch = new RegExp(`/post/${postId}\\b`, 'i').exec(searchArea)
-    const firstPostIdMatch = /\/post\/([A-Za-z0-9_-]+)\b/i.exec(searchArea)
+    const allPostIdMatches = Array.from(searchArea.matchAll(/\/post\/([A-Za-z0-9_-]+)\b/gi))
+    const firstPostIdMatch = allPostIdMatches[0]
     const firstIdInContent = firstPostIdMatch?.[1]
 
     // PBL: Solo empezamos desde el inicio si somos el post principal Y además
-    // no hay otro ID de post visible en los primeros 600 chars que no sea un breadcrumb.
+    // no hay otro ID de post visible en los primeros 1000 chars que no sea un breadcrumb.
     const isTargetAtTop = !firstIdInContent || firstIdInContent.toLowerCase() === postId.toLowerCase()
 
-    // Comprobar si el primer match es un link de servicio (breadcrumb)
-    const firstMatchIsService = serviceLinkRe.test(searchArea.slice(0, 1000))
-    const shouldStartFromTop = isMainPost && isTargetAtTop && !firstMatchIsService
+    // Comprobar si el primer match de ID pertenece a un link de servicio (breadcrumb)
+    let firstIdIsActuallyService = false
+    if (firstPostIdMatch) {
+      const lineStart = searchArea.lastIndexOf('\n', firstPostIdMatch.index)
+      const lineEnd = searchArea.indexOf('\n', firstPostIdMatch.index!)
+      const line = searchArea.slice(lineStart >= 0 ? lineStart : 0, lineEnd >= 0 ? lineEnd : searchArea.length)
+      firstIdIsActuallyService = serviceLinkRe.test(line)
+    }
+
+    const shouldStartFromTop = isMainPost && isTargetAtTop && !firstIdIsActuallyService
 
     if (shouldStartFromTop || !postMatch) {
       const postSection = searchArea.slice(0, 4000)
@@ -179,14 +187,14 @@ export function extractPostSectionMedia(jinaMarkdown: string, postId: string | n
       ]
     }
 
-    if (postMatch) {
+    if (firstPostIdMatch) {
       // Si el match es un link de servicio, buscamos el SIGUIENTE match
-      let actualIndex = postMatch.index
-      if (firstMatchIsService) {
-        const remaining = searchArea.slice(postMatch.index + postMatch[0].length)
+      let actualIndex = firstPostIdMatch.index!
+      if (firstIdIsActuallyService) {
+        const remaining = searchArea.slice(firstPostIdMatch.index! + firstPostIdMatch[0].length)
         const nextMatch = new RegExp(`/post/${postId}\\b`, 'i').exec(remaining)
         if (nextMatch) {
-          actualIndex = postMatch.index + postMatch[0].length + nextMatch.index
+          actualIndex = firstPostIdMatch.index! + firstPostIdMatch[0].length + nextMatch.index
         }
       }
 
@@ -239,8 +247,19 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
     : jinaMarkdown
 
   // 2. Encontrar la línea ancla del sub-post en el área de contenido
-  const serviceLinkRe = new RegExp(`^\\[[^\\]]+\\]\\(https?://[^)]+/post/${postId}[^)]*\\)`, 'im')
-  const firstMatchIsService = serviceLinkRe.test(contentArea.slice(0, 1000))
+  const serviceLinkRe = new RegExp(`^\\[[^\\]]*Thread[^\\]]*\\]\\(https?://[^)]+/post/${postId}[^)]*\\)`, 'im')
+
+  const allPostIdMatches = Array.from(contentArea.matchAll(/\/post\/([A-Za-z0-9_-]+)\b/gi))
+  const firstPostIdMatch = allPostIdMatches[0]
+  const firstIdInContent = firstPostIdMatch?.[1]
+
+  let firstIdIsActuallyService = false
+  if (firstPostIdMatch) {
+    const lineStart = contentArea.lastIndexOf('\n', firstPostIdMatch.index)
+    const lineEnd = contentArea.indexOf('\n', firstPostIdMatch.index!)
+    const line = contentArea.slice(lineStart >= 0 ? lineStart : 0, lineEnd >= 0 ? lineEnd : contentArea.length)
+    firstIdIsActuallyService = serviceLinkRe.test(line)
+  }
 
   const anchorRe = new RegExp(`/post/${postId}\\b`, 'i')
   const anchorMatch = anchorRe.exec(contentArea)
@@ -253,28 +272,46 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
   const isMainPost = urlSource.includes(`/post/${postId}`) || urlSource.includes(`/t/${postId}`)
 
   let bodyFull: string
-  const firstPostIdMatch = /\/post\/([A-Za-z0-9_-]+)\b/i.exec(contentArea)
-  const firstIdInContent = firstPostIdMatch?.[1]
   const isTargetAtTop = !firstIdInContent || firstIdInContent.toLowerCase() === postId.toLowerCase()
 
-  if (isMainPost && isTargetAtTop && !firstMatchIsService) {
+  // 3. Decidir el punto de inicio (bodyFull)
+  if (isMainPost && isTargetAtTop && !firstIdIsActuallyService) {
+    // Caso A: Somos el post principal y estamos al inicio (sin breadcrumbs que nos precedan)
     bodyFull = contentArea
-  } else if (anchorMatch) {
-    let actualIndex = anchorMatch.index
-    // Si el primer match de ID es un enlace de servicio (breadcrumb), buscamos el siguiente
-    if (firstMatchIsService) {
-      const remaining = contentArea.slice(anchorMatch.index + anchorMatch[0].length)
-      const nextMatch = anchorRe.exec(remaining)
-      if (nextMatch) {
-        actualIndex = anchorMatch.index + anchorMatch[0].length + nextMatch.index
+  } else {
+    // Buscamos si existe un ancla REAL (una mención al post que no sea el breadcrumb térmico de Jina)
+    let realAnchorIndex = -1
+    if (anchorMatch) {
+      if (!firstIdIsActuallyService || anchorMatch.index !== firstPostIdMatch?.index) {
+        // El match encontrado no es el service link inicial, es un ancla real
+        realAnchorIndex = anchorMatch.index
+      } else {
+        // El primer match era un breadcrumb, buscamos el siguiente (el real en el cuerpo)
+        const remaining = contentArea.slice(anchorMatch.index + anchorMatch[0].length)
+        const nextMatch = anchorRe.exec(remaining)
+        if (nextMatch) {
+          realAnchorIndex = anchorMatch.index + anchorMatch[0].length + nextMatch.index
+        }
       }
     }
-    const anchorLineEnd = contentArea.indexOf('\n', actualIndex)
-    const bodyStart = anchorLineEnd >= 0 ? anchorLineEnd + 1 : actualIndex + anchorMatch[0].length
-    bodyFull = contentArea.slice(bodyStart)
-  } else {
-    // Si no hay anchor y no es el principal, bajamos al modo resiliente (toda el área)
-    bodyFull = contentArea
+
+    if (realAnchorIndex >= 0) {
+      // Caso B: Hemos encontrado el ancla real del sub-post
+      const anchorLineEnd = contentArea.indexOf('\n', realAnchorIndex)
+      const bodyStart = anchorLineEnd >= 0 ? anchorLineEnd + 1 : realAnchorIndex + anchorMatch![0].length
+      bodyFull = contentArea.slice(bodyStart)
+    } else if (isMainPost && allPostIdMatches.length > 0) {
+      // Caso C: No hay ancla real pero hay hermanos. Somos el post "activo" (el que se está visitando).
+      // Su contenido suele ir después del último hermano enlazado.
+      const lastMatch = allPostIdMatches[allPostIdMatches.length - 1]
+      const lastIndex = lastMatch.index!
+      const lineEnd = contentArea.indexOf('\n', lastIndex)
+      const bodyStart = lineEnd >= 0 ? lineEnd + 1 : lastIndex + lastMatch[0].length
+      bodyFull = contentArea.slice(bodyStart)
+    } else {
+      // Caso D: Resiliencia (tomamos todo el área)
+      bodyFull = contentArea
+    }
   }
 
   // 4. Acotar el cuerpo hasta el SIGUIENTE /post/ID o sección "Related threads"
@@ -283,54 +320,63 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
   // si aparece más adelante (ej. en enlaces de imágenes).
   const nextPostRe = new RegExp(`\\/post\\/(?!${postId}\\b)[A-Za-z0-9_-]+\\b`, 'i')
   const sectionEndRe = new RegExp(
-    `${nextPostRe.source}|\\nRelated threads\\b|\\nRelated posts\\b|\\nLog in or sign up\\b|\\nContinue with Instagram\\b`,
+    `${nextPostRe.source}|\\n(?:###\\s+)?(?:Destacadas|Ver actividad|Replies|More replies|Activity|Respuesta de|Replies from|Replying to)\\b|\\nRelated threads\\b|\\nRelated posts\\b|\\nLog in or sign up\\b|\\nContinue with Instagram\\b|\\nLog in to see more replies\\b`,
     'i'
   )
   const sectionEndMatch = sectionEndRe.exec(bodyFull)
   const body = sectionEndMatch ? bodyFull.slice(0, sectionEndMatch.index) : bodyFull.slice(0, 2500)
 
-  // 5. Extraer texto: primera línea útil del cuerpo acotado
-  const lines = body.replace(/\r/g, '').split('\n').map((l) => l.trim())
-  let text: string | undefined
+  // 5. Extraer texto acumulando todas las líneas útiles del bloque acotado
+  const lines = body.replace(/\r/g, '').split('\n').map((lineText: string) => lineText.trim())
+  const validLines: string[] = []
+
   for (const line of lines) {
     if (!line) continue
     const candidate = normalizeCandidateLine(line)
-    if (candidate.length < 6) continue
+
+    if (candidate.length < 3) continue // Bajamos el umbral para capturar texto corto/handles
     if (/^!\[/.test(candidate)) continue
     if (isImageAltNoise(candidate)) continue
     if (isGenericThreadsText(candidate)) continue
     if (isThreadPositionNoise(candidate)) continue
     if (/^\d+$/.test(candidate)) continue
     if (/^https?:\/\//i.test(candidate)) continue
     if (/^title:|^url source:|^markdown content:/i.test(candidate)) continue
+
     if (/^\[/.test(candidate)) {
       const fullMatch = /^\[([^\]]+)\]\((https?:\/\/[^)]+)\)/.exec(candidate)
       if (!fullMatch) continue
       const [, displayText, rawUrl] = fullMatch
       if (displayText?.startsWith('!')) continue
+
       const resolved = resolveThreadsTrackingUrl(rawUrl)
       const urlText = resolved ?? rawUrl.replace(/^https?:\/\//i, '')
-      if (urlText.length >= 6 && !/^@/.test(urlText) && !/threads\.(?:net|com)/i.test(urlText)) {
-        text = urlText
-        break
+
+      // Permitimos handles (@) si están en el cuerpo del mensaje
+      if (urlText.length >= 2 && !/threads\.(?:net|com)/i.test(urlText)) {
+        validLines.push(urlText)
+        continue
       }
-      if (displayText && displayText.length >= 6 && !/^@/.test(displayText) && !/^\d+[kKmMbB]?$/.test(displayText)) {
+      if (displayText && displayText.length >= 2 && !/^\d+[kKmMbB]?$/.test(displayText)) {
         if (!isThreadPositionNoise(displayText)) {
-          text = displayText
-          break
+          validLines.push(displayText)
+          continue
         }
       }
       continue
     }
-    text = candidate
-    break
+
+    validLines.push(candidate)
   }
 
-  // 6. Extraer media del cuerpo acotado (sin solaparse con otros posts)
-  const mediaUrls = [
+  const text = validLines.join(' ').trim() || undefined
+
+  // 6. Extraer media del cuerpo acotado deduplicando URLs
+  const rawMediaUrls = [
     ...extractMediaFromText(body),
     ...extractEscapedMediaFromText(body),
   ]
+  const mediaUrls = Array.from(new Set(rawMediaUrls.filter((u) => u && u.startsWith('http'))))
 
   return { text, mediaUrls }
 }
@@ -465,10 +511,10 @@ function isLikelyThreadsPostUrl(url?: string): boolean {
 function isGenericThreadsText(value?: string): boolean {
   if (!value) return false
   const v = value.trim()
-  // Etiquetas de metadatos de perfil que Jina renderiza como bullets antes del texto real
-  if (/^(?:author|follow|followers?|following|published|likes?|reposts?|replies|related\s+threads|related\s+posts)$/i.test(v)) return true
-  // Strings de login/auth page — red de seguridad por si isJinaLoginPage no la detectó
-  return /join threads to share ideas|threads\s*•\s*log in|log in or sign up|log in with (?:your\s+)?(?:instagram|facebook|username)|log in to (?:threads|instagram)|sign in (?:with|to) (?:instagram|facebook|threads)|sign up for threads|continue with (?:instagram|facebook)|create (?:a )?new account|forgot (?:your )?password|don'?t have an account|join the conversation|see what people are talking about|what'?s on your mind/i.test(v)
+  // Etiquetas de metadatos de perfil/post que Jina renderiza
+  if (/^(?:author|follow|followers?|following|published|likes?|reposts?|replies|related\s+threads|related\s+posts|destacadas|ver\s+actividad)$/i.test(v)) return true
+  // Strings de login/auth o navegación de hilos
+  return /log in to see more replies|sign in (?:with|to) (?:instagram|facebook|threads)|join threads to share ideas|threads\s*•\s*log in|log in or sign up|log in with (?:your\s+)?(?:instagram|facebook|username)|sign up for threads|continue with (?:instagram|facebook)|create (?:a )?new account|forgot (?:your )?password|don'?t have an account|join the conversation|see what people are talking about|what'?s on your mind/i.test(v)
 }
 
 /*
@@ -531,6 +577,7 @@ function normalizeCandidateLine(line: string): string {
 function isThreadPositionNoise(line: string): boolean {
   return /^post\s+\d+\s+de\s+\d+$/i.test(line)
     || /^\d+\s+de\s+\d+$/i.test(line)
+    || /^\d+\/\d+$/i.test(line)
     || /^Thread\s*[-—]+\s*[\d.kmb\s]+\s*views$/i.test(line)
 }