Skip to content

Commit 8c94568

Browse files
committed
fix(extractor): blindaje contra mezcla de comentarios y deduplicación de media
1 parent ab50cfa commit 8c94568

1 file changed

Lines changed: 96 additions & 49 deletions

File tree

src/lib/utils/post-extractor.ts

Lines changed: 96 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -152,24 +152,32 @@ export function extractPostSectionMedia(jinaMarkdown: string, postId: string | n
152152
// [Thread ------ 1.1K views](https://www.threads.com/@user/post/postId)
153153
// Ese enlace falso ancla la extracción al inicio del documento (solapando con el padre).
154154
// Omitimos esos enlaces de metadatos al buscar el ancla real.
155-
const serviceLinkRe = new RegExp(`^\\[[^\\]]+\\]\\(https?://[^)]+/post/${postId}[^)]*\\)`, 'im')
155+
const serviceLinkRe = new RegExp(`^\\[[^\\]]*Thread[^\\]]*\\]\\(https?://[^)]+/post/${postId}[^)]*\\)`, 'im')
156156

157157
// Si el postId es el principal (header), tomamos desde el inicio (para posts sueltos).
158158
const urlSourceMatch = /^URL Source:\s*(.+)$/im.exec(jinaMarkdown)
159159
const urlSource = urlSourceMatch?.[1]?.trim() ?? ''
160160
const isMainPost = urlSource.includes(`/post/${postId}`) || urlSource.includes(`/t/${postId}`)
161161

162162
const postMatch = new RegExp(`/post/${postId}\\b`, 'i').exec(searchArea)
163-
const firstPostIdMatch = /\/post\/([A-Za-z0-9_-]+)\b/i.exec(searchArea)
163+
const allPostIdMatches = Array.from(searchArea.matchAll(/\/post\/([A-Za-z0-9_-]+)\b/gi))
164+
const firstPostIdMatch = allPostIdMatches[0]
164165
const firstIdInContent = firstPostIdMatch?.[1]
165166

166167
// PBL: Solo empezamos desde el inicio si somos el post principal Y además
167-
// no hay otro ID de post visible en los primeros 600 chars que no sea un breadcrumb.
168+
// no hay otro ID de post visible en los primeros 1000 chars que no sea un breadcrumb.
168169
const isTargetAtTop = !firstIdInContent || firstIdInContent.toLowerCase() === postId.toLowerCase()
169170

170-
// Comprobar si el primer match es un link de servicio (breadcrumb)
171-
const firstMatchIsService = serviceLinkRe.test(searchArea.slice(0, 1000))
172-
const shouldStartFromTop = isMainPost && isTargetAtTop && !firstMatchIsService
171+
// Comprobar si el primer match de ID pertenece a un link de servicio (breadcrumb)
172+
let firstIdIsActuallyService = false
173+
if (firstPostIdMatch) {
174+
const lineStart = searchArea.lastIndexOf('\n', firstPostIdMatch.index)
175+
const lineEnd = searchArea.indexOf('\n', firstPostIdMatch.index!)
176+
const line = searchArea.slice(lineStart >= 0 ? lineStart : 0, lineEnd >= 0 ? lineEnd : searchArea.length)
177+
firstIdIsActuallyService = serviceLinkRe.test(line)
178+
}
179+
180+
const shouldStartFromTop = isMainPost && isTargetAtTop && !firstIdIsActuallyService
173181

174182
if (shouldStartFromTop || !postMatch) {
175183
const postSection = searchArea.slice(0, 4000)
@@ -179,14 +187,14 @@ export function extractPostSectionMedia(jinaMarkdown: string, postId: string | n
179187
]
180188
}
181189

182-
if (postMatch) {
190+
if (firstPostIdMatch) {
183191
// Si el match es un link de servicio, buscamos el SIGUIENTE match
184-
let actualIndex = postMatch.index
185-
if (firstMatchIsService) {
186-
const remaining = searchArea.slice(postMatch.index + postMatch[0].length)
192+
let actualIndex = firstPostIdMatch.index!
193+
if (firstIdIsActuallyService) {
194+
const remaining = searchArea.slice(firstPostIdMatch.index! + firstPostIdMatch[0].length)
187195
const nextMatch = new RegExp(`/post/${postId}\\b`, 'i').exec(remaining)
188196
if (nextMatch) {
189-
actualIndex = postMatch.index + postMatch[0].length + nextMatch.index
197+
actualIndex = firstPostIdMatch.index! + firstPostIdMatch[0].length + nextMatch.index
190198
}
191199
}
192200

@@ -239,8 +247,19 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
239247
: jinaMarkdown
240248

241249
// 2. Encontrar la línea ancla del sub-post en el área de contenido
242-
const serviceLinkRe = new RegExp(`^\\[[^\\]]+\\]\\(https?://[^)]+/post/${postId}[^)]*\\)`, 'im')
243-
const firstMatchIsService = serviceLinkRe.test(contentArea.slice(0, 1000))
250+
const serviceLinkRe = new RegExp(`^\\[[^\\]]*Thread[^\\]]*\\]\\(https?://[^)]+/post/${postId}[^)]*\\)`, 'im')
251+
252+
const allPostIdMatches = Array.from(contentArea.matchAll(/\/post\/([A-Za-z0-9_-]+)\b/gi))
253+
const firstPostIdMatch = allPostIdMatches[0]
254+
const firstIdInContent = firstPostIdMatch?.[1]
255+
256+
let firstIdIsActuallyService = false
257+
if (firstPostIdMatch) {
258+
const lineStart = contentArea.lastIndexOf('\n', firstPostIdMatch.index)
259+
const lineEnd = contentArea.indexOf('\n', firstPostIdMatch.index!)
260+
const line = contentArea.slice(lineStart >= 0 ? lineStart : 0, lineEnd >= 0 ? lineEnd : contentArea.length)
261+
firstIdIsActuallyService = serviceLinkRe.test(line)
262+
}
244263

245264
const anchorRe = new RegExp(`/post/${postId}\\b`, 'i')
246265
const anchorMatch = anchorRe.exec(contentArea)
@@ -253,28 +272,46 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
253272
const isMainPost = urlSource.includes(`/post/${postId}`) || urlSource.includes(`/t/${postId}`)
254273

255274
let bodyFull: string
256-
const firstPostIdMatch = /\/post\/([A-Za-z0-9_-]+)\b/i.exec(contentArea)
257-
const firstIdInContent = firstPostIdMatch?.[1]
258275
const isTargetAtTop = !firstIdInContent || firstIdInContent.toLowerCase() === postId.toLowerCase()
259276

260-
if (isMainPost && isTargetAtTop && !firstMatchIsService) {
277+
// 3. Decidir el punto de inicio (bodyFull)
278+
if (isMainPost && isTargetAtTop && !firstIdIsActuallyService) {
279+
// Caso A: Somos el post principal y estamos al inicio (sin breadcrumbs que nos precedan)
261280
bodyFull = contentArea
262-
} else if (anchorMatch) {
263-
let actualIndex = anchorMatch.index
264-
// Si el primer match de ID es un enlace de servicio (breadcrumb), buscamos el siguiente
265-
if (firstMatchIsService) {
266-
const remaining = contentArea.slice(anchorMatch.index + anchorMatch[0].length)
267-
const nextMatch = anchorRe.exec(remaining)
268-
if (nextMatch) {
269-
actualIndex = anchorMatch.index + anchorMatch[0].length + nextMatch.index
281+
} else {
282+
// Buscamos si existe un ancla REAL (una mención al post que no sea el breadcrumb térmico de Jina)
283+
let realAnchorIndex = -1
284+
if (anchorMatch) {
285+
if (!firstIdIsActuallyService || anchorMatch.index !== firstPostIdMatch?.index) {
286+
// El match encontrado no es el service link inicial, es un ancla real
287+
realAnchorIndex = anchorMatch.index
288+
} else {
289+
// El primer match era un breadcrumb, buscamos el siguiente (el real en el cuerpo)
290+
const remaining = contentArea.slice(anchorMatch.index + anchorMatch[0].length)
291+
const nextMatch = anchorRe.exec(remaining)
292+
if (nextMatch) {
293+
realAnchorIndex = anchorMatch.index + anchorMatch[0].length + nextMatch.index
294+
}
270295
}
271296
}
272-
const anchorLineEnd = contentArea.indexOf('\n', actualIndex)
273-
const bodyStart = anchorLineEnd >= 0 ? anchorLineEnd + 1 : actualIndex + anchorMatch[0].length
274-
bodyFull = contentArea.slice(bodyStart)
275-
} else {
276-
// Si no hay anchor y no es el principal, bajamos al modo resiliente (toda el área)
277-
bodyFull = contentArea
297+
298+
if (realAnchorIndex >= 0) {
299+
// Caso B: Hemos encontrado el ancla real del sub-post
300+
const anchorLineEnd = contentArea.indexOf('\n', realAnchorIndex)
301+
const bodyStart = anchorLineEnd >= 0 ? anchorLineEnd + 1 : realAnchorIndex + anchorMatch![0].length
302+
bodyFull = contentArea.slice(bodyStart)
303+
} else if (isMainPost && allPostIdMatches.length > 0) {
304+
// Caso C: No hay ancla real pero hay hermanos. Somos el post "activo" (el que se está visitando).
305+
// Su contenido suele ir después del último hermano enlazado.
306+
const lastMatch = allPostIdMatches[allPostIdMatches.length - 1]
307+
const lastIndex = lastMatch.index!
308+
const lineEnd = contentArea.indexOf('\n', lastIndex)
309+
const bodyStart = lineEnd >= 0 ? lineEnd + 1 : lastIndex + lastMatch[0].length
310+
bodyFull = contentArea.slice(bodyStart)
311+
} else {
312+
// Caso D: Resiliencia (tomamos todo el área)
313+
bodyFull = contentArea
314+
}
278315
}
279316

280317
// 4. Acotar el cuerpo hasta el SIGUIENTE /post/ID o sección "Related threads"
@@ -283,54 +320,63 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
283320
// si aparece más adelante (ej. en enlaces de imágenes).
284321
const nextPostRe = new RegExp(`\\/post\\/(?!${postId}\\b)[A-Za-z0-9_-]+\\b`, 'i')
285322
const sectionEndRe = new RegExp(
286-
`${nextPostRe.source}|\\nRelated threads\\b|\\nRelated posts\\b|\\nLog in or sign up\\b|\\nContinue with Instagram\\b`,
323+
`${nextPostRe.source}|\\n(?:###\\s+)?(?:Destacadas|Ver actividad|Replies|More replies|Activity|Respuesta de|Replies from|Replying to)\\b|\\nRelated threads\\b|\\nRelated posts\\b|\\nLog in or sign up\\b|\\nContinue with Instagram\\b|\\nLog in to see more replies\\b`,
287324
'i'
288325
)
289326
const sectionEndMatch = sectionEndRe.exec(bodyFull)
290327
const body = sectionEndMatch ? bodyFull.slice(0, sectionEndMatch.index) : bodyFull.slice(0, 2500)
291328

292-
// 5. Extraer texto: primera línea útil del cuerpo acotado
293-
const lines = body.replace(/\r/g, '').split('\n').map((l) => l.trim())
294-
let text: string | undefined
329+
// 5. Extraer texto acumulando todas las líneas útiles del bloque acotado
330+
const lines = body.replace(/\r/g, '').split('\n').map((lineText: string) => lineText.trim())
331+
const validLines: string[] = []
332+
295333
for (const line of lines) {
296334
if (!line) continue
297335
const candidate = normalizeCandidateLine(line)
298-
if (candidate.length < 6) continue
336+
337+
if (candidate.length < 3) continue // Bajamos el umbral para capturar texto corto/handles
299338
if (/^!\[/.test(candidate)) continue
300339
if (isImageAltNoise(candidate)) continue
301340
if (isGenericThreadsText(candidate)) continue
302341
if (isThreadPositionNoise(candidate)) continue
303342
if (/^\d+$/.test(candidate)) continue
304343
if (/^https?:\/\//i.test(candidate)) continue
305344
if (/^title:|^url source:|^markdown content:/i.test(candidate)) continue
345+
306346
if (/^\[/.test(candidate)) {
307347
const fullMatch = /^\[([^\]]+)\]\((https?:\/\/[^)]+)\)/.exec(candidate)
308348
if (!fullMatch) continue
309349
const [, displayText, rawUrl] = fullMatch
310350
if (displayText?.startsWith('!')) continue
351+
311352
const resolved = resolveThreadsTrackingUrl(rawUrl)
312353
const urlText = resolved ?? rawUrl.replace(/^https?:\/\//i, '')
313-
if (urlText.length >= 6 && !/^@/.test(urlText) && !/threads\.(?:net|com)/i.test(urlText)) {
314-
text = urlText
315-
break
354+
355+
// Permitimos handles (@) si están en el cuerpo del mensaje
356+
if (urlText.length >= 2 && !/threads\.(?:net|com)/i.test(urlText)) {
357+
validLines.push(urlText)
358+
continue
316359
}
317-
if (displayText && displayText.length >= 6 && !/^@/.test(displayText) && !/^\d+[kKmMbB]?$/.test(displayText)) {
360+
if (displayText && displayText.length >= 2 && !/^\d+[kKmMbB]?$/.test(displayText)) {
318361
if (!isThreadPositionNoise(displayText)) {
319-
text = displayText
320-
break
362+
validLines.push(displayText)
363+
continue
321364
}
322365
}
323366
continue
324367
}
325-
text = candidate
326-
break
368+
369+
validLines.push(candidate)
327370
}
328371

329-
// 6. Extraer media del cuerpo acotado (sin solaparse con otros posts)
330-
const mediaUrls = [
372+
const text = validLines.join(' ').trim() || undefined
373+
374+
// 6. Extraer media del cuerpo acotado deduplicando URLs
375+
const rawMediaUrls = [
331376
...extractMediaFromText(body),
332377
...extractEscapedMediaFromText(body),
333378
]
379+
const mediaUrls = Array.from(new Set(rawMediaUrls.filter((u) => u && u.startsWith('http'))))
334380

335381
return { text, mediaUrls }
336382
}
@@ -465,10 +511,10 @@ function isLikelyThreadsPostUrl(url?: string): boolean {
465511
function isGenericThreadsText(value?: string): boolean {
466512
if (!value) return false
467513
const v = value.trim()
468-
// Etiquetas de metadatos de perfil que Jina renderiza como bullets antes del texto real
469-
if (/^(?:author|follow|followers?|following|published|likes?|reposts?|replies|related\s+threads|related\s+posts)$/i.test(v)) return true
470-
// Strings de login/auth page — red de seguridad por si isJinaLoginPage no la detectó
471-
return /join threads to share ideas|threads\s*\s*log in|log in or sign up|log in with (?:your\s+)?(?:instagram|facebook|username)|log in to (?:threads|instagram)|sign in (?:with|to) (?:instagram|facebook|threads)|sign up for threads|continue with (?:instagram|facebook)|create (?:a )?new account|forgot (?:your )?password|don'?t have an account|join the conversation|see what people are talking about|what'?s on your mind/i.test(v)
514+
// Etiquetas de metadatos de perfil/post que Jina renderiza
515+
if (/^(?:author|follow|followers?|following|published|likes?|reposts?|replies|related\s+threads|related\s+posts|destacadas|ver\s+actividad)$/i.test(v)) return true
516+
// Strings de login/auth o navegación de hilos
517+
return /log in to see more replies|sign in (?:with|to) (?:instagram|facebook|threads)|join threads to share ideas|threads\s*\s*log in|log in or sign up|log in with (?:your\s+)?(?:instagram|facebook|username)|sign up for threads|continue with (?:instagram|facebook)|create (?:a )?new account|forgot (?:your )?password|don'?t have an account|join the conversation|see what people are talking about|what'?s on your mind/i.test(v)
472518
}
473519

474520
/*
@@ -531,6 +577,7 @@ function normalizeCandidateLine(line: string): string {
531577
function isThreadPositionNoise(line: string): boolean {
532578
return /^post\s+\d+\s+de\s+\d+$/i.test(line)
533579
|| /^\d+\s+de\s+\d+$/i.test(line)
580+
|| /^\d+\/\d+$/i.test(line)
534581
|| /^Thread\s*[-]+\s*[\d.kmb\s]+\s*views$/i.test(line)
535582
}
536583

0 commit comments

Comments
 (0)