@@ -152,24 +152,32 @@ export function extractPostSectionMedia(jinaMarkdown: string, postId: string | n
152152 // [Thread ------ 1.1K views](https://www.threads.com/@user/post/postId)
153153 // Ese enlace falso ancla la extracción al inicio del documento (solapando con el padre).
154154 // Omitimos esos enlaces de metadatos al buscar el ancla real.
155- const serviceLinkRe = new RegExp ( `^\\[[^\\]]+ \\]\\(https?://[^)]+/post/${ postId } [^)]*\\)` , 'im' )
155+ const serviceLinkRe = new RegExp ( `^\\[[^\\]]*Thread[^\\]]* \\]\\(https?://[^)]+/post/${ postId } [^)]*\\)` , 'im' )
156156
157157 // Si el postId es el principal (header), tomamos desde el inicio (para posts sueltos).
158158 const urlSourceMatch = / ^ U R L S o u r c e : \s * ( .+ ) $ / im. exec ( jinaMarkdown )
159159 const urlSource = urlSourceMatch ?. [ 1 ] ?. trim ( ) ?? ''
160160 const isMainPost = urlSource . includes ( `/post/${ postId } ` ) || urlSource . includes ( `/t/${ postId } ` )
161161
162162 const postMatch = new RegExp ( `/post/${ postId } \\b` , 'i' ) . exec ( searchArea )
163- const firstPostIdMatch = / \/ p o s t \/ ( [ A - Z a - z 0 - 9 _ - ] + ) \b / i. exec ( searchArea )
163+ const allPostIdMatches = Array . from ( searchArea . matchAll ( / \/ p o s t \/ ( [ A - Z a - z 0 - 9 _ - ] + ) \b / gi) )
164+ const firstPostIdMatch = allPostIdMatches [ 0 ]
164165 const firstIdInContent = firstPostIdMatch ?. [ 1 ]
165166
166167 // PBL: Solo empezamos desde el inicio si somos el post principal Y además
167- // no hay otro ID de post visible en los primeros 600 chars que no sea un breadcrumb.
168+ // no hay otro ID de post visible en los primeros 1000 chars que no sea un breadcrumb.
168169 const isTargetAtTop = ! firstIdInContent || firstIdInContent . toLowerCase ( ) === postId . toLowerCase ( )
169170
170- // Comprobar si el primer match es un link de servicio (breadcrumb)
171- const firstMatchIsService = serviceLinkRe . test ( searchArea . slice ( 0 , 1000 ) )
172- const shouldStartFromTop = isMainPost && isTargetAtTop && ! firstMatchIsService
171+ // Comprobar si el primer match de ID pertenece a un link de servicio (breadcrumb)
172+ let firstIdIsActuallyService = false
173+ if ( firstPostIdMatch ) {
174+ const lineStart = searchArea . lastIndexOf ( '\n' , firstPostIdMatch . index )
175+ const lineEnd = searchArea . indexOf ( '\n' , firstPostIdMatch . index ! )
176+ const line = searchArea . slice ( lineStart >= 0 ? lineStart : 0 , lineEnd >= 0 ? lineEnd : searchArea . length )
177+ firstIdIsActuallyService = serviceLinkRe . test ( line )
178+ }
179+
180+ const shouldStartFromTop = isMainPost && isTargetAtTop && ! firstIdIsActuallyService
173181
174182 if ( shouldStartFromTop || ! postMatch ) {
175183 const postSection = searchArea . slice ( 0 , 4000 )
@@ -179,14 +187,14 @@ export function extractPostSectionMedia(jinaMarkdown: string, postId: string | n
179187 ]
180188 }
181189
182- if ( postMatch ) {
190+ if ( firstPostIdMatch ) {
183191 // Si el match es un link de servicio, buscamos el SIGUIENTE match
184- let actualIndex = postMatch . index
185- if ( firstMatchIsService ) {
186- const remaining = searchArea . slice ( postMatch . index + postMatch [ 0 ] . length )
192+ let actualIndex = firstPostIdMatch . index !
193+ if ( firstIdIsActuallyService ) {
194+ const remaining = searchArea . slice ( firstPostIdMatch . index ! + firstPostIdMatch [ 0 ] . length )
187195 const nextMatch = new RegExp ( `/post/${ postId } \\b` , 'i' ) . exec ( remaining )
188196 if ( nextMatch ) {
189- actualIndex = postMatch . index + postMatch [ 0 ] . length + nextMatch . index
197+ actualIndex = firstPostIdMatch . index ! + firstPostIdMatch [ 0 ] . length + nextMatch . index
190198 }
191199 }
192200
@@ -239,8 +247,19 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
239247 : jinaMarkdown
240248
241249 // 2. Encontrar la línea ancla del sub-post en el área de contenido
242- const serviceLinkRe = new RegExp ( `^\\[[^\\]]+\\]\\(https?://[^)]+/post/${ postId } [^)]*\\)` , 'im' )
243- const firstMatchIsService = serviceLinkRe . test ( contentArea . slice ( 0 , 1000 ) )
250+ const serviceLinkRe = new RegExp ( `^\\[[^\\]]*Thread[^\\]]*\\]\\(https?://[^)]+/post/${ postId } [^)]*\\)` , 'im' )
251+
252+ const allPostIdMatches = Array . from ( contentArea . matchAll ( / \/ p o s t \/ ( [ A - Z a - z 0 - 9 _ - ] + ) \b / gi) )
253+ const firstPostIdMatch = allPostIdMatches [ 0 ]
254+ const firstIdInContent = firstPostIdMatch ?. [ 1 ]
255+
256+ let firstIdIsActuallyService = false
257+ if ( firstPostIdMatch ) {
258+ const lineStart = contentArea . lastIndexOf ( '\n' , firstPostIdMatch . index )
259+ const lineEnd = contentArea . indexOf ( '\n' , firstPostIdMatch . index ! )
260+ const line = contentArea . slice ( lineStart >= 0 ? lineStart : 0 , lineEnd >= 0 ? lineEnd : contentArea . length )
261+ firstIdIsActuallyService = serviceLinkRe . test ( line )
262+ }
244263
245264 const anchorRe = new RegExp ( `/post/${ postId } \\b` , 'i' )
246265 const anchorMatch = anchorRe . exec ( contentArea )
@@ -253,28 +272,46 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
253272 const isMainPost = urlSource . includes ( `/post/${ postId } ` ) || urlSource . includes ( `/t/${ postId } ` )
254273
255274 let bodyFull : string
256- const firstPostIdMatch = / \/ p o s t \/ ( [ A - Z a - z 0 - 9 _ - ] + ) \b / i. exec ( contentArea )
257- const firstIdInContent = firstPostIdMatch ?. [ 1 ]
258275 const isTargetAtTop = ! firstIdInContent || firstIdInContent . toLowerCase ( ) === postId . toLowerCase ( )
259276
260- if ( isMainPost && isTargetAtTop && ! firstMatchIsService ) {
277+ // 3. Decidir el punto de inicio (bodyFull)
278+ if ( isMainPost && isTargetAtTop && ! firstIdIsActuallyService ) {
279+ // Caso A: Somos el post principal y estamos al inicio (sin breadcrumbs que nos precedan)
261280 bodyFull = contentArea
262- } else if ( anchorMatch ) {
263- let actualIndex = anchorMatch . index
264- // Si el primer match de ID es un enlace de servicio (breadcrumb), buscamos el siguiente
265- if ( firstMatchIsService ) {
266- const remaining = contentArea . slice ( anchorMatch . index + anchorMatch [ 0 ] . length )
267- const nextMatch = anchorRe . exec ( remaining )
268- if ( nextMatch ) {
269- actualIndex = anchorMatch . index + anchorMatch [ 0 ] . length + nextMatch . index
281+ } else {
282+ // Buscamos si existe un ancla REAL (una mención al post que no sea el breadcrumb térmico de Jina)
283+ let realAnchorIndex = - 1
284+ if ( anchorMatch ) {
285+ if ( ! firstIdIsActuallyService || anchorMatch . index !== firstPostIdMatch ?. index ) {
286+ // El match encontrado no es el service link inicial, es un ancla real
287+ realAnchorIndex = anchorMatch . index
288+ } else {
289+ // El primer match era un breadcrumb, buscamos el siguiente (el real en el cuerpo)
290+ const remaining = contentArea . slice ( anchorMatch . index + anchorMatch [ 0 ] . length )
291+ const nextMatch = anchorRe . exec ( remaining )
292+ if ( nextMatch ) {
293+ realAnchorIndex = anchorMatch . index + anchorMatch [ 0 ] . length + nextMatch . index
294+ }
270295 }
271296 }
272- const anchorLineEnd = contentArea . indexOf ( '\n' , actualIndex )
273- const bodyStart = anchorLineEnd >= 0 ? anchorLineEnd + 1 : actualIndex + anchorMatch [ 0 ] . length
274- bodyFull = contentArea . slice ( bodyStart )
275- } else {
276- // Si no hay anchor y no es el principal, bajamos al modo resiliente (toda el área)
277- bodyFull = contentArea
297+
298+ if ( realAnchorIndex >= 0 ) {
299+ // Caso B: Hemos encontrado el ancla real del sub-post
300+ const anchorLineEnd = contentArea . indexOf ( '\n' , realAnchorIndex )
301+ const bodyStart = anchorLineEnd >= 0 ? anchorLineEnd + 1 : realAnchorIndex + anchorMatch ! [ 0 ] . length
302+ bodyFull = contentArea . slice ( bodyStart )
303+ } else if ( isMainPost && allPostIdMatches . length > 0 ) {
304+ // Caso C: No hay ancla real pero hay hermanos. Somos el post "activo" (el que se está visitando).
305+ // Su contenido suele ir después del último hermano enlazado.
306+ const lastMatch = allPostIdMatches [ allPostIdMatches . length - 1 ]
307+ const lastIndex = lastMatch . index !
308+ const lineEnd = contentArea . indexOf ( '\n' , lastIndex )
309+ const bodyStart = lineEnd >= 0 ? lineEnd + 1 : lastIndex + lastMatch [ 0 ] . length
310+ bodyFull = contentArea . slice ( bodyStart )
311+ } else {
312+ // Caso D: Resiliencia (tomamos todo el área)
313+ bodyFull = contentArea
314+ }
278315 }
279316
280317 // 4. Acotar el cuerpo hasta el SIGUIENTE /post/ID o sección "Related threads"
@@ -283,54 +320,63 @@ export function extractPostSectionFromJina(jinaMarkdown: string, postId: string)
283320 // si aparece más adelante (ej. en enlaces de imágenes).
284321 const nextPostRe = new RegExp ( `\\/post\\/(?!${ postId } \\b)[A-Za-z0-9_-]+\\b` , 'i' )
285322 const sectionEndRe = new RegExp (
286- `${ nextPostRe . source } |\\nRelated threads\\b|\\nRelated posts\\b|\\nLog in or sign up\\b|\\nContinue with Instagram\\b` ,
323+ `${ nextPostRe . source } |\\n(?:###\\s+)?(?:Destacadas|Ver actividad|Replies|More replies|Activity|Respuesta de|Replies from|Replying to)\\b|\\ nRelated threads\\b|\\nRelated posts\\b|\\nLog in or sign up\\b|\\nContinue with Instagram\\b|\\nLog in to see more replies \\b` ,
287324 'i'
288325 )
289326 const sectionEndMatch = sectionEndRe . exec ( bodyFull )
290327 const body = sectionEndMatch ? bodyFull . slice ( 0 , sectionEndMatch . index ) : bodyFull . slice ( 0 , 2500 )
291328
292- // 5. Extraer texto: primera línea útil del cuerpo acotado
293- const lines = body . replace ( / \r / g, '' ) . split ( '\n' ) . map ( ( l ) => l . trim ( ) )
294- let text : string | undefined
329+ // 5. Extraer texto acumulando todas las líneas útiles del bloque acotado
330+ const lines = body . replace ( / \r / g, '' ) . split ( '\n' ) . map ( ( lineText : string ) => lineText . trim ( ) )
331+ const validLines : string [ ] = [ ]
332+
295333 for ( const line of lines ) {
296334 if ( ! line ) continue
297335 const candidate = normalizeCandidateLine ( line )
298- if ( candidate . length < 6 ) continue
336+
337+ if ( candidate . length < 3 ) continue // Bajamos el umbral para capturar texto corto/handles
299338 if ( / ^ ! \[ / . test ( candidate ) ) continue
300339 if ( isImageAltNoise ( candidate ) ) continue
301340 if ( isGenericThreadsText ( candidate ) ) continue
302341 if ( isThreadPositionNoise ( candidate ) ) continue
303342 if ( / ^ \d + $ / . test ( candidate ) ) continue
304343 if ( / ^ h t t p s ? : \/ \/ / i. test ( candidate ) ) continue
305344 if ( / ^ t i t l e : | ^ u r l s o u r c e : | ^ m a r k d o w n c o n t e n t : / i. test ( candidate ) ) continue
345+
306346 if ( / ^ \[ / . test ( candidate ) ) {
307347 const fullMatch = / ^ \[ ( [ ^ \] ] + ) \] \( ( h t t p s ? : \/ \/ [ ^ ) ] + ) \) / . exec ( candidate )
308348 if ( ! fullMatch ) continue
309349 const [ , displayText , rawUrl ] = fullMatch
310350 if ( displayText ?. startsWith ( '!' ) ) continue
351+
311352 const resolved = resolveThreadsTrackingUrl ( rawUrl )
312353 const urlText = resolved ?? rawUrl . replace ( / ^ h t t p s ? : \/ \/ / i, '' )
313- if ( urlText . length >= 6 && ! / ^ @ / . test ( urlText ) && ! / t h r e a d s \. (?: n e t | c o m ) / i. test ( urlText ) ) {
314- text = urlText
315- break
354+
355+ // Permitimos handles (@) si están en el cuerpo del mensaje
356+ if ( urlText . length >= 2 && ! / t h r e a d s \. (?: n e t | c o m ) / i. test ( urlText ) ) {
357+ validLines . push ( urlText )
358+ continue
316359 }
317- if ( displayText && displayText . length >= 6 && ! / ^ @ / . test ( displayText ) && ! / ^ \d + [ k K m M b B ] ? $ / . test ( displayText ) ) {
360+ if ( displayText && displayText . length >= 2 && ! / ^ \d + [ k K m M b B ] ? $ / . test ( displayText ) ) {
318361 if ( ! isThreadPositionNoise ( displayText ) ) {
319- text = displayText
320- break
362+ validLines . push ( displayText )
363+ continue
321364 }
322365 }
323366 continue
324367 }
325- text = candidate
326- break
368+
369+ validLines . push ( candidate )
327370 }
328371
329- // 6. Extraer media del cuerpo acotado (sin solaparse con otros posts)
330- const mediaUrls = [
372+ const text = validLines . join ( ' ' ) . trim ( ) || undefined
373+
374+ // 6. Extraer media del cuerpo acotado deduplicando URLs
375+ const rawMediaUrls = [
331376 ...extractMediaFromText ( body ) ,
332377 ...extractEscapedMediaFromText ( body ) ,
333378 ]
379+ const mediaUrls = Array . from ( new Set ( rawMediaUrls . filter ( ( u ) => u && u . startsWith ( 'http' ) ) ) )
334380
335381 return { text, mediaUrls }
336382}
@@ -465,10 +511,10 @@ function isLikelyThreadsPostUrl(url?: string): boolean {
465511function isGenericThreadsText ( value ?: string ) : boolean {
466512 if ( ! value ) return false
467513 const v = value . trim ( )
468- // Etiquetas de metadatos de perfil que Jina renderiza como bullets antes del texto real
469- if ( / ^ (?: a u t h o r | f o l l o w | f o l l o w e r s ? | f o l l o w i n g | p u b l i s h e d | l i k e s ? | r e p o s t s ? | r e p l i e s | r e l a t e d \s + t h r e a d s | r e l a t e d \s + p o s t s ) $ / i. test ( v ) ) return true
470- // Strings de login/auth page — red de seguridad por si isJinaLoginPage no la detectó
471- return / j o i n t h r e a d s t o s h a r e i d e a s | t h r e a d s \s * • \s * l o g i n | l o g i n o r s i g n u p | l o g i n w i t h (?: y o u r \s + ) ? (?: i n s t a g r a m | f a c e b o o k | u s e r n a m e ) | l o g i n t o (?: t h r e a d s | i n s t a g r a m ) | s i g n i n (?: w i t h | t o ) (?: i n s t a g r a m | f a c e b o o k | t h r e a d s ) | s i g n u p f o r t h r e a d s | c o n t i n u e w i t h (?: i n s t a g r a m | f a c e b o o k ) | c r e a t e (?: a ) ? n e w a c c o u n t | f o r g o t (?: y o u r ) ? p a s s w o r d | d o n ' ? t h a v e a n a c c o u n t | j o i n t h e c o n v e r s a t i o n | s e e w h a t p e o p l e a r e t a l k i n g a b o u t | w h a t ' ? s o n y o u r m i n d / i. test ( v )
514+ // Etiquetas de metadatos de perfil/post que Jina renderiza
515+ if ( / ^ (?: a u t h o r | f o l l o w | f o l l o w e r s ? | f o l l o w i n g | p u b l i s h e d | l i k e s ? | r e p o s t s ? | r e p l i e s | r e l a t e d \s + t h r e a d s | r e l a t e d \s + p o s t s | d e s t a c a d a s | v e r \s + a c t i v i d a d ) $ / i. test ( v ) ) return true
516+ // Strings de login/auth o navegación de hilos
517+ return / l o g i n t o s e e m o r e r e p l i e s | s i g n i n (?: w i t h | t o ) (?: i n s t a g r a m | f a c e b o o k | t h r e a d s ) | j o i n t h r e a d s t o s h a r e i d e a s | t h r e a d s \s * • \s * l o g i n | l o g i n o r s i g n u p | l o g i n w i t h (?: y o u r \s + ) ? (?: i n s t a g r a m | f a c e b o o k | u s e r n a m e ) | s i g n u p f o r t h r e a d s | c o n t i n u e w i t h (?: i n s t a g r a m | f a c e b o o k ) | c r e a t e (?: a ) ? n e w a c c o u n t | f o r g o t (?: y o u r ) ? p a s s w o r d | d o n ' ? t h a v e a n a c c o u n t | j o i n t h e c o n v e r s a t i o n | s e e w h a t p e o p l e a r e t a l k i n g a b o u t | w h a t ' ? s o n y o u r m i n d / i. test ( v )
472518}
473519
474520/*
@@ -531,6 +577,7 @@ function normalizeCandidateLine(line: string): string {
531577function isThreadPositionNoise ( line : string ) : boolean {
532578 return / ^ p o s t \s + \d + \s + d e \s + \d + $ / i. test ( line )
533579 || / ^ \d + \s + d e \s + \d + $ / i. test ( line )
580+ || / ^ \d + \/ \d + $ / i. test ( line )
534581 || / ^ T h r e a d \s * [ - — ] + \s * [ \d . k m b \s ] + \s * v i e w s $ / i. test ( line )
535582}
536583
0 commit comments