@@ -86,7 +86,7 @@ class MarkdownParser(BaseParser):
8686 """
8787
8888 # Configuration constants
89- DEFAULT_MAX_SECTION_SIZE = 1024 # Maximum tokens per section
89+ DEFAULT_MAX_SECTION_SIZE = 2048 # Maximum tokens per section
9090 DEFAULT_MIN_SECTION_TOKENS = 512 # Minimum tokens to create a separate section
9191 MAX_MERGED_FILENAME_LENGTH = 32 # Maximum length for merged section filenames
9292
@@ -372,10 +372,16 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]:
372372 para_tokens = self ._estimate_token_count (para )
373373 para_len = len (para )
374374
375- # Single paragraph too long (by tokens or chars): force split by characters
375+ # Single paragraph too long (by tokens or chars): force split by characters.
376+ # If the already accumulated prefix is very short, merge it into this
377+ # oversized paragraph first so we do not create a low-value tiny chunk
378+ # like `section_1.md` that only contains the heading/introduction.
376379 if para_tokens > max_size or para_len > max_chars :
377380 if current :
378- parts .append (current .strip ())
381+ if current_tokens < self .DEFAULT_MIN_SECTION_TOKENS :
382+ para = current + "\n \n " + para
383+ else :
384+ parts .append (current .strip ())
379385 current = ""
380386 current_tokens = 0
381387 for i in range (0 , len (para ), max_chars ):
@@ -391,7 +397,13 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]:
391397 current_tokens += para_tokens
392398
393399 if current .strip ():
394- parts .append (current .strip ())
400+ # Avoid emitting a tiny trailing chunk when all earlier content has
401+ # already been split out (for example, a huge paragraph followed by
402+ # a short "no data" tail). Fold the tail back into the previous part.
403+ if parts and current_tokens < self .DEFAULT_MIN_SECTION_TOKENS :
404+ parts [- 1 ] = f"{ parts [- 1 ]} \n \n { current .strip ()} " .strip ()
405+ else :
406+ parts .append (current .strip ())
395407
396408 return parts if parts else [content ]
397409
@@ -525,35 +537,117 @@ async def _process_sections_with_merge(
525537 ]
526538
527539 pending = []
540+ buffered_section = None
541+
542+ async def flush_buffered () -> None :
543+ nonlocal buffered_section
544+ if buffered_section is not None :
545+ await self ._save_section (
546+ content ,
547+ headings ,
548+ parent_dir ,
549+ buffered_section ,
550+ max_size ,
551+ min_size ,
552+ )
553+ buffered_section = None
554+
528555 for sec in expanded :
529556 name , tokens , content_text = sec ["name" ], sec ["tokens" ], sec ["content" ]
530557 has_children = sec ["has_children" ]
531558
532559 # Handle small sections
533560 if tokens < min_size :
534- pending = await self ._try_add_to_pending (
535- viking_fs , parent_dir , pending , (name , content_text , tokens ), max_size
536- )
561+ if pending and sum (t for _ , _ , t in pending ) + tokens > max_size :
562+ await flush_buffered ()
563+ await self ._save_merged (viking_fs , parent_dir , pending )
564+ pending = []
565+ pending .append ((name , content_text , tokens ))
537566 continue
538567
539- # Try merge with pending
540- if pending and self ._can_merge (pending , tokens , max_size , has_children ):
541- pending .append ((name , content_text , tokens ))
568+ if pending :
569+ await flush_buffered ()
570+
571+ # Try merge with pending
572+ if self ._can_merge (pending , tokens , max_size , has_children ):
573+ pending .append ((name , content_text , tokens ))
574+ await self ._save_merged (viking_fs , parent_dir , pending )
575+ pending = []
576+ continue
577+
578+ # Avoid flushing a single tiny section as a standalone low-value file.
579+ if self ._should_merge_pending_into_next (pending ):
580+ sec = self ._merge_pending_into_next_section (pending , sec )
581+ pending = []
582+ else :
583+ await self ._save_merged (viking_fs , parent_dir , pending )
584+ pending = []
585+ else :
586+ await flush_buffered ()
587+
588+ buffered_section = sec
589+
590+ if pending :
591+ # No next section exists. Fold a single tiny pending section back into
592+ # the previous saved candidate instead of emitting a standalone file.
593+ if buffered_section is not None and self ._should_merge_pending_into_next (pending ):
594+ buffered_section = self ._merge_pending_into_previous_section (
595+ buffered_section , pending
596+ )
597+ pending = []
598+ else :
599+ await flush_buffered ()
542600 await self ._save_merged (viking_fs , parent_dir , pending )
543601 pending = []
544- continue
545-
546- # Save pending and process current section
547- pending = await self ._flush_pending (viking_fs , parent_dir , pending )
548- await self ._save_section (content , headings , parent_dir , sec , max_size , min_size )
549602
550- # Save remaining pending
551- await self ._flush_pending (viking_fs , parent_dir , pending )
603+ await flush_buffered ()
552604
553605 def _can_merge (self , pending : List , tokens : int , max_size : int , has_children : bool ) -> bool :
554606 """Check if section can merge with pending."""
555607 return sum (t for _ , _ , t in pending ) + tokens <= max_size and not has_children
556608
609+ def _should_merge_pending_into_next (self , pending : List [Tuple [str , str , int ]]) -> bool :
610+ """Prefer folding a single tiny pending section into the next section."""
611+ return len (pending ) == 1 and pending [0 ][2 ] <= self .DEFAULT_MIN_SECTION_TOKENS
612+
613+ def _merge_pending_into_next_section (
614+ self , pending : List [Tuple [str , str , int ]], section : Dict [str , Any ]
615+ ) -> Dict [str , Any ]:
616+ """Attach a tiny pending section to the following section."""
617+ _ , pending_content , _ = pending [0 ]
618+ merged = dict (section )
619+ merged ["content" ] = f"{ pending_content } \n \n { section ['content' ]} " .strip ()
620+ merged ["tokens" ] = self ._estimate_token_count (merged ["content" ])
621+
622+ if merged .get ("has_children" ):
623+ direct_content = section .get ("direct_content" , "" ).strip ()
624+ merged ["direct_content" ] = (
625+ f"{ pending_content } \n \n { direct_content } " .strip ()
626+ if direct_content
627+ else pending_content
628+ )
629+
630+ return merged
631+
632+ def _merge_pending_into_previous_section (
633+ self , section : Dict [str , Any ], pending : List [Tuple [str , str , int ]]
634+ ) -> Dict [str , Any ]:
635+ """Attach a tiny trailing pending section back into the previous section."""
636+ _ , pending_content , _ = pending [0 ]
637+ merged = dict (section )
638+ merged ["content" ] = f"{ section ['content' ]} \n \n { pending_content } " .strip ()
639+ merged ["tokens" ] = self ._estimate_token_count (merged ["content" ])
640+
641+ if merged .get ("has_children" ):
642+ direct_content = section .get ("direct_content" , "" ).strip ()
643+ merged ["direct_content" ] = (
644+ f"{ direct_content } \n \n { pending_content } " .strip ()
645+ if direct_content
646+ else pending_content
647+ )
648+
649+ return merged
650+
557651 async def _try_add_to_pending (
558652 self , viking_fs , parent_dir : str , pending : List , item : Tuple , max_size : int
559653 ) -> List :
0 commit comments