Skip to content

Commit ed298cd

Browse files
MaojiaShengopenviking
andauthored
fix: optimize markdown slicing, and fix a rust CLI bug for compability (#1919)
Co-authored-by: openviking <openviking@example.com>
1 parent 7d5fa62 commit ed298cd

7 files changed

Lines changed: 142 additions & 861 deletions

File tree

crates/ov_cli/src/client.rs

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -410,14 +410,27 @@ impl HttpClient {
410410
) -> Result<serde_json::Value> {
411411
let path_obj = Path::new(path);
412412

413-
// Determine effective parent and create_parent flag
413+
// Determine effective parent and create_parent flag.
414+
// Only send create_parent when the user explicitly selected
415+
// --parent-auto-create, so older servers that do not support the
416+
// field still accept the request.
414417
let (effective_parent, create_parent) = match (parent, parent_auto_create) {
415418
(Some(p), None) => (Some(p), false),
416419
(None, Some(p)) => (Some(p), true),
417420
(None, None) => (None, false),
418421
(Some(_), Some(_)) => unreachable!("handled in cli"),
419422
};
420423

424+
let build_body = |base: serde_json::Value| {
425+
let mut body = base;
426+
if create_parent {
427+
body.as_object_mut()
428+
.expect("add_resource request body must be an object")
429+
.insert("create_parent".to_string(), serde_json::Value::Bool(true));
430+
}
431+
body
432+
};
433+
421434
if path_obj.exists() {
422435
if path_obj.is_dir() {
423436
let source_name = path_obj
@@ -435,12 +448,11 @@ impl HttpClient {
435448
self.upload_temp_file(zip_file.path()).await?
436449
};
437450

438-
let body = serde_json::json!({
451+
let body = build_body(serde_json::json!({
439452
"temp_file_id": temp_file_id,
440453
"source_name": source_name,
441454
"to": to,
442455
"parent": effective_parent,
443-
"create_parent": create_parent,
444456
"reason": reason,
445457
"instruction": instruction,
446458
"wait": wait,
@@ -451,7 +463,7 @@ impl HttpClient {
451463
"exclude": exclude,
452464
"directly_upload_media": directly_upload_media,
453465
"watch_interval": watch_interval,
454-
});
466+
}));
455467

456468
let dynamic_timeout = TimeoutConfig::for_resource_processing().calculate(zip_file.path())?;
457469
self.base.post_with_timeout("/api/v1/resources", &body, dynamic_timeout).await
@@ -466,12 +478,11 @@ impl HttpClient {
466478
self.upload_temp_file(path_obj).await?
467479
};
468480

469-
let body = serde_json::json!({
481+
let body = build_body(serde_json::json!({
470482
"temp_file_id": temp_file_id,
471483
"source_name": source_name,
472484
"to": to,
473485
"parent": effective_parent,
474-
"create_parent": create_parent,
475486
"reason": reason,
476487
"instruction": instruction,
477488
"wait": wait,
@@ -482,16 +493,15 @@ impl HttpClient {
482493
"exclude": exclude,
483494
"directly_upload_media": directly_upload_media,
484495
"watch_interval": watch_interval,
485-
});
496+
}));
486497

487498
let dynamic_timeout = TimeoutConfig::for_resource_processing().calculate(path_obj)?;
488499
self.base.post_with_timeout("/api/v1/resources", &body, dynamic_timeout).await
489500
} else {
490-
let body = serde_json::json!({
501+
let body = build_body(serde_json::json!({
491502
"path": path,
492503
"to": to,
493504
"parent": effective_parent,
494-
"create_parent": create_parent,
495505
"reason": reason,
496506
"instruction": instruction,
497507
"wait": wait,
@@ -502,16 +512,15 @@ impl HttpClient {
502512
"exclude": exclude,
503513
"directly_upload_media": directly_upload_media,
504514
"watch_interval": watch_interval,
505-
});
515+
}));
506516

507517
self.post("/api/v1/resources", &body).await
508518
}
509519
} else {
510-
let body = serde_json::json!({
520+
let body = build_body(serde_json::json!({
511521
"path": path,
512522
"to": to,
513523
"parent": effective_parent,
514-
"create_parent": create_parent,
515524
"reason": reason,
516525
"instruction": instruction,
517526
"wait": wait,
@@ -522,7 +531,7 @@ impl HttpClient {
522531
"exclude": exclude,
523532
"directly_upload_media": directly_upload_media,
524533
"watch_interval": watch_interval,
525-
});
534+
}));
526535

527536
self.post("/api/v1/resources", &body).await
528537
}

openviking/parse/parsers/markdown.py

Lines changed: 111 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class MarkdownParser(BaseParser):
8686
"""
8787

8888
# Configuration constants
89-
DEFAULT_MAX_SECTION_SIZE = 1024 # Maximum tokens per section
89+
DEFAULT_MAX_SECTION_SIZE = 2048 # Maximum tokens per section
9090
DEFAULT_MIN_SECTION_TOKENS = 512 # Minimum tokens to create a separate section
9191
MAX_MERGED_FILENAME_LENGTH = 32 # Maximum length for merged section filenames
9292

@@ -372,10 +372,16 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]:
372372
para_tokens = self._estimate_token_count(para)
373373
para_len = len(para)
374374

375-
# Single paragraph too long (by tokens or chars): force split by characters
375+
# Single paragraph too long (by tokens or chars): force split by characters.
376+
# If the already accumulated prefix is very short, merge it into this
377+
# oversized paragraph first so we do not create a low-value tiny chunk
378+
# like `section_1.md` that only contains the heading/introduction.
376379
if para_tokens > max_size or para_len > max_chars:
377380
if current:
378-
parts.append(current.strip())
381+
if current_tokens < self.DEFAULT_MIN_SECTION_TOKENS:
382+
para = current + "\n\n" + para
383+
else:
384+
parts.append(current.strip())
379385
current = ""
380386
current_tokens = 0
381387
for i in range(0, len(para), max_chars):
@@ -391,7 +397,13 @@ def _smart_split_content(self, content: str, max_size: int) -> List[str]:
391397
current_tokens += para_tokens
392398

393399
if current.strip():
394-
parts.append(current.strip())
400+
# Avoid emitting a tiny trailing chunk when all earlier content has
401+
# already been split out (for example, a huge paragraph followed by
402+
# a short "no data" tail). Fold the tail back into the previous part.
403+
if parts and current_tokens < self.DEFAULT_MIN_SECTION_TOKENS:
404+
parts[-1] = f"{parts[-1]}\n\n{current.strip()}".strip()
405+
else:
406+
parts.append(current.strip())
395407

396408
return parts if parts else [content]
397409

@@ -525,35 +537,117 @@ async def _process_sections_with_merge(
525537
]
526538

527539
pending = []
540+
buffered_section = None
541+
542+
async def flush_buffered() -> None:
543+
nonlocal buffered_section
544+
if buffered_section is not None:
545+
await self._save_section(
546+
content,
547+
headings,
548+
parent_dir,
549+
buffered_section,
550+
max_size,
551+
min_size,
552+
)
553+
buffered_section = None
554+
528555
for sec in expanded:
529556
name, tokens, content_text = sec["name"], sec["tokens"], sec["content"]
530557
has_children = sec["has_children"]
531558

532559
# Handle small sections
533560
if tokens < min_size:
534-
pending = await self._try_add_to_pending(
535-
viking_fs, parent_dir, pending, (name, content_text, tokens), max_size
536-
)
561+
if pending and sum(t for _, _, t in pending) + tokens > max_size:
562+
await flush_buffered()
563+
await self._save_merged(viking_fs, parent_dir, pending)
564+
pending = []
565+
pending.append((name, content_text, tokens))
537566
continue
538567

539-
# Try merge with pending
540-
if pending and self._can_merge(pending, tokens, max_size, has_children):
541-
pending.append((name, content_text, tokens))
568+
if pending:
569+
await flush_buffered()
570+
571+
# Try merge with pending
572+
if self._can_merge(pending, tokens, max_size, has_children):
573+
pending.append((name, content_text, tokens))
574+
await self._save_merged(viking_fs, parent_dir, pending)
575+
pending = []
576+
continue
577+
578+
# Avoid flushing a single tiny section as a standalone low-value file.
579+
if self._should_merge_pending_into_next(pending):
580+
sec = self._merge_pending_into_next_section(pending, sec)
581+
pending = []
582+
else:
583+
await self._save_merged(viking_fs, parent_dir, pending)
584+
pending = []
585+
else:
586+
await flush_buffered()
587+
588+
buffered_section = sec
589+
590+
if pending:
591+
# No next section exists. Fold a single tiny pending section back into
592+
# the previous saved candidate instead of emitting a standalone file.
593+
if buffered_section is not None and self._should_merge_pending_into_next(pending):
594+
buffered_section = self._merge_pending_into_previous_section(
595+
buffered_section, pending
596+
)
597+
pending = []
598+
else:
599+
await flush_buffered()
542600
await self._save_merged(viking_fs, parent_dir, pending)
543601
pending = []
544-
continue
545-
546-
# Save pending and process current section
547-
pending = await self._flush_pending(viking_fs, parent_dir, pending)
548-
await self._save_section(content, headings, parent_dir, sec, max_size, min_size)
549602

550-
# Save remaining pending
551-
await self._flush_pending(viking_fs, parent_dir, pending)
603+
await flush_buffered()
552604

553605
def _can_merge(self, pending: List, tokens: int, max_size: int, has_children: bool) -> bool:
554606
"""Check if section can merge with pending."""
555607
return sum(t for _, _, t in pending) + tokens <= max_size and not has_children
556608

609+
def _should_merge_pending_into_next(self, pending: List[Tuple[str, str, int]]) -> bool:
610+
"""Prefer folding a single tiny pending section into the next section."""
611+
return len(pending) == 1 and pending[0][2] <= self.DEFAULT_MIN_SECTION_TOKENS
612+
613+
def _merge_pending_into_next_section(
614+
self, pending: List[Tuple[str, str, int]], section: Dict[str, Any]
615+
) -> Dict[str, Any]:
616+
"""Attach a tiny pending section to the following section."""
617+
_, pending_content, _ = pending[0]
618+
merged = dict(section)
619+
merged["content"] = f"{pending_content}\n\n{section['content']}".strip()
620+
merged["tokens"] = self._estimate_token_count(merged["content"])
621+
622+
if merged.get("has_children"):
623+
direct_content = section.get("direct_content", "").strip()
624+
merged["direct_content"] = (
625+
f"{pending_content}\n\n{direct_content}".strip()
626+
if direct_content
627+
else pending_content
628+
)
629+
630+
return merged
631+
632+
def _merge_pending_into_previous_section(
633+
self, section: Dict[str, Any], pending: List[Tuple[str, str, int]]
634+
) -> Dict[str, Any]:
635+
"""Attach a tiny trailing pending section back into the previous section."""
636+
_, pending_content, _ = pending[0]
637+
merged = dict(section)
638+
merged["content"] = f"{section['content']}\n\n{pending_content}".strip()
639+
merged["tokens"] = self._estimate_token_count(merged["content"])
640+
641+
if merged.get("has_children"):
642+
direct_content = section.get("direct_content", "").strip()
643+
merged["direct_content"] = (
644+
f"{direct_content}\n\n{pending_content}".strip()
645+
if direct_content
646+
else pending_content
647+
)
648+
649+
return merged
650+
557651
async def _try_add_to_pending(
558652
self, viking_fs, parent_dir: str, pending: List, item: Tuple, max_size: int
559653
) -> List:

openviking/storage/viking_fs.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,16 +1123,17 @@ async def _walk(current_path: str, current_rel: str, current_depth: int):
11231123
if name in [".", ".."]:
11241124
continue
11251125
rel_path = f"{current_rel}/{name}" if current_rel else name
1126+
is_dir = entry.get("isDir", False)
11261127
new_entry = {
11271128
"uri": self._path_to_uri(f"{current_path}/{name}", ctx=ctx),
1128-
"size": entry.get("size", 0),
1129-
"isDir": entry.get("isDir", False),
1129+
"size": 0 if is_dir else entry.get("size", 0),
1130+
"isDir": is_dir,
11301131
"modTime": format_simplified(parse_iso_datetime(entry.get("modTime", "")), now),
11311132
}
11321133
new_entry["rel_path"] = rel_path
11331134
if not self._is_accessible(new_entry["uri"], real_ctx):
11341135
continue
1135-
if entry.get("isDir"):
1136+
if is_dir:
11361137
all_entries.append(new_entry)
11371138
await _walk(f"{current_path}/{name}", rel_path, current_depth + 1)
11381139
elif not name.startswith("."):
@@ -2212,15 +2213,16 @@ async def _ls_agent(
22122213
parsed_time = parse_iso_datetime(raw_time)
22132214
elif isinstance(entry.get("mtime"), (int, float)):
22142215
parsed_time = datetime.fromtimestamp(entry["mtime"], tz=timezone.utc)
2216+
is_dir = entry.get("isDir", False)
22152217
new_entry = {
22162218
"uri": self._path_to_uri(f"{path}/{name}", ctx=ctx),
2217-
"size": entry.get("size", 0),
2218-
"isDir": entry.get("isDir", False),
2219+
"size": 0 if is_dir else entry.get("size", 0),
2220+
"isDir": is_dir,
22192221
"modTime": format_simplified(parsed_time, now),
22202222
}
22212223
if not self._is_accessible(new_entry["uri"], real_ctx):
22222224
continue
2223-
if entry.get("isDir"):
2225+
if is_dir:
22242226
all_entries.append(new_entry)
22252227
elif not name.startswith("."):
22262228
all_entries.append(new_entry)

openviking_cli/utils/config/parser_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class ParserConfig:
3737
encoding: str = "utf-8"
3838

3939
# Smart splitting configuration
40-
max_section_size: int = 1000 # Maximum tokens per section before splitting
40+
max_section_size: int = 2048 # Maximum tokens per section before splitting
4141
section_size_flexibility: float = 0.3 # Allow 30% overflow to maintain coherence
4242
max_section_chars: int = (
4343
6000 # Hard character limit per section (guards against token estimation errors)

0 commit comments

Comments
 (0)