Skip to content

Commit 54e70a3

Browse files
gandhipratik203Suresh Kumar Moharajan
authored andcommitted
perf(plugin): use zero-copy PyString::to_str() in process_container
Replace extract::<String>() (full copy) with PyString::cast() + to_str() (zero-copy borrow) for the string leaf path. Also skip is_numeric_string() for strings > 50 bytes, and extend the O(1) pre-check to both truncate and block modes. Results vs previous commit: - Deep nested dict: 5.4x → 7.1x faster - Wide nested dict: 6.2x → 8.4x faster - List passthrough: 9.7x → 13.6x faster - Block mode 10KB: 4.5x → 5.0x faster - All 331 Python tests + 47 Rust tests pass Signed-off-by: Pratik Gandhi <gandhipratik203@gmail.com>
1 parent e31e795 commit 54e70a3

1 file changed

Lines changed: 40 additions & 53 deletions

File tree

  • plugins_rust/output_length_guard/src

plugins_rust/output_length_guard/src/lib.rs

Lines changed: 40 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -498,13 +498,14 @@ fn process_container<'py>(
498498

499499
// ── String leaf ──
500500
//
501-
// Fast path: check Python string length via O(1) PyAny::len() BEFORE
502-
// extracting to a Rust String (which copies all bytes). If the string
503-
// is under both the char and token limits, and there's no min constraint,
504-
// we can skip the extraction entirely.
505-
if container.is_instance_of::<PyString>() {
506-
// Python str.__len__() returns the number of code points — equivalent
507-
// to Rust chars().count() for BMP, and close enough for our limits.
501+
// Optimization layers (cheapest first):
502+
// 1. O(1) Python len() pre-check — skip entirely for under-limit strings
503+
// 2. PyString::to_str() zero-copy borrow — no String allocation
504+
// 3. count_chars_capped() — O(limit) char counting
505+
// 4. Skip is_numeric_string for strings > 50 bytes
506+
// 5. byte_offset_of_char() for direct &str slicing
507+
if let Ok(py_str) = container.cast::<PyString>() {
508+
// ── Layer 1: O(1) pre-check via Python str.__len__() ──
508509
let py_len = container.len().unwrap_or(0);
509510

510511
let needs_processing = match cfg.limit_mode {
@@ -521,22 +522,20 @@ fn process_container<'py>(
521522
}
522523
};
523524

524-
// If the string doesn't need processing AND strategy is truncate
525-
// (so no violation to report), skip the expensive extraction.
526-
if !needs_processing && cfg.strategy == Strategy::Truncate {
525+
// Under-limit strings need no further work regardless of strategy
526+
if !needs_processing {
527527
return Ok(ProcessResult::Unchanged);
528528
}
529-
}
530529

531-
if let Ok(text) = container.extract::<String>() {
532-
// Skip numeric strings
533-
if is_numeric_string(&text) {
530+
// ── Layer 2: zero-copy borrow ──
531+
let text: &str = py_str.to_str()?;
532+
533+
// Skip numeric check for long strings (no number is > 50 chars)
534+
if text.len() <= 50 && is_numeric_string(text) {
534535
return Ok(ProcessResult::Unchanged);
535536
}
536537

537-
// Use capped char counting — O(limit) instead of O(n).
538-
// For token mode, count (max_tokens+1)*cpt chars so the over-limit
539-
// check `token_count > max_tokens` works correctly.
538+
// ── Layer 3: capped char counting ──
540539
let limit_for_counting = match cfg.limit_mode {
541540
LimitMode::Character => cfg.max_chars.unwrap_or(usize::MAX),
542541
LimitMode::Token => cfg
@@ -545,10 +544,9 @@ fn process_container<'py>(
545544
.saturating_add(1)
546545
.saturating_mul(cfg.chars_per_token),
547546
};
548-
let (char_count, _) = count_chars_capped(&text, limit_for_counting);
547+
let (char_count, _) = count_chars_capped(text, limit_for_counting);
549548
let token_count = estimate_tokens(char_count, cfg.chars_per_token);
550549

551-
// Determine if out of bounds based on limit_mode
552550
let (below_min, above_max) = match cfg.limit_mode {
553551
LimitMode::Character => {
554552
let below = cfg.min_chars > 0 && char_count < cfg.min_chars;
@@ -566,19 +564,30 @@ fn process_container<'py>(
566564
return Ok(ProcessResult::Unchanged);
567565
}
568566

569-
// Block strategy
567+
// ── Block strategy ──
570568
if cfg.strategy == Strategy::Block {
571569
let location = if path.is_empty() {
572570
String::new()
573571
} else {
574572
format!(" at {path}")
575573
};
576574

577-
let preview = if text.chars().count() > 50 {
578-
let p: String = text.chars().take(50).collect();
579-
p + "..."
575+
let preview = if text.len() > 53 {
576+
let end = text
577+
.char_indices()
578+
.nth(50)
579+
.map_or(text.len(), |(off, _)| off);
580+
format!("{}...", &text[..end])
580581
} else {
581-
text.clone()
582+
text.to_string()
583+
};
584+
585+
let loc_str = || {
586+
if path.is_empty() {
587+
"root".to_string()
588+
} else {
589+
path.to_string()
590+
}
582591
};
583592

584593
let (reason, description, code, details) = if cfg.limit_mode == LimitMode::Token
@@ -599,14 +608,7 @@ fn process_container<'py>(
599608
"strategy".to_string(),
600609
ViolationValue::Str("block".to_string()),
601610
);
602-
d.insert(
603-
"location".to_string(),
604-
ViolationValue::Str(if path.is_empty() {
605-
"root".to_string()
606-
} else {
607-
path.to_string()
608-
}),
609-
);
611+
d.insert("location".to_string(), ViolationValue::Str(loc_str()));
610612
d.insert("value_preview".to_string(), ViolationValue::Str(preview));
611613
(
612614
format!("Output length out of bounds{location}"),
@@ -623,14 +625,7 @@ fn process_container<'py>(
623625
"strategy".to_string(),
624626
ViolationValue::Str("block".to_string()),
625627
);
626-
d.insert(
627-
"location".to_string(),
628-
ViolationValue::Str(if path.is_empty() {
629-
"root".to_string()
630-
} else {
631-
path.to_string()
632-
}),
633-
);
628+
d.insert("location".to_string(), ViolationValue::Str(loc_str()));
634629
d.insert("value_preview".to_string(), ViolationValue::Str(preview));
635630
(
636631
format!("Output length out of bounds{location}"),
@@ -639,7 +634,6 @@ fn process_container<'py>(
639634
d,
640635
)
641636
} else {
642-
// Min violation
643637
let mut d = HashMap::new();
644638
d.insert("length".to_string(), ViolationValue::Int(char_count as i64));
645639
d.insert(
@@ -654,14 +648,7 @@ fn process_container<'py>(
654648
"min_tokens".to_string(),
655649
ViolationValue::Int(cfg.min_tokens as i64),
656650
);
657-
d.insert(
658-
"location".to_string(),
659-
ViolationValue::Str(if path.is_empty() {
660-
"root".to_string()
661-
} else {
662-
path.to_string()
663-
}),
664-
);
651+
d.insert("location".to_string(), ViolationValue::Str(loc_str()));
665652
(
666653
format!("Output length out of bounds{location}"),
667654
format!(
@@ -682,12 +669,12 @@ fn process_container<'py>(
682669
}));
683670
}
684671

685-
// Truncate strategy — only truncate if above max
672+
// ── Truncate: only if above max ──
686673
if above_max {
687-
let truncated = truncate(&text, cfg);
674+
let truncated = truncate(text, cfg);
688675
if truncated != text {
689-
let py_str = PyString::new(py, &truncated);
690-
return Ok(ProcessResult::Modified(py_str.into_any()));
676+
let new_py_str = PyString::new(py, &truncated);
677+
return Ok(ProcessResult::Modified(new_py_str.into_any()));
691678
}
692679
}
693680

0 commit comments

Comments
 (0)