Existing methods

adaamko · adaamko · commit 87964fc93771 · 2026-03-16T22:45:46.000+01:00
diff --git a/README.md b/README.md
@@ -15,6 +15,8 @@
 - Fine-tuned Qwen 3.5 2B, 0.79 F1, ~91% compression
 - CLI pipe, Python library, or vLLM server
 
+Existing context pruning tools ([SWE-Pruner](https://github.com/Ayanami1314/swe-pruner), [Zilliz Semantic Highlight](https://huggingface.co/zilliz/semantic-highlight-bilingual-v1), [Provence](https://arxiv.org/abs/2501.16214)) are built for source code or document paragraphs. They don't handle the mixed, unstructured format of tool output (stack traces interleaved with passing tests, grep matches with context lines, build logs with timestamps). Squeez is trained specifically on 14 types of tool output from real SWE-bench workflows.
+
 ```bash
 pip install squeez
 python -m pytest tests/ -v 2>&1 | squeez "find the test failure related to authentication"
@@ -128,6 +130,7 @@ Evaluated on 617 held-out test samples from SWE-bench, across 14 tool types:
 |-------|-----------|--------|------|-------------|
 | **Squeez-2B** | **0.8043** | **0.8624** | **0.7895** | 0.9150 |
 | Qwen 3.5 35B A3B (zero-shot) | 0.7402 | 0.7498 | 0.7000 | 0.9177 |
+| Kimi K2 (zero-shot) | 0.6128 | 0.5286 | 0.5344 | 0.9425 |
 | Qwen 3.5 2B (untrained) | 0.4154 | 0.5299 | 0.4075 | 0.8197 |
 | BM25 (10%) | 0.1277 | 0.2172 | 0.1314 | 0.9036 |
 | Random (10%) | 0.0738 | 0.1009 | 0.0697 | 0.9067 |
diff --git a/scripts/evaluate_baselines.py b/scripts/evaluate_baselines.py
@@ -232,24 +232,19 @@ def baseline_swe_pruner(model, task: str, tool_output: str, threshold: float = 0
 
 def _load_zilliz():
     """Load Zilliz semantic-highlight (needs: pip install transformers torch)."""
-    import torch
     from transformers import AutoModel
 
     model_name = "zilliz/semantic-highlight-bilingual-v1"
-    model = AutoModel.from_pretrained(model_name, trust_remote_code=True, dtype=torch.float16)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = model.to(device)
+    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
     model.eval()
     return model
 
 
 def baseline_zilliz(model, task: str, tool_output: str, threshold: float = 0.5) -> list[str]:
     """Run Zilliz semantic-highlight via get_raw_predictions().
 
-    Uses the low-level API to avoid the broken process() path in
-    transformers 5.2 (build_inputs_with_special_tokens removed).
-    Each line is passed as a separate context, and per-token pruning
-    probabilities are averaged per line.
+    Uses per-line contexts since process() does nltk sentence splitting
+    which doesn't handle tool output lines well.
     """
     import torch
 
@@ -286,24 +281,48 @@ def _load_gliner2():
 
 
 def baseline_gliner2(model, task: str, tool_output: str) -> list[str]:
-    """Run GLiNER2 span extraction with 'relevant' as the entity label.
+    """Run GLiNER2 span extraction — keep any line containing an extracted entity.
 
-    Uses the task description as the label description to guide extraction.
-    Extracted spans are mapped back to line numbers.
+    Uses the task as a short label to guide entity extraction.
+    Any line that overlaps with an extracted span is kept.
     """
     lines = tool_output.split("\n")
     if not lines:
         return []
 
-    # Use the task as the entity description for guided extraction
-    result = model.extract_entities(
-        tool_output,
-        {"relevant": f"Text relevant to: {task}"},
-        include_spans=True,
-    )
+    # GLiNER2 has a max input length; truncate if needed
+    max_chars = 10000
+    text = tool_output[:max_chars] if len(tool_output) > max_chars else tool_output
+
+    # GLiNER2 works best with NER-style labels, not query descriptions.
+    # Use a fixed set of labels covering common relevant patterns in tool output.
+    labels = [
+        "error message",
+        "failed test",
+        "stack trace",
+        "warning",
+        "relevant code",
+        "file path",
+        "configuration",
+    ]
+
+    try:
+        result = model.extract_entities(
+            text,
+            labels,
+            include_spans=True,
+        )
+    except Exception as e:
+        logger.debug(f"GLiNER2 extract_entities failed: {e}")
+        return []
 
-    entities = result.get("entities", {}).get("relevant", [])
-    if not entities:
+    # result = {'entities': {'label': [{'text': ..., 'start': N, 'end': N}, ...]}}
+    all_entities = []
+    for label_entities in result.get("entities", {}).values():
+        if isinstance(label_entities, list):
+            all_entities.extend(label_entities)
+
+    if not all_entities:
         return []
 
     # Build line offset map
@@ -315,9 +334,12 @@ def baseline_gliner2(model, task: str, tool_output: str) -> list[str]:
 
     # Map character spans to line indices
     kept_indices = set()
-    for entity in entities:
-        span_start = entity.get("start", 0)
-        span_end = entity.get("end", 0)
+    for entity in all_entities:
+        if isinstance(entity, dict):
+            span_start = entity.get("start", 0)
+            span_end = entity.get("end", 0)
+        else:
+            continue
         for i, (lo, hi) in enumerate(line_offsets):
             if span_start < hi and span_end > lo:
                 kept_indices.add(i)
diff --git a/squeez/training/evaluate.py b/squeez/training/evaluate.py
@@ -490,11 +490,14 @@ def record_result(result: dict) -> None:
             record_result(result)
 
             if (i + 1) % 10 == 0:
-                logger.info(
-                    f"  [{i + 1}/{len(samples)}] "
-                    f"F1={result['span']['f1']:.3f} EM={result['span']['exact_match']:.0f} "
-                    f"ROUGE-L={result['rouge']:.3f}"
-                )
+                if "error" not in result:
+                    logger.info(
+                        f"  [{i + 1}/{len(samples)}] "
+                        f"F1={result['span']['f1']:.3f} EM={result['span']['exact_match']:.0f} "
+                        f"ROUGE-L={result['rouge']:.3f}"
+                    )
+                else:
+                    logger.info(f"  [{i + 1}/{len(samples)}] (last sample errored)")
 
     # Aggregate
     results = {}