urchade
diff --git a/‎benchmarks/eval_compressed_biomed.py‎
Lines changed: 209 additions & 0 deletions b/‎benchmarks/eval_compressed_biomed.py‎
Lines changed: 209 additions & 0 deletions
diff --git a/‎docs/usage.md‎
Lines changed: 106 additions & 0 deletions b/‎docs/usage.md‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎gliner/config.py‎
Lines changed: 9 additions & 0 deletions b/‎gliner/config.py‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,209 @@
+"""Evaluate raw vs compressed-prompt GLiNER on knowledgator/biomed_NER."""
+
+import argparse
+import random
+import time
+
+import torch
+from datasets import load_dataset
+
+from gliner import GLiNER
+
+
+def predictions_to_ner(text, preds):
+    """Map char-offset predictions from model.inference to word-level ner tuples."""
+    ent_dicts = [{"start": p["start"], "end": p["end"], "class": p["label"]} for p in preds]
+    return char_to_word_sample(text, ent_dicts)
+
+
+def distill_finetune(model, distill_data, *, epochs, lr, batch_size, output_dir):
+    """Fine-tune `model` on pseudo-labeled `distill_data` via GLiNER.train_model."""
+    # Attach the full label set so the collator uses it with prepare_labels=True.
+    model.train_model(
+        train_dataset=distill_data,
+        eval_dataset=None,
+        output_dir=output_dir,
+        num_train_epochs=epochs,
+        max_steps=-1,  # override create_training_args' default (10000) so num_train_epochs wins
+        per_device_train_batch_size=batch_size,
+        learning_rate=lr,
+        save_strategy="no",
+        report_to="none",
+        logging_steps=10,
+        remove_unused_columns=False,
+    )
+    model.eval()
+
+
+def timed_evaluate(model, eval_data, *, warmup, repeats, device, **eval_kwargs):
+    """Run model.evaluate once for metrics and `repeats` times for timing."""
+    if device.startswith("cuda"):
+        torch.cuda.synchronize()
+    out, f1 = model.evaluate(eval_data, **eval_kwargs)
+
+    for _ in range(warmup):
+        model.evaluate(eval_data, **eval_kwargs)
+
+    if device.startswith("cuda"):
+        torch.cuda.synchronize()
+    times = []
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        model.evaluate(eval_data, **eval_kwargs)
+        if device.startswith("cuda"):
+            torch.cuda.synchronize()
+        times.append(time.perf_counter() - t0)
+
+    mean = sum(times) / len(times)
+    return out, f1, mean, min(times)
+
+
+def char_to_word_sample(text, entities):
+    """Convert {text, entities:[{class,start,end}]} to {tokenized_text, ner}.
+
+    Uses whitespace tokenization and aligns char offsets to word indices.
+    Entities that don't align to word boundaries are dropped.
+    """
+    words = text.split()
+    # Build char-start index for each word (assuming single-space separation of split()).
+    char_starts, char_ends = [], []
+    cursor = 0
+    remaining = text
+    for w in words:
+        idx = remaining.find(w)
+        abs_start = cursor + idx
+        char_starts.append(abs_start)
+        char_ends.append(abs_start + len(w))
+        cursor = abs_start + len(w)
+        remaining = text[cursor:]
+
+    start_to_widx = {s: i for i, s in enumerate(char_starts)}
+    end_to_widx = {e: i for i, e in enumerate(char_ends)}
+
+    ner = []
+    for ent in entities:
+        s, e, cls = ent["start"], ent["end"], ent["class"].lower()
+        # Tolerate leading/trailing whitespace inside span
+        span_text = text[s:e]
+        ls = len(span_text) - len(span_text.lstrip())
+        le = len(span_text) - len(span_text.rstrip())
+        s2, e2 = s + ls, e - le
+        if s2 in start_to_widx and e2 in end_to_widx:
+            ner.append((start_to_widx[s2], end_to_widx[e2], cls))
+    return {"tokenized_text": words, "ner": ner}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="gliner-community/gliner_small-v2.5")
+    parser.add_argument("--dataset", default="knowledgator/biomed_NER")
+    parser.add_argument("--split", default="train")
+    parser.add_argument("--eval_size", type=int, default=3000)
+    parser.add_argument("--compress_size", type=int, default=1000)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--threshold", type=float, default=0.5)
+    parser.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--bench_warmup", type=int, default=1)
+    parser.add_argument("--bench_repeats", type=int, default=1)
+    parser.add_argument("--distill", action="store_true",
+                        help="Fine-tune the compressed model on raw-model pseudo-labels.")
+    parser.add_argument("--distill_size", type=int, default=1000,
+                        help="Number of texts to use for distillation (drawn after compress slice).")
+    parser.add_argument("--distill_epochs", type=int, default=3)
+    parser.add_argument("--distill_lr", type=float, default=1e-5)
+    parser.add_argument("--distill_threshold", type=float, default=0.3)
+    parser.add_argument("--distill_output_dir", type=str, default="./distill_ckpt")
+    args = parser.parse_args()
+
+    random.seed(args.seed)
+
+    print(f"Loading dataset {args.dataset} [{args.split}]...")
+    ds = load_dataset(args.dataset, split=args.split)
+
+    processed = [char_to_word_sample(r["text"], r["entities"]) for r in ds]
+    processed = [p for p in processed if p["ner"]]  # drop empties
+
+    labels = sorted({t for p in processed for _, _, t in p["ner"]})
+    print(f"{len(processed)} samples, {len(labels)} labels: {labels}")
+
+    random.shuffle(processed)
+    # Pin the full label set on every sample so raw and compressed evaluations
+    # share an identical label space. Without this, raw eval would derive
+    # labels per-sample (only the positives present) and be unfairly easier
+    # than the compressed path, which always classifies over all labels.
+    for p in processed:
+        p["ner_labels"] = labels
+    eval_data = processed[: args.eval_size]
+    compress_slice = processed[args.eval_size : args.eval_size + args.compress_size]
+    if not compress_slice:
+        compress_slice = processed[: args.compress_size]
+    compress_texts = [" ".join(p["tokenized_text"]) for p in compress_slice]
+
+    distill_start = args.eval_size + args.compress_size
+    distill_slice = processed[distill_start : distill_start + args.distill_size] if args.distill else []
+
+    print(f"Loading model {args.model}...")
+    model = GLiNER.from_pretrained(args.model).to(args.device)
+
+    eval_kwargs = dict(flat_ner=True, threshold=args.threshold, batch_size=args.batch_size)
+    n = len(eval_data)
+
+    print("=== Raw GLiNER evaluation ===")
+    raw_out, raw_f1, raw_mean, raw_best = timed_evaluate(
+        model, eval_data, warmup=args.bench_warmup, repeats=args.bench_repeats,
+        device=args.device, **eval_kwargs,
+    )
+    print(raw_out)
+    print(f"Raw F1: {raw_f1:.4f}")
+    print(f"Raw timing (n={n}, bs={args.batch_size}, repeats={args.bench_repeats}): "
+          f"mean {raw_mean:.3f}s | best {raw_best:.3f}s | "
+          f"{n / raw_mean:.1f} samples/s")
+
+    distill_data = None
+    if args.distill and distill_slice:
+        print(f"Generating pseudo-labels from raw model on {len(distill_slice)} distillation texts...")
+        distill_texts = [" ".join(p["tokenized_text"]) for p in distill_slice]
+        preds = model.inference(
+            distill_texts, labels, flat_ner=True,
+            threshold=args.distill_threshold, batch_size=args.batch_size,
+        )
+        distill_data = [predictions_to_ner(t, p) for t, p in zip(distill_texts, preds)]
+        kept = sum(1 for d in distill_data if d["ner"])
+        print(f"  {kept}/{len(distill_data)} samples carry at least one pseudo-label")
+
+    print(f"Compressing prompt embeddings over {len(compress_texts)} texts...")
+    model.compress_prompt_embeddings(
+        texts=compress_texts, labels=labels, batch_size=args.batch_size
+    )
+    model.config.precomputed_prompts_mode = True
+
+    if distill_data:
+        print(f"Fine-tuning compressed model on pseudo-labels "
+              f"(epochs={args.distill_epochs}, lr={args.distill_lr})...")
+        distill_finetune(
+            model, distill_data,
+            epochs=args.distill_epochs, lr=args.distill_lr,
+            batch_size=args.batch_size, output_dir=args.distill_output_dir,
+        )
+
+    print("=== Compressed GLiNER evaluation ===")
+    comp_out, comp_f1, comp_mean, comp_best = timed_evaluate(
+        model, eval_data, warmup=args.bench_warmup, repeats=args.bench_repeats,
+        device=args.device, **eval_kwargs,
+    )
+    print(comp_out)
+    print(f"Compressed F1: {comp_f1:.4f}")
+    print(f"Compressed timing (n={n}, bs={args.batch_size}, repeats={args.bench_repeats}): "
+          f"mean {comp_mean:.3f}s | best {comp_best:.3f}s | "
+          f"{n / comp_mean:.1f} samples/s")
+
+    print("\n=== Summary ===")
+    print(f"Raw        F1: {raw_f1:.4f}  | mean {raw_mean:.3f}s | {n / raw_mean:.1f} samples/s")
+    print(f"Compressed F1: {comp_f1:.4f}  | mean {comp_mean:.3f}s | {n / comp_mean:.1f} samples/s")
+    print(f"Delta F1     : {comp_f1 - raw_f1:+.4f}")
+    print(f"Speedup      : {raw_mean / comp_mean:.2f}x")
+
+
+if __name__ == "__main__":
+    main()
@@ -990,6 +990,112 @@ print(f"- Products: {[e['text'] for e in entities if e['label'] == 'product']}")
 print(f"- Timeline: {[e['text'] for e in entities if e['label'] == 'date']}")
 ```
 
+## ⚡ Prompt Compression (Precomputed Prompt Embeddings)
+
+For uni-encoder models (span, token, and relation-extraction variants) you can
+precompute the prompt embeddings for a **fixed** label set and reuse them at
+inference time. In precomputed mode the encoder receives only the text
+(no `<<ENT>>label1<<ENT>>...<<SEP>>` prefix), which shortens the input sequence,
+reduces attention cost, and can noticeably speed up inference — at a small
+accuracy trade-off versus re-encoding the prompts on every call.
+
+### How it works
+
+`BaseGLiNER.compress_prompt_embeddings(texts, labels, rel_labels=None, batch_size=8, distill=False, distill_threshold=0.3, distill_epochs=3, distill_lr=1e-5, distill_batch_size=None, distill_output_dir="./distill_ckpt", distill_train_kwargs=None)`:
+
+1. Runs the normal forward pass over `(texts, labels)` pairs.
+2. Extracts the per-label prompt embedding (the `<<ENT>>` token representation,
+   pre-projection) from each example.
+3. Averages across all examples to produce an `(L, D)` matrix stored as a
+   non-trainable parameter on the underlying model (`model.precomputed_prompts`).
+4. Sets `config.precomputed_prompts_mode = True` and writes
+   `config.id_to_classes`, so subsequent `predict_entities` / `forward` calls
+   skip prompt-prepending and look up the stored embeddings instead.
+
+The stored embeddings travel with `state_dict`, so `save_pretrained` /
+`from_pretrained` round-trip them automatically. Training can continue after
+compression — the stored matrix is frozen but everything else keeps training.
+
+### Basic usage (entity extraction)
+
+```python
+from gliner import GLiNER
+
+model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")
+
+# Representative texts from your target domain. They do not need labels;
+# they are only used as contexts while averaging the prompt representations.
+calibration_texts = [
+    "Barack Obama was born in Honolulu, Hawaii.",
+    "Apple announced a new iPhone at their Cupertino headquarters.",
+    # ... ideally 100–1000 diverse sentences from your domain
+]
+
+labels = ["person", "organization", "location", "date"]
+
+# One-time compression step
+model.compress_prompt_embeddings(calibration_texts, labels, batch_size=16)
+
+# Inference now uses the precomputed prompts — no need to pass labels again
+entities = model.predict_entities(
+    "Tim Cook visited Berlin last Tuesday.",
+    labels,               # must match (order-insensitive) the compressed set
+    threshold=0.5,
+)
+
+# Persist the compressed model
+model.save_pretrained("./gliner-compressed")
+```
+
+### Relation extraction
+
+For relex models (`UniEncoderSpanRelexModel` / `UniEncoderTokenRelexModel`),
+pass `rel_labels` so the `<<REL>>` prompt embeddings are compressed as well:
+
+```python
+model.compress_prompt_embeddings(
+    texts=calibration_texts,
+    labels=["person", "organization", "location"],
+    rel_labels=["works_for", "located_in", "founder_of"],
+    batch_size=8,
+)
+```
+
+### End-to-end distillation
+
+Compression alone can dip quality because averaged prompt embeddings drop
+context-specific signal. Pass `distill=True` to recover it in a single call:
+the raw (pre-compression) model first generates pseudo-labels over `texts`,
+prompts are then compressed, and the compressed model is fine-tuned on those
+pseudo-labels — no separate script required.
+
+```python
+model.compress_prompt_embeddings(
+    texts=calibration_texts,     # also used as the distillation corpus
+    labels=labels,
+    batch_size=16,
+    distill=True,
+    distill_threshold=0.3,       # pseudo-label confidence cutoff
+    distill_epochs=3,
+    distill_lr=1e-5,
+    distill_output_dir="./distill_ckpt",
+)
+```
+
+Relevant knobs:
+
+- `distill_threshold`: confidence cutoff used when the raw model produces
+  pseudo-labels. Lower values widen the training signal but add noise.
+- `distill_epochs`, `distill_lr`: fine-tuning schedule.
+- `distill_batch_size`: defaults to `batch_size` if omitted.
+- `distill_output_dir`: forwarded to `train_model`.
+- `distill_train_kwargs`: dict of extra kwargs merged into the underlying
+  `train_model` call (e.g. to override `save_strategy`, `logging_steps`, etc.).
+
+Pseudo-labels are generated from the same `texts` used for compression, so one
+diverse in-domain corpus serves both roles.
+
+
 ## Tips and Best Practices
 
 1. **Choose the right model architecture**:
 
@@ -39,6 +39,8 @@ def __init__(
         span_loss_coef: float = 1.0,
         represent_spans: bool = False,
         neg_spans_ratio: float = 1.0,
+        precomputed_prompts_mode: Optional[bool] = None,
+        id_to_classes: Optional[dict] = None,
         **kwargs,
     ):
         """Initialize BaseGLiNERConfig.
@@ -72,6 +74,8 @@ def __init__(
             span_loss_coef (float, optional): Span loss coefficient. Defaults to 1.0.
             represent_spans (bool, optional): Whether to represent spans. Defaults to False.
             neg_spans_ratio (float, optional): Ratio of negative spans. Defaults to 1.0.
+            precomputed_prompts_mode (Optional[bool]): Whether to use precomputed prompts. Defaults to None.
+            id_to_classes (Optional[dict]): Mapping from class IDs to class names. Defaults to None.
             **kwargs: Additional keyword arguments passed to parent class.
         """
         super().__init__(**kwargs)
@@ -108,6 +112,8 @@ def __init__(
         self.span_loss_coef = span_loss_coef
         self.represent_spans = represent_spans
         self.neg_spans_ratio = neg_spans_ratio
+        self.precomputed_prompts_mode = precomputed_prompts_mode
+        self.id_to_classes = id_to_classes
 
 
 class UniEncoderConfig(BaseGLiNERConfig):
@@ -201,6 +207,7 @@ def __init__(
         augment_ent_drop_prob=(0.0, 1.0),
         augment_rel_drop_prob=(0.0, 0.3),
         augment_add_other_prob=0.5,
+        rel_id_to_classes: Optional[dict] = None,
         **kwargs,
     ):
         """Initialize UniEncoderRelexConfig.
@@ -223,6 +230,7 @@ def __init__(
                 the per-type entity drop probability. Defaults to (0.0, 0.4).
             augment_rel_drop_prob (tuple, optional): Range (min, max) from which to sample
                 the per-type relation drop probability. Defaults to (0.0, 0.4).
+            rel_id_to_classes (Optional[dict]): Mapping from relation class IDs to class names. Defaults to None.
             **kwargs: Additional keyword arguments passed to UniEncoderConfig.
 
         Raises:
@@ -241,6 +249,7 @@ def __init__(
         self.augment_ent_drop_prob = tuple(augment_ent_drop_prob)
         self.augment_rel_drop_prob = tuple(augment_rel_drop_prob)
         self.augment_add_other_prob = augment_add_other_prob
+        self.rel_id_to_classes = rel_id_to_classes
 
 
 class UniEncoderSpanRelexConfig(UniEncoderRelexConfig):