EmbBERT/embbert_semantic_search_test.py at main · Raze-Systems/EmbBERT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
"""Run semantic-search experiments with EmbBERT checkpoint ``616000``.

This script loads the bundled EmbBERT pretraining checkpoint
``checkpoints/pretraining/checkpoint-616000`` and turns its token-level
hidden states into text embeddings with simple pooling.

It supports two modes:

1. Ad hoc search over either a built-in demo corpus or a plain-text corpus file.
2. Retrieval benchmarking against a small labeled JSON dataset with relevance
   judgments and aggregate metrics.

Examples:
    ./.venv/bin/python embbert_semantic_search_test.py

    ./.venv/bin/python embbert_semantic_search_test.py \\
        --query "pet animals that enjoy sitting on the couch" \\
        --top-k 5

    ./.venv/bin/python embbert_semantic_search_test.py \\
        --corpus-file datasets/my_corpus.txt \\
        --query "ways to optimize Python code"

    ./.venv/bin/python embbert_semantic_search_test.py \\
        --run-benchmark
"""

from __future__ import annotations

import argparse
import json
from dataclasses import dataclass
from pathlib import Path

import torch
import torch.nn.functional as F

try:
    from EmbBERT.loaders import load_pretraining_checkpoint
except ModuleNotFoundError:
    from loaders import load_pretraining_checkpoint

PROJECT_ROOT = Path(__file__).resolve().parent
DEFAULT_CHECKPOINT = "checkpoint-616000"
DEFAULT_BENCHMARK_FILE = PROJECT_ROOT / "datasets" / "embbert_semantic_search_benchmark.json"
DEFAULT_QUERIES = [
    "pet animals that like to curl up indoors",
    "improving code performance in a Python service",
    "space missions exploring planets and stars",
]
DEFAULT_CORPUS = [
    "Cats are quiet pets that often nap on sofas and sunny windowsills.",
    "Dogs are loyal companions that enjoy walks, play, and human attention.",
    "A kitten resting on the couch can spend hours sleeping near the window.",
    "Python applications can run faster after profiling hot paths and removing unnecessary work.",
    "Developers often optimize backend services by batching queries and reducing memory allocations.",
    "Refactoring a slow script can improve throughput and lower response times.",
    "Astronomers study planets, stars, and distant galaxies with large telescopes.",
    "Space agencies launch probes to explore the moon, Mars, and the outer solar system.",
    "Rocket missions collect data about planetary atmospheres and orbital mechanics.",
    "Fresh bread, olive oil, and tomatoes are common ingredients in Mediterranean cooking.",
    "Hiking trails in the mountains offer long walks, cold air, and scenic overlooks.",
    "Trains connect cities through stations, schedules, and regional transit networks.",
]


@dataclass(slots=True)
class SearchDocument:
    """Represent one searchable document.

    Attributes:
        document_id: Stable identifier used for retrieval evaluation.
        text: Document text embedded and searched by the model.
    """

    document_id: str
    text: str


@dataclass(slots=True)
class SearchQuery:
    """Represent one benchmark query with relevance judgments.

    Attributes:
        query_id: Stable benchmark query identifier.
        text: User-facing query text.
        relevant_ids: Document identifiers considered relevant for the query.
    """

    query_id: str
    text: str
    relevant_ids: set[str]


@dataclass(slots=True)
class SearchHit:
    """Represent one semantic-search match.

    Attributes:
        rank: One-based rank in the result list.
        score: Cosine-similarity score between query and document embedding.
        document_id: Stable identifier of the matched document.
        text: Matched document text.
    """

    rank: int
    score: float
    document_id: str
    text: str


def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for the semantic-search probe.

    Returns:
        Parsed script arguments controlling search mode and benchmark mode.
    """

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--checkpoint",
        default=DEFAULT_CHECKPOINT,
        help="EmbBERT bundle checkpoint name to load.",
    )
    parser.add_argument(
        "--query",
        action="append",
        default=[],
        help="Query text to search for. Repeat to provide multiple queries.",
    )
    parser.add_argument(
        "--corpus-file",
        type=Path,
        help="Optional plain-text file with one search document per line.",
    )
    parser.add_argument(
        "--pooling",
        choices=["mean", "cls"],
        default="mean",
        help="Pooling strategy used to convert token states into text embeddings.",
    )
    parser.add_argument(
        "--top-k",
        type=int,
        default=3,
        help="Number of nearest neighbors to show per query.",
    )
    parser.add_argument(
        "--max-length",
        type=int,
        default=256,
        help="Maximum tokenizer sequence length.",
    )
    parser.add_argument(
        "--run-benchmark",
        action="store_true",
        help="Evaluate retrieval metrics on the labeled benchmark dataset.",
    )
    parser.add_argument(
        "--benchmark-file",
        type=Path,
        default=DEFAULT_BENCHMARK_FILE,
        help="Path to a labeled benchmark JSON file.",
    )
    return parser.parse_args()


def load_corpus(corpus_file: Path | None) -> list[SearchDocument]:
    """Load the search corpus from disk or fall back to the built-in demo set.

    Args:
        corpus_file: Optional text file with one document per line.

    Returns:
        Search documents with stable identifiers.

    Raises:
        FileNotFoundError: If ``corpus_file`` was provided and does not exist.
        ValueError: If the resolved corpus has no usable documents.
    """

    if corpus_file is None:
        corpus = [
            SearchDocument(document_id=f"demo_{index:02d}", text=text)
            for index, text in enumerate(DEFAULT_CORPUS, start=1)
        ]
    else:
        if not corpus_file.exists():
            raise FileNotFoundError(f"Corpus file not found: {corpus_file}")
        corpus = [
            SearchDocument(document_id=f"file_{index:02d}", text=line.strip())
            for index, line in enumerate(corpus_file.read_text().splitlines(), start=1)
            if line.strip()
        ]

    if not corpus:
        raise ValueError("The search corpus is empty.")
    return corpus


def load_benchmark(benchmark_file: Path) -> tuple[list[SearchDocument], list[SearchQuery]]:
    """Load a labeled semantic-search benchmark from JSON.

    Args:
        benchmark_file: JSON file containing ``documents`` and ``queries``.

    Returns:
        A tuple of benchmark documents and benchmark queries.

    Raises:
        FileNotFoundError: If the benchmark file does not exist.
        ValueError: If the JSON payload is missing required fields.
    """

    if not benchmark_file.exists():
        raise FileNotFoundError(f"Benchmark file not found: {benchmark_file}")

    payload = json.loads(benchmark_file.read_text())
    raw_documents = payload.get("documents", [])
    raw_queries = payload.get("queries", [])
    if not raw_documents or not raw_queries:
        raise ValueError("Benchmark JSON must contain non-empty 'documents' and 'queries' arrays.")

    documents = [
        SearchDocument(document_id=document["id"], text=document["text"])
        for document in raw_documents
    ]
    queries = [
        SearchQuery(
            query_id=query["id"],
            text=query["text"],
            relevant_ids=set(query["relevant_ids"]),
        )
        for query in raw_queries
    ]
    return documents, queries


def resolve_queries(queries: list[str]) -> list[str]:
    """Return user-supplied queries or the built-in demo queries.

    Args:
        queries: Queries supplied on the command line.

    Returns:
        User queries when present, otherwise the default demo queries.
    """

    return queries or DEFAULT_QUERIES


def pool_hidden_states(hidden_states: torch.Tensor, attention_mask: torch.Tensor, pooling: str) -> torch.Tensor:
    """Pool token-level hidden states into one vector per input text.

    Args:
        hidden_states: Tensor of shape ``(batch, seq_len, hidden_size)``.
        attention_mask: Tensor of shape ``(batch, seq_len)`` marking real tokens.
        pooling: Pooling strategy, either ``"mean"`` or ``"cls"``.

    Returns:
        Tensor of shape ``(batch, hidden_size)``.

    Raises:
        ValueError: If an unsupported pooling mode is requested.
    """

    match pooling:
        case "cls":
            return hidden_states[:, 0, :]
        case "mean":
            mask = attention_mask.unsqueeze(-1).to(hidden_states.dtype)
            masked_hidden = hidden_states * mask
            token_counts = mask.sum(dim=1).clamp_min(1.0)
            return masked_hidden.sum(dim=1) / token_counts
        case _:
            raise ValueError(f"Unsupported pooling mode: {pooling}")


def encode_texts(
    model,
    tokenizer,
    texts: list[str],
    *,
    pooling: str,
    max_length: int,
) -> torch.Tensor:
    """Encode a batch of texts into normalized embeddings.

    Args:
        model: Loaded EmbBERT pretraining wrapper.
        tokenizer: Tokenizer paired with the checkpoint.
        texts: Input texts to embed.
        pooling: Pooling strategy used to collapse token representations.
        max_length: Maximum sequence length used during tokenization.

    Returns:
        L2-normalized embedding tensor with shape ``(len(texts), hidden_size)``.
    """

    encoded = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt",
    )

    with torch.no_grad():
        hidden_states = model.model(encoded["input_ids"], encoded["attention_mask"])
        pooled = pool_hidden_states(hidden_states, encoded["attention_mask"], pooling)
        return F.normalize(pooled, p=2, dim=1)


def search_corpus(
    query_embeddings: torch.Tensor,
    corpus_embeddings: torch.Tensor,
    corpus: list[SearchDocument],
    *,
    top_k: int,
) -> list[list[SearchHit]]:
    """Return the top semantic-search hits for each query embedding.

    Args:
        query_embeddings: Normalized query embeddings.
        corpus_embeddings: Normalized document embeddings.
        corpus: Search documents aligned with ``corpus_embeddings``.
        top_k: Maximum number of hits per query.

    Returns:
        Ranked search hits for each query in order.
    """

    similarity = query_embeddings @ corpus_embeddings.T
    k = min(top_k, len(corpus))
    scores, indices = torch.topk(similarity, k=k, dim=1)

    return [
        [
            SearchHit(
                rank=rank + 1,
                score=float(score),
                document_id=corpus[index].document_id,
                text=corpus[index].text,
            )
            for rank, (score, index) in enumerate(zip(row_scores.tolist(), row_indices.tolist(), strict=True))
        ]
        for row_scores, row_indices in zip(scores, indices, strict=True)
    ]


def reciprocal_rank(hits: list[SearchHit], relevant_ids: set[str]) -> float:
    """Compute reciprocal rank for one ranked result list.

    Args:
        hits: Ranked retrieval results for a single query.
        relevant_ids: Relevant document identifiers for that query.

    Returns:
        Reciprocal rank, or ``0.0`` when no relevant hit is retrieved.
    """

    for hit in hits:
        if hit.document_id in relevant_ids:
            return 1.0 / hit.rank
    return 0.0


def recall_at_k(hits: list[SearchHit], relevant_ids: set[str], k: int) -> float:
    """Compute recall at ``k`` for one query.

    Args:
        hits: Ranked retrieval results for a single query.
        relevant_ids: Relevant document identifiers for that query.
        k: Retrieval cutoff.

    Returns:
        Fraction of relevant documents retrieved within the top ``k``.
    """

    if not relevant_ids:
        return 0.0
    retrieved = {hit.document_id for hit in hits[:k] if hit.document_id in relevant_ids}
    return len(retrieved) / len(relevant_ids)


def hit_rate_at_k(hits: list[SearchHit], relevant_ids: set[str], k: int) -> float:
    """Compute hit rate at ``k`` for one query.

    Args:
        hits: Ranked retrieval results for a single query.
        relevant_ids: Relevant document identifiers for that query.
        k: Retrieval cutoff.

    Returns:
        ``1.0`` if any relevant document appears in the top ``k``, else ``0.0``.
    """

    return 1.0 if any(hit.document_id in relevant_ids for hit in hits[:k]) else 0.0


def evaluate_benchmark(
    queries: list[SearchQuery],
    results: list[list[SearchHit]],
    *,
    ks: list[int],
) -> dict[str, float]:
    """Aggregate retrieval metrics across the benchmark queries.

    Args:
        queries: Benchmark queries with relevance judgments.
        results: Ranked retrieval hits aligned with ``queries``.
        ks: Retrieval cutoffs used for recall and hit-rate reporting.

    Returns:
        Dictionary of aggregate retrieval metrics.
    """

    metrics: dict[str, float] = {
        "query_count": float(len(queries)),
        "mrr": sum(reciprocal_rank(hits, query.relevant_ids) for query, hits in zip(queries, results, strict=True))
        / len(queries),
    }

    for k in ks:
        metrics[f"recall@{k}"] = sum(
            recall_at_k(hits, query.relevant_ids, k) for query, hits in zip(queries, results, strict=True)
        ) / len(queries)
        metrics[f"hit_rate@{k}"] = sum(
            hit_rate_at_k(hits, query.relevant_ids, k) for query, hits in zip(queries, results, strict=True)
        ) / len(queries)

    return metrics


def print_results(queries: list[str], results: list[list[SearchHit]]) -> None:
    """Print semantic-search results in a compact human-readable form.

    Args:
        queries: Queries that produced the result sets.
        results: Ranked hits for each query.
    """

    for query, hits in zip(queries, results, strict=True):
        print(f"\nQuery: {query}")
        for hit in hits:
            print(f"  {hit.rank}. score={hit.score:.4f}  {hit.document_id}  {hit.text}")


def print_benchmark_results(queries: list[SearchQuery], results: list[list[SearchHit]], metrics: dict[str, float]) -> None:
    """Print benchmark metrics and per-query result summaries.

    Args:
        queries: Benchmark queries with relevance judgments.
        results: Ranked retrieval hits aligned with ``queries``.
        metrics: Aggregate retrieval metrics.
    """

    print("\nBenchmark summary:")
    print(f"  queries: {int(metrics['query_count'])}")
    for key in sorted(metric for metric in metrics if metric != "query_count"):
        print(f"  {key}: {metrics[key]:.4f}")

    print("\nPer-query top hits:")
    for query, hits in zip(queries, results, strict=True):
        print(f"\nQuery [{query.query_id}]: {query.text}")
        print(f"  relevant: {', '.join(sorted(query.relevant_ids))}")
        for hit in hits:
            is_relevant = "relevant" if hit.document_id in query.relevant_ids else "non-relevant"
            print(f"  {hit.rank}. score={hit.score:.4f}  {hit.document_id}  {is_relevant}")


def run_interactive_search(args: argparse.Namespace) -> None:
    """Run ad hoc semantic search over the demo corpus or a user corpus.

    Args:
        args: Parsed command-line arguments.
    """

    queries = resolve_queries(args.query)
    corpus = load_corpus(args.corpus_file)

    model, tokenizer = load_pretraining_checkpoint(args.checkpoint)
    model.eval()

    corpus_embeddings = encode_texts(
        model,
        tokenizer,
        [document.text for document in corpus],
        pooling=args.pooling,
        max_length=args.max_length,
    )
    query_embeddings = encode_texts(
        model,
        tokenizer,
        queries,
        pooling=args.pooling,
        max_length=args.max_length,
    )

    print(f"Checkpoint: {args.checkpoint}")
    print(f"Pooling: {args.pooling}")
    print(f"Corpus size: {len(corpus)}")
    print(f"Embedding dimension: {corpus_embeddings.shape[1]}")

    results = search_corpus(
        query_embeddings,
        corpus_embeddings,
        corpus,
        top_k=args.top_k,
    )
    print_results(queries, results)


def run_benchmark(args: argparse.Namespace) -> None:
    """Run retrieval evaluation on the labeled benchmark dataset.

    Args:
        args: Parsed command-line arguments.
    """

    corpus, queries = load_benchmark(args.benchmark_file)
    model, tokenizer = load_pretraining_checkpoint(args.checkpoint)
    model.eval()

    corpus_embeddings = encode_texts(
        model,
        tokenizer,
        [document.text for document in corpus],
        pooling=args.pooling,
        max_length=args.max_length,
    )
    query_embeddings = encode_texts(
        model,
        tokenizer,
        [query.text for query in queries],
        pooling=args.pooling,
        max_length=args.max_length,
    )

    ks = sorted({1, min(3, len(corpus)), min(5, len(corpus))})
    results = search_corpus(
        query_embeddings,
        corpus_embeddings,
        corpus,
        top_k=max(ks),
    )
    metrics = evaluate_benchmark(queries, results, ks=ks)

    print(f"Checkpoint: {args.checkpoint}")
    print(f"Pooling: {args.pooling}")
    print(f"Benchmark file: {args.benchmark_file}")
    print(f"Corpus size: {len(corpus)}")
    print(f"Query count: {len(queries)}")
    print(f"Embedding dimension: {corpus_embeddings.shape[1]}")
    print_benchmark_results(queries, results, metrics)


def main() -> None:
    """Dispatch to either interactive search or benchmark evaluation."""

    args = parse_args()
    if args.run_benchmark:
        run_benchmark(args)
    else:
        run_interactive_search(args)


if __name__ == "__main__":
    main()