Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 42 additions & 10 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import warnings

from .utils import (
add_jitter,
is_l2_normalized,
groundtruth_neighbors_filename,
memmap_bin_file,
offset_neighbor_indices,
Expand Down Expand Up @@ -111,6 +113,25 @@ def choose_random_queries(dataset, n_queries):
return dataset[query_idx, :]


def choose_random_queries_with_jitter(dataset, n_queries, seed=12345):
"""Pick ``n_queries`` random rows from ``dataset``, add Gaussian jitter at
scale ``0.1 * std(sample)``, and re-normalize to unit norm iff the
original dataset rows already are.
"""
import numpy as _np

print("Choosing random vectors from dataset and jittering with noise")
rng = _np.random.default_rng(seed)
n_rows = dataset.shape[0]
# Sort indices so the memmap read is sequential rather than random-access.
query_idx = _np.sort(rng.choice(n_rows, size=n_queries, replace=False))
sampled = dataset[query_idx, :].astype(_np.float32, copy=True)

normalize = is_l2_normalized(sampled)

return add_jitter(sampled, rng, normalize)

Comment on lines +116 to +133

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

HIGH: random-jitter writes float32 query data under dataset-typed filename suffix.

Line 128/Line 132 produce float32 jittered queries, but Line 389 still picks suffix from dataset.dtype. For non-float datasets, this can write float32 payloads into .u8bin/.i8bin filenames, which later decode with the wrong dtype.

Proposed fix
-        queries_filename = os.path.join(
-            args.output, "queries" + suffix_from_dtype(dtype)
-        )
+        query_out_dtype = (
+            queries.dtype if args.queries == "random-jitter" else dtype
+        )
+        queries_filename = os.path.join(
+            args.output, "queries" + suffix_from_dtype(query_out_dtype)
+        )

As per coding guidelines, “Prevent silent data corruption from type coercion and validate that array type coercions are handled safely.”

Also applies to: 388-390

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py` around lines
116 - 133, The function `choose_random_queries_with_jitter` always returns
float32 jittered query data (via the return statement calling `add_jitter` on
the float32-casted `sampled` array), but the code that uses this function's
output (around line 389) determines the filename suffix using the original
`dataset.dtype` instead of the actual output dtype. This causes float32 data to
be written to filenames with the wrong type suffix (e.g., `.u8bin` for uint8
inputs), leading to incorrect decoding later. Update the code that generates the
output filename suffix to use float32 as the dtype instead of `dataset.dtype`,
since the jittered queries are always float32 regardless of the input dataset
type.

Source: Coding guidelines


def cpu_search(dataset, queries, k, metric="squeclidean"):
"""
Find the k nearest neighbors for each query point in the dataset using the
Expand Down Expand Up @@ -235,18 +256,22 @@ def main():
"The input and output files are in big-ann-benchmark's binary format.",
epilog="""Example usage
# With existing query file
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=/dataset/query.public.10K.fbin

# With randomly generated queries
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=random --n_queries=10000
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=random --n_queries=10000

# Using only a subset of the dataset. Define queries by randomly
# selecting vectors from the (subset of the) dataset.
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--rows=2000000 --cols=128 --output=groundtruth_dir \
--queries=random-choice --n_queries=10000

# Jittered queries (following the logic of cuvs_bench.synthesize_dataset)
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=random-jitter --n_queries=10000
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
Expand All @@ -256,9 +281,12 @@ def main():
"--queries",
type=str,
default="random",
help="Queries file name, or one of 'random-choice' or 'random' "
"(default). 'random-choice': select n_queries vectors from the input "
"dataset. 'random': generate n_queries as uniform random numbers.",
help="Queries file name, or one of 'random-choice', 'random-jitter', "
"or 'random' (default). 'random-choice': select n_queries vectors "
"from the input dataset. 'random-jitter': same as 'random-choice', "
"but add std-relative Gaussian noise to each query and re-normalize "
"if the dataset rows are unit-norm. 'random': generate n_queries "
"as uniform random numbers.",
)
parser.add_argument(
"--output",
Expand Down Expand Up @@ -341,7 +369,7 @@ def main():
if len(args.output) > 0:
os.makedirs(args.output, exist_ok=True)

if args.queries == "random" or args.queries == "random-choice":
if args.queries in {"random", "random-choice", "random-jitter"}:
if args.n_queries is None:
raise RuntimeError(
"n_queries must be given to generate random queries"
Expand All @@ -352,6 +380,10 @@ def main():
)
elif args.queries == "random-choice":
queries = choose_random_queries(dataset, args.n_queries)
elif args.queries == "random-jitter":
queries = choose_random_queries_with_jitter(
dataset, args.n_queries
)

queries_filename = os.path.join(
args.output, "queries" + suffix_from_dtype(dtype)
Expand Down
37 changes: 37 additions & 0 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,43 @@
from cuvs_bench._bin_format import read_bin_header, write_bin_header


def is_l2_normalized(
data,
sample_size: int = 10_000,
tol: float = 1e-2,
seed: int = 0,
) -> bool:
"""Cheaply check whether ``data`` rows are L2-unit-norm.

Samples up to ``sample_size`` rows uniformly at random and returns ``True``
iff every sampled row has ``|‖x‖ - 1| < tol``.
"""
n = len(data)
if n == 0:
return False
rng = np.random.default_rng(seed)
take = min(sample_size, n)
idx = rng.choice(n, size=take, replace=False)
norms = np.linalg.norm(data[idx].astype(np.float32), axis=1)
return bool(np.all(np.abs(norms - 1.0) < tol))


def add_jitter(
queries: np.ndarray,
rng: np.random.Generator,
normalize: bool,
) -> np.ndarray:
"""Add Gaussian jitter to query vectors and optionally re-normalize."""
noise_scale = float(np.std(queries)) * 0.1
queries = queries + rng.normal(0, noise_scale, queries.shape).astype(
np.float32
)
if normalize:
norms = np.linalg.norm(queries, axis=1, keepdims=True)
queries = queries / np.maximum(norms, 1e-8)
return queries.astype(np.float32)


def dtype_from_filename(filename):
ext = os.path.splitext(filename)[1]
if ext == ".fbin":
Expand Down
Loading
Loading