Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import warnings

from .utils import (
add_jitter,
groundtruth_neighbors_filename,
memmap_bin_file,
offset_neighbor_indices,
Expand Down Expand Up @@ -111,6 +112,22 @@ def choose_random_queries(dataset, n_queries):
return dataset[query_idx, :]


def choose_random_queries_with_jitter(dataset, n_queries, seed=12345):
"""Pick ``n_queries`` random rows from ``dataset`` and add Gaussian jitter
at scale ``0.1 * std(sample)``.
"""
import numpy as _np

print("Choosing random vectors from dataset and jittering with noise")
rng = _np.random.default_rng(seed)
n_rows = dataset.shape[0]
# Sort indices so the memmap read is sequential rather than random-access.
query_idx = _np.sort(rng.choice(n_rows, size=n_queries, replace=False))
sampled = dataset[query_idx, :].astype(_np.float32, copy=True)

return add_jitter(sampled, rng, normalize=False)

Comment thread
jinsolp marked this conversation as resolved.

def cpu_search(dataset, queries, k, metric="squeclidean"):
"""
Find the k nearest neighbors for each query point in the dataset using the
Expand Down Expand Up @@ -235,18 +252,22 @@ def main():
"The input and output files are in big-ann-benchmark's binary format.",
epilog="""Example usage
# With existing query file
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=/dataset/query.public.10K.fbin

# With randomly generated queries
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=random --n_queries=10000
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=random --n_queries=10000

# Using only a subset of the dataset. Define queries by randomly
# selecting vectors from the (subset of the) dataset.
python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--rows=2000000 --cols=128 --output=groundtruth_dir \
--queries=random-choice --n_queries=10000

# Jittered queries (following the logic of cuvs_bench.synthesize_dataset)
python -m cuvs_bench.generate_groundtruth /dataset/base.fbin \
--output=groundtruth_dir --queries=random-jitter --n_queries=10000
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
Expand All @@ -256,9 +277,11 @@ def main():
"--queries",
type=str,
default="random",
help="Queries file name, or one of 'random-choice' or 'random' "
"(default). 'random-choice': select n_queries vectors from the input "
"dataset. 'random': generate n_queries as uniform random numbers.",
help="Queries file name, or one of 'random-choice', 'random-jitter', "
"or 'random' (default). 'random-choice': select n_queries vectors "
"from the input dataset. 'random-jitter': same as 'random-choice', "
"but add std-relative Gaussian noise to each query. 'random': generate "
"n_queries as uniform random numbers.",
)
parser.add_argument(
"--output",
Expand Down Expand Up @@ -341,7 +364,7 @@ def main():
if len(args.output) > 0:
os.makedirs(args.output, exist_ok=True)

if args.queries == "random" or args.queries == "random-choice":
if args.queries in {"random", "random-choice", "random-jitter"}:
if args.n_queries is None:
raise RuntimeError(
"n_queries must be given to generate random queries"
Expand All @@ -352,9 +375,13 @@ def main():
)
elif args.queries == "random-choice":
queries = choose_random_queries(dataset, args.n_queries)
elif args.queries == "random-jitter":
queries = choose_random_queries_with_jitter(
dataset, args.n_queries
)

queries_filename = os.path.join(
args.output, "queries" + suffix_from_dtype(dtype)
args.output, "queries" + suffix_from_dtype(queries.dtype)
)
print("Writing queries file", queries_filename)
write_bin(queries_filename, queries)
Expand Down
16 changes: 16 additions & 0 deletions python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@
from cuvs_bench._bin_format import read_bin_header, write_bin_header


def add_jitter(
queries: np.ndarray,
rng: np.random.Generator,
normalize: bool,
) -> np.ndarray:
"""Add Gaussian jitter to query vectors and optionally re-normalize."""
noise_scale = float(np.std(queries)) * 0.1
queries = queries + rng.normal(0, noise_scale, queries.shape).astype(
np.float32
)
if normalize:
norms = np.linalg.norm(queries, axis=1, keepdims=True)
queries = queries / np.maximum(norms, 1e-8)
return queries.astype(np.float32)


def dtype_from_filename(filename):
ext = os.path.splitext(filename)[1]
if ext == ".fbin":
Expand Down
Loading
Loading