EleutherAI
diff --git a/‎.readthedocs.yaml‎
Lines changed: 6 additions & 6 deletions b/‎.readthedocs.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 12 additions & 6 deletions b/‎README.md‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎bergson/__init__.py‎
Lines changed: 16 additions & 5 deletions b/‎bergson/__init__.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎bergson/__main__.py‎
Lines changed: 59 additions & 26 deletions b/‎bergson/__main__.py‎
Lines changed: 59 additions & 26 deletions
diff --git a/‎bergson/build.py‎
Lines changed: 32 additions & 8 deletions b/‎bergson/build.py‎
Lines changed: 32 additions & 8 deletions
@@ -14,9 +14,9 @@ build:
 sphinx:
    configuration: docs/conf.py
 
-# Optionally, but recommended,
-# declare the Python requirements required to build your documentation
-# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
-# python:
-#    install:
-#    - requirements: docs/requirements.txt
+python:
+  install:
+    - method: pip
+      path: .
+      extra_requirements:
+        - dev
@@ -6,7 +6,7 @@ We view attribution as a counterfactual question: **_If we "unlearned" this trai
 ## Core features
 
 - Gradient store for serial queries. We provide collection-time gradient compression for efficient storage, and integrate with FAISS for fast KNN search over large stores.
-- On-the-fly queries. Query uncompressed gradients without disk I/O overhead via a single pass over a dataset with a set of precomputed query gradients.
+- On-the-fly queries. Query gradients without compression or disk I/O overhead via a single pass over a dataset with a set of precomputed query gradients.
   - Experiment with multiple query strategies based on [LESS](https://arxiv.org/pdf/2402.04333).
 - Train‑time gradient collection. Capture gradients produced during training with a ~17% performance overhead.
 - Scalable. We use [FSDP2](https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html), BitsAndBytes, and other performance optimizations to support large models, datasets, and clusters.
@@ -39,15 +39,15 @@ pip install bergson
 # Quickstart
 
 ```
-python -m bergson build runs/quickstart --model EleutherAI/pythia-14m --dataset NeelNanda/pile-10k --truncation
+bergson build runs/quickstart --model EleutherAI/pythia-14m --dataset NeelNanda/pile-10k --truncation
 ```
 
 # Usage
 
 You can build an index of gradients for each training sample from the command line, using `bergson` as a CLI tool:
 
 ```bash
-python -m bergson build <output_path> --model <model_name> --dataset <dataset_name>
+bergson build <output_path> --model <model_name> --dataset <dataset_name>
 ```
 
 This will create a directory at `<output_path>` containing the gradients for each training sample in the specified dataset. The `--model` and `--dataset` arguments should be compatible with the Hugging Face `transformers` library. By default it assumes that the dataset has a `text` column, but you can specify other columns using `--prompt_column` and optionally `--completion_column`. The `--help` flag will show you all available options.
@@ -61,10 +61,16 @@ At the lowest level of abstraction, the `GradientCollector` context manager allo
 
 ## On-the-fly Query
 
-You can query a large dataset without first building an index, by specifying a previously built index to query against:
+You can score a large dataset against a previously built query index without saving its gradients to disk:
 
 ```bash
-python -m bergson query <output_path> --model <model_name> --dataset <dataset_name> --query_path <existing_index_path> --scores_path <output_path> --score mean --save_index False
+bergson score <output_path> --model <model_name> --dataset <dataset_name> --query_path <existing_index_path> --score mean --projection_dim 0
+```
+
+We provide a utility to reduce a dataset into its mean or sum query gradient, for use as a query index:
+
+```bash
+bergson reduce <output_path> --model <model_name> --dataset <dataset_name> --method mean --unit_normalize --projection_dim 0
 ```
 
 ## Index Query
@@ -144,7 +150,7 @@ collect_gradients(
 Where a reward signal is available we compute gradients using a weighted advantage estimate based on Dr. GRPO:
 
 ```bash
-python -m bergson build <output_path> --model <model_name> --dataset <dataset_name> --reward_column <reward_column_name>
+bergson build <output_path> --model <model_name> --dataset <dataset_name> --reward_column <reward_column_name>
 ```
 
 # Development
 
@@ -1,12 +1,20 @@
 __version__ = "0.2.0"
 
-from .attributor import Attributor
 from .collection import collect_gradients
-from .data import AttentionConfig, DataConfig, IndexConfig, load_gradients
-from .faiss_index import FaissConfig
+from .config import (
+    AttentionConfig,
+    DataConfig,
+    IndexConfig,
+    QueryConfig,
+    ReduceConfig,
+    ScoreConfig,
+)
+from .data import load_gradients
 from .gradcheck import FiniteDiff
 from .gradients import GradientCollector, GradientProcessor
-from .score_writer import MemmapScoreWriter
+from .query.attributor import Attributor
+from .query.faiss_index import FaissConfig
+from .score.scorer import Scorer
 
 __all__ = [
     "collect_gradients",
@@ -19,5 +27,8 @@
     "IndexConfig",
     "DataConfig",
     "AttentionConfig",
-    "MemmapScoreWriter",
+    "Scorer",
+    "ScoreConfig",
+    "ReduceConfig",
+    "QueryConfig",
 ]
@@ -1,68 +1,101 @@
-import os
 from dataclasses import dataclass
 from typing import Optional, Union
 
 from simple_parsing import ArgumentParser, ConflictResolution
 
 from .build import build
-from .data import IndexConfig, QueryConfig
-from .query import query
+from .config import IndexConfig, QueryConfig, ReduceConfig, ScoreConfig
+from .query.query_index import query
+from .reduce import reduce
+from .score.score import score_dataset
 
 
 @dataclass
 class Build:
-    """Build the gradient dataset."""
+    """Build a gradient index."""
 
-    cfg: IndexConfig
+    index_cfg: IndexConfig
 
     def execute(self):
-        """Build the gradient dataset."""
-        if not self.cfg.save_index and self.cfg.skip_preconditioners:
-            raise ValueError(
-                "Either save_index must be True or skip_preconditioners must be False"
+        """Build the gradient index."""
+        if self.index_cfg.skip_index and self.index_cfg.skip_preconditioners:
+            raise ValueError("Either skip_index or skip_preconditioners must be False")
+
+        build(self.index_cfg)
+
+
+@dataclass
+class Reduce:
+    """Reduce a gradient index."""
+
+    index_cfg: IndexConfig
+
+    reduce_cfg: ReduceConfig
+
+    def execute(self):
+        """Reduce a gradient index."""
+        if self.index_cfg.projection_dim != 0:
+            print(
+                "Warning: projection_dim is not 0. "
+                "Compressed gradients will be reduced."
             )
 
-        build(self.cfg)
+        reduce(self.index_cfg, self.reduce_cfg)
 
 
 @dataclass
-class Query:
-    """Query the gradient dataset."""
+class Score:
+    """Score a dataset against an existing gradient index."""
 
-    query_cfg: QueryConfig
+    score_cfg: ScoreConfig
 
     index_cfg: IndexConfig
 
     def execute(self):
-        """Query the gradient dataset."""
-        assert self.query_cfg.scores_path
-        assert self.query_cfg.query_path
-
-        if os.path.exists(self.index_cfg.run_path) and self.index_cfg.save_index:
-            raise ValueError(
-                "Index path already exists and save_index is True - "
-                "running this query will overwrite the existing gradients. "
-                "If you meant to query the existing gradients use "
-                "Attributor instead."
+        """Score a dataset against an existing gradient index."""
+        assert self.score_cfg.query_path
+
+        if self.index_cfg.projection_dim != 0:
+            print(
+                "Warning: projection_dim is not 0. "
+                "Compressed gradients will be scored."
             )
 
-        query(self.index_cfg, self.query_cfg)
+        score_dataset(self.index_cfg, self.score_cfg)
+
+
+@dataclass
+class Query:
+    """Query an existing gradient index."""
+
+    query_cfg: QueryConfig
+
+    def execute(self):
+        """Query an existing gradient index."""
+        query(self.query_cfg)
 
 
 @dataclass
 class Main:
     """Routes to the subcommands."""
 
-    command: Union[Build, Query]
+    command: Union[Build, Query, Reduce, Score]
 
     def execute(self):
         """Run the script."""
         self.command.execute()
 
 
-def main(args: Optional[list[str]] = None):
+def get_parser():
+    """Get the argument parser. Used for documentation generation."""
     parser = ArgumentParser(conflict_resolution=ConflictResolution.EXPLICIT)
     parser.add_arguments(Main, dest="prog")
+    return parser
+
+
+def main(args: Optional[list[str]] = None):
+    """Parse CLI arguments and dispatch to the selected subcommand."""
+    parser = get_parser()
     prog: Main = parser.parse_args(args=args).prog
     prog.execute()
 
 
@@ -10,7 +10,8 @@
 from tqdm.auto import tqdm
 
 from bergson.collection import collect_gradients
-from bergson.data import IndexConfig, allocate_batches
+from bergson.config import IndexConfig
+from bergson.data import allocate_batches
 from bergson.utils import assert_type
 from bergson.worker_utils import setup_model_and_peft
 
@@ -24,6 +25,20 @@ def build_worker(
     cfg: IndexConfig,
     ds: Dataset | IterableDataset,
 ):
+    """
+    Build worker executed per rank to collect gradients to populate the index.
+
+    Parameters
+    ----------
+    rank : int
+        Distributed rank / GPU ID for this worker.
+    world_size : int
+        Total number of workers participating in the run.
+    cfg : IndexConfig
+        Specifies the model, tokenizer, PEFT adapters, and other settings.
+    ds : Dataset | IterableDataset
+        The entire dataset to be indexed. A subset is assigned to each worker.
+    """
     torch.cuda.set_device(rank)
 
     # These should be set by the main process
@@ -85,13 +100,22 @@ def flush(kwargs):
             processor.save(cfg.partial_run_path)
 
 
-def build(cfg: IndexConfig):
-    cfg.partial_run_path.mkdir(parents=True, exist_ok=True)
-    with (cfg.partial_run_path / "index_config.json").open("w") as f:
-        json.dump(asdict(cfg), f, indent=2)
+def build(index_cfg: IndexConfig):
+    """
+    Build a gradient index by distributing work across all available GPUs.
 
-    ds = setup_data_pipeline(cfg)
+    Parameters
+    ----------
+    index_cfg : IndexConfig
+        Specifies the run path, dataset, model, tokenizer, PEFT adapters,
+        and many other gradient collection settings.
+    """
+    index_cfg.partial_run_path.mkdir(parents=True, exist_ok=True)
+    with (index_cfg.partial_run_path / "index_config.json").open("w") as f:
+        json.dump(asdict(index_cfg), f, indent=2)
 
-    launch_distributed_run("build", build_worker, [cfg, ds])
+    ds = setup_data_pipeline(index_cfg)
 
-    shutil.move(cfg.partial_run_path, cfg.run_path)
+    launch_distributed_run("build", build_worker, [index_cfg, ds])
+
+    shutil.move(index_cfg.partial_run_path, index_cfg.run_path)