BenjaminIsaac0111
diff --git a/‎.gitattributes‎
Lines changed: 20 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 67 additions & 2 deletions b/‎.gitignore‎
Lines changed: 67 additions & 2 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 25 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 42 additions & 26 deletions b/‎README.md‎
Lines changed: 42 additions & 26 deletions
diff --git a/‎config.yaml‎
Lines changed: 9 additions & 2 deletions b/‎config.yaml‎
Lines changed: 9 additions & 2 deletions
@@ -1,2 +1,22 @@
+# Force LF line endings for all text files
 * text=auto eol=lf
 
+# Explicitly handle Python files
+*.py text eol=lf
+
+# Handle configuration files
+*.yaml text eol=lf
+*.json text eol=lf
+*.toml text eol=lf
+*.md text eol=lf
+
+# Mark data artifacts as binary to prevent corruption
+*.csv binary
+*.sqlite binary
+*.h5 binary
+*.pth binary
+*.pt binary
+*.pkl binary
+
+# Large data and logs should definitely be binary
+*.log text eol=lf
@@ -182,9 +182,9 @@ cython_debug/
 .abstra/
 
 # Visual Studio Code
-#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
 #  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
-#  and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
 #  you could uncomment the following to ignore the entire vscode folder
 # .vscode/
 
@@ -213,9 +213,74 @@ checkpoints/
 checkpoints_full/
 results/
 results_long_run/
+runs/
+vscode/
+.agent/
 
 # Images
 *.png
 *.jpg
 *.jpeg
 *.svg
+docs/research_proposal.tex
+.vscode/settings.json
+hest_data/.gitattributes
+hest_data/HEST_v1_3_0.csv
+hest_data/README.md
+hest_data/cellvit_seg/INT1_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT1_cellvit_seg.parquet
+hest_data/cellvit_seg/INT10_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT10_cellvit_seg.parquet
+hest_data/cellvit_seg/INT11_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT11_cellvit_seg.parquet
+hest_data/cellvit_seg/INT12_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT12_cellvit_seg.parquet
+hest_data/cellvit_seg/INT13_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT13_cellvit_seg.parquet
+hest_data/cellvit_seg/INT16_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT16_cellvit_seg.parquet
+hest_data/cellvit_seg/INT19_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT19_cellvit_seg.parquet
+hest_data/cellvit_seg/INT20_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT20_cellvit_seg.parquet
+hest_data/cellvit_seg/INT21_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/INT21_cellvit_seg.parquet
+hest_data/cellvit_seg/INT1_cellvit_seg.geojson
+hest_data/cellvit_seg/INT10_cellvit_seg.geojson
+hest_data/cellvit_seg/INT11_cellvit_seg.geojson
+hest_data/cellvit_seg/INT12_cellvit_seg.geojson
+hest_data/cellvit_seg/INT13_cellvit_seg.geojson
+hest_data/cellvit_seg/INT16_cellvit_seg.geojson
+hest_data/cellvit_seg/INT19_cellvit_seg.geojson
+hest_data/cellvit_seg/INT20_cellvit_seg.geojson
+hest_data/cellvit_seg/INT21_cellvit_seg.geojson
+hest_data/cellvit_seg/TENX175_cellvit_seg.geojson
+hest_data/cellvit_seg/TENX175_cellvit_seg.geojson.zip
+hest_data/cellvit_seg/TENX175_cellvit_seg.parquet
+hest_data/metadata/TENX175.json
+hest_data/patches/TENX175.h5
+hest_data/st/TENX175.h5ad
+hest_data/tissue_seg/TENX175_contours.geojson
+hest_data/wsis/TENX175.tif
+.idea/.gitignore
+.idea/csv-editor.xml
+.idea/deployment.xml
+.idea/jupyter-settings.xml
+.idea/misc.xml
+.idea/modules.xml
+.idea/SpatialTranscriptFormer.iml
+.idea/vcs.xml
+.idea/inspectionProfiles/profiles_settings.xml
+.idea/inspectionProfiles/Project_Default.xml
+.idea/runConfigurations/STF_Compute_Pathways.xml
+.idea/runConfigurations/STF_Train_PrimaryPathway.xml
+.gemini/settings.json
+.gemini/agents/literature-search.md
+.gemini/agents/test-triage.md
+
+# Large Data Artifacts
+global_genes_stats.csv
+*.sqlite
+HEST_v1_3_0.csv
+global_genes.json
+test_out.txt
@@ -0,0 +1,25 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: check-added-large-files
+
+-   repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+    -   id: black
+        language_version: python3
+
+-   repo: local
+    hooks:
+    -   id: pytest
+        name: pytest
+        entry: conda run -n SpatialTranscriptFormer --no-capture-output python -m pytest
+        language: system
+        pass_filenames: false
+        always_run: true
@@ -8,7 +8,7 @@
 > [!TIP]
 > **Framework Release**: SpatialTranscriptFormer has been restructured from a research codebase into a robust framework. You can now use the Python API to train on your own spatial transcriptomics data with custom backbones and architectures.
 
-**SpatialTranscriptFormer** is a modular deep learning framework designed to bridge histology and biological pathways. It leverages transformer architectures to model the interplay between morphological features and gene expression signatures, providing interpretable mapping of the tissue microenvironment.
+**SpatialTranscriptFormer** is a modular deep learning framework designed to bridge histology and biological pathways. It leverages transformer architectures to directly predict spatially-resolved **biological pathway activity scores** from H&E image patches, providing interpretable maps of the tissue microenvironment.
 
 ## Python API: Quick Start
 
@@ -28,22 +28,23 @@ predictor = Predictor(model, device="cuda")
 #    coords:        (N, 2) tensor of spatial coordinates (from your WSI tiling)
 features = extractor.extract_batch(image_patches, batch_size=64)  # → (N, 768)
 
-# 3. Predict gene expression from extracted features
-predictions = predictor.predict_wsi(features, coords)  # → (1, G)
+# 3. Predict pathway activity scores from extracted features
+predictions = predictor.predict_wsi(features, coords)  # → (1, P)
 
 # 4. Integrate with Scanpy
-inject_predictions(adata, coords, predictions[0], gene_names=model.gene_names)
+inject_predictions(adata, coords, predictions[0], pathway_names=model.pathway_names)
 ```
 
 For more details, see the **[Python API Reference](docs/API.md)**.
 
 ## Key Technical Pillars
 
-- **Modular Architecture**: Decoupled backbones, interaction modules, and output heads.
+- **Modular Architecture**: Decoupled backbones, interaction modules, and pathway output heads.
 - **Quad-Flow Interaction**: Configurable attention between Pathways and Histology patches (`p2p`, `p2h`, `h2p`, `h2h`).
-- **Pathway Bottleneck**: Interpretable gene expression prediction via 50 MSigDB Hallmark tokens.
-- **Spatial Pattern Coherence**: Optimized using a composite **MSE + PCC (Pearson Correlation) loss**.
-- **Foundation Model Ready**: Native support for **CTransPath**, **Phikon**, **Hibou**, and **GigaPath**.
+- **Pathway-Exclusive Prediction**: Directly predicts biological pathway activity scores (e.g., 50 MSigDB Hallmark pathways) — no intermediate gene reconstruction step.
+- **Offline Pathway Targets**: Ground-truth pathway activities are pre-computed offline (`stf-compute-pathways`) from raw gene expression using QC → CP10k normalisation → z-score → mean pathway aggregation. This eliminates the circular auxiliary loss used in previous versions.
+- **Spatial Pattern Coherence**: Optimised using a composite **MSE + PCC (Pearson Correlation) loss**.
+- **Foundation Model Ready**: Native support for **CTransPath**, **Phikon**, **Hibou**, **PLIP**, and **GigaPath**.
 
 ---
 
@@ -61,6 +62,7 @@ This project is protected by a **Proprietary Source Code License**. See the [LIC
 ## Intellectual Property
 
 The core architectural innovations, including the **SpatialTranscriptFormer** interaction logic and spatial masking strategies, are the unique Intellectual Property of the author. For a detailed breakdown, see the [IP Statement](docs/IP_STATEMENT.md).
+
 ---
 
 ## Installation
@@ -83,20 +85,30 @@ The `SpatialTranscriptFormer` repository includes a complete, out-of-the-box CLI
 stf-download --organ Breast --disease Cancer --tech Visium --local_dir hest_data
 ```
 
-### 2. Training with Presets
+### 2. Pre-Compute Pathway Activity Targets
+
+Before training, compute the offline pathway activity matrix for each sample. This step applies per-spot QC, CP10k normalisation, and z-scoring before aggregating gene expression into MSigDB Hallmark pathway scores.
+
+```bash
+stf-compute-pathways --data-dir hest_data
+```
+
+See the **[Pathway Mapping docs](docs/PATHWAY_MAPPING.md)** for a full description of the scoring methodology and available CLI options.
+
+### 3. Training with Presets
 
 ```bash
 # Recommended: Run the Interaction model (Small)
 python scripts/run_preset.py --preset stf_small
 ```
 
-### 3. Inference & Visualization
+### 4. Inference & Visualization
 
 ```bash
 stf-predict --data-dir A:\hest_data --sample-id MEND29 --model-path checkpoints/best_model.pth --model-type interaction
 ```
 
-Visualization plots and spatial expression maps will be saved to the `./results` directory. For the full guide, see the **[HEST Recipe Docs](src/spatial_transcript_former/recipes/hest/README.md)**.
+Visualization plots and spatial pathway activation maps will be saved to the `./results` directory. For the full guide, see the **[HEST Recipe Docs](src/spatial_transcript_former/recipes/hest/README.md)**.
 
 ## Documentation
 
@@ -109,10 +121,9 @@ Visualization plots and spatial expression maps will be saved to the `./results`
 
 ### Theory & Interpretability
 
-- **[Models & Architecture](docs/MODELS.md)**: Deep dive into the quad-flow interaction logic and network scaling.
-- **[Pathway Mapping](docs/PATHWAY_MAPPING.md)**: Clinical interpretability, pathway bottleneck design, and MSigDB integration.
-- **[Gene Analysis](docs/GENE_ANALYSIS.md)**: Modeling strategies for mapping morphology to high-dimensional gene spaces.
-- **[Data Structure](docs/DATA_STRUCTURE.md)**: Detailed breakdown of the HEST data structure on disk, metadata conventions, and preprocessing invariants.
+- **[Models & Architecture](docs/MODELS.md)**: Deep dive into the pathway-exclusive prediction architecture, quad-flow interaction logic, and network scaling.
+- **[Pathway Mapping](docs/PATHWAY_MAPPING.md)**: Offline pathway scoring methodology, QC pipeline, and MSigDB integration.
+- **[Data Structure](docs/DATA_FORMAT.md)**: Detailed breakdown of the HEST data structure on disk, metadata conventions, and preprocessing invariants.
 
 ## Development
 
@@ -123,28 +134,33 @@ Visualization plots and spatial expression maps will be saved to the `./results`
 .\test.ps1
 ```
 
+The test suite is organised into a hierarchical directory structure under `tests/`:
+
+| Directory | Coverage Area |
+| :--- | :--- |
+| `tests/data/` | Data integrity, pathway scoring, augmentation |
+| `tests/models/` | Backbone loading, interaction logic, model compilation |
+| `tests/training/` | Loss functions, trainer loop, checkpoints, config |
+| `tests/recipes/hest/` | HEST dataset loading and splitting |
+| `tests/test_api.py` | End-to-end Python API integration |
+
 ## Development Roadmap
 
 Active research and development is tracked in the **[Research & Improvement Roadmap](docs/SC_BEST_PRACTICES.md)**. Key directions are summarised below.
 
 ### Near-term
 
-- **Vocabulary quality** — mitochondrial gene filtering (`MT-*` exclusion) and a rebuild of the gene vocabulary using SVG-weighted ranking (Moran's I), ensuring training targets are spatially informative rather than dominated by housekeeping genes.
-- **Moran's I weighted loss** — weight each gene's contribution to the training loss by its spatial variability score, so that the gradient is driven by spatially coherent genes rather than high-expression noise.
-
-### Medium-term: Architectural Reframing
+- **Extended knowledge base integration** — The offline pathway scoring step currently supports MSigDB Hallmarks via GMT files. The architecture is designed to be database-agnostic; future work will add first-class support for [decoupleR](https://decoupler-py.readthedocs.io) + [PROGENy](https://saezlab.github.io/progeny/) (Saez lab) and [LIANA+](https://liana-py.readthedocs.io) ligand-receptor databases as alternative scoring backends.
+- **Visium HD & Xenium support** — Architecturally trivial; blocked only by data availability.
 
-The current model predicts ~1000 individual gene expression values as its primary task, with pathway activity as a secondary interpretability output. Based on a review of the ST literature and the [Saezlab ecosystem](https://saezlab.org) (PROGENy, decoupleR, LIANA+), we are shifting toward:
+### Medium-term
 
-- **Pathway activity as the primary prediction target.** Spatial pathway activity maps pre-computed offline via [decoupleR](https://decoupler-py.readthedocs.io) + [PROGENy](https://saezlab.github.io/progeny/) are spatially cleaner, clinically interpretable, and directly supervised — avoiding the circular regularisation issue of the current `AuxiliaryPathwayLoss`.
-- **Gene expression as a secondary imputation head**, weighted by Moran's I.
-- **Pluggable prior knowledge.** The offline preprocessing step accepts any biological network (PROGENy signalling pathways, MSigDB Hallmarks, LIANA+ ligand-receptor pairs, CollecTRI TF regulons) without changing the model architecture.
+- **Evaluation on the 2025 Nat. Comms. benchmark suite** (11 methods, 28 metrics, 5 datasets).
+- **Pluggable scoring backends** — Allow `stf-compute-pathways` to accept any biological network (CollecTRI TF regulons, custom GMT files) without changing the model architecture.
 
 ### Longer-term
 
-- Evaluation on the 2025 Nat. Comms. benchmark suite (11 methods, 28 metrics, 5 datasets).
-- Support for higher-resolution platforms (Visium HD, Xenium) — architecturally trivial, blocked only by data availability.
-- **Clinical integration** — using predicted spatial pathway activation maps as features for patient risk assessment and prognosis tracking in an end-to-end pipeline.
+- **Clinical integration** — Using predicted spatial pathway activation maps as features for patient risk assessment and prognosis tracking in an end-to-end pipeline.
 
 > [!NOTE]
 > **Call for Collaborators:** Rigorous risk assessment models require large clinical cohorts with spatial transcriptomics and survival outcomes, which we currently lack access to. We are open to investigating *any* disease of interest. If you have access to such cohorts and are interested in exploring how spatially-resolved pathway activation correlates with patient prognosis, we would love to partner with you.
 
@@ -11,9 +11,16 @@ training:
   num_genes: 1000
   batch_size: 8
   learning_rate: 0.0001
-  output_dir: "./checkpoints"
-  
+  output_dir: "./runs"
+
 # MSigDB Pathway Settings
 pathways:
   default_collection: "hallmarks"
   cache_dir: ".cache"
+
+# Quality Control Defaults
+qc:
+  min_umis: 500
+  min_genes: 200
+  max_mt: 0.15
+  min_pathways: 25