diff --git a/CHANGELOG.md b/CHANGELOG.md
index c515c51308..f9b3d045ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ to [Semantic Versioning]. The full commit history is available in the [commit lo
 
 #### Added
 
+- Add {class}`scvi.external.SENADVAE` for causal perturbation modeling in single-cell genomics {pr}`3571`.
+
 #### Fixed
 
 - Fix checkpointing for {class}`scvi.model.TOTALVI`, {pr}`3651`.
diff --git a/src/scvi/external/SENADVAE/__init__.py b/src/scvi/external/SENADVAE/__init__.py
new file mode 100644
index 0000000000..774529e0ae
--- /dev/null
+++ b/src/scvi/external/SENADVAE/__init__.py
@@ -0,0 +1,5 @@
+from ._dataloader import ControlReshuffleCallback, SENADataLoader
+from ._model import SENADVAE
+from ._module import SENAModule
+
+__all__ = ["SENADVAE", "SENAModule", "SENADataLoader", "ControlReshuffleCallback"]
diff --git a/src/scvi/external/SENADVAE/_dataloader.py b/src/scvi/external/SENADVAE/_dataloader.py
new file mode 100644
index 0000000000..6db6d7fe62
--- /dev/null
+++ b/src/scvi/external/SENADVAE/_dataloader.py
@@ -0,0 +1,1011 @@
+"""
+Specialized data loading and processing components for the SENADVAE
+
+Implementing dynamic control-perturbation matching and efficient batch processing for
+single-cell perturbation analysis.
+"""
+
+import logging
+from collections.abc import Sequence
+
+import numpy as np
+import pandas as pd
+import torch
+from lightning.pytorch.callbacks import Callback
+
+from scvi import REGISTRY_KEYS
+from scvi.data import AnnDataManager
+from scvi.dataloaders import AnnDataLoader, DataSplitter
+
+logger = logging.getLogger(__name__)
+
+
+class SENADataLoader(AnnDataLoader):
+    """
+    Custom dataloader for SENADVAE with dynamic preprocessing during training.
+
+    This dataloader processes single-cell gene expression data with perturbations, handling
+    the complex task of matching control cells to perturbed cells and converting perturbation
+    annotations into a format suitable for neural network training. It inherits from AnnDataLoader
+    to maintain compatibility with the scvi-tools framework.
+
+    Parameters
+    ----------
+    adata_manager : AnnDataManager
+        AnnDataManager instance containing the original adata.
+    perturbation_key : str
+        Key in adata.obs containing perturbation information.
+    shuffle : bool, default True
+        Whether to shuffle the data.
+    batch_size : int, default 128
+        Batch size for training.
+    mode : str, default "training"
+        Mode for dataloader: "training" or "prediction".
+    indices : Sequence[int], optional
+        Specific indices for prediction mode.
+    prediction_indices : Sequence[int], optional
+        Alternative parameter name for indices (used by SENADataSplitter).
+    **kwargs
+        Additional arguments passed to DataLoader.
+
+    Attributes
+    ----------
+    adata_manager : AnnDataManager
+        AnnDataManager instance containing the original adata.
+    adata : AnnData
+        The AnnData object from adata_manager.
+    perturbation_key : str
+        Field name where the perturbation gene names are stored.
+    mode : str
+        Mode for dataloader: "training" or "prediction".
+    prediction_indices : Sequence[int] or None
+        Specific indices for prediction mode.
+    matched_ctrl_X : array_like
+        Gene expression data from matched control cells.
+    perturbed_X : array_like
+        Gene expression data from perturbed cells.
+    perturbation_strings : array_like
+        Perturbation annotation strings.
+    intervention_matrix : np.ndarray
+        Numerical intervention matrix encoding perturbation states.
+    intervention_genes : list
+        List of all intervention genes.
+    gene_to_intervention_idx : dict
+        Mapping from gene names to indices.
+    n_intervention_genes : int
+        Number of unique intervention genes.
+
+    Notes
+    -----
+    Key Features:
+    1. Control-Perturbation Pair Management:
+       - Identifies and separates control cells from perturbed cells
+       - Creates matched pairs by randomly sampling controls for each perturbation
+       - Implements dynamic control reassignment between epochs to improve generalization
+
+    2. Perturbation Processing:
+       - Converts gene perturbation annotations into numerical matrices
+       - Supports both single-gene and multi-gene perturbations
+       - Creates efficient sparse representation of intervention states
+
+    3. Data Format:
+       Processes and returns data in a standardized format for training:
+       - X: Gene expression data from matched control cells
+       - labels: Gene expression data from perturbed cells
+       - cat_covs: Intervention matrix encoding perturbation states:
+         * -1: Control state
+         * 0: Non-perturbed gene
+         * 1: Perturbed gene
+
+    The dataloader implements efficient batch processing and ensures consistent
+    data preprocessing across training, validation, and inference phases.
+    """
+
+    def __init__(
+        self,
+        adata_manager: AnnDataManager,
+        perturbation_key: str,
+        shuffle: bool = True,
+        batch_size: int = 128,
+        mode: str = "training",  # "training" or "prediction"
+        indices: Sequence[int] | None = None,  # For prediction mode
+        prediction_indices: Sequence[int] | None = None,  # Alternative name for indices
+        **kwargs,
+    ):
+        """
+        Initialize SENA dataloader with dynamic preprocessing capabilities.
+
+        Sets up the complete data processing pipeline for single-cell perturbation analysis,
+        including control-perturbation matching, intervention matrix creation, and batch
+        processing compatible with scvi-tools framework.
+
+        Parameters
+        ----------
+        adata_manager : AnnDataManager
+            AnnDataManager instance containing the original adata.
+        perturbation_key : str
+            Key in adata.obs containing perturbation information.
+        shuffle : bool, default True
+            Whether to shuffle the data.
+        batch_size : int, default 128
+            Batch size for training.
+        mode : str, default "training"
+            Mode for dataloader: "training" or "prediction".
+        indices : Sequence[int], optional
+            Specific indices for prediction mode.
+        prediction_indices : Sequence[int], optional
+            Alternative parameter name for indices (used by SENADataSplitter).
+        **kwargs
+            Additional arguments passed to DataLoader.
+        """
+        self.adata_manager = adata_manager
+        self.adata = adata_manager.adata
+        self.perturbation_key = (
+            perturbation_key  # field name where the perturbation gene names are stored
+        )
+        self.mode = mode  # "training" or "prediction"
+
+        # Handle both parameter names for indices
+        self.prediction_indices = prediction_indices if prediction_indices is not None else indices
+
+        # Store these parameters explicitly since we need them later
+        self._shuffle = shuffle
+        self._batch_size = batch_size
+
+        # Preprocess the data to create the matched pairs (same logic as setup_anndata)
+        self._create_matched_dataset()
+
+        # Create indices for the processed dataset
+        self.dataset_indices = np.arange(self.matched_ctrl_X.shape[0])
+
+        # Filter out custom parameters before passing to parent
+        parent_kwargs = {
+            k: v for k, v in kwargs.items() if k not in ["prediction_indices", "mode"]
+        }
+
+        # Initialize parent with our custom dataset
+        super().__init__(
+            adata_manager=adata_manager,
+            indices=self.dataset_indices,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            **parent_kwargs,
+        )
+
+        # Store these parameters for our custom iterator (don't override PyTorch attributes)
+        self._custom_shuffle = shuffle
+        self._custom_batch_size = batch_size
+
+    def _create_matched_dataset(self):
+        """
+        Create matched control-perturbation pairs for training or prediction.
+
+        This method processes the AnnData object to:
+        1. Identify control and perturbed cells
+        2. Filter based on mode:
+           - Training mode: Only single perturbations
+           - Prediction mode: Use specified indices (can include double perturbations)
+        3. Create matched pairs by random sampling from control cells
+        4. Prepare the data structure for batch processing
+
+        Raises
+        ------
+        ValueError
+            If no perturbed cells or control cells are found in AnnData object.
+        ValueError
+            If no single perturbation cells found in training mode.
+        ValueError
+            If unknown mode is specified.
+
+        Notes
+        -----
+        The method implements different filtering strategies based on mode:
+        - Training mode: Restricts to single gene perturbations only for stable learning
+        - Prediction mode: Allows any perturbation complexity including combinations
+        """
+        logger.info(
+            f"Creating matched control-perturbation dataset in dataloader (mode: {self.mode})."
+        )
+
+        # Separate control and perturbed cells
+        is_control = self.adata.obs[self.perturbation_key] == ""
+        ctrl_indices = np.where(is_control)[0]
+        ptb_indices = np.where(~is_control)[0]
+
+        if len(ptb_indices) == 0:
+            raise ValueError("No perturbed cells found in AnnData object.")
+        if len(ctrl_indices) == 0:
+            raise ValueError("No control cells found in AnnData object.")
+
+        logger.info(
+            f"Found {len(ctrl_indices)} control cells and {len(ptb_indices)} perturbed cells."
+        )
+
+        # Filter based on mode
+        if self.mode == "training":
+            # Training mode: Filter to only include single perturbations
+            single_perturbation_mask = []
+            for idx in ptb_indices:
+                pert_str = self.adata.obs[self.perturbation_key].iloc[idx]
+                # Single perturbation if it doesn't contain a comma
+                is_single = pert_str and pert_str != "" and "," not in pert_str
+                single_perturbation_mask.append(is_single)
+
+            single_perturbation_mask = np.array(single_perturbation_mask)
+            final_ptb_indices = ptb_indices[single_perturbation_mask]
+
+            if len(final_ptb_indices) == 0:
+                raise ValueError("No single perturbation cells found in AnnData object.")
+
+            logger.info(
+                f"Training mode: Filtered to {len(final_ptb_indices)} single perturbation cells."
+            )
+
+        elif self.mode == "prediction":
+            # Prediction mode: Use specified indices (can include double perturbations)
+            if self.prediction_indices is not None:
+                # If indices are provided, they should already be filtered appropriately
+                # by the caller (e.g., SENADataSplitter), so trust them
+                final_ptb_indices = np.array(self.prediction_indices)
+
+                # Verify these are actually perturbation indices
+                valid_ptb_indices = np.intersect1d(final_ptb_indices, ptb_indices)
+                if len(valid_ptb_indices) != len(final_ptb_indices):
+                    if valid_ptb_indices.size > 0:
+                        logger.warning(
+                            f"Some provided indices are not perturbation cells. "
+                            f"Using {len(valid_ptb_indices)}/{len(final_ptb_indices)}"
+                            f"valid indices."
+                        )
+                        final_ptb_indices = valid_ptb_indices
+
+                if len(final_ptb_indices) == 0:
+                    logger.warning("No pertubred cells in indices")
+
+                # Count single vs double perturbations for logging
+                single_count = 0
+                double_count = 0
+                for idx in final_ptb_indices:
+                    pert_str = self.adata.obs[self.perturbation_key].iloc[idx]
+                    if pert_str and pert_str != "":
+                        if "," in pert_str:
+                            double_count += 1
+                        else:
+                            single_count += 1
+
+                logger.info(
+                    f"Prediction mode: Using {len(final_ptb_indices)} pre-filtered cells"
+                    f"({single_count} single, {double_count} double perturbations)."
+                )
+            else:
+                # No specific indices provided, use all perturbed cells
+                final_ptb_indices = ptb_indices
+                logger.info(
+                    f"Prediction mode: Using all {len(final_ptb_indices)} perturbed cells."
+                )
+        else:
+            raise ValueError(f"Unknown mode: {self.mode}. Must be 'training' or 'prediction'.")
+
+        self.actual_training_indices = final_ptb_indices
+
+        # Create matched pairs by sampling controls for the selected perturbations
+        n_ptb = len(final_ptb_indices)
+        random_ctrl_indices = np.random.choice(ctrl_indices, n_ptb, replace=True)
+
+        # Store the matched data gene expression and the gene names of the perturbations
+        self.matched_ctrl_X = self.adata.X[random_ctrl_indices]  # Control expression
+        self.perturbed_X = self.adata.X[final_ptb_indices]  # Perturbed expression
+        self.perturbation_strings = (
+            self.adata.obs[self.perturbation_key].iloc[final_ptb_indices].values
+        )
+
+        # Process perturbation strings to numerical indices, considering all
+        # genes (single + double)
+        self._process_perturbation_strings()
+
+    def _process_perturbation_strings(self):
+        """
+        Convert perturbation annotations into numerical intervention matrices.
+
+        This method:
+        1. Extracts unique perturbed genes from ALL perturbed annotation strings
+        (single and double) for vocabulary
+        2. Creates a mapping between genes and numerical indices
+        3. Constructs intervention matrices representing perturbation states for training data
+        (single only)
+        4. Handles single gene perturbations only in the final matrix
+
+        Notes
+        -----
+        The intervention matrix format:
+        - Shape: (n_single_perturbations, n_intervention_genes)
+        - Values: -1 for controls, 0 for non-perturbed genes, 1 for perturbed genes
+
+        Vocabulary Construction:
+        The method builds a complete vocabulary from ALL perturbations (single and double)
+        to ensure compatibility during inference, even though training uses only single
+        perturbations.
+        """
+        logger.info("Processing perturbation strings to numerical format.")
+
+        # Extract all unique genes from ALL perturbation strings (including double perturbations)
+        # This ensures the vocabulary includes genes from double perturbations even though
+        # we only train on single perturbations
+        all_perturbed_genes = set()
+
+        # Get all perturbation strings from the entire dataset (not just training subset)
+        all_perturbation_strings = self.adata.obs[self.perturbation_key].values
+
+        for pert_str in all_perturbation_strings:
+            if pert_str and pert_str != "":
+                genes = [g.strip() for g in pert_str.split(",")]
+                all_perturbed_genes.update(genes)
+
+        # Create gene-to-index mapping using the complete vocabulary(Ex: {ATF4:[15]})
+        sorted_genes = sorted(all_perturbed_genes)
+        gene_to_idx = {gene: idx for idx, gene in enumerate(sorted_genes)}
+        n_intervention_genes = len(sorted_genes)
+
+        # this is improtant as it shapes the size of one of the encoder´s first layer.
+        logger.info(
+            f"Found {n_intervention_genes} unique intervention genes from all perturbations:"
+            f"{sorted_genes[:10]}{'...' if n_intervention_genes > 10 else ''}"
+        )
+
+        # Count single vs double perturbations for logging
+        single_count = sum(1 for s in all_perturbation_strings if s and s != "" and "," not in s)
+        double_count = sum(1 for s in all_perturbation_strings if s and s != "" and "," in s)
+        logger.info(
+            f"Total perturbations in dataset: {single_count} single, {double_count} double"
+        )
+        if self.mode == "training":
+            logger.info(
+                f"Training will use only {len(self.perturbation_strings)} single perturbations"
+            )
+
+        # Create numerical intervention matrix for training data (single perturbations only)
+        # Shape: (n_single_perturbations, n_intervention_genes)
+        # Values: -1 for controls, 0 for non-perturbed genes, 1 for perturbed genes
+        intervention_matrix = np.full(
+            (len(self.perturbation_strings), n_intervention_genes), -1, dtype=np.int32
+        )
+
+        # Process only the single perturbations that will be used for training
+        for cell_idx, pert_str in enumerate(self.perturbation_strings):
+            if pert_str and pert_str != "":  # Should all be single perturbations
+                # Verify it's actually a single perturbation
+                if "," in pert_str and self.mode == "training":
+                    logger.warning(
+                        f"Double perturbation '{pert_str}' found"
+                        f"in training data - this should not happen"
+                    )
+                    # continue;
+
+                # Set all genes to 0 first (non-perturbed)
+                intervention_matrix[cell_idx, :] = 0
+
+                # Set perturbed gene to 1
+                gene = pert_str.strip()
+                if "," in gene:
+                    gene = gene.split(",")
+                    for g in gene:
+                        if g in gene_to_idx:
+                            intervention_matrix[cell_idx, gene_to_idx[g]] = 1
+                        else:
+                            logger.warning(f"Gene '{g}' in perturbation not found in gene mapping")
+                else:
+                    if gene in gene_to_idx:
+                        intervention_matrix[cell_idx, gene_to_idx[gene]] = 1
+                    else:
+                        logger.warning(f"Gene '{gene}' in perturbation not found in gene mapping")
+
+        # Store the processed data
+        self.intervention_matrix = intervention_matrix
+        self.intervention_genes = sorted_genes
+        self.gene_to_intervention_idx = gene_to_idx
+        self.n_intervention_genes = n_intervention_genes
+
+    def reshuffle_controls(self):
+        """
+        Reshuffle control assignments for the next epoch, respecting current mode.
+
+        Re-creates the matched pairs with new random sampling to enhance model
+        generalization by preventing memorization of specific control-perturbation
+        relationships.
+
+        Notes
+        -----
+        This method applies the same filtering logic as in `_create_matched_dataset`
+        to maintain consistency between initial setup and reshuffling. The method
+        prevents overfitting by ensuring different control-perturbation pairings
+        across epochs while maintaining the same perturbation distribution.
+        """
+        # Re-create the matched pairs with new random sampling
+        is_control = self.adata.obs[self.perturbation_key] == ""
+        ctrl_indices = np.where(is_control)[0]
+        ptb_indices = np.where(~is_control)[0]
+
+        # Apply same filtering logic as in _create_matched_dataset. Both methods correspond
+        # to the same class so the self atributes are the same.We could use the control indices
+        # created during the _create_matched_dataset, without needed to reshufle.
+        if self.mode == "training":
+            # Filter to only include single perturbations
+            single_perturbation_mask = []
+            for idx in ptb_indices:
+                pert_str = self.adata.obs[self.perturbation_key].iloc[idx]
+                is_single = pert_str and pert_str != "" and "," not in pert_str
+                single_perturbation_mask.append(is_single)
+
+            single_perturbation_mask = np.array(single_perturbation_mask)
+            final_ptb_indices = ptb_indices[single_perturbation_mask]
+
+        elif self.mode == "prediction":
+            if self.prediction_indices is not None:
+                final_ptb_indices = np.intersect1d(ptb_indices, self.prediction_indices)
+            else:
+                final_ptb_indices = ptb_indices
+
+        n_ptb = len(final_ptb_indices)
+        random_ctrl_indices = np.random.choice(ctrl_indices, n_ptb, replace=True)
+        self.matched_ctrl_X = self.adata.X[random_ctrl_indices]
+
+    def __iter__(self):
+        """
+        Make the dataloader iterable for batch processing.
+
+        When you write 'for batch in dataloader:' in your training code,
+        Python calls this __iter__ method to provide batches of processed data.
+
+        Yields
+        ------
+        dict of str to torch.Tensor
+            Dictionary containing:
+            - X_KEY : torch.Tensor
+                Control expression data, shape (batch_size, n_genes).
+            - LABELS_KEY : torch.Tensor
+                Perturbed expression data, shape (batch_size, n_genes).
+            - CAT_COVS_KEY : torch.Tensor
+                Intervention matrix, shape (batch_size, n_intervention_genes).
+
+        Notes
+        -----
+        This method has two phases:
+
+        PHASE 1 - SETUP (runs once per epoch):
+        - Gets the total number of samples (e.g., 52,047 training pairs)
+        - Creates a list of indices [0, 1, 2, ..., 52046]
+        - Shuffles these indices randomly for this epoch
+        - Sets the batch size (e.g., 32 samples per batch)
+
+        PHASE 2 - BATCH CREATION (runs multiple times per epoch):
+        - The 'for' loop below divides data into batches of 32 samples each
+        - Each time your training code asks for the next batch, this loop runs ONE iteration
+        - The 'yield' statement pauses here and returns the current batch
+        - When the next batch is requested, the loop resumes from where it left off
+        - This continues until all ~1627 batches (52047÷32) are processed
+        """
+        # Use shape[0] for sparse matrix compatibility
+        n_samples = self.matched_ctrl_X.shape[0]
+        indices = np.arange(n_samples)
+
+        # Use our stored shuffle parameter
+        if getattr(self, "_custom_shuffle", self._shuffle):
+            np.random.shuffle(indices)
+
+        # Use our stored batch_size
+        # Get the batch size
+        batch_size = getattr(self, "_custom_batch_size", self._batch_size)
+        if batch_size is None:
+            batch_size = self._batch_size
+
+        batch_count = 0
+
+        # Once we have the matched control and perturbations we need to divide them in batches
+        for i in range(0, n_samples, batch_size):
+            batch_indices = indices[i : i + batch_size]
+
+            # Get batch data and obtain the control and pertubed expresion and
+            # the intervention matrix
+            batch_ctrl_X = self.matched_ctrl_X[batch_indices]
+            batch_perturbed_X = self.perturbed_X[batch_indices]
+            batch_intervention_matrix = self.intervention_matrix[batch_indices]
+
+            # We create a distioanry of three tensors
+            tensors = {
+                REGISTRY_KEYS.X_KEY: torch.from_numpy(
+                    batch_ctrl_X.toarray() if hasattr(batch_ctrl_X, "toarray") else batch_ctrl_X
+                ).float(),
+                REGISTRY_KEYS.LABELS_KEY: torch.from_numpy(
+                    batch_perturbed_X.toarray()
+                    if hasattr(batch_perturbed_X, "toarray")
+                    else batch_perturbed_X
+                ).float(),
+                REGISTRY_KEYS.CAT_COVS_KEY: torch.from_numpy(batch_intervention_matrix).long(),
+            }
+
+            # Debug output for first batch
+            if batch_count == 0:
+                batch_count += 1
+            # This freeses the for loop an returns the current batch until the generator
+            # funciton its called again.
+            yield tensors
+
+    def __len__(self):
+        """
+        Return number of batches in the dataloader.
+
+        This is a special method that Python calls when we do len(dataloader).
+        It calculates the total number of batches based on the number of matched
+        control-perturbation pairs and the batch size.
+
+        Returns
+        -------
+        int
+            Number of batches in the dataloader.
+
+        Notes
+        -----
+        The calculation uses ceiling division to ensure all samples are included:
+        (n_samples + batch_size - 1) // batch_size
+        This handles cases where n_samples is not evenly divisible by batch_size.
+        """
+        n_samples = self.matched_ctrl_X.shape[0]
+        batch_size = getattr(self, "_custom_batch_size", self._batch_size)
+        if batch_size is None:
+            batch_size = self._batch_size
+        return (n_samples + batch_size - 1) // batch_size
+
+    def get_intervention_vocabulary(self):
+        """
+        Return the complete intervention vocabulary including genes from double perturbations.
+
+        Provides access to the complete gene vocabulary used for intervention encoding,
+        including genes that appear only in combination perturbations. This is essential
+        for inference on novel perturbation combinations.
+
+        Returns
+        -------
+        dict
+            Dictionary containing:
+            - 'genes' : list
+                List of all intervention genes.
+            - 'gene_to_idx' : dict
+                Mapping from gene names to indices.
+            - 'n_genes' : int
+                Number of unique intervention genes.
+
+        Notes
+        -----
+        The vocabulary includes all genes from both single and double perturbations
+        to ensure compatibility during inference, even though training may use only
+        single perturbations.
+        """
+        return {
+            "genes": self.intervention_genes,
+            "gene_to_idx": self.gene_to_intervention_idx,
+            "n_genes": self.n_intervention_genes,
+        }
+
+    def create_intervention_vector(self, perturbation_string):
+        """
+        Create an intervention vector for any perturbation string (single or double).
+
+        This method allows creating intervention vectors for double perturbations
+        at inference time, even though the model was trained only on single perturbations.
+        It enables prediction of combination effects by encoding multiple simultaneous
+        gene perturbations.
+
+        Parameters
+        ----------
+        perturbation_string : str
+            Perturbation string (e.g., "GENE1" or "GENE1,GENE2").
+
+        Returns
+        -------
+        np.ndarray
+            Intervention vector of shape (n_intervention_genes,).
+            Values: 0 for non-perturbed genes, 1 for perturbed genes.
+
+        Notes
+        -----
+        The method handles various perturbation formats:
+        - Empty string or "": Returns zero vector (control condition)
+        - Single gene: "GENE1" -> one-hot vector with 1 at GENE1 position
+        - Multiple genes: "GENE1,GENE2" -> multi-hot vector with 1s at both positions
+        """
+        if not perturbation_string or perturbation_string == "":
+            # Control case - return all zeros
+            return np.zeros(self.n_intervention_genes, dtype=np.int32)
+
+        # Initialize intervention vector
+        intervention_vector = np.zeros(self.n_intervention_genes, dtype=np.int32)
+
+        # Parse perturbation string
+        genes = [g.strip() for g in perturbation_string.split(",")]
+
+        # Set perturbed genes to 1
+        for gene in genes:
+            if gene in self.gene_to_intervention_idx:
+                intervention_vector[self.gene_to_intervention_idx[gene]] = 1
+            else:
+                logger.warning(f"Gene '{gene}' not found in intervention vocabulary")
+
+        return intervention_vector
+
+
+class ControlReshuffleCallback(Callback):
+    """
+    A PyTorch Lightning callback that performs dynamic cell reassignment during training.
+
+    This callback enhances model generalization by randomizing the control-perturbation
+    pairs at the start of each training epoch. It prevents the model from memorizing
+    specific control-perturbation relationships and ensures learning of general
+    perturbation effects.
+
+    Notes
+    -----
+    Key Functions:
+    - Prevents overfitting by randomizing control cell assignments
+    - Enhances learning of generalizable perturbation effects
+    - Reduces bias from specific control-perturbation pairings
+    - Seamlessly integrates with PyTorch Lightning's training loop
+
+    Implementation:
+    The callback operates by calling the reshuffle_controls() method of the
+    dataloader at the start of each epoch, ensuring random reassignment of
+    control cells while maintaining the overall data structure.
+
+    Scientific Rationale:
+    In single-cell perturbation experiments, the choice of control cells paired
+    with each perturbed cell can introduce systematic biases. By reshuffling
+    these pairings each epoch, the model learns perturbation effects that
+    generalize across different cellular contexts.
+    """
+
+    def __init__(self):
+        """
+        Initialize control reshuffle callback for SENA training workflows.
+
+        Sets up the callback infrastructure for dynamic cell reassignment during
+        training epochs, enabling enhanced generalization through randomized
+        control-perturbation pairings.
+        """
+        super().__init__()
+
+    def on_train_epoch_start(self, trainer, pl_module):
+        """
+        Reshuffle control assignments in the SENA dataloaders.
+
+        Function that is called at the start of each epoch to perform dynamic
+        control cell reassignment for enhanced generalization. Accesses the training
+        dataloader and calls its reshuffle_controls method if available.
+
+        Parameters
+        ----------
+        trainer : pytorch_lightning.Trainer
+            PyTorch Lightning trainer managing the training process.
+        pl_module : pytorch_lightning.LightningModule
+            SENA model being trained.
+
+        Notes
+        -----
+        This method attempts to access the train_dataloader through the trainer
+        and calls its reshuffle_controls method if available. It also logs the
+        current epoch for monitoring purposes. The reshuffling ensures that
+        control-perturbation pairings vary across epochs to prevent overfitting.
+        """
+        # I am not ssure this would wok because reshuffle controls is a custom function
+        # from our dataloader. But maybe Claudio just copied the name form an
+        # exsitng function in the trainer
+        # Method 2: Try to access through trainer's train_dataloader
+        if hasattr(trainer, "train_dataloader") and trainer.train_dataloader is not None:
+            train_dl = trainer.train_dataloader
+            if hasattr(train_dl, "reshuffle_controls"):
+                train_dl.reshuffle_controls()
+                if hasattr(pl_module, "log"):
+                    pl_module.log(
+                        "control_reshuffle_epoch", float(trainer.current_epoch), on_epoch=True
+                    )
+
+
+def create_sena_dataloader(
+    adata_manager: AnnDataManager,
+    perturbation_key: str,
+    indices: Sequence[int] | None = None,
+    batch_size: int = 128,
+    shuffle: bool = True,
+    mode: str = "training",  # "training" or "prediction"
+    **kwargs,
+) -> SENADataLoader:
+    """
+    Factory function to create a SENA dataloader with specified configuration.
+
+    Provides a convenient interface for creating SENADataLoader instances with
+    standardized parameters and proper error handling. This factory function
+    ensures consistent dataloader configuration across different parts of the
+    SENA framework.
+
+    Parameters
+    ----------
+    adata_manager : AnnDataManager
+        AnnDataManager instance containing processed single-cell data.
+    perturbation_key : str
+        Key in adata.obs containing perturbation information.
+    indices : Sequence[int], optional
+        Indices to use (for prediction mode).
+    batch_size : int, default 128
+        Batch size for training.
+    shuffle : bool, default True
+        Whether to shuffle the data.
+    mode : str, default "training"
+        Mode for dataloader: "training" (single perturbations only) or
+        "prediction" (all specified indices).
+    **kwargs
+        Additional arguments passed to SENADataLoader.
+
+    Returns
+    -------
+    SENADataLoader
+        Configured SENADataLoader instance ready for training or inference.
+
+    Notes
+    -----
+    This factory function standardizes the creation of SENA dataloaders and
+    provides a single point for configuration validation and error handling.
+    It ensures consistent behavior across different usage contexts.
+    """
+    return SENADataLoader(
+        adata_manager=adata_manager,
+        perturbation_key=perturbation_key,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        mode=mode,
+        indices=indices,
+        **kwargs,
+    )
+
+
+class SENADataSplitter(DataSplitter):
+    """
+    Custom data splitter for SENA perturbation analysis with stratified splitting.
+
+    This splitter ensures proper train/validation separation for perturbation experiments
+    while maintaining scientific validity and preventing data leakage. It implements
+    stratified sampling to preserve perturbation type distributions across splits
+    and handles the unique requirements of control-perturbation experimental designs.
+
+    Parameters
+    ----------
+    adata_manager : AnnDataManager
+        AnnDataManager instance containing single-cell perturbation data.
+    perturbation_key : str
+        Key in adata.obs containing perturbation information.
+    train_size : float, default 0.9
+        Size of training set (default 0.9 = 90%).
+    validation_size : float, optional
+        Size of validation set (auto-calculated if None).
+    shuffle_set_split : bool, default True
+        Whether to shuffle when splitting.
+    batch_size : int, default 128
+        Batch size for dataloaders.
+    **kwargs
+        Additional arguments passed to DataSplitter.
+
+    Attributes
+    ----------
+    perturbation_key : str
+        Key in adata.obs containing perturbation information.
+    train_ptb_idx : np.ndarray
+        Indices of training perturbation cells.
+    val_ptb_idx : np.ndarray
+        Indices of validation perturbation cells.
+
+    Notes
+    -----
+    Key Features:
+    1. Perturbation-Only Splitting:
+       - Only single perturbation cells are split between train/validation
+       - Uses stratified sampling to maintain perturbation type distribution
+       - Controls are available to both train and validation dataloaders
+
+    2. scvi-tools Integration:
+       - Seamlessly integrates with UnsupervisedTrainingMixin
+       - Follows standard scvi-tools validation patterns
+       - No separate validation function needed - automatic during training
+
+    3. Scientific Validity:
+       - Prevents overfitting by using separate perturbation cells for validation
+       - Maintains proper evaluation methodology
+       - Control sharing is acceptable since controls represent baseline state
+
+    Experimental Design Rationale:
+    In perturbation experiments, the key is to evaluate the model's ability to
+    predict unseen perturbation effects. Therefore, validation must use different
+    perturbed cells than training, while control cells can be shared since they
+    represent the common baseline state.
+    """
+
+    def __init__(
+        self,
+        adata_manager: AnnDataManager,
+        perturbation_key: str,
+        train_size: float = 0.9,
+        validation_size: float | None = None,
+        shuffle_set_split: bool = True,
+        batch_size: int = 128,
+        **kwargs,
+    ):
+        """
+        Initialize SENA data splitter with stratified perturbation splitting.
+
+        Sets up the data splitting infrastructure for SENA training, ensuring
+        proper separation of perturbation cells while allowing control cell
+        sharing between training and validation sets.
+
+        Parameters
+        ----------
+        adata_manager : AnnDataManager
+            AnnDataManager instance containing single-cell perturbation data.
+        perturbation_key : str
+            Key in adata.obs containing perturbation information.
+        train_size : float, default 0.9
+            Size of training set (default 0.9 = 90%).
+        validation_size : float, optional
+            Size of validation set (auto-calculated if None).
+        shuffle_set_split : bool, default True
+            Whether to shuffle when splitting.
+        batch_size : int, default 128
+            Batch size for dataloaders.
+        **kwargs
+            Additional arguments passed to DataSplitter.
+        """
+        self.perturbation_key = perturbation_key
+        self._batch_size = batch_size
+
+        # Initialize parent DataSplitter
+        super().__init__(
+            adata_manager=adata_manager,
+            train_size=train_size,
+            validation_size=validation_size,
+            shuffle_set_split=shuffle_set_split,
+            batch_size=batch_size,
+            **kwargs,
+        )
+
+    def setup(self, stage: str = None):
+        """
+        Setup data splits for SENA with proper perturbation-only splitting.
+
+        Performs stratified splitting of single perturbation cells between training
+        and validation sets while ensuring control cells remain available to both.
+        This approach maintains scientific validity by testing on unseen perturbation
+        effects while allowing baseline comparison.
+
+        Parameters
+        ----------
+        stage : str, optional
+            Training stage (unused but required for compatibility).
+
+        Raises
+        ------
+        ValueError
+            If no single perturbation cells found for training.
+
+        Notes
+        -----
+        Splitting Strategy:
+        1. Identifies single perturbation cells (excludes controls and double perturbations)
+        2. Performs stratified split to maintain perturbation type proportions
+        3. Assigns training and validation indices for perturbation cells only
+        4. Control cells remain accessible to both training and validation dataloaders
+
+        The stratification ensures that rare perturbation types are represented
+        in both training and validation sets proportionally.
+        """
+        adata = self.adata_manager.adata
+
+        # Get perturbation information
+        perturbations = adata.obs[self.perturbation_key].values
+
+        # Identify single perturbation cells (training data)
+        is_control = perturbations == ""
+        is_single_perturbation = (~is_control) & (~pd.Series(perturbations).str.contains(","))
+
+        # Get indices for single perturbation cells only
+        single_perturbation_indices = np.where(is_single_perturbation)[0]
+
+        if len(single_perturbation_indices) == 0:
+            raise ValueError("No single perturbation cells found for training")
+
+        # Perform stratified (same proportion of each perturbation on train and validation)
+        # split on single perturbation cells only
+        single_perturbations = perturbations[single_perturbation_indices]
+
+        from sklearn.model_selection import train_test_split
+
+        train_ptb_idx, val_ptb_idx = train_test_split(
+            single_perturbation_indices,
+            test_size=1 - self.train_size,
+            stratify=single_perturbations,
+            random_state=42,
+            shuffle=self.shuffle_set_split,
+        )
+
+        # Store the perturbation indices for train/validation
+        self.train_ptb_idx = train_ptb_idx
+        self.val_ptb_idx = val_ptb_idx
+
+        # For compatibility with parent class, set train_idx and val_idx
+        self.train_idx = train_ptb_idx
+        self.val_idx = val_ptb_idx
+        self.test_idx = np.array([])  # No test set for SENA
+
+        logger.info(
+            f"SENA data split: {len(train_ptb_idx)} train perturbations, {len(val_ptb_idx)}"
+            f"validation perturbations"
+        )
+        logger.info("Control cells will be shared between train/validation sets")
+
+        # Log perturbation distribution
+        train_ptb_types = np.unique(perturbations[train_ptb_idx], return_counts=True)
+        val_ptb_types = np.unique(perturbations[val_ptb_idx], return_counts=True)
+        logger.info(f"Train perturbation types: {len(train_ptb_types[0])}")
+        logger.info(f"Val perturbation types: {len(val_ptb_types[0])}")
+
+    def train_dataloader(self):
+        """
+        Create training dataloader with only training perturbation indices.
+
+        Generates a SENADataLoader configured for training mode with the subset
+        of perturbation cells assigned to the training split. Control cells are
+        accessible through the dataloader's dynamic matching mechanism.
+
+        Returns
+        -------
+        SENADataLoader
+            Training dataloader configured for single perturbations only.
+
+        Notes
+        -----
+        The dataloader uses training mode which restricts processing to single
+        perturbations and enables shuffling for proper stochastic gradient descent.
+        Control cells are dynamically matched to perturbation cells during iteration.
+        """
+        return SENADataLoader(
+            adata_manager=self.adata_manager,
+            perturbation_key=self.perturbation_key,
+            batch_size=self._batch_size,
+            shuffle=True,  # Always shuffle for training
+            mode="training",  # Training mode - only single perturbations
+            prediction_indices=self.train_ptb_idx,  # Use only training perturbation indices
+        )
+
+    def val_dataloader(self):
+        """
+        Create validation dataloader with only validation perturbation indices.
+
+        Generates a SENADataLoader configured for validation with the subset
+        of perturbation cells assigned to the validation split. Uses the same
+        control cells as training but with different perturbation targets.
+
+        Returns
+        -------
+        SENADataLoader
+            Validation dataloader configured for single perturbations only.
+
+        Notes
+        -----
+        The validation dataloader uses training mode (single perturbations only)
+        but with shuffle=False for consistent validation metrics. Control cells
+        are shared with training, which is scientifically appropriate since
+        controls represent the baseline cellular state.
+        """
+        # Check val_ptb_idx instead of val_idx since that's what we actually use for validation
+        return SENADataLoader(
+            adata_manager=self.adata_manager,
+            perturbation_key=self.perturbation_key,
+            batch_size=self._batch_size,
+            shuffle=False,  # No shuffling for validation
+            mode="training",  # Training mode - only single perturbations
+            prediction_indices=self.val_ptb_idx,  # Use only validation perturbation indices
+        )
diff --git a/src/scvi/external/SENADVAE/_model.py b/src/scvi/external/SENADVAE/_model.py
new file mode 100644
index 0000000000..3bb6c6d9c2
--- /dev/null
+++ b/src/scvi/external/SENADVAE/_model.py
@@ -0,0 +1,1525 @@
+import logging
+import os
+import random
+from collections import defaultdict
+from collections.abc import Sequence
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from anndata import AnnData
+from scipy import stats
+from statsmodels.stats.multitest import multipletests
+
+from scvi import REGISTRY_KEYS
+from scvi.data import AnnDataManager
+from scvi.data.fields import LayerField
+from scvi.model.base import BaseModelClass, UnsupervisedTrainingMixin
+from scvi.train import TrainingPlan, TrainRunner
+from scvi.utils import setup_anndata_dsp
+
+from ._dataloader import ControlReshuffleCallback, SENADataLoader, SENADataSplitter
+from ._module import SENAModule
+from ._training_plan import (
+    LossWeightScheduler,
+    SENABatchProgressBar,
+    SENATrainingPlan,
+    TemperatureScheduler,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class SENADVAE(UnsupervisedTrainingMixin, BaseModelClass):
+    """
+    Structural Equation Network Analysis Variational AutoEncoder for causal perturbation modeling.
+
+    SENADVAE implements a novel approach to understanding causal relationships in single-cell
+    perturbation experiments by combining Variational AutoEncoders with biological pathway
+    constraints and causal graph learning. This model is specifically designed for analyzing
+    CRISPR screens, drug perturbations, and other intervention studies in single-cell genomics.
+
+    Parameters
+    ----------
+    adata : AnnData
+        Annotated data object containing single-cell expression data with perturbation annotations.
+        Must be pre-registered via setup_anndata with control/perturbation labels.
+    go_file_path : str
+        Path to Gene Ontology annotation file containing pathway definitions for biological
+        constraints.
+        Typically in format: pathway_id<tab>gene_name for GO term memberships.
+    go_gene_map_path : str
+        Path to gene-to-GO mapping file enabling pathway constraint enforcement
+        in NetworkActivity layers.
+        Format: gene_name<tab>go_term_id for comprehensive gene-pathway associations.
+    n_latent : int, default 64
+        Dimensionality of VAE latent space representing pathway activity levels.
+        Should balance expressiveness with interpretability (32-128 typical range).
+    sena_lambda : float, default 0.1
+        L1 regularization strength for causal graph sparsity, promoting interpretable
+        pathway interactions.
+        Higher values encourage sparser causal structures (0.01-1.0 typical range).
+    n_hidden_encoder : int, default 256
+        Hidden layer width in pathway-constrained encoder networks.
+        Should accommodate pathway complexity while maintaining computational efficiency.
+    n_hidden_decoder : int, default 256
+        Hidden layer width in intervention-aware decoder networks.
+        Larger values may improve reconstruction quality for complex perturbation effects.
+    n_hidden_interv : int, default 256
+        Hidden layer width in intervention network responsible for modeling perturbation-specific
+        effects.
+        Controls capacity for learning complex intervention-response relationships.
+    mode : str, default "sena"
+        Training mode selection: "sena" enables pathway constraints and causal learning,
+        "normal" uses standard VAE training without biological constraints.
+
+    Attributes
+    ----------
+    perturbation_key : str
+        Key in adata.obs containing perturbation information.
+    rel : dict
+        Gene-to-pathway relationship mapping dictionary.
+    go_map : pd.DataFrame
+        Gene-to-GO mapping dataframe.
+    go_dict : dict
+        GO term to index mapping.
+    gen_dict : dict
+        Gene to index mapping.
+    mapping_dict : dict
+        Gene symbol to ensemble ID mapping.
+    module : SENAModule
+        The underlying neural network module.
+
+    Notes
+    -----
+    Scientific Innovation:
+    - **Biological Pathway Integration**: Enforces Gene Ontology constraints via
+    NetworkActivity layers
+    - **Causal Graph Learning**: Discovers intervention-specific causal relationships between
+    pathways
+    - **Intervention Modeling**: Explicitly models control vs. perturbation differences
+    - **Distribution Matching**: Uses MMD or MSE to ensure realistic perturbation predictions
+
+    Model Architecture:
+    1. **Pathway-Constrained Encoder**: Maps gene expression to biologically meaningful
+    latent space
+    2. **Causal Graph Network**: Learns intervention-specific pathway interactions
+    3. **Intervention-Aware Decoder**: Reconstructs expression under control and
+    perturbation conditions
+    4. **Multi-Component Loss**: Balances reconstruction, prediction, and biological constraints
+
+    Key Applications:
+    - CRISPR knockout/knockdown effect prediction
+    - Drug mechanism of action discovery
+    - Pathway interaction mapping under perturbations
+    - Single-cell intervention response modeling
+    - Causal biomarker identification
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> from scvi.external import SENADVAE
+    >>> # Load perturbation screen data
+    >>> adata = sc.read_h5ad("crispr_screen.h5ad")
+    >>> # adata.obs["perturbation"] contains: "", "KRAS", "TP53", "KRAS,TP53", etc.
+    >>> # Register data with SENA-specific preprocessing
+    >>> SENADVAE.setup_anndata(adata, perturbation_key="perturbation")
+    >>> # Initialize model with pathway constraints
+    >>> model = SENADVAE(
+    ...     adata,
+    ...     go_file_path="GO_pathways.tsv",
+    ...     go_gene_map_path="gene_GO_mapping.tsv",
+    ...     n_latent=64,
+    ...     sena_lambda=0.1,
+    ... )
+    >>> # Train with curriculum learning and progress monitoring
+    >>> model.train(max_epochs=100, batch_size=256)
+    >>> # Extract pathway activity representations
+    >>> latent_pathways = model.get_latent_representation()
+    >>> # Predict perturbation effects
+    >>> perturb_pred = model.predict_perturbation(["KRAS", "TP53"])
+    """
+
+    # Core scvi-tools integration components for SENA training infrastructure
+    _module_cls = SENAModule  # Neural architecture with pathway constraints
+    _training_plan_cls = SENATrainingPlan  # Multi-component loss and curriculum learning
+    _train_runner_cls = TrainRunner  # Standard scvi trainer managing training loops
+    _data_splitter_cls = SENADataSplitter  # Control-perturbation aware data splitting
+
+    @staticmethod
+    def set_seeds(seed: int) -> None:
+        """
+        Configure deterministic random number generation for reproducible SENA training.
+
+        Ensures reproducible results across different runs of perturbation modeling experiments,
+        critical for validating biological discoveries and comparing model configurations.
+        Sets seeds for all relevant random number generators used in training pipeline.
+
+        Parameters
+        ----------
+        seed : int
+            Random seed for deterministic model initialization and training.
+        """
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+
+        # Configure GPU determinism if available
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+
+        # Set PyTorch Lightning seeds for comprehensive determinism
+        try:
+            import lightning.pytorch as pl
+
+            pl.seed_everything(seed, workers=True)
+        except ImportError:
+            pass
+
+    def __init__(
+        self,
+        adata: AnnData,
+        go_file_path: str,
+        go_gene_map_path: str,
+        gene_symb_ensemble_path: str,
+        n_latent: int = 105,
+        n_go_thresh: int = 5,
+        sena_lambda: float = 0.0,
+        n_hidden_encoder: int = 128,
+        n_hidden_decoder: int = 128,
+        n_hidden_interv: int = 128,
+        seed: int | None = None,
+        mode: str = "sena",
+        **model_kwargs,
+    ):
+        """
+        Initialize SENA model for causal perturbation analysis with biological constraints.
+
+        Sets up the complete SENA architecture including pathway-constrained encoders,
+        causal graph learning networks, and intervention-aware decoders. Configures
+        biological constraints from Gene Ontology annotations and prepares model
+        for training on single-cell perturbation data.
+
+        Parameters
+        ----------
+        adata : AnnData
+            Pre-registered single-cell perturbation dataset with control/treatment annotations.
+            Must contain perturbation metadata in .obs and normalized expression in .X.
+        go_file_path : str
+            Path to Gene Ontology pathway definition file for biological constraint enforcement.
+            Expected format: pathway_id<tab>gene_symbol for comprehensive pathway coverage.
+        go_gene_map_path : str
+            Path to gene-to-GO term mapping file enabling pathway constraint application.
+            Expected format: gene_symbol<tab>go_term_id for complete gene annotation.
+        gene_symb_ensemble_path : str
+            Path to gene symbol to ensemble ID mapping file.
+        n_latent : int, default 105
+            Latent space dimensionality representing pathway activity levels.
+            Should approximate number of relevant biological pathways (50-200 typical).
+        n_go_thresh : int, default 5
+            Minimum gene count threshold for including GO pathways in constraints.
+            Filters out very small pathways that may introduce noise (3-10 typical range).
+        sena_lambda : float, default 0.0
+            L1 regularization strength for causal graph sparsity in pathway interactions.
+            Higher values promote sparser, more interpretable causal structures.
+        n_hidden_encoder : int, default 512
+            Hidden layer width in pathway-constrained encoder networks.
+            Larger values increase model capacity for complex expression patterns.
+        n_hidden_decoder : int, default 128
+            Hidden layer width in intervention-aware decoder networks.
+            Should balance reconstruction quality with computational efficiency.
+        n_hidden_interv : int, default 256
+            Hidden layer width in intervention prediction networks.
+            Controls capacity for modeling perturbation-specific effects.
+        seed : int, optional
+            Random seed for reproducible model initialization and training.
+            Essential for validating biological discoveries across experiments.
+        mode : str, default "sena"
+            Training mode: "sena" enables pathway constraints and causal learning,
+            "normal" uses standard VAE without biological constraints.
+        **model_kwargs
+            Additional arguments passed to parent BaseModelClass initialization.
+
+        Notes
+        -----
+        Architecture Initialization:
+        - Loads Gene Ontology pathway constraints for NetworkActivity layers
+        - Configures VAE latent dimensionality for pathway representation
+        - Sets up multi-component loss function with biological regularization
+        - Initializes causal graph learning with sparsity constraints
+        """
+        # Set deterministic training if seed provided
+        if seed is not None:
+            self.set_seeds(seed)
+
+        # Initialize scvi-tools base model infrastructure
+        super().__init__(adata)
+
+        logger.info("Initializing SENA model with Gene Ontology pathway constraints.")
+
+        # Load and process Gene Ontology pathway annotations for biological constraints
+
+        # Load gene-to-GO mapping file containing pathway memberships
+        go_map = pd.read_csv(go_gene_map_path, sep="\t")
+        go_map.columns = ["GO_id", "ensembl_id"]
+
+        # Filter to only genes present in the expression dataset
+        go_map = go_map[go_map["ensembl_id"].isin(adata.var_names)]
+
+        # Load selected GO pathways for biological constraint enforcement
+        selected_gos = pd.read_csv(go_file_path, sep="\t")["PathwayID"].values.tolist()
+
+        # Restrict to pathways from the curated pathway selection
+        go_map = go_map[go_map["GO_id"].isin(selected_gos)]
+
+        # Apply pathway size filtering to ensure robust constraint learning
+        go_counts = go_map["GO_id"].value_counts()
+        genesets_in = go_counts[go_counts >= n_go_thresh].index
+        logger.info(
+            f"Applying pathway size filter: {len(genesets_in)} pathways with ≥{n_go_thresh} genes."
+        )
+
+        go_map = go_map[go_map["GO_id"].isin(genesets_in)]
+        # We obtain a sorted list of the GOs that pases the filtering
+        gos = sorted(go_map["GO_id"].unique())
+
+        # Create gene-pathway relationship dictionary
+        genes = adata.var.index.values
+        # dictinaries with gos and genes as their keys, and indices as their valies
+        go_dict = dict(zip(gos, range(len(gos)), strict=False))
+        gen_dict = dict(zip(genes, range(len(genes)), strict=False))
+        # We create a dictionary that relates the gos and ensemble ids
+        # Create filtered pathway-gene relationship mapping for NetworkActivity constraints
+        rel_dict = defaultdict(list)
+        gene_set, go_set = set(genes), set(gos)
+        self.go_dict = go_dict
+        self.gen_dict = gen_dict
+
+        mapping = pd.read_csv(gene_symb_ensemble_path, sep="\t")
+        self.mapping_dict = dict(
+            zip(mapping["external_gene_name"], mapping["ensembl_gene_id"], strict=False)
+        )
+
+        # Build gene-to-pathway mapping dictionary for biological constraint enforcement
+        for go, gen in zip(go_map["GO_id"], go_map["ensembl_id"], strict=False):
+            if (gen in gene_set) and (go in go_set):
+                # Map each gene index to its associated pathway indices for NetworkActivity layers
+                rel_dict[gen_dict[gen]].append(go_dict[go])
+
+        # Extract intervention metadata from dataset preprocessing
+        n_intervention_genes = adata.uns.get("n_intervention_genes", 1)
+
+        # Retrieve perturbation annotation key from setup_anndata configuration
+        self.perturbation_key = adata.uns.get("_sena_perturbation_key")
+        if self.perturbation_key is None:
+            raise ValueError(
+                "Perturbation key not found in AnnData.uns. "
+                "Ensure setup_anndata() was called with perturbation_key parameter."
+            )
+
+        # Set latent dimensionality to match number of biological pathways
+        n_latent = n_intervention_genes
+        logger.info(f"Configuring SENA with {n_latent} pathway-constrained latent dimensions.")
+
+        self.rel = rel_dict
+
+        self.go_map = go_map
+        self.gos = gos
+
+        # Initialize SENA neural architecture with biological constraints
+        self.module = self._module_cls(
+            n_input=adata.n_vars,  # Number of genes in expression matrix
+            n_latent=n_latent,  # Pathway activity latent dimensions
+            n_cat_covs=n_intervention_genes,  # Number of intervention categories
+            n_categories_interv=n_intervention_genes,  # Intervention encoding dimensionality
+            gos=gos,  # Pathway identifiers for constraints
+            rel_dict=rel_dict,  # Gene-pathway relationship mapping
+            n_hidden_encoder=n_hidden_encoder,  # Encoder network capacity
+            n_hidden_decoder=n_hidden_decoder,  # Decoder network capacity
+            n_hidden_interv=n_hidden_interv,  # Intervention network capacity
+            mode=mode,  # Training mode (sena vs normal)
+            sena_lambda=sena_lambda,  # L1 sparsity regularization strength
+            **model_kwargs,
+        )
+
+        # Configure model summary for inspection and debugging
+        self._model_summary_string = (
+            f"SENA Model Summary:\n"
+            f"├── Latent Pathways: {n_latent}\n"
+            f"├── Encoder Hidden: {n_hidden_encoder}\n"
+            f"├── Decoder Hidden: {n_hidden_decoder}\n"
+            f"├── Intervention Hidden: {n_hidden_interv}\n"
+            f"├── Training Mode: {mode}\n"
+            f"├── L1 Regularization: {sena_lambda}\n"
+            f"├── Input Genes: {adata.n_vars}\n"
+            f"└── Intervention Categories: {n_intervention_genes}"
+        )
+
+        # Store initialization parameters for model persistence and reproducibility
+        self.init_params_ = self._get_init_params(locals())
+
+    @classmethod
+    @setup_anndata_dsp.dedent
+    def setup_anndata(
+        cls,
+        adata: AnnData,
+        perturbation_key: str,
+        layer: str | None = None,
+        **kwargs,
+    ) -> AnnData | None:
+        """
+        Configure AnnData object for SENA perturbation modeling with automatic preprocessing.
+
+        This class method prepares single-cell perturbation data for SENA training by:
+        1. Parsing perturbation annotations into numerical intervention matrices
+        2. Validating data quality and perturbation coverage
+        3. Extracting intervention gene metadata for model configuration
+        4. Registering data fields with scvi-tools infrastructure
+
+        The actual control-perturbation matching and batch preprocessing is handled by
+        the custom SENADataLoader during training, preserving the original AnnData structure
+        while enabling sophisticated intervention modeling workflows.
+
+        Parameters
+        ----------
+        %(param_adata)s
+        perturbation_key : str
+            Column name in adata.obs containing perturbation identifiers.
+            Must contain control ("") and perturbation gene name entries.
+            Example: "perturbation", "treatment", "intervention"
+        %(param_layer)s
+        **kwargs
+            Additional arguments passed to parent setup_anndata method.
+
+        Returns
+        -------
+        AnnData or None
+            Modified AnnData object with registered fields, or None if in-place modification.
+
+        Notes
+        -----
+        Data Requirements:
+        - Expression data in adata.X (normalized log counts recommended)
+        - Perturbation annotations in adata.obs[perturbation_key]
+        - Both control ("") and perturbed cells must be present
+        - Gene names in adata.var_names matching GO annotation files
+
+        Perturbation Annotation Format:
+        - Control cells: empty string ("")
+        - Single perturbations: gene name ("KRAS")
+        - Multiple perturbations: comma-separated ("KRAS,TP53")
+        - Supports any combination of single/multiple perturbations
+
+        Examples
+        --------
+        >>> # Setup CRISPR screen data
+        >>> adata.obs["perturbation"] = ["", "KRAS", "TP53", "KRAS,TP53", ...]
+        >>> SENADVAE.setup_anndata(adata, perturbation_key="perturbation")
+        >>> # Use specific expression layer
+        >>> SENADVAE.setup_anndata(adata, perturbation_key="treatment", layer="log1p")
+        """
+        logger.info("Configuring AnnData for SENA perturbation modeling.")
+
+        # Validate perturbation annotation presence
+        if perturbation_key not in adata.obs.columns:
+            raise ValueError(
+                f"Perturbation key '{perturbation_key}' not found in adata.obs columns."
+            )
+
+        # Validate control and perturbation cell presence
+        is_control = adata.obs[perturbation_key] == ""
+        n_controls = is_control.sum()
+        n_perturbed = (~is_control).sum()
+
+        if n_perturbed == 0:
+            raise ValueError(
+                "No perturbed cells detected. Ensure perturbation annotations "
+                "contain non-empty strings for perturbed cells."
+            )
+        if n_controls == 0:
+            raise ValueError(
+                "No control cells detected. Ensure control cells are annotated "
+                "with empty strings ('') in perturbation column."
+            )
+
+        logger.info(
+            f"Data validation complete: {n_controls} controls, {n_perturbed} perturbed cells."
+        )
+
+        # Store perturbation key for dataloader access
+        adata.uns["_sena_perturbation_key"] = perturbation_key
+
+        # Extract and process intervention gene metadata
+        all_perturbed_genes = set()
+        perturbation_strings = adata.obs[perturbation_key].values
+
+        # Parse perturbation strings to extract unique intervention genes
+        for pert_str in perturbation_strings:
+            if pert_str and pert_str != "":
+                # Split comma-separated gene names and clean whitespace
+                genes = [g.strip() for g in pert_str.split(",")]
+                all_perturbed_genes.update(genes)
+
+        # Create sorted intervention gene list for consistent model configuration
+        sorted_genes = sorted(all_perturbed_genes)
+        n_intervention_genes = len(sorted_genes)
+
+        logger.info(
+            f"Extracted {n_intervention_genes} unique intervention targets: {sorted_genes[:5]}..."
+        )
+
+        # Store intervention metadata for model initialization
+        adata.uns["intervention_genes"] = sorted_genes
+        adata.uns["n_intervention_genes"] = n_intervention_genes
+
+        # Configure scvi-tools data registration with SENA-specific fields
+        setup_method_args = cls._get_setup_method_args(**locals())
+
+        # Register expression data with scvi-tools infrastructure
+        # Note: Perturbation annotations are handled by custom dataloader
+        anndata_fields = [
+            LayerField(REGISTRY_KEYS.X_KEY, layer, is_count_data=True),
+        ]
+
+        # Complete data registration with AnnDataManager
+        adata_manager = AnnDataManager(fields=anndata_fields, setup_method_args=setup_method_args)
+        adata_manager.register_fields(adata, **kwargs)
+        cls.register_manager(adata_manager)
+
+        return adata
+
+    @staticmethod
+    def analyze_perturbations(adata: AnnData, perturbation_key: str) -> dict:
+        """
+        Analyze perturbation structure and provide comprehensive intervention statistics.
+
+        This utility function examines the distribution and complexity of perturbations
+        in single-cell intervention datasets, providing essential quality control metrics
+        for SENA model configuration and experimental design validation.
+
+        Parameters
+        ----------
+        adata : AnnData
+            Single-cell perturbation dataset with intervention annotations.
+            Must contain perturbation metadata in .obs[perturbation_key].
+        perturbation_key : str
+            Column name in adata.obs containing perturbation identifiers.
+            Expected format: "", "gene1", "gene1,gene2" for controls and perturbations.
+
+        Returns
+        -------
+        dict
+            Comprehensive perturbation statistics including:
+            - Cell counts by perturbation type (control, single, multiple)
+            - Most frequent intervention targets
+            - Unique gene counts and intervention complexity metrics
+            - Data quality recommendations for SENA training
+
+        Notes
+        -----
+        Statistical Analysis:
+        - Control vs. perturbed cell counts and ratios
+        - Single vs. multiple perturbation distribution
+        - Most frequent intervention targets and combinations
+        - Unique gene coverage and intervention complexity
+
+        Quality Control Insights:
+        - Sufficient control cell representation for robust modeling
+        - Perturbation target diversity for comprehensive causal learning
+        - Multi-gene intervention complexity for pathway interaction discovery
+        - Data balance assessment for training stability
+
+        Examples
+        --------
+        >>> # Analyze CRISPR screen structure
+        >>> stats = SENADVAE.analyze_perturbations(adata, "perturbation")
+        >>> print(f"Controls: {stats['n_controls']}, Perturbed: {stats['n_perturbed']}")
+        >>> print(f"Unique targets: {stats['n_unique_genes']}")
+        """
+        if perturbation_key not in adata.obs.columns:
+            raise ValueError(
+                f"Perturbation key '{perturbation_key}' not found in adata.obs columns."
+            )
+
+        # Process perturbation annotations
+        perturbations = adata.obs[perturbation_key].fillna("")
+
+        # Categorize cells by perturbation type
+        controls = perturbations == ""
+        single_perturb = (perturbations != "") & (~perturbations.str.contains(","))
+        multi_perturb = perturbations.str.contains(",")
+
+        # Extract all unique intervention genes
+        all_genes = set()
+        for pert_str in perturbations[perturbations != ""]:
+            all_genes.update([g.strip() for g in pert_str.split(",")])
+
+        # Compile comprehensive perturbation statistics
+        summary = {
+            "n_total_cells": len(adata),
+            "n_controls": controls.sum(),
+            "n_single_perturbations": single_perturb.sum(),
+            "n_multi_perturbations": multi_perturb.sum(),
+            "n_unique_combinations": len(perturbations[perturbations != ""].unique()),
+            "n_unique_genes": len(all_genes),
+            "most_common_perturbations": perturbations.value_counts().head(10).to_dict(),
+            "intervention_genes": sorted(all_genes),
+            "control_ratio": controls.sum() / len(adata),
+            "perturbation_complexity": multi_perturb.sum()
+            / (single_perturb.sum() + multi_perturb.sum())
+            if (single_perturb.sum() + multi_perturb.sum()) > 0
+            else 0,
+        }
+
+        return summary
+
+    def train(
+        self,
+        max_epochs: int = 100,
+        alpha_max: float = 1.0,
+        beta_max: float = 1.0,
+        temp_max: float = 100.0,
+        alpha_start_epoch: int = 5,
+        beta_start_epoch: int = 10,
+        temp_start_epoch: int = 5,
+        accelerator: str = "auto",
+        devices: int | str = "auto",
+        train_size: float = 0.9,
+        validation_size: float | None = None,
+        shuffle_set_split: bool = True,
+        batch_size: int = 32,
+        early_stopping: bool = False,
+        check_val_every_n_epoch: int = 1,
+        training_plan: TrainingPlan
+        | None = None,  # Remember that this one is not used, as we have a custom training plan
+        plan_kwargs: dict | None = None,
+        **trainer_kwargs,
+    ):
+        """
+        Train the SENA model with scheduled loss weights and temperature annealing.
+
+        Parameters
+        ----------
+        max_epochs : int, default 100
+            Maximum number of training epochs.
+        alpha_max : float, default 1.0
+            Maximum weight for intervention loss.
+        beta_max : float, default 1.0
+            Maximum weight for KL divergence loss.
+        temp_max : float, default 100.0
+            Maximum temperature for intervention softmax.
+        alpha_start_epoch : int, default 5
+            Epoch to start ramping up alpha.
+        beta_start_epoch : int, default 10
+            Epoch to start ramping up beta.
+        temp_start_epoch : int, default 5
+            Epoch to start ramping up temperature.
+        accelerator : str, default "auto"
+            Accelerator type for training.
+        devices : int or str, default "auto"
+            Number of devices to use.
+        train_size : float, default 0.9
+            Proportion of data to use for training.
+        validation_size : float, optional
+            Proportion of data to use for validation.
+        shuffle_set_split : bool, default True
+            Whether to shuffle data when splitting.
+        batch_size : int, default 32
+            Batch size for training.
+        early_stopping : bool, default False
+            Whether to use early stopping.
+        check_val_every_n_epoch : int, default 1
+            How often to run validation. Set to 1 to validate every epoch.
+        training_plan : TrainingPlan, optional
+            Training plan to use (not used as we have custom training plan).
+        plan_kwargs : dict, optional
+            Additional keyword arguments for the training plan.
+        **trainer_kwargs
+            Additional keyword arguments for the trainer.
+        """
+        # Create custom SENA dataloader for reference (callbacks may need it)
+        train_dataloader = SENADataLoader(
+            adata_manager=self.adata_manager,
+            perturbation_key=self.perturbation_key,
+            batch_size=batch_size,
+            shuffle=True,  # Always shuffle during training
+            mode="training",  # Training mode - only single perturbations
+        )
+
+        # Store reference for callbacks
+        self._current_dataloader = train_dataloader
+
+        # Create schedulers
+        # Those schedulers are custom scheduleres that inherit from Callback class
+
+        # This one is form the MSE/MMD and KL divergence regularization parameters (alpha and beta)
+        loss_scheduler = LossWeightScheduler(
+            alpha_max=alpha_max,
+            beta_max=beta_max,
+            alpha_start_epoch=alpha_start_epoch,
+            beta_start_epoch=beta_start_epoch,
+        )
+
+        # This one is for the temp parameter that is used to determin the sharpness
+        # the softmax function
+        temp_scheduler = TemperatureScheduler(temp_max=temp_max, temp_start_epoch=temp_start_epoch)
+
+        # Create control reshuffle callback for per-epoch control resampling
+        control_reshuffle_callback = ControlReshuffleCallback()
+
+        # Create custom batch progress bar for batch-level progress tracking
+        batch_progress_bar = SENABatchProgressBar()
+
+        # Import ModelCheckpoint for best model saving
+        from lightning.pytorch.callbacks import ModelCheckpoint
+
+        # Create ModelCheckpoint callback to save best model based on validation loss
+        best_model_checkpoint = ModelCheckpoint(
+            monitor="validation_loss",  # Monitor total validation loss (all components combined)
+            mode="min",  # Save model with minimum validation loss
+            save_top_k=1,  # Keep only the best model
+            save_last=False,  # Don't save the last model automatically
+            verbose=False,  # Reduce checkpoint messages
+            filename="best_model",  # Name for the best model checkpoint
+        )
+
+        # Add schedulers, control reshuffle callback, custom progressbar and best model checkpoint
+        callbacks = [
+            loss_scheduler,
+            temp_scheduler,
+            control_reshuffle_callback,
+            batch_progress_bar,
+            best_model_checkpoint,  # Add best model checkpoint callback
+        ]
+        if "callbacks" in trainer_kwargs:
+            trainer_kwargs["callbacks"].extend(callbacks)
+        else:
+            trainer_kwargs["callbacks"] = callbacks
+
+        # Setup training plan - don't create it ourselves, let the mixin handle it
+        # but pass our custom plan_kwargs if needed
+        if plan_kwargs is None:
+            plan_kwargs = {}
+
+        # Remove training_plan from trainer_kwargs if it exists to avoid duplicate argument
+        if "training_plan" in trainer_kwargs:
+            trainer_kwargs.pop("training_plan")
+
+        # Ensure perturbation_key is passed to the data splitter
+        datasplitter_kwargs = trainer_kwargs.get("datasplitter_kwargs", {})
+        datasplitter_kwargs["perturbation_key"] = self.perturbation_key
+        trainer_kwargs["datasplitter_kwargs"] = datasplitter_kwargs
+
+        # Configure trainer for batch-level progress display
+        trainer_kwargs.update(
+            {
+                "log_every_n_steps": 1,  # Log metrics every batch for real-time updates
+                "enable_progress_bar": False,  # Disable default progress bars (custom one)
+                "simple_progress_bar": False,  # Disable scvi-tools progress bar (custom one)
+                "enable_model_summary": False,  # Reduce startup messages
+                "enable_checkpointing": True,  # Enable checkpointing for best model saving
+            }
+        )
+
+        # Reduce logging verbosity for cleaner output
+        import logging
+
+        logging.getLogger("scvi").setLevel(logging.WARNING)
+        logging.getLogger("lightning").setLevel(logging.WARNING)
+        logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)
+
+        # Pass the arguments to the parent train method
+        # The UnsupervisedTrainingMixin will use our SENADataSplitter automatically
+        super().train(
+            max_epochs=max_epochs,
+            accelerator=accelerator,
+            devices=devices,
+            train_size=train_size,
+            validation_size=validation_size,
+            shuffle_set_split=shuffle_set_split,
+            batch_size=batch_size,
+            early_stopping=early_stopping,
+            plan_kwargs=plan_kwargs,  # Let mixin create training_plan with our kwargs
+            check_val_every_n_epoch=check_val_every_n_epoch,  # Enable validation
+            **trainer_kwargs,  # Includes datasplitter_kwargs with perturbation_key
+        )
+
+        # Load the best model checkpoint after training completes
+        self._load_best_checkpoint()
+
+    def _load_best_checkpoint(self):
+        """Load the best model checkpoint after training completes."""
+        try:
+            # Find the best model checkpoint in the trainer's checkpoint directory
+            if (
+                hasattr(self.trainer, "checkpoint_callback")
+                and self.trainer.checkpoint_callback is not None
+            ):
+                best_model_path = self.trainer.checkpoint_callback.best_model_path
+                if best_model_path and os.path.exists(best_model_path):
+                    # Load the best model state
+                    checkpoint = torch.load(best_model_path, map_location=self.device)
+                    self.module.load_state_dict(checkpoint["state_dict"])
+                    logger.info(f"Loaded best model from checkpoint: {best_model_path}")
+                else:
+                    logger.warning("Best model checkpoint not found, using final epoch model")
+            else:
+                logger.warning("No checkpoint callback found, using final epoch model")
+        except (ValueError, RuntimeError, KeyError, TypeError) as e:
+            logger.warning(f"Failed to load best checkpoint: {e}, using final epoch model")
+
+    def get_latent_representation(
+        self,
+        adata: AnnData | None = None,
+        indices: Sequence[int] | None = None,
+        give_mean: bool = True,
+        batch_size: int | None = None,
+    ) -> np.ndarray:
+        """
+        Return the latent representation for each cell.
+
+        Parameters
+        ----------
+        adata : AnnData, optional
+            AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the
+            AnnData object used to initialize the model.
+        indices : Sequence[int], optional
+            Indices of cells in adata to use. If `None`, all cells are used.
+        give_mean : bool, default True
+            Give mean of distribution or sample from it.
+        batch_size : int, optional
+            Batch size for data loading. If `None`, full data is used.
+
+        Returns
+        -------
+        np.ndarray
+            Latent representation of cells.
+        """
+        self._check_if_trained(warn=False)
+        # checks if the andata has been resisteger with th elatest andata manager.
+        # It is a basemoduleclasss methos that inturn has functions from AnnDataManager()
+        adata = self._validate_anndata(adata)
+        # From the indices of cells given it extracts the latent representation.
+        # For sme reason it does so in batches, not all at once
+        # This just create an instance of the custom dataloader
+        scdl = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size)
+        latent = []
+
+        # Then it iterates over the tensors in the trained andata object,
+        # extracts the input of the gene expresion and performs the mu,sd and z calculations
+        for tensors in scdl:
+            inference_inputs = self.module._get_inference_input(tensors)
+            outputs = self.module.inference(**inference_inputs)
+            if give_mean:
+                latent_sample = outputs["z_mu"]
+            else:
+                latent_sample = outputs["z"]
+            # detach() isoaltes the tensor from the gradients computation graph and
+            # cpu moves the tensor from the gpu to the cpu
+            latent += [latent_sample.detach().cpu()]
+        # It returns the corresponding tensor (mu or z)
+        return torch.cat(latent).numpy()
+
+    def get_metric(self, adata, batch_size, N=100, temp_values=None) -> dict:
+        """
+        Calculate comprehensive pathway activity analysis and perturbation metrics.
+
+        This method provides a complete analysis similar to the generating_data function,
+        extracting all neural network components, pathway activities, intervention effects,
+        and traditional metrics for comprehensive perturbation analysis.
+
+        Parameters
+        ----------
+        adata : AnnData
+            AnnData object containing perturbation data.
+        batch_size : int
+            Batch size for processing.
+        N : int, default 100
+            Top N pathways to consider for hit ratio calculation.
+        temp_values : list, default [1, 100, 1000]
+            Temperature values for intervention effect analysis.
+
+        Returns
+        -------
+        dict
+            Comprehensive results dictionary containing:
+            - Traditional metrics: da_p, dar_p, hitn
+            - Neural network components: fc1, fc_mean, fc_var, z
+            - Intervention effects: bc_temp1, bc_temp100, bc_temp1000
+            - Causal analysis: u, causal_graph
+            - Network weights: mean_delta_matrix, std_delta_matrix
+            - Mappings: pert_map
+            - GO pathways: gos (list of pathway identifiers)
+
+        Notes
+        -----
+        This method provides complete access to SENA's internal representations
+        and learned parameters, enabling detailed analysis of pathway activities,
+        causal relationships, and intervention mechanisms.
+        """
+        logger.info("Starting comprehensive pathway activity analysis.")
+
+        # Initialize default temperature values if not provided
+        if temp_values is None:
+            temp_values = [1, 100, 1000]
+
+        # Initialize results dictionary with all components
+        results_dict = {
+            # Traditional perturbation metrics
+            "da_p": {},
+            "dar_p": {},
+            "hitn": {},
+            # Neural network pathway activities
+            "fc1": {},  # Raw pathway scores from encoder
+            "fc_mean": {},  # VAE mean parameters
+            "fc_var": {},  # VAE variance parameters
+            "z": {},
+            "z_interv": {},
+            "z_interv_temp_1": {},
+            "z_interv_temp_100": {},
+            "z_interv_temp_1000": {},  # Sampled latents (reparameterized)
+            "activity_score": {},  # Pathway activity scores for traditional metrics
+            # Intervention effects at different temperatures
+            "bc_temp1": {},  # Temperature=1 intervention gates
+            "bc_temp100": {},  # Temperature=100 intervention gates
+            "bc_temp1000": {},  # Temperature=1000 intervention gates
+            # Final pathway activities after causal propagation
+            "u": {},
+            # Causal graph and network weights
+            "causal_graph": None,
+            "mean_delta_matrix": None,
+            "std_delta_matrix": None,
+            "pert_map": {},
+            # GO pathway information
+            "gos": self.gos,  # List of GO pathway identifiers
+        }
+
+        # Get indices for control and perturbed cells
+        indices = np.where(adata.obs[self.perturbation_key] != "")[0]
+        indices_control = np.where(adata.obs[self.perturbation_key] == "")[0]
+
+        # Create dataloaders for perturbed and control cells
+        perturbed_dataloader = self._make_data_loader(
+            adata=adata, indices=indices, batch_size=batch_size
+        )
+        controls_dataloader = self._make_data_loader(
+            adata=adata, indices=indices_control, batch_size=batch_size
+        )
+
+        # Extract neural network weights
+        self._extract_network_weights(results_dict)
+
+        # Process perturbed cells
+        self._process_perturbed_cells(perturbed_dataloader, results_dict, temp_values)
+
+        # Process control cells
+        self._process_control_cells(controls_dataloader, results_dict)
+
+        # Calculate traditional metrics (da_p, dar_p, hitn)
+
+        # Convert pert_map to DataFrame format (equivalent to original code)
+
+        # Convert network weights to DataFrames with GO pathway indices
+        if "mean_delta_matrix" in results_dict and results_dict["mean_delta_matrix"] is not None:
+            results_dict["mean_delta_matrix"] = pd.DataFrame(
+                results_dict["mean_delta_matrix"].T, index=self.gos
+            )
+
+        if "std_delta_matrix" in results_dict and results_dict["std_delta_matrix"] is not None:
+            results_dict["std_delta_matrix"] = pd.DataFrame(
+                results_dict["std_delta_matrix"].T, index=self.gos
+            )
+
+        # Convert stored activation lists to pandas DataFrames (equivalent to original processing)
+        logger.info("Converting activation data to pandas DataFrames.")
+
+        # Process each layer's data into DataFrames (including activity_score_fdr)
+        layers_to_process = [
+            "fc1",
+            "fc_mean",
+            "fc_var",
+            "z",
+            "z_interv",
+            "u",
+            "bc_temp1",
+            "bc_temp100",
+            "bc_temp1000",
+            "activity_score",
+            "z_interv_temp1",
+            "z_interv_temp100",
+            "z_interv_temp1000",
+        ]
+        for layer in layers_to_process:
+            if layer in results_dict and results_dict[layer] is not None:
+                # Check if it's already a DataFrame (for activity_score_fdr)
+                if isinstance(results_dict[layer], pd.DataFrame):
+                    continue
+
+                temp_df = []
+                for gene in results_dict[layer]:
+                    gene_data_list = results_dict[layer][gene]
+                    if len(gene_data_list) > 0:  # Check if list is not empty
+                        # Convert list of arrays to DataFrame
+                        gene_data = pd.DataFrame(np.vstack(gene_data_list))
+                        gene_data.index = [gene] * gene_data.shape[0]
+                        temp_df.append(gene_data)
+
+                if temp_df:  # Only concatenate if we have data
+                    # Substitute the dictionary with concatenated DataFrame
+                    results_dict[layer] = pd.concat(temp_df)
+
+                    # Add GO pathway column names for layers with pathway dimensions
+                    if layer in ["fc1", "activity_score", "activity_score_fdr"]:
+                        results_dict[layer].columns = self.gos
+        self._calculate_traditional_metrics(results_dict, N)
+
+        # Perform statistical testing on activity scores after DataFrame conversion
+        if "activity_score" in results_dict and isinstance(
+            results_dict["activity_score"], pd.DataFrame
+        ):
+            try:
+                logger.info("Performing statistical testing on pathway activity scores...")
+                fdr_results = self._test_activity_score_significance(
+                    results_dict["activity_score"]
+                )
+                results_dict["activity_score_fdr"] = fdr_results
+                logger.info("Statistical testing completed successfully.")
+            except (ValueError, RuntimeError, KeyError, TypeError) as e:
+                logger.warning(f"Statistical testing failed: {e}")
+                results_dict["activity_score_fdr"] = None
+
+        logger.info("Comprehensive analysis completed successfully.")
+        return results_dict
+
+    def _extract_network_weights(self, results_dict):
+        """Extract and store neural network weights."""
+        try:
+            # Extract encoder weights
+            if hasattr(self.module.fc_mean, "weight"):
+                weight_tensor = self.module.fc_mean.weight.detach().cpu().numpy()
+                results_dict["mean_delta_matrix"] = weight_tensor
+            if hasattr(self.module.fc_var, "weight"):
+                weight_tensor = self.module.fc_var.weight.detach().cpu().numpy()
+                results_dict["std_delta_matrix"] = weight_tensor
+
+            # Extract causal graph
+            if hasattr(self.module, "G"):
+                results_dict["causal_graph"] = self.module.G.detach().cpu().numpy()
+
+        except (AttributeError, RuntimeError) as e:
+            logger.warning(f"Could not extract network weights: {e}")
+
+    def _process_perturbed_cells(self, dataloader, results_dict, temp_values):
+        """Process perturbed cells to extract all neural network activations."""
+        logger.info("Processing perturbed cells for comprehensive analysis.")
+
+        with torch.no_grad():
+            for tensors in dataloader:
+                # Get intervention information (one-hot encoded)
+                perturbations = tensors["extra_categorical_covs"]
+
+                # Standard inference pass
+                inference_inputs = self.module._get_inference_input(tensors, metrics=True)
+                # inference_outputs has: "z": z, "z_mu": z_mu, "z_var": z_var, "activity_score": h
+                inference_outputs = self.module.inference(**inference_inputs)
+
+                # Extract pathway activities and latent representations
+                # gene_expression
+                x = inference_inputs["x"]
+
+                # Get fc1 outputs (raw pathway activities) we need the softmax also
+                fc1_output = self.module.fc1(x)
+
+                # Get VAE parameters
+                fc_mean_output = self.module.fc_mean(fc1_output)
+                fc_var_output = F.softplus(self.module.fc_var(fc1_output))
+
+                # Get sampled latents
+                z = inference_outputs["z"]
+
+                # Process interventions at different temperatures
+                generative_inputs = self.module._get_generative_input(tensors, inference_outputs)
+
+                # Get internal variables at different temperatures including bc values
+                z_interv_by_temp = {}
+                bc1_by_temp = {}
+                bc2_by_temp = {}
+
+                for t in temp_values:
+                    generative_outputs = self.module.generative(**generative_inputs, temp=t)
+                    z_interv_by_temp[f"z_interv_temp{t}"] = generative_outputs["z_interv"]
+                    bc1_by_temp[f"bc_temp{t}"] = generative_outputs["bc1"]
+                    bc2_by_temp[f"bc_temp{t}"] = generative_outputs["bc2"]
+
+                # Use default temp for main variables (maintaining backward compatibility)
+                generative_outputs = self.module.generative(**generative_inputs, temp=1.0)
+
+                # Extract all internal variables from generative outputs
+                u = generative_outputs["u"]
+                z_interv = generative_outputs["z_interv"]
+
+                # Store all activations grouped by perturbation including bc values
+                activations_dict = {
+                    "fc1": fc1_output,
+                    "fc_mean": fc_mean_output,
+                    "fc_var": fc_var_output,
+                    "z": z,
+                    "z_interv": z_interv,
+                    "u": u,
+                    "activity_score": inference_outputs["activity_score"],
+                }
+
+                # Add temperature-specific z_interv
+                for temp_key, z_interv_temp in z_interv_by_temp.items():
+                    activations_dict[temp_key] = z_interv_temp
+
+                # Add temperature-specific bc values
+                for bc_key, bc_temp in bc1_by_temp.items():
+                    if bc_temp is not None:
+                        activations_dict[bc_key] = bc_temp
+
+                # Store all activations grouped by perturbation
+                self._store_activations_by_perturbation(
+                    perturbations,
+                    dataloader,
+                    activations_dict,
+                    results_dict,
+                )
+
+    def _store_activations_by_perturbation(
+        self, perturbations, dataloader, activations, results_dict
+    ):
+        """Store neural network activations grouped by perturbation type."""
+        # Get perturbation strings from dataloader
+        onehot_gene_dict = dataloader.gene_to_intervention_idx
+
+        # Convert one-hot to perturbation strings
+        perturbation_strings = []
+        for _, pert_vector in enumerate(perturbations):
+            genes = []
+            for j, val in enumerate(pert_vector):
+                if val == 1:
+                    # Find gene name from index
+                    for gene_name, gene_idx in onehot_gene_dict.items():
+                        if gene_idx == j:
+                            genes.append(gene_name)
+                            break
+
+            if genes:
+                pert_string = ",".join(genes)
+            else:
+                pert_string = "ctrl"
+            perturbation_strings.append(pert_string)
+
+        # Group activations by perturbation
+        for i, pert_string in enumerate(perturbation_strings):
+            for key, tensor in activations.items():
+                if key not in results_dict:
+                    continue
+
+                if pert_string not in results_dict[key]:
+                    results_dict[key][pert_string] = []
+
+                # Store single cell activation
+                cell_activation = tensor[i : i + 1].detach().cpu().numpy()
+                results_dict[key][pert_string].append(cell_activation)
+
+    def _process_control_cells(self, dataloader, results_dict):
+        """Process control cells for baseline measurements."""
+        logger.info("Processing control cells for baseline analysis.")
+
+        with torch.no_grad():
+            for tensors in dataloader:
+                inference_inputs = self.module._get_inference_input(tensors, metrics=True)
+                inference_outputs = self.module.inference(**inference_inputs)
+
+                # Extract control cell activations
+                x = inference_inputs["x"]
+                fc1_output = self.module.fc1(x)
+                fc_mean_output = self.module.fc_mean(fc1_output)
+                fc_var_output = F.softplus(self.module.fc_var(fc1_output))
+                z = inference_outputs["z"]
+
+                # Calculate u for controls (no intervention)
+                I = torch.eye(self.module.n_latent, device=z.device)
+                dag_matrix = torch.inverse(I - torch.triu(self.module.G, diagonal=1))
+                u = z @ dag_matrix
+
+                # Store control activations
+                control_activations = {
+                    "fc1": fc1_output,
+                    "fc_mean": fc_mean_output,
+                    "fc_var": fc_var_output,
+                    "z": z,
+                    "z_interv": z,  # For controls, z_interv = z (no intervention)
+                    "u": u,
+                    "activity_score": inference_outputs["activity_score"],
+                }
+
+                for key, tensor in control_activations.items():
+                    if key not in results_dict:
+                        continue
+
+                    if "ctrl" not in results_dict[key]:
+                        results_dict[key]["ctrl"] = []
+
+                    results_dict[key]["ctrl"].append(tensor.detach().cpu().numpy())
+
+    def _calculate_traditional_metrics(self, results_dict, N):
+        """Calculate traditional perturbation metrics (da_p, dar_p, hitn)."""
+        logger.info("Calculating traditional perturbation metrics.")
+
+        # Use activity_score for traditional metrics calculation
+        if "activity_score" not in results_dict:
+            logger.warning("No activity scores found for traditional metrics calculation.")
+            return
+
+        # Get control baseline
+        if "ctrl" not in results_dict["activity_score"].index:
+            logger.warning("No control cells found for baseline calculation.")
+            return
+
+        ctrl_activities = results_dict["activity_score"].loc["ctrl"].values
+        ctrl_mean = np.mean(ctrl_activities, axis=0)
+
+        # Initialize empty DataFrame for DA values with NaN values
+        df_da = pd.DataFrame(
+            data=np.nan,
+            index=list(np.unique(results_dict["activity_score"].index)),
+            columns=self.gos,
+        )
+
+        # Calculate metrics for each perturbation
+        for pert_strings in np.unique(results_dict["activity_score"].index):
+            if pert_strings == "ctrl":
+                continue
+            activities = results_dict["activity_score"].loc[pert_strings].values
+            pert_mean = np.mean(activities, axis=0)
+
+            # Calculate DA (Differential Activity) as absolute difference per paper definition
+            # DA^p_k = |α̅^p_k - α̅^c_k| where α̅^p_k and α̅^c_k are mean activities
+            raw_diff = pert_mean - ctrl_mean
+            da_values = np.abs(raw_diff)
+            df_da.loc[pert_strings] = da_values
+
+            # Calculate DAR and HitN using gene-pathway mappings
+        self._calculate_dar_hitn(df_da, results_dict, N)
+        results_dict["da_p"] = df_da
+
+    def _calculate_dar_hitn(self, df_da, results_dict, N):
+        """Calculate DAR and HitN metrics for a specific perturbation."""
+        df_dar = pd.DataFrame(
+            data=np.nan, index=list(results_dict["activity_score"].index), columns=["dar"]
+        )
+        df_hitn = pd.DataFrame(
+            data=np.nan, index=list(results_dict["activity_score"].index), columns=["hitn"]
+        )
+
+        for pert_string in df_da.index:
+            pathway_indices = []
+            # get the pathways where this pertubration is included
+            gene_symbols = pert_string.split(",")
+            for gene_symbol in gene_symbols:
+                try:
+                    # Convert gene symbol to ensembl ID
+                    if hasattr(self, "mapping_dict") and gene_symbol in self.mapping_dict:
+                        ensembl_id = self.mapping_dict[gene_symbol]
+
+                        # Get gene index
+                        if hasattr(self, "gen_dict") and ensembl_id in self.gen_dict:
+                            gene_idx = self.gen_dict[ensembl_id]
+
+                            # Get pathway indices for this gene
+                            if hasattr(self, "rel") and gene_idx in self.rel:
+                                pathway_indices.extend(self.rel[gene_idx])
+
+                except KeyError:
+                    logger.warning(f"Could not map gene {gene_symbol} to pathways")
+                    continue
+            not_pathway_indices = [
+                i for i in range(len(df_da.columns)) if i not in pathway_indices
+            ]
+            # add a print of the head of df_da
+
+            wp = df_da.loc[pert_string].values[pathway_indices]
+            wp_n = df_da.loc[pert_string].values[not_pathway_indices]
+            dar = np.mean(wp_n) / np.mean(wp)
+
+            # Now wwe will need to sort the DA values and see how many of the pathways
+            # that include the perturbed gene are in the top N
+            # quiza habria que añadir una columna al laod con los valores del index y
+            # luego ordenarlo por la columna de da
+            # Crea un dataframe con los valores de da y su indice
+            da = df_da.loc[pert_string].values
+            inde = np.arange(0, len(df_da.columns))
+            print(df_da.index)
+            print(inde)
+            print(da)
+            print(f"da shape: {da.shape}")
+            df = pd.DataFrame({"da": da, "index": inde})
+            # now sort the df by the column da
+            df = df.sort_values(by="da", ascending=False)
+            # now select the number of rows with index in the wp list that are within
+            # the first 100 rows
+            hits = df.loc[:, "index"].isin(wp)
+            # now lets get the numbers of trues in the first N rows
+            hitN = sum(hits[:N]) / N
+
+            df_dar.loc["dar"] = dar
+            df_hitn.loc["hitn"] = hitN
+        results_dict["dar_p"] = df_dar
+        results_dict["hitn"] = df_hitn
+
+    def predict_perturbation_response(
+        self,
+        adata: AnnData | None = None,
+        indices: Sequence[int] | None = None,
+        batch_size: int | None = None,
+        temp: float = 1.0,
+    ) -> np.ndarray:
+        """
+        Predict gene expression response to perturbations.
+
+        Note: num_interv is now auto-detected based on the intervention matrix.
+
+        Parameters
+        ----------
+        adata : AnnData, optional
+            AnnData object with equivalent structure to initial AnnData.
+        indices : Sequence[int], optional
+            Indices of cells in adata to use.
+        batch_size : int, optional
+            Batch size for data loading.
+        temp : float, default 1.0
+            Temperature for intervention softmax.
+
+        Returns
+        -------
+        np.ndarray
+            Predicted perturbed gene expression.
+        """
+        # Once the model has been trained with single perturbation we want to know
+        # the unknown perturbations
+        self._check_if_trained(warn=False)
+        adata = self._validate_anndata(adata)
+        # Load the
+        scdl = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size)
+        predictions = []
+
+        # In this case it computes the inference and generative parts to obtain all the
+        # elements of the nerual network. Then it selects the output predicted expresion
+
+        for tensors in scdl:
+            inference_inputs = self.module._get_inference_input(tensors)
+            inference_outputs = self.module.inference(**inference_inputs)
+
+            # This returns the one hot vector c1 and c2 along with th z latent vector and
+            # auto-detected num_interv
+            generative_inputs = self.module._get_generative_input(tensors, inference_outputs)
+
+            # This calculates the shift, computes the activty after the acicclic graph operations
+            # and reconstruct both the control and the predicted gene expresion
+            # Use auto-detected num_interv from generative_inputs
+            generative_outputs = self.module.generative(
+                z=generative_inputs["z"],
+                c1=generative_inputs["c1"],
+                c2=generative_inputs["c2"],
+                num_interv=generative_inputs["num_interv"],  # Auto-detected
+                temp=temp,
+            )
+            # It then returns te predicted pertubration gene expresion
+            predictions += [generative_outputs["y_hat"].detach().cpu()]
+        # This returns the predicted gene exression
+        return torch.cat(predictions).numpy()
+
+    def get_causal_graph(self) -> np.ndarray:
+        """
+        Get the learned causal graph matrix.
+
+        Returns
+        -------
+        np.ndarray
+            Causal graph adjacency matrix.
+        """
+        self._check_if_trained(warn=False)
+        return torch.triu(self.module.G, diagonal=1).detach().cpu().numpy()
+
+    def _test_activity_score_significance(
+        self, activity_score_df: pd.DataFrame, alpha: float = 0.05, method: str = "fdr_bh"
+    ) -> pd.DataFrame:
+        """
+        Perform two-tailed t-tests comparing pathway activity scores between perturbations & ctrls.
+
+        For each GO term, compares the activity scores of each perturbation condition against
+        control cells, then applies FDR correction across all tests.
+
+        Parameters
+        ----------
+        activity_score_df : pd.DataFrame
+            DataFrame with perturbation conditions as rows and GO terms as columns.
+            Must include 'ctrl' rows for control comparisons.
+        alpha : float, default 0.05
+            Significance level for FDR correction.
+        method : str, default 'fdr_bh'
+            Method for multiple testing correction.
+
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame with perturbations as rows and GO terms as columns containing FDR values.
+        """
+        # Validate input
+        if not isinstance(activity_score_df, pd.DataFrame):
+            raise ValueError("activity_score_df must be a pandas DataFrame")
+
+        # Extract control data
+        ctrl_mask = activity_score_df.index == "ctrl"
+        if not ctrl_mask.any():
+            raise ValueError("No control cells found. Expected 'ctrl' in DataFrame index.")
+
+        ctrl_data = activity_score_df[ctrl_mask]
+
+        # Get unique perturbation conditions (excluding control)
+        perturbation_conditions = activity_score_df.index[~ctrl_mask].unique()
+
+        # Initialize results storage
+        fdr_results = {}
+
+        for condition in perturbation_conditions:
+            # Get perturbation data for this condition
+            pert_mask = activity_score_df.index == condition
+            pert_data = activity_score_df[pert_mask]
+
+            # Skip if insufficient data
+            if len(pert_data) < 2:
+                logger.warning(f"Skipping {condition}: insufficient cells (n={len(pert_data)})")
+                continue
+
+            # Perform t-tests for each GO term
+            p_values = []
+            tested_gos = []
+
+            for go_term in activity_score_df.columns:
+                try:
+                    # Get values for this GO term
+                    ctrl_values = ctrl_data[go_term].values
+                    pert_values = pert_data[go_term].values
+
+                    # Remove any NaN values
+                    ctrl_values = ctrl_values[~np.isnan(ctrl_values)]
+                    pert_values = pert_values[~np.isnan(pert_values)]
+
+                    # Check if we have sufficient data
+                    if len(ctrl_values) < 2 or len(pert_values) < 2:
+                        continue
+
+                    # Perform two-tailed t-test
+                    t_stat, p_val = stats.ttest_ind(
+                        pert_values,
+                        ctrl_values,
+                        equal_var=False,  # Welch's t-test (unequal variances)
+                    )
+
+                    p_values.append(p_val)
+                    tested_gos.append(go_term)
+
+                except (ValueError, RuntimeError, KeyError, TypeError) as e:
+                    logger.warning(f"Error testing {condition} vs ctrl for {go_term}: {e}")
+                    continue
+
+            # Apply FDR correction if we have tests
+            if len(p_values) > 0:
+                # Perform multiple testing correction
+                rejected, p_adjusted, alpha_sidak, alpha_bonf = multipletests(
+                    p_values, alpha=alpha, method=method
+                )
+
+                # Store results for this condition
+                condition_results = {}
+                for go_term, fdr_val in zip(tested_gos, p_adjusted, strict=True):
+                    condition_results[go_term] = fdr_val
+
+                # Fill in NaN for GO terms that weren't tested
+                for go_term in activity_score_df.columns:
+                    if go_term not in condition_results:
+                        condition_results[go_term] = np.nan
+
+                fdr_results[condition] = condition_results
+            else:
+                logger.warning(f"No valid tests for condition {condition}")
+                # Fill with NaN if no tests were possible
+                fdr_results[condition] = dict.fromkeys(activity_score_df.columns, np.nan)
+
+        # Convert to DataFrame
+        fdr_df = pd.DataFrame(fdr_results).T
+
+        # Ensure column order matches input
+        fdr_df = fdr_df.reindex(columns=activity_score_df.columns)
+
+        return fdr_df
+
+    def _make_data_loader(
+        self,
+        adata: AnnData | None = None,
+        indices: Sequence[int] | None = None,
+        batch_size: int | None = None,
+        shuffle: bool = False,
+        data_loader_class=None,
+        **data_loader_kwargs,
+    ):
+        """
+        Create a custom SENA data loader that handles control-perturbation matching.
+
+        This method creates a SENADataLoader that performs the exact same preprocessing
+        as the original setup_anndata method, but applies it on-the-fly during training.
+
+        Parameters
+        ----------
+        adata : AnnData, optional
+            AnnData object to create dataloader for.
+        indices : Sequence[int], optional
+            Indices of cells to include.
+        batch_size : int, optional
+            Batch size for data loading.
+        shuffle : bool, default False
+            Whether to shuffle the data.
+        data_loader_class : optional
+            Data loader class to use (ignored, uses SENADataLoader).
+        **data_loader_kwargs
+            Additional arguments for data loader.
+
+        Returns
+        -------
+        SENADataLoader
+            Custom SENA data loader instance.
+        """
+        if batch_size is None:
+            batch_size = 128
+
+        if adata is None:
+            adata_manager = self.adata_manager
+        else:
+            # Validate that the new adata has the same structure
+            adata_manager = self._validate_anndata(adata)
+            adata_manager = self.adata_manager
+
+        # Create SENA-specific dataloader that replicates the original preprocessing
+        # Use prediction mode if indices are specified, training mode otherwise
+        mode = "prediction" if indices is not None else "training"
+
+        sena_dataloader = SENADataLoader(
+            adata_manager=adata_manager,
+            perturbation_key=self.perturbation_key,
+            shuffle=shuffle,
+            batch_size=batch_size,
+            mode=mode,
+            indices=indices,  # Pass indices for prediction mode
+            **data_loader_kwargs,
+        )
+
+        # Store reference to current dataloader for control reshuffling
+        self._current_dataloader = sena_dataloader
+
+        return sena_dataloader
diff --git a/src/scvi/external/SENADVAE/_module.py b/src/scvi/external/SENADVAE/_module.py
new file mode 100644
index 0000000000..c051d3e57f
--- /dev/null
+++ b/src/scvi/external/SENADVAE/_module.py
@@ -0,0 +1,811 @@
+import logging
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from scvi import REGISTRY_KEYS
+from scvi.module.base import BaseModuleClass, auto_move_data
+
+logger = logging.getLogger(__name__)
+
+
+class NetworkActivity_layer(torch.nn.Module):
+    """
+    Biologically-constrained neural network layer implementing Gene Ontology pathway constraints.
+
+    This layer represents the core biological innovation of SENA, transforming raw gene expression
+    data into biologically meaningful pathway activity scores. Unlike standard neural network
+    layers that learn arbitrary gene-gene relationships, this layer enforces
+    biological priors by only allowing connections between genes and their experimentally-validated
+    Gene Ontology annotations.
+
+    Parameters
+    ----------
+    input_genes : int
+        Number of input genes in expression profiles (typically 5,000-20,000).
+    output_gs : int
+        Number of output Gene Ontology pathways (typically 100-500 after filtering).
+    relation_dict : dict
+        Gene-to-pathway mapping from GO annotations.
+        Format: {gene_index: [pathway_index1, pathway_index2, ...]}.
+    bias : bool, default True
+        Whether to include learnable bias terms for each pathway.
+    lambda_parameter : float, default 0
+        Regularization allowing minimal contribution from non-annotated gene-pathway pairs.
+        Prevents complete information loss (typically 0.01-0.1).
+
+    Attributes
+    ----------
+    n_input_genes : int
+        Number of genes in expression profile.
+    n_output_gs : int
+        Number of GO pathways after filtering.
+    relation_dict : dict
+        Gene index to list of pathway indices mapping.
+    mask : torch.Tensor
+        Biological constraint mask enforcing GO annotations.
+    weight : nn.Parameter
+        Learnable weights for gene contributions to pathway activities.
+    bias : nn.Parameter or None
+        Learnable bias term for each pathway baseline activity.
+
+    Notes
+    -----
+    Biological Rationale:
+    - Genes function within coordinated biological pathways/processes
+    - Gene Ontology provides curated, experimental evidence-based gene-pathway annotations
+    - Pathway-level analysis reduces noise and improves interpretability
+    - Enforces sparsity consistent with biological network structure
+
+    Mathematical Implementation:
+    Creates a masked weight matrix where W[pathway_i, gene_j] = 0 unless gene_j is annotated
+    to pathway_i in Gene Ontology. The lambda_parameter allows minimal "leakage" to prevent
+    complete information loss for genes with sparse annotations.
+    """
+
+    def __init__(self, input_genes, output_gs, relation_dict, bias=True, lambda_parameter=0):
+        super().__init__()
+        self.n_input_genes = input_genes  # Number of genes in expression profile
+        self.n_output_gs = output_gs  # Number of GO pathways after filtering
+        self.relation_dict = relation_dict  # Gene index -> list of pathway indices mapping
+
+        # Create biological constraint mask enforcing GO annotations
+        # Only genes annotated to a pathway can contribute to its activity
+        mask = torch.zeros((self.n_input_genes, self.n_output_gs))
+        for gene_idx in range(self.n_input_genes):
+            if gene_idx in self.relation_dict:
+                for pathway_idx in self.relation_dict[gene_idx]:
+                    mask[gene_idx, pathway_idx] = 1.0
+
+        self.mask = mask
+        # Apply lambda regularization to prevent complete information loss
+        # Allows minimal contribution from non-annotated gene-pathway pairs
+        self.mask[self.mask == 0] = lambda_parameter
+
+        # Learnable weights for gene contributions to pathway activities
+        # Shape: (n_pathways, n_genes)
+        self.weight = nn.Parameter(torch.empty((self.n_output_gs, self.n_input_genes)))
+
+        # Learnable bias term for each pathway baseline activity
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.n_output_gs))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def forward(self, x):
+        """
+        Transform gene expression to pathway activities using biological constraints.
+
+        Computes pathway activities as a biologically-constrained linear transformation:
+        pathway_activity = (gene_expression @ masked_weights) + bias
+
+        The masked weights ensure only biologically relevant gene-pathway connections
+        contribute to the pathway activity computation.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Gene expression matrix, shape (batch_size, n_genes).
+            Values typically log-normalized single-cell counts.
+
+        Returns
+        -------
+        torch.Tensor
+            Pathway activity matrix, shape (batch_size, n_pathways).
+            Each element represents inferred activity level of a biological pathway.
+        """
+        device = self.weight.device
+        # Apply biological masking: only annotated gene-pathway pairs contribute
+        # Transpose mask to match matrix multiplication requirements
+        masked_weights = self.weight * self.mask.T.to(device)
+
+        # Compute pathway activities as weighted sum of annotated genes
+        output = x @ masked_weights.T
+
+        if self.bias is not None:
+            return output + self.bias
+        return output
+
+    def reset_parameters(self) -> None:
+        """
+        Initialize network parameters using standard PyTorch initialization schemes.
+
+        Uses Kaiming uniform initialization for weights and uniform initialization
+        for bias terms based on fan-in calculations.
+        """
+        nn.init.kaiming_uniform_(self.weight, a=np.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / np.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.bias, -bound, bound)
+
+
+class SENAModule(BaseModuleClass):
+    """
+    SENA neural architecture for causal perturbation modeling.
+
+    This module implements a specialized Variational Autoencoder designed for single-cell
+    perturbation experiments. It combines biological pathway constraints with causal graph
+    learning to model how genetic perturbations propagate through cellular networks to
+    affect gene expression.
+
+    Parameters
+    ----------
+    n_input : int
+        Number of genes in expression profiles (typically 5,000-20,000).
+    n_latent : int
+        Latent space dimensionality (equals number of GO pathways).
+    n_cat_covs : int
+        Number of intervention categories (same as n_categories_interv).
+    n_categories_interv : int
+        Total number of unique genes that can be perturbed.
+    gos : list
+        Gene Ontology pathway identifiers (e.g., ['GO:0006915', 'GO:0008219']).
+    rel_dict : dict
+        Gene-to-pathway mapping from GO annotations {gene_idx: [pathway_idx1, pathway_idx2]}.
+    n_hidden_encoder : int, default 512
+        Hidden layer size for standard encoder (used when mode="normal").
+    n_hidden_decoder : int, default 128
+        Hidden layer size for gene expression reconstruction decoder.
+    n_hidden_interv : int, default 256
+        Hidden layer size for perturbation effect encoding network.
+    mode : str, default "sena"
+        Architecture mode: "sena" (pathway-constrained) or "normal" (standard VAE).
+    sena_lambda : float, default 0.1
+        Regularization strength for pathway sparsity constraints.
+
+    Attributes
+    ----------
+    n_input : int
+        Number of input genes in expression profile.
+    n_latent : int
+        Latent space dimensionality (equals number of pathways).
+    n_cat_covs : int
+        Number of intervention categories.
+    n_categories_interv : int
+        Number of unique perturbable genes.
+    mode : str
+        Architecture mode ("sena" or "normal").
+    sena_lambda : float
+        Regularization strength for pathway sparsity constraints.
+    relations : dict
+        Gene-to-pathway mapping dictionary.
+    fc1 : NetworkActivity_layer or nn.Linear
+        First layer of encoder (pathway-constrained or standard).
+    fc_mean : nn.Linear
+        Linear layer for latent mean parameters.
+    fc_var : nn.Linear
+        Linear layer for latent variance parameters.
+    d1 : nn.Linear
+        First decoder layer.
+    d2 : nn.Linear
+        Second decoder layer.
+    G : nn.Parameter
+        Causal graph matrix modeling pathway-pathway interactions.
+    c1 : nn.Linear
+        First intervention encoding layer.
+    c2 : nn.Linear
+        Second intervention encoding layer.
+    c_shift : nn.Parameter
+        Learnable intervention strength parameters.
+    activity_score : torch.Tensor
+        Pathway activity scores from the encoder.
+
+    Notes
+    -----
+    Scientific Framework:
+    1. Gene Expression → Pathway Activities (biologically constrained via GO annotations)
+    2. Pathway Activities → Latent Space (VAE probabilistic encoding with μ, σ)
+    3. Perturbation Effects → Pathway Shifts (learnable intervention-specific effects)
+    4. Causal Propagation → Modified Activities (DAG matrix models pathway interactions)
+    5. Modified Activities → Gene Expression (decoder reconstruction/prediction)
+
+    Key Biological Insights:
+    - Perturbations act primarily at pathway level, not individual genes
+    - Pathways interact causally (upstream pathways affect downstream ones)
+    - Intervention effects can be decomposed into pathway-specific shifts
+    - Causal structure is learnable from observational + intervention data
+
+    Applications:
+    - Predict single/double gene knockout effects
+    - Identify causal pathway relationships
+    - Design optimal perturbation experiments
+    - Understand drug/therapeutic mechanisms of action
+    """
+
+    def __init__(
+        self,
+        n_input: int,  # Number of input genes in expression profile
+        n_latent: int,  # Latent space dimensionality (equals number of pathways)
+        n_cat_covs: int,  # Number of intervention categories
+        n_categories_interv: int,  # Number of unique perturbable genes
+        gos: list,  # List of GO pathway IDs that passed filtering
+        rel_dict: dict,  # Gene index -> pathway indices mapping
+        n_hidden_encoder: int = 128,  # Standard encoder hidden layer size (mode="normal")
+        n_hidden_decoder: int = 128,  # Decoder hidden layer size
+        n_hidden_interv: int = 128,  # Intervention encoder hidden layer size
+        mode: str = "sena",
+        sena_lambda: float = 0,
+    ):
+        super().__init__()  # Initialize parent BaseModuleClass
+
+        # Store architecture parameters
+        self.n_input = n_input
+        self.n_latent = n_latent
+        self.n_cat_covs = n_cat_covs
+        self.n_categories_interv = n_categories_interv
+        self.mode = mode
+        self.sena_lambda = sena_lambda
+        self.relations = rel_dict
+
+        # Encoder pathway: Gene Expression → Pathway Activities → Latent Space
+        if mode == "sena":
+            # Biologically-constrained encoder using GO pathway annotations
+            self.fc1 = NetworkActivity_layer(
+                self.n_input, len(gos), rel_dict, lambda_parameter=sena_lambda
+            )
+            latent_input_size = len(gos)  # Pathway activities feed into latent encoding
+        else:
+            # Standard linear encoder (no biological constraints)
+            self.fc1 = nn.Linear(self.n_input, n_hidden_encoder)
+            latent_input_size = n_hidden_encoder
+
+        # VAE latent space encoding: Pathway Activities → μ, σ parameters
+        # Separate networks learn mean and variance of latent pathway activity distributions
+        self.fc_mean = nn.Linear(latent_input_size, self.n_latent)
+        self.fc_var = nn.Linear(latent_input_size, self.n_latent)
+
+        # Decoder pathway: Latent Activities → Gene Expression
+        # Standard feedforward decoder reconstructs gene expression from pathway activities
+        self.d1 = nn.Linear(self.n_latent, n_hidden_decoder)
+        self.d2 = nn.Linear(n_hidden_decoder, self.n_input)
+
+        # Causal graph matrix: models pathway-pathway interactions
+        # Upper triangular ensures acyclicity (prevents causal loops)
+        # G[i,j] represents effect of pathway j on pathway i
+        self.G = nn.Parameter(torch.zeros(self.n_latent, self.n_latent))
+
+        # Intervention networks: Perturbation → Pathway Effects
+        # Transforms one-hot perturbation vectors into pathway-specific shifts
+        self.c1 = nn.Linear(self.n_categories_interv, n_hidden_interv)
+        self.c2 = nn.Linear(n_hidden_interv, self.n_latent)
+
+        # Learnable intervention strength parameters
+        # c_shift[i] represents baseline perturbation strength for gene i
+        self.c_shift = nn.Parameter(torch.ones(self.n_categories_interv))
+
+    @auto_move_data
+    def _get_inference_input(
+        self, tensors: dict[str, torch.Tensor], metrics: bool = False
+    ) -> dict[str, torch.Tensor]:
+        """
+        Extract control cell expression for pathway activity inference.
+
+        In SENA's causal framework, we encode control (unperturbed) cells to learn
+        baseline pathway activities, then apply interventions to predict perturbed states.
+        This follows the structural equation modeling approach where we model
+        Y_perturbed = f(Y_control, intervention).
+
+        Parameters
+        ----------
+        tensors : dict of str to torch.Tensor
+            Input data dictionary containing paired control-perturbation samples.
+        metrics : bool, default False
+            If True, use labels key instead of X key for metrics computation.
+
+        Returns
+        -------
+        dict of str to torch.Tensor
+            Dictionary with control expression for encoder input.
+        """
+        if not metrics:
+            return {"x": tensors[REGISTRY_KEYS.X_KEY]}
+        else:
+            return {"x": tensors[REGISTRY_KEYS.LABELS_KEY]}
+
+    @auto_move_data
+    def _get_generative_input(
+        self, tensors: dict[str, torch.Tensor], inference_output: dict[str, torch.Tensor], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        """
+        Process intervention matrix and prepare inputs for perturbation effect modeling.
+
+        This method converts complex multi-one-hot intervention matrices into the format
+        required by SENA's intervention networks. It handles single and double perturbations
+        by decomposing them into two separate one-hot vectors (c1, c2) that can be processed
+        by the intervention encoding networks.
+
+        Parameters
+        ----------
+        tensors : dict of str to torch.Tensor
+            Input data containing intervention matrix.
+        inference_output : dict of str to torch.Tensor
+            Latent variables from encoder (z, z_mu, z_var).
+        **kwargs
+            Additional parameters (num_interv can be manually specified).
+
+        Returns
+        -------
+        dict of str to torch.Tensor
+            Processed intervention data ready for generative modeling.
+
+        Notes
+        -----
+        Intervention Matrix Format:
+        - Shape: (batch_size, n_intervention_genes)
+        - Values: -1 (control), 0 (non-perturbed gene), 1 (perturbed gene)
+        - Multiple 1s indicate combinatorial perturbations
+
+        Output Format:
+        - c1: First perturbation vector (single perturbations or first gene in doubles)
+        - c2: Second perturbation vector (zeros for singles, second gene for doubles)
+        - num_interv: Auto-detected number of interventions per cell
+        """
+        z = inference_output["z"]
+
+        # Extract intervention matrix encoding perturbation states
+        # Shape: (batch_size, n_intervention_genes)
+        # Values: -1 for controls, 0 for non-perturbed genes, 1 for perturbed genes
+        intervention_matrix = tensors.get(REGISTRY_KEYS.CAT_COVS_KEY)
+
+        # Initialize intervention vectors for dual-perturbation architecture
+        c1, c2 = None, None
+        if intervention_matrix is not None:
+            batch_size = intervention_matrix.shape[0]
+            n_per = intervention_matrix.shape[1]  # Number of unique perturbable genes
+
+            # Initialize c1 and c2 as zero vectors for all cells
+            c1 = torch.zeros(
+                batch_size, n_per, device=intervention_matrix.device, dtype=torch.float32
+            )
+            c2 = torch.zeros(
+                batch_size, n_per, device=intervention_matrix.device, dtype=torch.float32
+            )
+
+            # Auto-detect maximum number of simultaneous interventions
+            first_cell_interventions = intervention_matrix[0, :]
+            first_cell_perturbed = torch.where(first_cell_interventions == 1)[0]
+            max_interventions = len(first_cell_perturbed)
+
+            # Process each cell's intervention pattern
+            for cell_idx in range(batch_size):
+                cell_interventions = intervention_matrix[cell_idx, :]
+
+                # Identify perturbed genes (value = 1 in intervention matrix)
+                perturbed_gene_indices = torch.where(cell_interventions == 1)[0]
+                n_perturbed = len(perturbed_gene_indices)
+
+                # Distribute perturbations across c1 and c2 networks
+                if n_perturbed == 1:
+                    # Single perturbation: assign to c1, c2 remains zero
+                    gene_idx = perturbed_gene_indices[0]
+                    c1[cell_idx, gene_idx] = 1.0
+
+                elif n_perturbed == 2:
+                    # Double perturbation: split between c1 and c2
+                    gene_idx_1 = perturbed_gene_indices[0]
+                    gene_idx_2 = perturbed_gene_indices[1]
+                    c1[cell_idx, gene_idx_1] = 1.0
+                    c2[cell_idx, gene_idx_2] = 1.0
+
+                elif n_perturbed > 2:
+                    # Higher-order perturbations: use first two genes only
+                    # Note: SENA architecture currently supports max 2 simultaneous perturbations
+                    gene_idx_1 = perturbed_gene_indices[0]
+                    gene_idx_2 = perturbed_gene_indices[1]
+                    c1[cell_idx, gene_idx_1] = 1.0
+                    c2[cell_idx, gene_idx_2] = 1.0
+                    logger.warning(
+                        f"Cell {cell_idx} has {n_perturbed} perturbations, using only first 2"
+                    )
+
+                # n_perturbed == 0 (control cells): both c1 and c2 remain zeros
+
+            # Auto-detect intervention count with manual override capability
+            auto_num_interv = min(max_interventions, 2)  # Cap at 2 (current architecture limit)
+            num_interv = kwargs.get("num_interv", auto_num_interv)
+
+        else:
+            # No intervention matrix provided - default to single intervention
+            num_interv = kwargs.get("num_interv", 1)
+
+        return {"z": z, "c1": c1, "c2": c2, "num_interv": num_interv}
+
+    @auto_move_data
+    def inference(self, x: torch.Tensor, **kwargs):
+        """
+        Encode gene expression to pathway-constrained latent representations.
+
+        This method implements the encoder pathway of SENA, transforming raw gene expression
+        into biologically meaningful latent variables representing pathway activities.
+        The encoding process enforces biological constraints through Gene Ontology annotations,
+        ensuring the latent space captures interpretable biological processes.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Gene expression matrix, shape (batch_size, n_genes).
+            Typically log-normalized single-cell RNA-seq counts from control cells.
+        **kwargs
+            Additional keyword arguments (unused).
+
+        Returns
+        -------
+        dict
+            Dictionary containing:
+            - z : torch.Tensor
+                Sampled latent pathway activities, shape (batch_size, n_pathways).
+            - z_mu : torch.Tensor
+                Mean pathway activities, shape (batch_size, n_pathways).
+            - z_var : torch.Tensor
+                Variance of pathway activities, shape (batch_size, n_pathways).
+            - activity_score : torch.Tensor
+                Pathway activity scores from encoder, shape (batch_size, n_pathways).
+
+        Notes
+        -----
+        Encoding Pipeline:
+        1. Gene Expression → Pathway Activities (via NetworkActivity_layer)
+        2. Pathway Activities → Latent Parameters (μ, σ² for VAE)
+        3. Latent Sampling → z ~ N(μ, σ²) (reparameterization trick)
+
+        Biological Interpretation:
+        - Input: Raw gene expression from control cells
+        - Latent z: Inferred activities of biological pathways
+        - μ, σ²: Uncertainty estimates for pathway activity inference
+        """
+        # Transform gene expression to pathway activities using biological constraints
+        # LeakyReLU activation allows for subtle negative pathway activities
+        h = F.leaky_relu(self.fc1(x), 0.2)
+        self.activity_score = h
+
+        # Encode pathway activities to latent distribution parameters
+        z_mu = self.fc_mean(h)  # Mean pathway activity levels
+        z_var = F.softplus(self.fc_var(h))  # Positive variance estimates
+
+        # Sample latent pathway activities using reparameterization trick
+        # Enables gradient flow through stochastic sampling
+        z = self.reparameterize(z_mu, z_var)
+
+        return {"z": z, "z_mu": z_mu, "z_var": z_var, "activity_score": h}
+
+    @auto_move_data
+    def generative(
+        self,
+        z: torch.Tensor,
+        c1: torch.Tensor,
+        c2: torch.Tensor,
+        num_interv: int = 1,
+        temp: float = 1.0,
+        **kwargs,
+    ):
+        """
+        Generate perturbed gene expression through causal pathway modeling.
+
+        This method implements the core causal mechanism of SENA, modeling how genetic
+        perturbations propagate through biological pathway networks to affect gene expression.
+        It combines intervention effects with learned causal relationships between pathways.
+
+        Parameters
+        ----------
+        z : torch.Tensor
+            Baseline pathway activities from encoder, shape (batch_size, n_pathways).
+        c1 : torch.Tensor
+            First intervention one-hot vector, shape (batch_size, n_intervention_genes).
+        c2 : torch.Tensor
+            Second intervention one-hot vector, shape (batch_size, n_intervention_genes).
+        num_interv : int, default 1
+            Number of simultaneous interventions (0=control, 1=single, 2=double).
+            Auto-detected from intervention matrix processing.
+        temp : float, default 1.0
+            Temperature parameter for intervention effect sharpness (softmax temperature).
+            Higher values → more focused pathway effects.
+        **kwargs
+            Additional keyword arguments (unused).
+
+        Returns
+        -------
+        dict
+            Dictionary containing:
+            - y_hat : torch.Tensor
+                Predicted perturbed gene expression, shape (batch_size, n_genes).
+            - x_recon : torch.Tensor
+                Reconstructed control expression, shape (batch_size, n_genes).
+            - G : torch.Tensor
+                Learned causal DAG matrix, shape (n_pathways, n_pathways).
+
+        Notes
+        -----
+        Causal Modeling Pipeline:
+        1. Intervention Encoding: One-hot perturbation → pathway-specific effects
+        2. Pathway Shifting: Apply intervention effects to baseline pathway activities
+        3. Causal Propagation: Propagate effects through learned pathway DAG
+        4. Expression Decoding: Transform final pathway activities → gene expression
+
+        Mathematical Framework:
+        - z_interv = z + Σ(bc_i * csz_i) for i interventions
+        - u = z_interv @ (I - G_upper)^(-1)  [causal propagation]
+        - y_hat = Decoder(u)  [perturbed expression prediction]
+
+        Biological Interpretation:
+        - bc: Which pathways are affected by each intervention (gating)
+        - csz: Magnitude of intervention effect (strength)
+        - G: Learned causal relationships between pathways (DAG structure)
+        - u: Final pathway activities after causal propagation
+        """
+
+        def _c_encode(c_one_hot, temp):
+            """
+            Encode single intervention to pathway-level effects.
+
+            Transforms one-hot perturbation vector into two components:
+            1. Gating (bc): Which pathways are affected by this intervention
+            2. Strength (s): Overall magnitude of intervention effect
+
+            Returns
+            -------
+            tuple
+                (bc, s) where bc is pathway gating vector and s is strength scalar
+            """
+            if c_one_hot is None:
+                return torch.zeros_like(z), torch.zeros(z.shape[0], device=z.device)
+
+            # Encode intervention to pathway-level effects
+            # Two-layer network learns intervention → pathway mapping
+            h = F.leaky_relu(self.c1(c_one_hot.float()), 0.2)
+
+            # Apply temperature-controlled softmax for focused pathway targeting
+            # Higher temperature → more selective pathway effects
+            h = F.softmax(self.c2(h) * temp, dim=-1)
+
+            # Compute intervention strength from learnable gene-specific parameters
+            # c_shift contains learned baseline perturbation strengths per gene
+            s = c_one_hot.float() @ self.c_shift
+
+            return h, s  # (gating_vector, strength_scalar)
+
+        # Initialize intervention effect components
+        bc1, csz1 = torch.zeros_like(z), torch.zeros(z.shape[0], device=z.device)
+        bc2, csz2 = torch.zeros_like(z), torch.zeros(z.shape[0], device=z.device)
+
+        # Apply intervention-specific pathway shifts
+        # Each intervention contributes additively to pathway activity changes
+        if num_interv == 1:
+            # Single perturbation: apply only first intervention
+            bc1, csz1 = _c_encode(c1, temp)
+            z_interv = z + bc1 * csz1.reshape(-1, 1)  # Broadcast strength across pathways
+        elif num_interv == 2:
+            # Double perturbation: combine effects of both interventions
+            bc1, csz1 = _c_encode(c1, temp)
+            bc2, csz2 = _c_encode(c2, temp)
+            z_interv = z + bc1 * csz1.reshape(-1, 1) + bc2 * csz2.reshape(-1, 1)
+        else:
+            # No intervention (control) or unsupported intervention count
+            z_interv = z
+
+        # Apply causal propagation through learned pathway DAG
+        # Models how pathway perturbations propagate through biological networks
+        I = torch.eye(self.n_latent, device=z.device)
+
+        # Ensure acyclicity by using upper triangular G matrix
+        # G[i,j] represents causal effect of pathway j on pathway i
+        dag_matrix = torch.inverse(I - torch.triu(self.G, diagonal=1))
+
+        # Propagate intervention effects through causal pathway network
+        # u represents final pathway activities after causal propagation
+        u = z_interv @ dag_matrix
+
+        # Decode final pathway activities to predicted perturbed gene expression
+        y_hat = self.decode(u)
+
+        # Also reconstruct control expression for regularization
+        # Uses baseline pathway activities without intervention effects
+        u_recon = z @ dag_matrix
+        x_recon = self.decode(u_recon)
+
+        return {
+            "y_hat": y_hat,
+            "x_recon": x_recon,
+            "G": self.G,
+            "z_interv": z_interv,
+            "u": u,
+            "bc1": bc1 if num_interv >= 1 else None,
+            "bc2": bc2 if num_interv >= 2 else None,
+            "dag_matrix": dag_matrix,
+        }
+
+    def decode(self, u: torch.Tensor) -> torch.Tensor:
+        """
+        Decode pathway activities to gene expression predictions.
+
+        This method transforms latent pathway activities back to gene expression space
+        using a standard feedforward decoder. Unlike the encoder which enforces biological
+        constraints, the decoder learns flexible mappings from pathways to genes.
+
+        Parameters
+        ----------
+        u : torch.Tensor
+            Pathway activities after causal propagation, shape (batch_size, n_pathways).
+
+        Returns
+        -------
+        torch.Tensor
+            Predicted gene expression, shape (batch_size, n_genes).
+        """
+        h = F.leaky_relu(self.d1(u))
+        return F.leaky_relu(self.d2(h))
+
+    def reparameterize(self, mu: torch.Tensor, var: torch.Tensor) -> torch.Tensor:
+        """
+        VAE reparameterization trick for differentiable stochastic sampling.
+
+        Enables gradient flow through stochastic pathway activity sampling by
+        expressing random sampling as a deterministic function of learnable parameters
+        plus independent noise: z = μ + σ * ε, where ε ~ N(0,1).
+
+        Parameters
+        ----------
+        mu : torch.Tensor
+            Mean pathway activities, shape (batch_size, n_pathways).
+        var : torch.Tensor
+            Variance of pathway activities, shape (batch_size, n_pathways).
+
+        Returns
+        -------
+        torch.Tensor
+            Sampled pathway activities, shape (batch_size, n_pathways).
+        """
+        std = torch.sqrt(var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+
+    @auto_move_data
+    def loss(
+        self,
+        tensors: dict[str, torch.Tensor],
+        inference_output: dict[str, torch.Tensor],
+        generative_output: dict[str, torch.Tensor],
+        kl_weight: float = 1.0,
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Fallback loss computation for SENA module (not used with custom training plan).
+
+        This method provides a basic loss implementation for compatibility but should
+        not be called during normal training since SENA uses SENATrainingPlan with
+        specialized multi-component loss functions. The custom training plan implements
+        more sophisticated loss components including MMD, scheduled weights, and
+        biological regularization.
+
+        Parameters
+        ----------
+        tensors : dict of str to torch.Tensor
+            Input data containing control (X) and perturbed (labels) expression.
+        inference_output : dict of str to torch.Tensor
+            Encoder outputs (z, z_mu, z_var).
+        generative_output : dict of str to torch.Tensor
+            Decoder outputs (y_hat, x_recon, G).
+        kl_weight : float, default 1.0
+            Weight for KL divergence term (typically scheduled during training).
+        **kwargs
+            Additional keyword arguments (unused).
+
+        Returns
+        -------
+        dict of str to torch.Tensor
+            Dictionary of loss components for compatibility.
+
+        Notes
+        -----
+        Loss Components:
+        1. Reconstruction Loss: MSE between predicted and actual control expression
+        2. Intervention Loss: MSE between predicted and actual perturbed expression
+        3. KL Divergence: VAE regularization encouraging N(0,1) latent distribution
+        4. L1 Regularization: Sparsity penalty on causal DAG matrix
+        """
+        logger.debug("SENAModule.loss() called - should use SENATrainingPlan instead!")
+
+        # Defensive programming: handle missing keys gracefully
+        if REGISTRY_KEYS.LABELS_KEY not in tensors:
+            device = tensors[REGISTRY_KEYS.X_KEY].device
+            return self._return_dummy_loss(device)
+
+        # Extract data tensors
+        x = tensors[REGISTRY_KEYS.X_KEY]  # Control expression (encoder input)
+        y = tensors[REGISTRY_KEYS.LABELS_KEY]  # Perturbed expression (target)
+
+        # Extract model outputs with safety checks
+        x_recon = generative_output.get("x_recon")
+        y_hat = generative_output.get("y_hat")
+        G = generative_output.get("G")
+        mu = inference_output.get("z_mu")
+        var = inference_output.get("z_var")
+
+        # Return dummy loss if essential outputs missing
+        if x_recon is None or mu is None or var is None:
+            return self._return_dummy_loss(x.device)
+
+        # Compute loss components using simplified methods
+        # (SENATrainingPlan implements more sophisticated versions)
+
+        # Intervention loss: simple MSE (SENATrainingPlan uses MMD)
+        if y_hat is None:
+            loss_interv = torch.tensor(0.0, device=x.device)
+        else:
+            loss_interv = torch.nn.functional.mse_loss(y_hat, y)
+
+        # Reconstruction loss: MSE for control expression
+        loss_recon = torch.nn.functional.mse_loss(x_recon, x)
+
+        # KL divergence: VAE regularization term
+        logvar = torch.log(var)
+        KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
+        loss_kld = torch.mean(KLD_element).mul_(-0.5) / x.shape[0]
+
+        # L1 regularization: sparsity penalty on causal DAG
+        if G is not None:
+            l1_numerator = torch.norm(torch.triu(G, diagonal=1), p=1)
+            l1_denominator = torch.sum(torch.triu(torch.ones_like(G), diagonal=1))
+            loss_l1 = (
+                l1_numerator / l1_denominator
+                if l1_denominator > 0
+                else torch.tensor(0.0, device=G.device)
+            )
+        else:
+            loss_l1 = torch.tensor(0.0, device=x.device)
+
+        # Total loss (simplified weighting)
+        total_loss = loss_recon + loss_interv + kl_weight * loss_kld + 0.1 * loss_l1
+
+        return {
+            "loss": total_loss,
+            "reconstruction_loss": loss_recon,
+            "kl_local": loss_kld,
+            "intervention_loss": loss_interv,
+            "l1_reg_loss": loss_l1,
+        }
+
+    def _return_dummy_loss(self, device):
+        """
+        Return dummy loss components for error handling.
+
+        Parameters
+        ----------
+        device : torch.device
+            Device on which to create the dummy tensors.
+
+        Returns
+        -------
+        dict of str to torch.Tensor
+            Dictionary with zero-valued loss components.
+        """
+        return {
+            "loss": torch.tensor(0.0, device=device),
+            "reconstruction_loss": torch.tensor(0.0, device=device),
+            "kl_local": torch.tensor(0.0, device=device),
+            "intervention_loss": torch.tensor(0.0, device=device),
+            "l1_reg_loss": torch.tensor(0.0, device=device),
+        }
diff --git a/src/scvi/external/SENADVAE/_training_plan.py b/src/scvi/external/SENADVAE/_training_plan.py
new file mode 100644
index 0000000000..ebd7b3e349
--- /dev/null
+++ b/src/scvi/external/SENADVAE/_training_plan.py
@@ -0,0 +1,931 @@
+"""
+SENA Training Plan Implementation for Causal Perturbation Analysis
+
+This module implements specialized training procedures for the SENA (Structural Equation
+Network Analysis) model, providing sophisticated loss computation, parameter scheduling,
+and progress tracking tailored for single-cell perturbation experiments.
+
+Key Components:
+- SENATrainingPlan: Custom training logic with multi-component loss functions
+- MMD_loss: Maximum Mean Discrepancy for distribution matching in perturbation analysis
+- LossWeightScheduler: Dynamic scheduling of α (intervention) and β (KL) loss weights
+- TemperatureScheduler: Annealing of intervention effect sharpness over training
+- SENABatchProgressBar: Real-time batch-level progress tracking for long training runs
+
+The training plan implements the complete SENA loss function combining:
+1. Reconstruction loss (MSE) for control expression accuracy
+2. Intervention loss (MMD/MSE) for perturbation prediction accuracy
+3. KL divergence for VAE regularization
+4. L1 regularization for sparse causal graph learning
+"""
+
+import logging
+
+import torch
+import torch.nn as nn
+from lightning.pytorch.callbacks import Callback
+
+from scvi import REGISTRY_KEYS
+from scvi.train import Trainer, TrainingPlan
+
+try:
+    from tqdm import tqdm
+except ImportError:
+    tqdm = None
+
+logger = logging.getLogger(__name__)
+
+
+class MMD_loss(nn.Module):
+    """
+    Maximum Mean Discrepancy loss for comparing gene expression distributions.
+
+    MMD provides a powerful non-parametric method for comparing distributions that is
+    particularly well-suited for single-cell perturbation analysis. Unlike MSE which
+    requires paired samples, MMD can compare entire distributions of perturbed vs
+    control cells, making it ideal for scenarios where exact cell-to-cell matching
+    is impossible or undesirable.
+
+    Parameters
+    ----------
+    kernel_mul : float, default=2.0
+        Multiplicative factor for generating multiple kernel bandwidths.
+    kernel_num : int, default=5
+        Number of different kernel scales to use.
+    fix_sigma : float, optional
+        Fixed bandwidth parameter; if None, bandwidth is data-adaptive.
+
+    Attributes
+    ----------
+    kernel_num : int
+        Number of kernel scales used in computation.
+    kernel_mul : float
+        Bandwidth multiplication factor.
+    fix_sigma : float or None
+        Fixed bandwidth value.
+
+    Notes
+    -----
+    Biological rationale:
+    - Perturbations affect population-level expression distributions.
+    - Individual cells show heterogeneous responses to the same perturbation.
+    - MMD captures distributional shifts that MSE might miss.
+    - Robust to outliers and batch effects common in single-cell data.
+
+    Mathematical foundation:
+    MMD uses kernel methods to embed distributions in a reproducing kernel Hilbert space
+    where mean embeddings can be compared via L2 distance. The Gaussian kernel with
+    multiple bandwidths provides rich comparison capabilities.
+    """
+
+    def __init__(self, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
+        super().__init__()
+        self.kernel_num = kernel_num
+        self.kernel_mul = kernel_mul
+        self.fix_sigma = fix_sigma
+
+    def gaussian_kernel(self, source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
+        """
+        Compute multi-scale Gaussian kernel matrix for MMD calculation.
+
+        This method implements the core kernel computation that enables MMD to compare
+        distributions across multiple scales simultaneously. The multi-scale approach
+        is crucial for capturing both local and global differences between expression
+        distributions.
+
+        Parameters
+        ----------
+        source : torch.Tensor
+            Source distribution samples (e.g., predicted perturbation effects).
+        target : torch.Tensor
+            Target distribution samples (e.g., observed perturbation effects).
+        kernel_mul : float, default=2.0
+            Bandwidth multiplication factor for creating kernel scales.
+        kernel_num : int, default=5
+            Number of kernel scales to compute.
+        fix_sigma : float, optional
+            Fixed bandwidth; if None, computed adaptively from data.
+
+        Returns
+        -------
+        torch.Tensor
+            Combined kernel matrix incorporating all scales.
+
+        Notes
+        -----
+        Kernel design:
+        - Multiple bandwidths capture different scales of distribution differences.
+        - Data-adaptive bandwidth selection ensures robustness across datasets.
+        - Gaussian kernels provide smooth, differentiable similarity measures.
+        """
+        n_samples = int(source.size()[0]) + int(target.size()[0])
+        total = torch.cat([source, target], dim=0)
+
+        # Create all pairwise sample combinations for kernel evaluation
+        total0 = total.unsqueeze(0).expand(
+            int(total.size(0)), int(total.size(0)), int(total.size(1))
+        )
+        total1 = total.unsqueeze(1).expand(
+            int(total.size(0)), int(total.size(0)), int(total.size(1))
+        )
+
+        # Compute L2 distances between all sample pairs
+        L2_distance = ((total0 - total1) ** 2).sum(2)
+
+        # Adaptive bandwidth selection based on data characteristics
+        if fix_sigma:
+            bandwidth = fix_sigma
+        else:
+            # Median heuristic: use median pairwise distance as base bandwidth
+            bandwidth = torch.sum(L2_distance.data) / (n_samples**2 - n_samples)
+
+        # Generate multiple kernel scales for robust comparison
+        bandwidth /= kernel_mul ** (kernel_num // 2)
+        bandwidth_list = [bandwidth * (kernel_mul**i) for i in range(kernel_num)]
+
+        # Compute Gaussian kernels at all scales
+        kernel_val = [
+            torch.exp(-L2_distance / bandwidth_temp) for bandwidth_temp in bandwidth_list
+        ]
+
+        # Combine kernels across scales (sum provides richer representation)
+        return sum(kernel_val)
+
+    def forward(self, source, target):
+        """
+        Compute Maximum Mean Discrepancy between source and target distributions.
+
+        This method implements the MMD estimator using the kernel trick, providing
+        an unbiased estimate of the distance between distributions in the reproducing
+        kernel Hilbert space (RKHS). The estimator is particularly powerful for
+        high-dimensional biological data where traditional parametric approaches fail.
+
+        MMD Formula:
+        MMD²(P,Q) = E[k(X,X')] + E[k(Y,Y')] - 2E[k(X,Y)]
+        where X~P (source), Y~Q (target), k is the kernel function
+
+        Biological Interpretation:
+        - High MMD: Predicted and observed perturbation effects are very different
+        - Low MMD: Model successfully captures perturbation-induced changes
+        - Zero MMD: Perfect distributional match (theoretical optimum)
+
+        Parameters
+        ----------
+        source : torch.Tensor
+            Source distribution samples, shape (n_source_samples, n_features)
+            Typically predicted perturbed gene expression
+        target : torch.Tensor
+            Target distribution samples, shape (n_target_samples, n_features)
+            Typically observed perturbed gene expression
+
+        Returns
+        -------
+        torch.Tensor
+            MMD loss value (scalar), higher values indicate greater distributional mismatch
+        """
+        batch_size = int(source.size()[0])
+
+        # Compute kernel matrix for all pairwise combinations
+        kernels = self.gaussian_kernel(
+            source,
+            target,
+            kernel_mul=self.kernel_mul,
+            kernel_num=self.kernel_num,
+            fix_sigma=self.fix_sigma,
+        )
+
+        # Extract kernel submatrices for MMD computation
+        XX = kernels[:batch_size, :batch_size]  # Source-source similarities
+        YY = kernels[batch_size:, batch_size:]  # Target-target similarities
+        XY = kernels[:batch_size, batch_size:]  # Source-target similarities
+        YX = kernels[batch_size:, :batch_size]  # Target-source similarities
+
+        # Compute unbiased MMD estimator
+        # Higher values indicate greater distributional difference
+        loss = torch.mean(XX + YY - XY - YX)
+        return loss
+
+
+class LossWeightScheduler(Callback):
+    """
+    Dynamic scheduler for SENA loss component weights during training.
+
+    This scheduler implements a sophisticated annealing strategy for the α (intervention)
+    and β (KL divergence) loss weights, following principles from curriculum learning and
+    disentangled representation learning. The scheduling prevents early training instability
+    while ensuring proper convergence to the full SENA objective.
+
+    Training Phases:
+    1. Initialization Phase: Focus on basic reconstruction (low α, β)
+    2. Ramping Phase: Gradually introduce intervention and KL penalties
+    3. Stable Phase: Use full loss weights for final convergence
+
+    Biological Motivation:
+    - Early training: Learn basic gene expression patterns
+    - Mid training: Introduce perturbation modeling gradually
+    - Late training: Enforce full causal structure and regularization
+
+    Mathematical Schedule:
+    - Alpha (intervention weight): Linear ramp from 0 to α_max over specified epochs
+    - Beta (KL weight): Linear ramp from 0 to β_max over specified epochs
+    - Independent scheduling allows fine-tuning of each component
+
+    Parameters
+    ----------
+    alpha_max : float
+        Maximum weight for intervention loss (typically 0.5-2.0)
+        Higher values emphasize perturbation prediction accuracy
+    beta_max : float
+        Maximum weight for KL divergence loss (typically 0.1-1.0)
+        Higher values enforce stronger VAE regularization
+    alpha_start_epoch : int, default=5
+        Epoch to begin ramping up intervention loss weight
+        Early epochs focus on basic reconstruction
+    beta_start_epoch : int, default=10
+        Epoch to begin ramping up KL divergence weight
+        Delayed to allow pathway representation learning
+    """
+
+    def __init__(
+        self,
+        alpha_max: float,
+        beta_max: float,
+        alpha_start_epoch: int = 5,
+        beta_start_epoch: int = 10,
+    ):
+        super().__init__()
+        self.alpha_max = alpha_max
+        self.beta_max = beta_max
+        self.alpha_start_epoch = alpha_start_epoch
+        self.beta_start_epoch = beta_start_epoch
+
+    def on_train_epoch_start(self, trainer: Trainer, pl_module: TrainingPlan):
+        """
+        Update loss weights at the beginning of each training epoch.
+
+        This method implements the curriculum learning schedule, gradually increasing
+        the importance of intervention and KL divergence losses as training progresses.
+        The scheduling ensures stable training dynamics while maintaining the full
+        expressiveness of the SENA objective.
+
+        Scheduling Logic:
+        - Phase 1 (α): Ramp from 0 to α_max over first half of training
+        - Phase 2 (α): Maintain α_max for second half of training
+        - Phase 1 (β): Keep at 0 until β_start_epoch
+        - Phase 2 (β): Linear ramp from 0 to β_max until end of training
+
+        Parameters
+        ----------
+        trainer : Trainer
+            PyTorch Lightning trainer containing epoch information
+        pl_module : TrainingPlan
+            SENA training plan module to update with new weights
+        """
+        current_epoch = trainer.current_epoch
+        max_epochs = trainer.max_epochs
+
+        # Alpha (intervention loss) scheduling
+        # Ramp up over first half of training, then maintain maximum
+        half_epochs = max_epochs // 2
+        if current_epoch < self.alpha_start_epoch:
+            alpha_val = 0.0  # Early training: focus on reconstruction
+        elif self.alpha_start_epoch <= current_epoch < half_epochs:
+            # Linear ramp-up phase for intervention loss
+            total_ramp_epochs = half_epochs - self.alpha_start_epoch
+            current_ramp_epoch = current_epoch - self.alpha_start_epoch
+            alpha_val = (self.alpha_max / total_ramp_epochs) * current_ramp_epoch
+        else:
+            alpha_val = self.alpha_max  # Stable phase: full intervention weight
+
+        # Update training plan with new alpha value
+        pl_module.alpha = alpha_val
+
+        # Beta (KL divergence) scheduling
+        # Delayed start to allow pathway representation learning
+        if current_epoch < self.beta_start_epoch:
+            beta_val = 0.0  # No KL penalty during pathway learning
+        else:
+            # Linear ramp-up from beta start to end of training
+            total_ramp_epochs = max_epochs - self.beta_start_epoch
+            current_ramp_epoch = current_epoch - self.beta_start_epoch
+            beta_val = (self.beta_max / total_ramp_epochs) * current_ramp_epoch
+
+        # Update training plan with new beta value
+        pl_module.beta = beta_val
+
+        # Log scheduled values for monitoring
+        pl_module.log_dict(
+            {"alpha_scheduled": alpha_val, "beta_scheduled": beta_val}, on_epoch=True
+        )
+
+
+class TemperatureScheduler(Callback):
+    """
+    Temperature annealing scheduler for intervention effect sharpness.
+
+    This scheduler implements temperature annealing for the softmax layers in SENA's
+    intervention networks, controlling the sharpness of pathway targeting over training.
+    Temperature scheduling is crucial for learning focused, interpretable intervention
+    effects while maintaining training stability.
+
+    Biological Motivation:
+    - Early training: Broad pathway targeting (low temperature) for exploration
+    - Late training: Sharp pathway targeting (high temperature) for precision
+    - Gradual transition prevents training instability and local minima
+    - Final high temperature enforces biological specificity
+
+    Temperature Effects:
+    - Low temp (≈1): Soft, distributed pathway targeting
+    - High temp (≫1): Sharp, focused pathway targeting
+    - temp→∞: One-hot pathway selection (maximum specificity)
+
+    Mathematical Impact:
+    softmax(x/T) where T is temperature
+    - T=1: Standard softmax
+    - T→0: Uniform distribution (maximum entropy)
+    - T→∞: One-hot distribution (minimum entropy)
+
+    Parameters
+    ----------
+    temp_max : float
+        Maximum temperature value (typically 10-100)
+        Higher values create sharper pathway targeting
+    temp_start_epoch : int, default=5
+        Epoch to begin temperature annealing
+        Early epochs use temp=1 for stable exploration
+    """
+
+    def __init__(self, temp_max: float, temp_start_epoch: int = 5):
+        super().__init__()
+        self.temp_max = temp_max
+        self.temp_start_epoch = temp_start_epoch
+
+    def on_train_start(self, trainer: Trainer, pl_module: TrainingPlan):
+        """
+        Initialize temperature parameter at the start of training.
+
+        Sets up the generative_kwargs dictionary in the training plan to store
+        temperature values that will be passed to the generative method during
+        forward passes.
+        """
+        # Initialize temperature storage in training plan
+        if not hasattr(pl_module, "generative_kwargs"):
+            pl_module.generative_kwargs = {}
+        pl_module.generative_kwargs["temp"] = 1.0  # Start with standard softmax
+
+    def on_train_epoch_start(self, trainer: Trainer, pl_module: TrainingPlan):
+        """
+        Update temperature at the beginning of each training epoch.
+
+        Implements linear temperature annealing from 1.0 to temp_max, allowing
+        the model to gradually transition from exploratory (broad pathway targeting)
+        to precise (focused pathway targeting) intervention effects.
+
+        Parameters
+        ----------
+        trainer : Trainer
+            PyTorch Lightning trainer containing epoch information
+        pl_module : TrainingPlan
+            SENA training plan to update with new temperature
+        """
+        current_epoch = trainer.current_epoch
+        max_epochs = trainer.max_epochs
+
+        if current_epoch < self.temp_start_epoch:
+            temp_val = 1.0  # Standard softmax during early training
+        else:
+            # Linear annealing from 1.0 to temp_max
+            total_ramp_epochs = max_epochs - self.temp_start_epoch
+            current_ramp_epoch = current_epoch - self.temp_start_epoch
+            temp_range = self.temp_max - 1.0
+            temp_val = 1.0 + (temp_range / total_ramp_epochs) * current_ramp_epoch
+
+        # Update temperature in generative method kwargs
+        if not hasattr(pl_module, "generative_kwargs"):
+            pl_module.generative_kwargs = {}
+        pl_module.generative_kwargs["temp"] = temp_val
+        pl_module.log("temperature_scheduled", temp_val, on_epoch=True)
+
+
+class SENATrainingPlan(TrainingPlan):
+    """
+    SENA training plan for causal perturbation analysis.
+
+    Orchestrates inference/generative passes and a multi-component loss with
+    scheduling and progress tracking for single-cell perturbation experiments.
+
+    Parameters
+    ----------
+    module : SENAModule
+        SENA neural architecture instance.
+    lr : float, default=1e-3
+        Learning rate for the optimizer (typically 1e-4 to 1e-2).
+    alpha : float, default=1.0
+        Initial weight for the intervention loss; updated by LossWeightScheduler.
+    beta : float, default=1.0
+        Initial weight for the KL divergence; updated by LossWeightScheduler.
+    lmbda : float, default=0.1
+        Weight for the L1 regularization on the causal graph matrix.
+    mmd_sigma : float, default=200.0
+        Bandwidth parameter for Gaussian kernels used by MMD.
+    kernel_num : int, default=10
+        Number of kernel scales for MMD computation.
+    matched_io : bool, default=False
+        Intervention loss selection: False → MMD (distributional); True → MSE (paired).
+    **kwargs
+        Additional arguments forwarded to the parent TrainingPlan.
+
+    Attributes
+    ----------
+    alpha : float
+        Current intervention loss weight (scheduled).
+    beta : float
+        Current KL divergence weight (scheduled).
+    lmbda : float
+        L1 regularization weight on the causal graph.
+    matched_io : bool
+        Whether intervention loss uses MSE instead of MMD.
+    mse_loss : nn.MSELoss
+        Reconstruction (and optionally intervention) loss function.
+    mmd_loss : MMD_loss or None
+        Distributional matching loss used when ``matched_io`` is False.
+    loss_kwargs : dict
+        Optional configuration for loss computation (e.g., ``upper_tri_only``,
+        ``normalize_l1``, ``zero_if_missing``, ``reduction``).
+
+    """
+
+    def __init__(
+        self,
+        module,
+        *,
+        lr: float = 1e-3,
+        alpha: float = 1.0,
+        beta: float = 1.0,
+        lmbda: float = 0.1,
+        mmd_sigma: float = 200.0,
+        kernel_num: int = 10,
+        matched_io: bool = False,
+        **kwargs,
+    ):
+        super().__init__(module, lr=lr, **kwargs)
+
+        logger.info(
+            "Initializing SENA Training Plan with custom loss and optimizer configuration."
+        )
+
+        # Loss component weights (dynamically updated by scheduler callbacks)
+        self.alpha = alpha  # Intervention loss weight
+        self.beta = beta  # KL divergence weight
+        self.lmbda = lmbda  # L1 regularization weight
+        self.matched_io = matched_io  # Loss function selection flag
+
+        # Loss function implementations
+        self.mse_loss = nn.MSELoss()  # For reconstruction and matched intervention prediction
+
+        # MMD loss for distributional intervention matching (when matched_io=False)
+        if not self.matched_io:
+            self.mmd_loss = MMD_loss(fix_sigma=mmd_sigma, kernel_num=kernel_num)
+
+        # Storage for additional loss computation parameters
+        self.loss_kwargs = {}
+
+    def loss(
+        self,
+        tensors: dict[str, torch.Tensor],
+        inference_output: dict[str, torch.Tensor],
+        generative_output: dict[str, torch.Tensor],
+        kl_weight: float = 1.0,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Compute SENA's multi-component loss function.
+
+        Computes a weighted sum of intervention, reconstruction, KL, and L1 terms to
+        train the model for perturbation prediction and latent regularization.
+
+        Parameters
+        ----------
+        tensors : dict[str, torch.Tensor]
+            Input data containing control and perturbed expressions.
+        inference_output : dict[str, torch.Tensor]
+            Encoder outputs.
+        generative_output : dict[str, torch.Tensor]
+            Decoder outputs, including predictions and causal matrix.
+        kl_weight : float, default=1.0
+            Warmup weight for the KL divergence.
+
+        Returns
+        -------
+        dict[str, torch.Tensor]
+            Dictionary with total loss and individual components.
+
+        Notes
+        -----
+        Formula (plain text):
+            L_total = alpha * L_intervention + L_reconstruction
+                    + beta * L_KL + lambda * L_L1
+
+        Components:
+        - L_intervention: MMD between predicted and observed perturbation effects
+        (or MSE if matched_io=True).
+        - L_reconstruction: MSE on control expression reconstruction.
+        - L_KL: KL divergence of the latent distribution.
+        - L_L1: L1 penalty on the causal DAG for sparsity.
+
+        Interpretation:
+        - High reconstruction loss: poor baseline/pathway inference.
+        - High intervention loss: inaccurate perturbation effect modeling.
+        - High KL loss: poorly regularized latent space.
+        - High L1 loss: overly complex pathway interactions.
+        """
+        # Extract input data tensors
+        x = tensors[REGISTRY_KEYS.X_KEY]  # Control expression (baseline)
+        y = tensors[REGISTRY_KEYS.LABELS_KEY]  # Perturbed expression (target)
+
+        # Extract model predictions and intermediate representations
+        x_recon = generative_output["x_recon"]  # Reconstructed control expression
+        y_hat = generative_output["y_hat"]  # Predicted perturbed expression
+        G = generative_output["G"]  # Learned causal DAG matrix
+        mu = inference_output["z_mu"]  # Latent pathway activity means
+        var = inference_output["z_var"]  # Latent pathway activity variances
+
+        # Intervention Loss: Measure perturbation prediction accuracy
+        # Choice between MMD (distributional) and MSE (pointwise) matching
+        if y_hat is None:
+            loss_interv = torch.tensor(0.0, device=x.device)
+        elif self.matched_io:
+            # MSE: Requires exact cell-to-cell matching (paired experimental design)
+            loss_interv = self.mse_loss(y_hat, y)
+        else:
+            # MMD: Robust distributional matching (heterogeneous cellular responses)
+            loss_interv = self.mmd_loss(y_hat, y)
+
+        # Reconstruction Loss: Ensure accurate control cell modeling
+        # Critical for learning baseline pathway activities
+        loss_recon = self.mse_loss(x_recon, x)
+
+        # KL Divergence: VAE regularization term
+        # Encourages latent pathway activities to follow standard normal distribution
+        logvar = torch.log(var)
+        KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
+        loss_kld = torch.mean(KLD_element).mul_(-0.5) / x.shape[0]
+
+        # L1 Regularization: Promote sparse causal pathway interactions
+        # Biological motivation: Most pathway pairs should not interact directly
+        loss_kld = torch.mean(KLD_element).mul_(-0.5) / x.shape[0]
+
+        # L1 Regularization: Promote sparse causal pathway interactions
+        # Biological motivation: Most pathway pairs should not interact directly
+        if G is not None:
+            # Apply L1 penalty only to upper triangular part (causal directions)
+            l1_numerator = torch.norm(torch.triu(G, diagonal=1), p=1)
+            l1_denominator = torch.sum(torch.triu(torch.ones_like(G), diagonal=1))
+            loss_l1 = (
+                l1_numerator / l1_denominator
+                if l1_denominator > 0
+                else torch.tensor(0.0, device=G.device)
+            )
+        else:
+            loss_l1 = torch.tensor(0.0, device=x.device)
+
+        # Compute weighted total loss with dynamic scheduling
+        # α, β weights are updated by scheduler callbacks during training
+        total_loss = (
+            self.alpha * loss_interv  # Weighted intervention prediction accuracy
+            + loss_recon  # Unweighted reconstruction (always important)
+            + self.beta * kl_weight * loss_kld  # Scheduled KL regularization
+            + self.lmbda * loss_l1  # L1 sparsity on causal structure
+        )
+
+        return {
+            "loss": total_loss,
+            "reconstruction_loss": loss_recon,
+            "kl_local": loss_kld,
+            "intervention_loss": loss_interv,
+            "l1_reg_loss": loss_l1,
+        }
+
+    def training_step(self, batch, batch_idx):
+        """
+        Execute a single training step.
+
+        Runs the inference and generative passes, computes the multi-component SENA loss,
+        and logs training metrics for monitoring progress.
+
+        Parameters
+        ----------
+        batch : dict[str, torch.Tensor]
+            Training batch containing registry keys:
+            - REGISTRY_KEYS.X_KEY: Control cell expression.
+            - REGISTRY_KEYS.LABELS_KEY: Perturbed cell expression.
+            - REGISTRY_KEYS.CAT_COVS_KEY: Intervention matrix.
+        batch_idx : int
+            Index of the current batch within the epoch.
+
+        Returns
+        -------
+        torch.Tensor
+            Total loss for backpropagation and optimization.
+
+        Notes
+        -----
+        Pipeline (plain text):
+        1. Inference pass: encode control expression to pathway activities (z, mu, var).
+        2. Generative input: process intervention matrix (c1, c2, num_interv).
+        3. Generative pass: apply interventions and causal propagation to get predictions.
+        4. Loss computation: multi-component SENA loss with current alpha, beta weights.
+        5. Metric logging: batch- and epoch-level tracking.
+        """
+        # Forward pass: encode control cells to pathway activities
+        inference_outputs = self.module.inference(batch[REGISTRY_KEYS.X_KEY])
+
+        # Process intervention matrix and prepare generative inputs
+        # Auto-detects single vs double perturbations from intervention matrix
+        generative_inputs = self.module._get_generative_input(
+            batch, inference_outputs, **getattr(self, "generative_kwargs", {})
+        )
+
+        # Generative pass: apply perturbations and causal propagation
+        # Produces predicted perturbed expression and control reconstruction
+        generative_outputs = self.module.generative(
+            z=generative_inputs["z"],
+            c1=generative_inputs["c1"],
+            c2=generative_inputs["c2"],
+            num_interv=generative_inputs["num_interv"],  # Auto-detected
+            **getattr(self, "generative_kwargs", {}),
+        )
+
+        # Compute comprehensive SENA loss with current weight schedule
+        loss_output = self.loss(
+            batch, inference_outputs, generative_outputs, **getattr(self, "loss_kwargs", {})
+        )
+
+        # Log primary loss for real-time progress monitoring (batch-level)
+        self.log(
+            "loss_train",
+            loss_output["loss"],
+            on_step=True,
+            on_epoch=False,
+            prog_bar=True,
+            logger=True,
+        )
+
+        # Log epoch-level summary without cluttering progress display
+        self.log(
+            "loss_train_epoch",
+            loss_output["loss"],
+            on_step=False,
+            on_epoch=True,
+            prog_bar=False,
+            logger=True,
+        )
+
+        # Store batch-level metrics for progress bar (not saved to history)
+        if not hasattr(self, "_current_batch_metrics"):
+            self._current_batch_metrics = {}
+
+        self._current_batch_metrics.update(
+            {
+                "reconstruction_loss": loss_output["reconstruction_loss"].item(),
+                "intervention_loss": loss_output["intervention_loss"].item(),
+                "kl_local": loss_output["kl_local"].item(),
+                "l1_reg_loss": loss_output["l1_reg_loss"].item(),
+            }
+        )
+
+        # Log detailed loss components for history (epoch-level only)
+        component_metrics = {
+            f"{key}_train": val for key, val in loss_output.items() if key != "loss"
+        }
+        self.log_dict(component_metrics, on_step=False, on_epoch=True, prog_bar=False)
+
+        # Also log epoch-level versions for easy access in history
+        component_metrics_epoch = {
+            f"{key}_train_epoch": val for key, val in loss_output.items() if key != "loss"
+        }
+        self.log_dict(component_metrics_epoch, on_step=False, on_epoch=True, prog_bar=False)
+
+        return loss_output["loss"]
+
+    def validation_step(self, batch, batch_idx):
+        """
+        Execute a validation step.
+
+        Performs the same forward passes as training (without gradients) to compute
+        the multi-component SENA loss on held-out perturbation data.
+
+        Parameters
+        ----------
+        batch : dict[str, torch.Tensor]
+            Validation batch with the same structure as the training batch.
+        batch_idx : int
+            Index of the current validation batch.
+
+        Returns
+        -------
+        torch.Tensor
+            Validation loss for monitoring generalization.
+
+        Notes
+        -----
+        Methodology (plain text):
+        - Forward pass mirrors training; gradients are disabled by the framework.
+        - Uses the same loss components for direct comparison with training.
+        - Metrics are logged for monitoring overfitting and stability.
+        """
+        # Forward pass identical to training (but in eval mode automatically)
+        inference_outputs = self.module.inference(batch[REGISTRY_KEYS.X_KEY])
+
+        # Process validation intervention matrix
+        generative_inputs = self.module._get_generative_input(
+            batch, inference_outputs, **getattr(self, "generative_kwargs", {})
+        )
+
+        # Generative pass with current model state
+        generative_outputs = self.module.generative(
+            z=generative_inputs["z"],
+            c1=generative_inputs["c1"],
+            c2=generative_inputs["c2"],
+            num_interv=generative_inputs["num_interv"],
+            **getattr(self, "generative_kwargs", {}),
+        )
+
+        # Compute validation loss using identical loss function as training
+        loss_output = self.loss(
+            batch, inference_outputs, generative_outputs, **getattr(self, "loss_kwargs", {})
+        )
+
+        # Log total validation loss for ModelCheckpoint monitoring
+        self.log(
+            "validation_loss",
+            loss_output["loss"],
+            on_step=False,
+            on_epoch=True,
+            prog_bar=False,
+            logger=True,
+        )
+
+        # Log detailed validation loss components for diagnostic analysis
+        validation_metrics = {
+            f"{key}_val": val for key, val in loss_output.items() if key != "loss"
+        }
+        self.log_dict(validation_metrics, on_step=False, on_epoch=True, prog_bar=False)
+
+        # Also log epoch-level versions for easy access in history
+        validation_metrics_epoch = {
+            f"{key}_val_epoch": val for key, val in loss_output.items() if key != "loss"
+        }
+        self.log_dict(validation_metrics_epoch, on_step=False, on_epoch=True, prog_bar=False)
+
+        return loss_output["loss"]
+
+
+class SENABatchProgressBar(Callback):
+    """
+    Specialized progress tracking callback for real-time monitoring of SENA training dynamics.
+
+    Standard epoch-level progress reporting is insufficient for complex single-cell perturbation
+    modeling workflows where researchers need immediate feedback on multi-component loss
+    convergence. This callback provides batch-level progress tracking essential for monitoring
+    causal graph learning, VAE stability, and biological constraint satisfaction during training.
+
+    Scientific Motivation:
+    - Large-scale perturbation screens require fine-grained convergence monitoring
+    - Multi-component loss functions need real-time component-wise tracking
+    - Causal graph learning benefits from immediate feedback on structure discovery
+    - Early detection of training instabilities in VAE latent space
+    - Interactive training sessions for hyperparameter optimization
+
+    Progress Metrics Displayed:
+    - Batch completion within current epoch
+    - Real-time primary loss value (reconstruction + intervention + regularization)
+    - Training throughput (batches per second)
+    - Current epoch context within total training schedule
+
+    Technical Implementation:
+    - Uses tqdm for high-performance progress display
+    - Integrates with PyTorch Lightning logging system
+    - Minimal computational overhead (<0.1% training time)
+    - Compatible with distributed and multi-GPU training setups
+
+    Usage Context:
+    Essential for interactive perturbation modeling where bioinformaticians need immediate
+    feedback on model convergence, particularly when training on large-scale CRISPR screens
+    or optimizing complex biological pathway constraint parameters.
+    """
+
+    def __init__(self):
+        """
+        Initialize batch-level progress tracking for SENA training workflows.
+
+        Sets up progress bar infrastructure for real-time monitoring of single-cell
+        perturbation modeling training, optimized for bioinformatics research workflows
+        requiring immediate feedback on complex multi-component loss convergence.
+        """
+        super().__init__()
+        self.batch_progress_bar = None
+
+    def on_train_epoch_start(self, trainer, pl_module):
+        """
+        Initialize epoch-specific batch progress tracking.
+
+        Creates a new progress bar for each training epoch, providing context
+        about current position in overall training schedule and preparing
+        real-time batch-level monitoring of loss convergence.
+
+        Parameters
+        ----------
+        trainer : pytorch_lightning.Trainer
+            PyTorch Lightning trainer managing the training process
+        pl_module : SENADVAE
+            SENA model being trained
+        """
+        if tqdm is not None:
+            epoch = trainer.current_epoch + 1
+            total_batches = trainer.num_training_batches
+
+            # Create batch-level progress bar with epoch context
+            self.batch_progress_bar = tqdm(
+                total=total_batches,
+                desc=f"Epoch {epoch}/{trainer.max_epochs}",
+                position=1,
+                leave=False,
+            )
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        """
+        Update progress bar with current batch training metrics.
+
+        Captures real-time loss values from SENA's multi-component loss function
+        and updates progress display, providing immediate feedback on reconstruction
+        quality, perturbation prediction accuracy, and biological constraint satisfaction.
+
+        Parameters
+        ----------
+        trainer : pytorch_lightning.Trainer
+            Current trainer instance with logged metrics
+        pl_module : SENADVAE
+            SENA model with current training state
+        outputs : dict
+            Training step outputs (typically loss values)
+        batch : dict
+            Current training batch data
+        batch_idx : int
+            Index of current batch within epoch
+        """
+        if self.batch_progress_bar is not None:
+            # Extract current loss from trainer's logged metrics and batch metrics
+            postfix = {}
+
+            # Get primary loss from logged metrics (this will still be logged)
+            if hasattr(pl_module, "trainer") and hasattr(pl_module.trainer, "logged_metrics"):
+                logged_metrics = pl_module.trainer.logged_metrics
+                if "loss_train" in logged_metrics:
+                    postfix["total_loss"] = f"{logged_metrics['loss_train']:.4f}"
+
+            # Get detailed metrics from batch storage (not saved to history)
+            if hasattr(pl_module, "_current_batch_metrics"):
+                metrics = pl_module._current_batch_metrics
+                postfix["recon"] = f"{metrics.get('reconstruction_loss', 0):.4f}"
+                postfix["interv"] = f"{metrics.get('intervention_loss', 0):.4f}"
+                postfix["kl"] = f"{metrics.get('kl_local', 0):.4f}"
+                postfix["l1"] = f"{metrics.get('l1_reg_loss', 0):.4f}"
+
+            # Update progress display with current metrics
+            self.batch_progress_bar.set_postfix(postfix)
+            self.batch_progress_bar.update(1)
+
+    def on_train_epoch_end(self, trainer, pl_module):
+        """
+        Clean up batch progress bar at epoch completion.
+
+        Closes current epoch's batch progress bar and prepares for next epoch,
+        maintaining clean progress display throughout multi-epoch training sessions.
+
+        Parameters
+        ----------
+        trainer : pytorch_lightning.Trainer
+            Trainer completing current epoch
+        pl_module : SENADVAE
+            Model completing current epoch
+        """
+        if self.batch_progress_bar is not None:
+            self.batch_progress_bar.close()
+            self.batch_progress_bar = None
+
+    def on_train_end(self, trainer, pl_module):
+        """
+        Clean up all progress tracking at training completion.
+
+        Ensures proper cleanup of progress bar resources when training completes,
+        either by reaching maximum epochs or early stopping criteria.
+
+        Parameters
+        ----------
+        trainer : pytorch_lightning.Trainer
+            Trainer completing training
+        pl_module : SENADVAE
+            Model completing training
+        """
+        if self.batch_progress_bar is not None:
+            self.batch_progress_bar.close()
diff --git a/src/scvi/external/__init__.py b/src/scvi/external/__init__.py
index 201f7a5ae6..4f76b2e089 100644
--- a/src/scvi/external/__init__.py
+++ b/src/scvi/external/__init__.py
@@ -16,6 +16,7 @@
 from .scar import SCAR
 from .scbasset import SCBASSET
 from .scviva import SCVIVA
+from .SENADVAE import SENADVAE
 from .solo import SOLO
 from .stereoscope import RNAStereoscope, SpatialStereoscope
 from .sysvi import SysVI
@@ -43,6 +44,7 @@
     "RESOLVI",
     "SCVIVA",
     "CYTOVI",
+    "SENADVAE",
 ]
 
 
diff --git a/tests/external/sena/test_sena.py b/tests/external/sena/test_sena.py
new file mode 100644
index 0000000000..c34d7a737f
--- /dev/null
+++ b/tests/external/sena/test_sena.py
@@ -0,0 +1,174 @@
+import numpy as np
+import pytest
+
+from scvi.data import synthetic_iid
+from scvi.external import SENADVAE
+
+
+@pytest.fixture
+def mock_sena_adata():
+    """Create mock AnnData with perturbation annotations for SENADVAE testing."""
+    adata = synthetic_iid(
+        n_genes=100,
+    )
+
+    # Add perturbation annotations
+    # Create a mix of control and perturbed cells
+    n_cells = adata.n_obs
+    perturbations = [""] * (n_cells // 2)  # Half control cells
+
+    # Add some single perturbations
+    gene_names = ["GENE1", "GENE2", "GENE3"]
+    for i in range(n_cells // 2, n_cells):
+        perturbations.append(gene_names[i % len(gene_names)])
+
+    adata.obs["perturbation"] = perturbations
+
+    return adata
+
+
+@pytest.fixture
+def mock_go_files(tmp_path):
+    """Create mock GO annotation files for SENADVAE testing."""
+    # Create mock GO pathway file
+    go_file_path = tmp_path / "go_pathways.tsv"
+    go_pathways = ["GO:0001", "GO:0002", "GO:0003"]
+    with open(go_file_path, "w") as f:
+        f.write("PathwayID\n")
+        for go in go_pathways:
+            f.write(f"{go}\n")
+
+    # Create mock gene-to-GO mapping file
+    go_gene_map_path = tmp_path / "gene_go_map.tsv"
+    with open(go_gene_map_path, "w") as f:
+        f.write("GO_id\tensembl_id\n")
+        # Map some genes to GO terms (using var_names from synthetic data)
+        for i, go in enumerate(go_pathways):
+            for j in range(10):
+                gene_idx = (i * 10 + j) % 100
+                f.write(f"{go}\tgene_{gene_idx}\n")
+
+    # Create mock gene symbol to ensemble mapping file
+    gene_symb_ensemble_path = tmp_path / "gene_symb_ensemble.tsv"
+    with open(gene_symb_ensemble_path, "w") as f:
+        f.write("external_gene_name\tensembl_gene_id\n")
+        f.write("GENE1\tgene_0\n")
+        f.write("GENE2\tgene_10\n")
+        f.write("GENE3\tgene_20\n")
+
+    return {
+        "go_file_path": str(go_file_path),
+        "go_gene_map_path": str(go_gene_map_path),
+        "gene_symb_ensemble_path": str(gene_symb_ensemble_path),
+    }
+
+
+def test_senadvae_setup_anndata(mock_sena_adata):
+    """Test SENADVAE.setup_anndata registers data correctly."""
+    SENADVAE.setup_anndata(mock_sena_adata, perturbation_key="perturbation")
+
+    assert "_sena_perturbation_key" in mock_sena_adata.uns
+    assert mock_sena_adata.uns["_sena_perturbation_key"] == "perturbation"
+    assert "intervention_genes" in mock_sena_adata.uns
+    assert "n_intervention_genes" in mock_sena_adata.uns
+
+
+def test_senadvae_analyze_perturbations(mock_sena_adata):
+    """Test SENADVAE.analyze_perturbations provides correct statistics."""
+    stats = SENADVAE.analyze_perturbations(mock_sena_adata, "perturbation")
+
+    assert "n_total_cells" in stats
+    assert "n_controls" in stats
+    assert "n_unique_genes" in stats
+    assert stats["n_total_cells"] == mock_sena_adata.n_obs
+    assert stats["n_controls"] > 0
+
+
+def test_senadvae_init(mock_sena_adata, mock_go_files):
+    """Test SENADVAE model initialization."""
+    SENADVAE.setup_anndata(mock_sena_adata, perturbation_key="perturbation")
+
+    model = SENADVAE(
+        mock_sena_adata,
+        go_file_path=mock_go_files["go_file_path"],
+        go_gene_map_path=mock_go_files["go_gene_map_path"],
+        gene_symb_ensemble_path=mock_go_files["gene_symb_ensemble_path"],
+        n_hidden_encoder=32,
+        n_hidden_decoder=32,
+        n_hidden_interv=32,
+        n_go_thresh=2,
+        seed=42,
+    )
+
+    assert model is not None
+    assert model.module is not None
+
+
+def test_senadvae_train(mock_sena_adata, mock_go_files):
+    """Test SENADVAE model training."""
+    SENADVAE.setup_anndata(mock_sena_adata, perturbation_key="perturbation")
+
+    model = SENADVAE(
+        mock_sena_adata,
+        go_file_path=mock_go_files["go_file_path"],
+        go_gene_map_path=mock_go_files["go_gene_map_path"],
+        gene_symb_ensemble_path=mock_go_files["gene_symb_ensemble_path"],
+        n_hidden_encoder=32,
+        n_hidden_decoder=32,
+        n_hidden_interv=32,
+        n_go_thresh=2,
+        seed=42,
+    )
+
+    model.train(max_epochs=2, batch_size=32, check_val_every_n_epoch=1)
+
+    assert model.is_trained_
+
+
+def test_senadvae_get_latent_representation(mock_sena_adata, mock_go_files):
+    """Test SENADVAE latent representation extraction."""
+    SENADVAE.setup_anndata(mock_sena_adata, perturbation_key="perturbation")
+
+    model = SENADVAE(
+        mock_sena_adata,
+        go_file_path=mock_go_files["go_file_path"],
+        go_gene_map_path=mock_go_files["go_gene_map_path"],
+        gene_symb_ensemble_path=mock_go_files["gene_symb_ensemble_path"],
+        n_hidden_encoder=32,
+        n_hidden_decoder=32,
+        n_hidden_interv=32,
+        n_go_thresh=2,
+        seed=42,
+    )
+
+    model.train(max_epochs=1, batch_size=32)
+
+    latent = model.get_latent_representation()
+
+    assert latent is not None
+    assert isinstance(latent, np.ndarray)
+    assert latent.shape[0] == mock_sena_adata.n_obs // 2
+
+
+def test_senadvae_get_causal_graph(mock_sena_adata, mock_go_files):
+    """Test SENADVAE causal graph extraction."""
+    SENADVAE.setup_anndata(mock_sena_adata, perturbation_key="perturbation")
+
+    model = SENADVAE(
+        mock_sena_adata,
+        go_file_path=mock_go_files["go_file_path"],
+        go_gene_map_path=mock_go_files["go_gene_map_path"],
+        gene_symb_ensemble_path=mock_go_files["gene_symb_ensemble_path"],
+        n_hidden_encoder=32,
+        n_hidden_decoder=32,
+        n_hidden_interv=32,
+        n_go_thresh=2,
+        seed=42,
+    )
+
+    model.train(max_epochs=1, batch_size=32)
+
+    causal_graph = model.get_causal_graph()
+
+    assert causal_graph is not None
+    assert isinstance(causal_graph, np.ndarray)