aqlaboratory · ljarosch · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/docs/source/configuration_reference.md b/docs/source/configuration_reference.md
@@ -133,16 +133,36 @@ data_module_args:
 
 ### 3.6. Dataset Config Kwargs (`dataset_config_kwargs`)
 
-Configures MSA and template feature generation.
+Configures MSA/template feature generation and optional custom CCD input for
+inference.
 
 **Pydantic Model**: [`InferenceDatasetConfigKwargs`](https://github.com/aqlaboratory/openfold-3/blob/main/openfold3/projects/of3_all_atom/config/dataset_configs.py#L270)
 
 **All Options**:
-- `ccd_file_path` *(FilePath | None)*: Path to Chemical Component Dictionary file, uses CCD from Biotite if null (default: `null`)
+- `ccd_file_path` *(FilePath | None)*: Path to custom Chemical Component
+  Dictionary file for inference (`.cif` or `.bcif`). If `null`, uses Biotite's
+  bundled CCD (default: `null`)
 - `msa` *(MSASettings)*: MSA processing settings (see below)
 - `template` *(TemplateSettings)*: Template processing settings (see below)
 
-#### 3.6.1. MSA Settings (`msa`)
+#### 3.6.1 User-provided Chemical Component Dictionary
+
+For inference, set `dataset_config_kwargs.ccd_file_path` to provide a custom CCD file.
+This can be useful for finer control of atom names of custom ligands, as well as
+allowing for more readable query JSONs using user-defined ligand keys. 
+
+- Supported formats: `.bcif` and `.cif`.
+- `.cif` input is converted to temporary `BinaryCIF` before being passed to
+  Biotite. Note that this on-the-fly conversion may add over a minute of startup time.
+- `.bcif` can also be generated from a cif-file beforehand, using [preprocess_ccd_biotite.py](https://github.com/aqlaboratory/openfold-3/blob/main/scripts/data_preprocessing/preprocess_ccd_biotite.py)
+
+**Example**:
+```yaml
+dataset_config_kwargs:
+  ccd_file_path: /path/to/custom/components.cif
+```
+
+#### 3.6.2. MSA Settings (`msa`)
 
 Controls how MSAs are parsed and processed into features.
 
@@ -169,7 +189,7 @@ dataset_config_kwargs:
     moltypes: [0, 1]  # protein and RNA
 ```
 
-#### 3.6.2. Template Settings (`template`)
+#### 3.6.3. Template Settings (`template`)
 
 Controls template structure processing.
 
@@ -267,7 +287,9 @@ Configures template structure preprocessing and filtering.
 - `structure_array_directory` *(Path | None)*: Directory for preparsed structures (default: `null`)
 - `cache_directory` *(Path | None)*: Directory for template cache (default: `null`)
 - `log_directory` *(Path | None)*: Directory for logs (default: `null`)
-- `ccd_file_path` *(Path | None)*: Path to Chemical Component Dictionary file (default: `null`)
+- `ccd_file_path` *(Path | None)*: Path to Chemical Component Dictionary file.
+  Primarily useful for standalone template preprocessing workflows; for
+  inference, prefer `dataset_config_kwargs.ccd_file_path` (default: `null`)
 
 **Example**:
 ```yaml
@@ -286,4 +308,3 @@ For the complete list of default values, see the Pydantic model classes in:
 - [`openfold3/projects/of3_all_atom/config/dataset_config_components.py`](https://github.com/aqlaboratory/openfold-3/blob/main/openfold3/projects/of3_all_atom/config/dataset_config_components.py) - MSA and template settings
 - [`openfold3/core/data/tools/colabfold_msa_server.py`](https://github.com/aqlaboratory/openfold-3/blob/main/openfold3/core/data/tools/colabfold_msa_server.py) - MSA server settings
 - [`openfold3/core/data/pipelines/preprocessing/template.py`](http://github.com/aqlaboratory/openfold-3/blob/main/openfold3/core/data/pipelines/preprocessing/template.py) - Template preprocessing settings
-
diff --git a/docs/source/input_format.md b/docs/source/input_format.md
@@ -266,8 +266,7 @@ Below is a complete example of an input JSON file specifying a single bioassembl
                 }
             ],
         }
-    },
-    "ccd_file_path": "/path/to/CCD/file.cif"
+    }
 }
 ```
 
@@ -277,4 +276,4 @@ Additional example input JSON files can be found here:
 - [Multi-chain protein with different chains (multimer)](../../examples/example_inference_inputs/query_multimer.json): Deoxy human hemoglobin (PDB: 1A3N)
 - [Protein-ligand complex](../../examples/example_inference_inputs/query_protein_ligand.json): Mcl-1 with small molecule inhibitor (PDB: 5FDR)
 - [Sigle protein-single ligand complex](../../examples/example_inference_inputs/query_single_protein_single_ligand.json): T4 Lysozyme (L99A mutant) with toluene (PDB: 7L39)
-- [Multiple Protein-ligand complexes](../../examples/example_inference_inputs/query_protein_ligand_multiple.json): Two queries with Mcl-1 and different small molecule inhibitors (PDB: 5FDR)
+- [Multiple Protein-ligand complexes](../../examples/example_inference_inputs/query_protein_ligand_multiple.json): Two queries with Mcl-1 and different small molecule inhibitors (PDB: 5FDR)
diff --git a/docs/source/template_how_to.md b/docs/source/template_how_to.md
@@ -280,4 +280,9 @@ template_preprocessor_settings:
   ccd_file_path: <optional/path/to/ccd/file>
 ```
 
-where a CCD file can be optionally provided if the template structures contain custom ligands or other chemical components.
+where a CCD file can be optionally provided if the template structures contain
+custom ligands or other chemical components.
+
+For regular inference runs, prefer setting custom CCD via
+`dataset_config_kwargs.ccd_file_path` in `runner.yml`; this value is copied to
+`template_preprocessor_settings.ccd_file_path` automatically.
diff --git a/examples/reference_full_config/full_config.yml b/examples/reference_full_config/full_config.yml
@@ -63,7 +63,7 @@ data_module_args:
 # DatasetConfigKwargs: https://github.com/aqlaboratory/openfold-3/blob/main/openfold3/projects/of3_all_atom/config/dataset_configs.py#L270
 # Arguments for creating template and MSA features
 dataset_config_kwargs: 
-  ccd_file_path: null  # if null, uses CCD from Biotite
+  ccd_file_path: null  # optional custom CCD (.cif or .bcif); if null, uses Biotite CCD
   # MSA Settings: https://github.com/aqlaboratory/openfold-3/blob/main/openfold3/projects/of3_all_atom/config/dataset_config_components.py#L32
   # Use this section to customize parsing of MSAs into features, more information in docs/source/precomputed_msa_how_to.md
   msa: 
@@ -158,4 +158,4 @@ template_preprocessor_settings:
   structure_array_directory: null
   cache_directory: <tmp-dir>/of3_template_data/template_cache
   log_directory: null
-  ccd_file_path: null
+  ccd_file_path: null
diff --git a/openfold3/core/data/framework/data_module.py b/openfold3/core/data/framework/data_module.py
@@ -66,6 +66,9 @@
     SamplerDataset,
 )
 from openfold3.core.data.pipelines.preprocessing.template import TemplatePreprocessor
+from openfold3.core.data.primitives.structure.biotite_ccd import (
+    update_biotite_ccd,
+)
 from openfold3.core.data.tools.colabfold_msa_server import (
     MsaComputationSettings,
     augment_main_msa_with_query_sequence,
@@ -545,6 +548,20 @@ def setup(self, stage=None):
             dist.broadcast_object_list(placeholder, src=0)
             self.inference_config.query_set = placeholder[0]
         super().setup()
+        self._base_worker_init = self.worker_init_function_with_data_seed
+        self.worker_init_function_with_data_seed = self._worker_init_with_ccd
+
+    def _worker_init_with_ccd(self, worker_id, rank=None):
+        """Wrap the base worker init to re-apply the custom Biotite CCD path.
+
+        The custom CCD setting in Biotite is process-local global state, so we reapply
+        it in each worker for future-proofing with spawn/forkserver.
+        """
+        self._base_worker_init(worker_id, rank)
+        dataset = torch.utils.data.get_worker_info().dataset
+        ccd_path = getattr(dataset, "_biotite_ccd_path", None)
+        if ccd_path is not None:
+            update_biotite_ccd(ccd_path)
 
 
 # TODO: Remove debug logic and improve handlingi of training only features

diff --git a/openfold3/core/data/framework/single_datasets/inference.py b/openfold3/core/data/framework/single_datasets/inference.py
@@ -23,7 +23,6 @@
 import pandas as pd
 import torch
 from biotite.structure import AtomArray
-from biotite.structure.io import pdbx
 from torch.utils.data import Dataset
 
 from openfold3.core.config.msa_pipeline_configs import MsaSampleProcessorInputInference
@@ -56,6 +55,9 @@
 from openfold3.core.data.pipelines.sample_processing.template import (
     process_template_structures_of3,
 )
+from openfold3.core.data.primitives.structure.biotite_ccd import (
+    update_biotite_ccd_from_file,
+)
 from openfold3.core.data.primitives.structure.component import BiotiteCCDWrapper
 from openfold3.core.data.primitives.structure.query import (
     StructureWithReferenceMolecules,
@@ -112,12 +114,15 @@ def __init__(
         if self.template_preprocessor_settings.preparse_structures:
             self.template_preprocessor_settings.structure_file_format = "npz"
 
-        # Parse CCD
-        if dataset_config.ccd_file_path is not None:
-            logger.debug("Parsing CCD file.")
-            self.ccd = pdbx.CIFFile.read(dataset_config.ccd_file_path)
-        else:
-            self.ccd = BiotiteCCDWrapper()
+        # If a custom CCD file is provided, overwrite Biotite's global CCD.
+        # The resolved path is stored for re-applying in DataLoader workers
+        # started with spawn/forkserver.
+        self._biotite_ccd_path = update_biotite_ccd_from_file(
+            dataset_config.ccd_file_path
+        )
+
+        # Template code requires "conventional" CIF format
+        self._ccd = BiotiteCCDWrapper()
 
         # Create individual datapoint cache (allows rerunning the same query with
         # different seeds)
@@ -264,7 +269,7 @@ def create_template_features(
             template_structures_directory=self.template_preprocessor_settings.structure_directory,
             template_structure_array_directory=self.template_preprocessor_settings.structure_array_directory,
             template_file_format=self.template_preprocessor_settings.structure_file_format,
-            ccd=self.ccd,
+            ccd=self._ccd,
         )
 
         # Featurization

diff --git a/openfold3/core/data/io/structure/cif.py b/openfold3/core/data/io/structure/cif.py
@@ -268,22 +268,18 @@ def _create_cif_file(
     else:
         raise ValueError("Suffix must be either .cif or .bcif")
 
-    try:
-        # copy entity_id to label_entity_id so biotite uses it for the atom_site table
-        atom_array.set_annotation("label_entity_id", atom_array.entity_id)
-        pdbx.set_structure(
-            cif_file, atom_array, data_block=data_block, include_bonds=include_bonds
-        )
-    # This error sometimes happens in the PDB preprocessing
-    except KeyError:
-        logger.warning(
-            "KeyError while writing structure to CIF file. Retrying with "
-            "intra-residue COORDINATION bonds set to SINGLE."
-        )
+    # copy entity_id to label_entity_id so biotite uses it for the atom_site table
+    atom_array.set_annotation("label_entity_id", atom_array.entity_id)
+
+    # Biotite cannot serialize COORDINATION bonds in the _chem_comp_bond table which are
+    # introduced by pdbeccdutils
+    # Convert to SINGLE proactively to avoid a KeyError in set_structure.
+    if include_bonds:
         atom_array = convert_intra_residue_dative_to_single(atom_array)
-        pdbx.set_structure(
-            cif_file, atom_array, data_block=data_block, include_bonds=include_bonds
-        )
+
+    pdbx.set_structure(
+        cif_file, atom_array, data_block=data_block, include_bonds=include_bonds
+    )
 
     # Update and add additional metadata tables
     if make_ost_compatible: