♻️ simplify Infos class (#444)

casenave · github-code-quality[bot] · web-flow · commit 8a254db7280d · 2026-06-06T17:05:13.000+02:00
Co-authored-by: Copilot Autofix powered by AI &lt;223894421+github-code-quality[bot]@users.noreply.github.com&gt;
diff --git a/docs/source/concepts/infos.md b/docs/source/concepts/infos.md
@@ -21,29 +21,24 @@ In the current API, infos stores:
 ## Basic usage
 
 ```python
-from plaid.infos import DataProduction, Infos
+from plaid import Infos
 
 infos = Infos(
     owner="Safran",
     license="proprietary",
-    data_production=DataProduction(
-        type="simulation",
-        physics="fluid dynamics",
-        simulator="ExampleSolver",
-    ),
+    data_production={
+        "type": "simulation",
+        "physics": "fluid dynamics",
+        "simulator": "ExampleSolver",
+    },
     data_description="ExampleDescription",
 )
 ```
 
-Infos can also be built from a plain mapping, for instance after reading YAML:
+To inspect the public constructor fields accepted by `Infos`, use:
 
 ```python
-infos = Infos.model_validate(
-    {
-        "owner": "Safran",
-        "license": "proprietary",
-    }
-)
+Infos.print_available_fields()
 ```
 
 `num_samples` and `storage_backend` are derived from the chosen storage backend
@@ -60,13 +55,7 @@ infos = Infos.from_path("/path/to/plaid_dataset")
 ```
 
 When a directory is provided, `Infos.from_path(...)` looks for `infos.yaml`
-inside that directory. By default, loading from disk requires the persisted
-storage metadata (`num_samples` and `storage_backend`) to be present. To load a
-draft infos file that has not been produced by `save_to_disk(...)`, use:
-
-```python
-infos = Infos.from_path("/path/to/draft/infos.yaml", require_persisted=False)
-```
+inside that directory.
 
 ## Saving
 
@@ -77,7 +66,10 @@ infos.save_to_file("/path/to/plaid_dataset/infos.yaml")
 ```
 
 If a directory path is provided, the file is saved as `infos.yaml` inside that
-directory.
+directory. Direct YAML writing requires complete persisted metadata: `owner`,
+`license`, `num_samples`, and `storage_backend`. When using
+`save_to_disk(..., infos=...)`, PLAID fills `num_samples` and `storage_backend`
+automatically before writing `infos.yaml`.
 
 ## Typed access and serialization
 
diff --git a/docs/source/tutorials/storage.md b/docs/source/tutorials/storage.md
@@ -96,6 +96,9 @@ infos = Infos(
     owner="NeuralOperator (https://zenodo.org/records/13993629)",
     license="cc-by-4.0",
     data_description="No changes to data content from original dataset",
+    type="simulation",
+    physics="phase-field fracture models for brittle fracture",
+    script="Subset 'res-SENS' of the initial dataset, 1/5th time steps, converted to PLAID format for standardized access; no changes to data content."
 )
 
 
diff --git a/examples/infos_example.py b/examples/infos_example.py
@@ -36,7 +36,7 @@
 
 # %%
 # Import necessary libraries and classes
-from plaid.infos import DataProduction, Infos
+from plaid.infos import Infos
 
 # %% [markdown]
 # ## Section 1: Initializing Infos
@@ -51,51 +51,25 @@
 infos = Infos(
     owner="PLAID",
     license="MIT",
+    data_production={
+        "type": "simulation",
+        "physics": "fluid dynamics",
+        "simulator": "ExampleSolver",
+    },
+    data_description="ExampleDescription",
 )
 print(f"{infos = }")
 
 # %% [markdown]
-# ### Initialize Infos from a plain mapping
+# ### Print available Infos fields
 
 # %%
-infos_from_mapping = Infos.model_validate(
-    {
-        "owner": "PLAID",
-        "license": "MIT",
-        "data_description": "Example metadata for a PLAID dataset.",
-    }
-)
-print(f"{infos_from_mapping = }")
+Infos.print_available_fields()
 
 # %% [markdown]
-# ## Section 2: Configuring Infos and retrieve data
+# ## Section 2: Modifying Infos and retrieve data
 #
-# This section demonstrates how to handle and configure Infos objects and access
-# metadata.
-
-# %% [markdown]
-# ### Set owner and license metadata
-
-# %%
-infos.owner = "Safran"
-infos.license = "proprietary"
-print(f"{infos.owner = }")
-print(f"{infos.license = }")
-
-# %% [markdown]
-# ### Set data production metadata
-
-# %%
-infos.data_production = DataProduction(
-    type="simulation",
-    physics="fluid dynamics",
-    simulator="ExampleSolver",
-    hardware="ExampleCluster",
-    computation_duration="1 hour",
-    script="run_simulation.py",
-    contact="contact@example.com",
-)
-print(f"{infos.data_production = }")
+# This section demonstrates how to handle Infos objects and access metadata.
 
 # %% [markdown]
 # ### Set data description
@@ -119,8 +93,7 @@
 # %% [markdown]
 # ## Section 3: Saving and Loading Infos
 #
-# This section demonstrates how to save and load Infos from a directory or YAML
-# file.
+# This section demonstrates how to save and load Infos from a YAML file.
 
 # %% [markdown]
 # ### Save Infos to a YAML file
@@ -131,20 +104,22 @@
 )
 infos_save_fname = test_pth / "infos.yaml"
 test_pth.mkdir(parents=True, exist_ok=True)
-print(f"saving path: {infos_save_fname}")
 
+print(f"saving path: {infos_save_fname}")
+infos.num_samples = {"train": 0}
+infos.storage_backend = "zarr"
 infos.save_to_file(infos_save_fname)
 
 # %% [markdown]
 # ### Load Infos from a YAML file
 
 # %%
-loaded_infos = Infos.from_path(infos_save_fname, require_persisted=False)
+loaded_infos = Infos.from_path(infos_save_fname)
 print(loaded_infos)
 
 # %% [markdown]
-# ### Load Infos from a directory containing infos.yaml
+# ### Load Infos from an explicit infos.yaml path
 
 # %%
-loaded_infos_from_dir = Infos.from_path(test_pth, require_persisted=False)
-print(loaded_infos_from_dir)
+loaded_infos_from_explicit_path = Infos.from_path(test_pth / "infos.yaml")
+print(loaded_infos_from_explicit_path)
diff --git a/src/plaid/cli/plaidcheck.py b/src/plaid/cli/plaidcheck.py
@@ -435,7 +435,7 @@ def check_dataset(
         )
         return report
     try:
-        infos = load_infos_from_disk(path)
+        infos = load_infos_from_disk(path / "infos.yaml")
     except Exception as exc:
         report.add("error", "INFOS_READ_ERROR", "infos.yaml", str(exc))
         return report
diff --git a/src/plaid/infos.py b/src/plaid/infos.py
@@ -21,8 +21,6 @@ class DataProduction(
 ):
     """Dataset production context metadata."""
 
-    owner: str | None = None
-    license: str | None = None
     type: str | None = None
     physics: str | None = None
     simulator: str | None = None
@@ -52,15 +50,26 @@ class Infos(
 ):
     """Structured representation of a PLAID dataset ``infos`` payload."""
 
-    # model_config = _PD_CONFIG
-
     owner: str
     license: str
     data_production: DataProduction | None = None
     data_description: str | None = None
     num_samples: dict[str, int] = Field(default_factory=dict)
     storage_backend: str | None = None
 
+    @classmethod
+    def print_available_fields(cls) -> None:
+        """Print the public constructor fields accepted by :class:`Infos`."""
+        print("Infos fields:")
+        for field_name in cls.model_fields:
+            print(f"  - {field_name}")
+            if field_name in {"num_samples", "storage_backend"}:
+                print("    note: automatically filled when calling save_to_disk")
+            if field_name == "data_production":
+                print("    subfields:")
+                for subfield_name in DataProduction.model_fields:
+                    print(f"      - {subfield_name}")
+
     def require_persisted(self) -> "Infos":
         """Validate fields that must exist in persisted dataset infos.
 
@@ -142,8 +151,8 @@ def from_path(
         """Load and validate an :class:`Infos` from a YAML file.
 
         Args:
-            path: Path to the YAML file (typically ``infos.yaml``) or to a
-                directory containing it.
+            path: Path to the YAML file (typically ``infos.yaml``). If no
+                suffix is provided, ``.yaml`` is appended.
             require_persisted: When True, require storage-derived metadata
                 fields expected in a complete on-disk dataset.
 
@@ -152,10 +161,15 @@ def from_path(
 
         Raises:
             FileNotFoundError: If the resolved YAML file does not exist.
+            IsADirectoryError: If ``path`` points to a directory.
         """
         path = Path(path)
         if path.is_dir():
-            path = path / "infos.yaml"
+            raise IsADirectoryError(
+                f'Expected a YAML file path, got directory "{path}"'
+            )
+        if path.suffix != ".yaml":
+            path = path.with_suffix(".yaml")
         if not path.exists():
             raise FileNotFoundError(f'File "{path}" does not exist. Abort')
 
@@ -171,22 +185,28 @@ def save_to_file(self, path: Union[str, Path]) -> None:
         """Save infos to ``path`` as a YAML file.
 
         Args:
-            path: File path (or directory) where the YAML will be written. If
-                ``path`` is a directory it will be extended with ``infos.yaml``.
+            path: File path where the YAML will be written. If no suffix is
+                provided, ``.yaml`` is appended.
+
+        Raises:
+            IsADirectoryError: If ``path`` points to a directory.
         """
+        self.require_persisted()
+
         path = Path(path)
-        if path.suffix == "" and not path.exists():
-            # Treat suffix-less paths as directories.
-            path = path / "infos.yaml"
-        elif path.is_dir():
-            path = path / "infos.yaml"
+        if path.is_dir():
+            raise IsADirectoryError(
+                f'Expected a YAML file path, got directory "{path}"'
+            )
+
         if path.suffix != ".yaml":
             path = path.with_suffix(".yaml")
 
         path.parent.mkdir(parents=True, exist_ok=True)
 
         data = self.model_dump(exclude_none=True, exclude_unset=True)
         ordered_data = {key: data[key] for key in _KEY_ORDER if key in data}
+
         # Preserve any future fields.
         for key, value in data.items():
             if key not in ordered_data:
diff --git a/src/plaid/problem_definition.py b/src/plaid/problem_definition.py
@@ -21,6 +21,13 @@
 
 logger = logging.getLogger(__name__)
 
+_KEY_ORDER = [
+    "input_features",
+    "output_features",
+    "train_split",
+    "test_split",
+]
+
 
 def _normalize_list(v):
     return sorted(map(str, v))
@@ -35,10 +42,6 @@ class ProblemDefinition(
 ):
     """Defines the input and output features for a machine learning problem."""
 
-    # model_config = ConfigDict(
-    #     revalidate_instances="always", validate_assignment=True, extra="forbid"
-    # )
-
     input_features: list[str]
     output_features: list[str]
     train_split: dict[str, Sequence[int] | Literal["all"]]
@@ -57,8 +60,13 @@ def from_path(cls, path: str | Path) -> "ProblemDefinition":
 
         Raises:
             FileNotFoundError: If the resolved YAML file does not exist.
+            IsADirectoryError: If ``path`` points to a directory.
         """
         path = Path(path)
+        if path.is_dir():
+            raise IsADirectoryError(
+                f'Expected a YAML file path, got directory "{path}"'
+            )
         if path.suffix != ".yaml":
             path = path.with_suffix(".yaml")
         if not path.exists():
@@ -203,21 +211,18 @@ def save_to_file(self, path: Union[str, Path]) -> None:
                 problem.save_to_file("/path/to/save_file")
         """
         path = Path(path)
-        path.parent.mkdir(parents=True, exist_ok=True)
+        if path.is_dir():
+            raise IsADirectoryError(
+                f'Expected a YAML file path, got directory "{path}"'
+            )
 
         if path.suffix != ".yaml":
             path = path.with_suffix(".yaml")
 
-        data = self.model_dump()
-
-        key_order = [
-            "input_features",
-            "output_features",
-            "train_split",
-            "test_split",
-        ]
+        path.parent.mkdir(parents=True, exist_ok=True)
 
-        ordered_data = {key: data[key] for key in key_order if key in data}
+        data = self.model_dump()
+        ordered_data = {key: data[key] for key in _KEY_ORDER if key in data}
 
         # Save infos
         with path.open("w") as file:
diff --git a/tests/test_info.py b/tests/test_info.py
diff --git a/tests/test_problem_definition.py b/tests/test_problem_definition.py

Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,9 @@ infos = Infos(`
`96`	`96`	`owner="NeuralOperator (https://zenodo.org/records/13993629)",`
`97`	`97`	`license="cc-by-4.0",`
`98`	`98`	`data_description="No changes to data content from original dataset",`
	`99`	`+ type="simulation",`
	`100`	`+ physics="phase-field fracture models for brittle fracture",`
	`101`	`+ script="Subset 'res-SENS' of the initial dataset, 1/5th time steps, converted to PLAID format for standardized access; no changes to data content."`
`99`	`102`	`)`
`100`	`103`
`101`	`104`
Original file line number	Diff line number	Diff line change
`@@ -435,7 +435,7 @@ def check_dataset(`
`435`	`435`	`)`
`436`	`436`	`return report`
`437`	`437`	`try:`
`438`		`- infos = load_infos_from_disk(path)`
	`438`	`+ infos = load_infos_from_disk(path / "infos.yaml")`
`439`	`439`	`except Exception as exc:`
`440`	`440`	`report.add("error", "INFOS_READ_ERROR", "infos.yaml", str(exc))`
`441`	`441`	`return report`