Machine-Learning-for-Medical-Language
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 8 deletions b/‎pyproject.toml‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/cnlpt/_cli/train.py‎
Lines changed: 18 additions & 22 deletions b/‎src/cnlpt/_cli/train.py‎
Lines changed: 18 additions & 22 deletions
diff --git a/‎src/cnlpt/data/cnlp_dataset.py‎
Lines changed: 10 additions & 10 deletions b/‎src/cnlpt/data/cnlp_dataset.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎src/cnlpt/data/data_reader.py‎
Lines changed: 8 additions & 8 deletions b/‎src/cnlpt/data/data_reader.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/cnlpt/data/predictions.py‎
Lines changed: 6 additions & 6 deletions b/‎src/cnlpt/data/predictions.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/cnlpt/data/preprocess.py‎
Lines changed: 13 additions & 15 deletions b/‎src/cnlpt/data/preprocess.py‎
Lines changed: 13 additions & 15 deletions
@@ -91,7 +91,6 @@ select = [
     "I",   # isort
     "UP",  # pyupgrade
     "G",   # logging
-    "FA",  # future annotations
     "PIE", # misc
     "RUF", # misc
 ]
@@ -101,12 +100,5 @@ ignore = [
     "G004", # f-strings in logging statements
 ]
 
-[tool.ruff.lint.pyupgrade]
-# Preserve Union types, despite alternate 'X | Y' syntax being available via __future__ annotations module.
-# This is necessary because fastAPI and pydantic parse type annotations at runtime, and since the new syntax is
-# a python 3.10 feature they don't expect it in python 3.9.
-# This can be removed if/when we stop supporting python 3.9.
-keep-runtime-typing = true
-
 [tool.uv]
 cache-keys = [{ git = { commit = true, tags = true } }]
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Annotated, Any, Final, Union
+from typing import Annotated, Any, Final
 
 import typer
 from click.core import ParameterSource
@@ -40,7 +40,7 @@ def callback(ctx: typer.Context, param: typer.CallbackParam, value: Any):
 def training_arg_option(
     field_name: str,
     *aliases,
-    compatibility: Union[list[ModelType], None] = None,
+    compatibility: list[ModelType] | None = None,
     **kwargs,
 ):
     field = CnlpTrainingArguments.__dataclass_fields__[field_name]
@@ -59,7 +59,7 @@ def training_arg_option(
 
 def model_arg_option(
     *args,
-    compatibility: Union[list[ModelType], None] = None,
+    compatibility: list[ModelType] | None = None,
     **kwargs,
 ):
     if compatibility is not None:
@@ -69,7 +69,7 @@ def model_arg_option(
 
 def data_arg_option(
     *args,
-    compatibility: Union[list[ModelType], None] = None,
+    compatibility: list[ModelType] | None = None,
     **kwargs,
 ):
     if compatibility is not None:
@@ -251,15 +251,15 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
     ),
 ]
 TaskNamesArg = Annotated[
-    Union[list[str], None],
+    list[str] | None,
     data_arg_option(
         "--task",
         "-t",
         help="The name of a task in the dataset to train on. Can be specified multiple times to target more than one task. Defaults to all tasks.",
     ),
 ]
 TokenizerArg = Annotated[
-    Union[str, None],
+    str | None,
     data_arg_option(
         "--tokenizer",
         help=f'Name or path to a model to use for tokenization. For projection and hierarchical models, this will default to the --encoder if left unspecified; otherwise defaults to "{DEFAULT_ENCODER}".',
@@ -288,15 +288,15 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
     ),
 ]
 MaxTrainArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option("--max_train", help="Limit the number of training samples to use."),
 ]
 MaxEvalArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option("--max_eval", help="Limit the number of eval samples to use."),
 ]
 MaxTestArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option("--max_test", help="Limit the number of test samples to use."),
 ]
 AllowDisjointLabelsArg = Annotated[
@@ -314,17 +314,17 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
     ),
 ]
 HierChunkLenArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option("--hier_chunk_len", help="Chunk length for hierarchical models."),
 ]
 HierNumChunksArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option(
         "--hier_num_chunks", help="Number of chunks for hierarchical models."
     ),
 ]
 HierPrependEmptyChunkArg = Annotated[
-    Union[int, None],
+    int | None,
     data_arg_option(
         "--hier_prepend_empty_chunk",
         help="Whether to prepend an empty chunk for hierarchical models.",
@@ -349,23 +349,19 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
         "logging_first_step", "--logging_first_step/--no_logging_first_step"
     ),
 ]
-CacheDirArg = Annotated[Union[str, None], training_arg_option("cache_dir")]
+CacheDirArg = Annotated[str | None, training_arg_option("cache_dir")]
 MetricForBestModelArg = Annotated[str, training_arg_option("metric_for_best_model")]
 
 
 ##### COMMON HF TRANSFORMERS ARGS #####
-NumTrainEpochsArg = Annotated[
-    Union[float, None], transformers_arg_option("num_train_epochs")
-]
+NumTrainEpochsArg = Annotated[float | None, transformers_arg_option("num_train_epochs")]
 PerDeviceTrainBatchSizeArg = Annotated[
-    Union[int, None], transformers_arg_option("per_device_train_batch_size")
+    int | None, transformers_arg_option("per_device_train_batch_size")
 ]
 GradientAccumulationStepsArg = Annotated[
-    Union[int, None], transformers_arg_option("gradient_accumulation_steps")
-]
-LearningRateArg = Annotated[
-    Union[float, None], transformers_arg_option("learning_rate")
+    int | None, transformers_arg_option("gradient_accumulation_steps")
 ]
+LearningRateArg = Annotated[float | None, transformers_arg_option("learning_rate")]
 DoTrainArg = Annotated[bool, transformers_arg_option("do_train", "--do_train")]
 DoEvalArg = Annotated[bool, transformers_arg_option("do_eval", "--do_eval")]
 DoPredictArg = Annotated[bool, transformers_arg_option("do_predict", "--do_predict")]
@@ -613,7 +609,7 @@ def train(
     if bias_fit:
         model_init_kwargs["bias_fit"] = True
 
-    model: Union[CnnModel, LstmModel, HierarchicalModel, ProjectionModel] = (
+    model: CnnModel | LstmModel | HierarchicalModel | ProjectionModel = (
         AutoModel.from_config(config, **model_init_kwargs)
     )
     train_system = CnlpTrainSystem(model, dataset, training_args)
 
@@ -2,7 +2,7 @@
 from collections import Counter
 from dataclasses import dataclass
 from enum import Enum
-from typing import Literal, Union
+from typing import Literal
 
 import torch
 from datasets import Dataset
@@ -22,7 +22,7 @@ class HierarchicalDataConfig:
 
 def load_tokenizer(
     model_name_or_path: str,
-    hf_cache_dir: Union[str, None] = None,
+    hf_cache_dir: str | None = None,
     truncation_side: Literal["left", "right"] = "right",
     character_level: bool = False,
 ) -> PreTrainedTokenizer:
@@ -50,19 +50,19 @@ class CnlpDataset:
 
     def __init__(
         self,
-        data_dir: Union[str, os.PathLike],
-        tokenizer: Union[str, PreTrainedTokenizer] = "roberta-base",
-        task_names: Union[list[str], None] = None,
-        hier_config: Union[HierarchicalDataConfig, None] = None,
+        data_dir: str | os.PathLike,
+        tokenizer: str | PreTrainedTokenizer = "roberta-base",
+        task_names: list[str] | None = None,
+        hier_config: HierarchicalDataConfig | None = None,
         truncation_side: TruncationSide = TruncationSide.RIGHT,
         max_seq_length: int = 128,
         use_data_cache: bool = True,
-        max_train: Union[int, None] = None,
-        max_eval: Union[int, None] = None,
-        max_test: Union[int, None] = None,
+        max_train: int | None = None,
+        max_eval: int | None = None,
+        max_test: int | None = None,
         allow_disjoint_labels: bool = False,
         character_level: bool = False,
-        hf_cache_dir: Union[str, None] = None,
+        hf_cache_dir: str | None = None,
     ):
         """Create a new `CnlpDataset`.
 
 
@@ -1,7 +1,7 @@
 import json
 import os
 from collections.abc import Iterable
-from typing import Any, Final, Literal, Union, cast
+from typing import Any, Final, Literal, cast
 
 from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
 
@@ -21,7 +21,7 @@
 NONE_VALUE: Final = "__None__"
 
 
-def _infer_split(filepath: Union[str, os.PathLike]) -> DatasetSplit:
+def _infer_split(filepath: str | os.PathLike) -> DatasetSplit:
     _dir, filename = os.path.split(filepath)
     root, _ext = os.path.splitext(filename)
 
@@ -106,7 +106,7 @@ def _get_task_by_name(self, task_name: str):
                 return task
         raise ValueError(f'task with name "{task_name}" not found')
 
-    def get_tasks(self, task_names: Union[Iterable[str], None] = None):
+    def get_tasks(self, task_names: Iterable[str] | None = None):
         """Get all or some subset of the tasks in the data.
 
         The `TaskInfo` objects returned by this method will have their `index` property
@@ -198,8 +198,8 @@ def _extend(self, new_dataset: DatasetDict, tasks: list[TaskInfo]):
 
     def load_json(
         self,
-        json_filepath: Union[str, os.PathLike],
-        split: Union[DatasetSplit, None] = None,
+        json_filepath: str | os.PathLike,
+        split: DatasetSplit | None = None,
     ):
         """Update this reader with new data from a CNLP-formatted json file.
 
@@ -274,8 +274,8 @@ def load_json(
 
     def load_csv(
         self,
-        csv_filepath: Union[str, os.PathLike],
-        split: Union[DatasetSplit, None] = None,
+        csv_filepath: str | os.PathLike,
+        split: DatasetSplit | None = None,
         sep: str = ",",
     ):
         """Update this reader with new data from a CNLP-formatted csv (or tsv) file.
@@ -299,7 +299,7 @@ def load_csv(
         tasks = _infer_tasks(dataset[split])
         self._extend(dataset, tasks)
 
-    def load_dir(self, data_dir: Union[str, os.PathLike]):
+    def load_dir(self, data_dir: str | os.PathLike):
         """Update this reader with new data from a directory containing CNLP-formatted data.
 
         This will search (non-recursively) for files named "train", "test", "validation", "valid", or "dev",
 
@@ -2,7 +2,7 @@
 import os
 from collections.abc import Iterable
 from dataclasses import asdict, dataclass
-from typing import Any, Union
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -19,7 +19,7 @@
 class TaskPredictions:
     task: TaskInfo
     logits: npt.NDArray
-    labels: Union[npt.NDArray, None]
+    labels: npt.NDArray | None
 
     @property
     def probs(self) -> npt.NDArray:
@@ -34,7 +34,7 @@ def predicted_str_labels(self) -> npt.NDArray:
         return np.array(self.task.labels)[self.predicted_int_labels]
 
     @property
-    def target_str_labels(self) -> Union[npt.NDArray, None]:
+    def target_str_labels(self) -> npt.NDArray | None:
         if self.labels is None:
             return None
         masked = self.labels.copy()
@@ -68,7 +68,7 @@ def __init__(
 
         self.task_predictions: dict[str, TaskPredictions] = {}
 
-        task_labels: dict[str, Union[npt.NDArray, None]]
+        task_labels: dict[str, npt.NDArray | None]
 
         if self.raw.label_ids is None:
             task_labels = {t.name: None for t in tasks}
@@ -137,7 +137,7 @@ def arr_to_list(obj):
 
     def save_json(
         self,
-        json_filepath: Union[str, os.PathLike],
+        json_filepath: str | os.PathLike,
         allow_overwrite: bool = False,
     ):
         write_mode = "w" if allow_overwrite else "x"
@@ -169,7 +169,7 @@ def list_to_arr(obj, dtype):
         )
 
     @classmethod
-    def load_json(cls, filepath: Union[str, os.PathLike]):
+    def load_json(cls, filepath: str | os.PathLike):
         with open(filepath) as f:
             return cls.from_dict(json.load(f))
 
 
@@ -1,6 +1,6 @@
 import logging
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Final, Union
+from typing import TYPE_CHECKING, Any, Final
 
 import numpy as np
 from transformers.tokenization_utils import PreTrainedTokenizer
@@ -17,13 +17,13 @@
 
 
 def preprocess_raw_data(
-    batch: dict[str, Union[list[str], list[int], list[float]]],
+    batch: dict[str, list[str] | list[int] | list[float]],
     tokenizer: PreTrainedTokenizer,
-    tasks: Union[Iterable[TaskInfo], None],
-    max_length: Union[int, None] = None,
+    tasks: Iterable[TaskInfo] | None,
+    max_length: int | None = None,
     inference_only: bool = False,
     character_level: bool = False,
-    hier_config: Union["HierarchicalDataConfig", None] = None,
+    hier_config: "HierarchicalDataConfig | None" = None,
 ) -> BatchEncoding:
     """Preprocess raw CNLP data for training/evaluation.
 
@@ -248,7 +248,7 @@ def _get_word_ids(
     tokenizer: PreTrainedTokenizer,
     tokenized_input: BatchEncoding,
     character_level: bool,
-) -> list[list[Union[int, None]]]:
+) -> list[list[int | None]]:
     if tokenizer.is_fast:
         return [
             tokenized_input.word_ids(i) for i in range(len(tokenized_input.input_ids))
@@ -267,9 +267,9 @@ def _get_word_ids(
             ]
         )
 
-        def get_word_ids(indices: Iterable[int]) -> list[Union[int, None]]:
+        def get_word_ids(indices: Iterable[int]) -> list[int | None]:
             current = 0
-            raw: list[Union[int, None]] = []
+            raw: list[int | None] = []
             for index in indices:
                 if index in special_token_ids:
                     raw.append(None)
@@ -290,9 +290,9 @@ def get_word_ids(indices: Iterable[int]) -> list[Union[int, None]]:
 
 
 def _tokenize_batch(
-    batch: dict[str, Union[list[str], list[int], list[float]]],
+    batch: dict[str, list[str] | list[int] | list[float]],
     tokenizer: PreTrainedTokenizer,
-    max_length: Union[int, None],
+    max_length: int | None,
     hierarchical: bool,
     character_level: bool,
 ) -> BatchEncoding:
@@ -339,9 +339,7 @@ def _tokenize_batch(
     return tokenized_batch
 
 
-def _preprocess_raw_labels(
-    raw: Union[list[str], list[int], list[float]], task: TaskInfo
-):
+def _preprocess_raw_labels(raw: list[str] | list[int] | list[float], task: TaskInfo):
     mask_missing: Final = {MISSING_DATA_STR: MASK_VALUE}
     if task.type == CLASSIFICATION:
         # labels is just a list of one label for each instance
@@ -356,7 +354,7 @@ def _preprocess_raw_labels(
             for tags in raw
         ]
     elif task.type == RELATIONS:
-        preprocessed: list[Union[list[str], list[tuple[int, int, int]]]] = []
+        preprocessed: list[list[str] | list[tuple[int, int, int]]] = []
         for relations in raw:
             if relations in (None, "None"):
                 preprocessed.append(["None"])
@@ -483,7 +481,7 @@ def _build_labels_for_task(
     labels: list[tuple[Any, ...]],
     max_length: int,
     pad_classification: bool,
-) -> Union[np.ndarray, list[np.ndarray]]:
+) -> np.ndarray | list[np.ndarray]:
     if task.type == TAGGING:
         return _get_tagging_labels(task, tokenized_input, labels)
     elif task.type == RELATIONS: