OpenPipe
diff --git a/‎src/art/_backend_training.py‎
Lines changed: 109 additions & 0 deletions b/‎src/art/_backend_training.py‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎src/art/local/backend.py‎
100644100755
Lines changed: 60 additions & 121 deletions b/‎src/art/local/backend.py‎
100644100755
Lines changed: 60 additions & 121 deletions
@@ -0,0 +1,109 @@
+from collections.abc import Iterable
+import time
+from typing import Literal
+
+from . import dev
+from .metrics_taxonomy import (
+    average_metric_samples,
+    build_training_summary_metrics,
+    summarize_trajectory_groups,
+)
+from .trajectories import TrajectoryGroup
+from .types import TrainConfig
+
+
+def build_rl_train_configs(
+    *,
+    learning_rate: float,
+    advantage_balance: float = 0.0,
+    scale_rewards: bool = True,
+    importance_sampling_level: Literal[
+        "token", "sequence", "average", "geometric_average"
+    ] = "token",
+    mask_prob_ratio: bool = False,
+    ppo: bool = False,
+    precalculate_logprobs: bool = False,
+    epsilon: float | None = None,
+    epsilon_high: float | None = None,
+    max_negative_advantage_importance_sampling_weight: float | None = None,
+    kimi_k2_tau: float | None = None,
+    kl_penalty_coef: float = 0.0,
+    allow_training_without_logprobs: bool | None = None,
+    plot_tensors: bool | None = None,
+    truncated_importance_sampling: float | None = None,
+    scale_learning_rate_by_reward_std_dev: bool | None = None,
+    logprob_calculation_chunk_size: int | None = None,
+    num_trajectories_learning_rate_multiplier_power: float | None = None,
+    kl_ref_adapter_path: str | None = None,
+) -> tuple[TrainConfig, dev.TrainConfig]:
+    config = TrainConfig(
+        learning_rate=learning_rate,
+        kl_penalty_coef=kl_penalty_coef,
+    )
+    dev_config: dev.TrainConfig = {
+        "advantage_balance": advantage_balance,
+        "importance_sampling_level": importance_sampling_level,
+        "kl_penalty_coef": kl_penalty_coef,
+        "mask_prob_ratio": mask_prob_ratio,
+        "ppo": ppo,
+        "precalculate_logprobs": precalculate_logprobs,
+        "scale_rewards": scale_rewards,
+    }
+
+    if allow_training_without_logprobs is not None:
+        dev_config["allow_training_without_logprobs"] = (
+            allow_training_without_logprobs
+        )
+    if plot_tensors is not None:
+        dev_config["plot_tensors"] = plot_tensors
+    if truncated_importance_sampling is not None:
+        dev_config["truncated_importance_sampling"] = truncated_importance_sampling
+    if scale_learning_rate_by_reward_std_dev is not None:
+        dev_config["scale_learning_rate_by_reward_std_dev"] = (
+            scale_learning_rate_by_reward_std_dev
+        )
+    if logprob_calculation_chunk_size is not None:
+        dev_config["logprob_calculation_chunk_size"] = (
+            logprob_calculation_chunk_size
+        )
+    if num_trajectories_learning_rate_multiplier_power is not None:
+        dev_config["num_trajectories_learning_rate_multiplier_power"] = (
+            num_trajectories_learning_rate_multiplier_power
+        )
+    if epsilon is not None:
+        dev_config["epsilon"] = epsilon
+    if epsilon_high is not None:
+        dev_config["epsilon_high"] = epsilon_high
+    if max_negative_advantage_importance_sampling_weight is not None:
+        dev_config["max_negative_advantage_importance_sampling_weight"] = (
+            max_negative_advantage_importance_sampling_weight
+        )
+    if kimi_k2_tau is not None:
+        dev_config["kimi_k2_tau"] = kimi_k2_tau
+    if kl_ref_adapter_path is not None:
+        dev_config["kl_ref_adapter_path"] = kl_ref_adapter_path
+
+    return config, dev_config
+
+
+def aggregate_rl_training_metrics(
+    *,
+    training_metrics: list[dict[str, float]],
+    trajectory_groups: Iterable[TrajectoryGroup],
+    trainer_started: float,
+) -> dict[str, float]:
+    groups_list = list(trajectory_groups)
+    avg_metrics = average_metric_samples(training_metrics)
+    summary = summarize_trajectory_groups(groups_list)
+    avg_metrics.setdefault("time/step_trainer_s", time.monotonic() - trainer_started)
+    avg_metrics.update(
+        {
+            key: value
+            for key, value in build_training_summary_metrics(
+                summary,
+                include_trainable_groups=True,
+            ).items()
+            if key not in avg_metrics
+        }
+    )
+    return avg_metrics
@@ -43,11 +43,13 @@
 from mp_actors import close_proxy, move_to_child_process
 
 from .. import dev
+from .._backend_training import (
+    aggregate_rl_training_metrics,
+    build_rl_train_configs,
+)
 from ..backend import AnyTrainableModel, Backend
-from ..costs import build_cost_calculator, get_model_pricing
 from ..metrics_taxonomy import (
     TRAIN_GRADIENT_STEPS_KEY,
-    average_metric_samples,
     build_training_summary_metrics,
     summarize_trajectory_groups,
 )
@@ -160,9 +162,6 @@ def _allocated_gpu_count(self, model: Model) -> int:
     def __enter__(self) -> Self:
         return self
 
-    async def __aenter__(self) -> Self:
-        return self
-
     def __exit__(
         self,
         exc_type: type[BaseException] | None,
@@ -171,30 +170,14 @@ def __exit__(
     ) -> None:
         self._close()
 
-    async def __aexit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc: BaseException | None,
-        tb: TracebackType | None,
-    ) -> None:
-        await self.close()
-
     async def close(self) -> None:
         """
         If running vLLM in a separate process, this will kill that process and close the communication threads.
         """
-        for service in self._services.values():
-            aclose = getattr(service, "aclose", None)
-            if aclose is None:
-                close = getattr(service, "close", None)
-                if close is not None:
-                    close()
-            else:
-                await aclose()
-            close_proxy(service)
+        self._close()
 
     def _close(self) -> None:
-        for service in self._services.values():
+        for _, service in self._services.items():
             close = getattr(service, "close", None)
             if close is not None:
                 close()
@@ -226,11 +209,6 @@ async def register(
         # (wandb initialization is now handled by the model's _get_wandb_run method)
         if model.trainable and "WANDB_API_KEY" in os.environ:
             _ = model._get_wandb_run()
-        if model.trainable:
-            trainable_model = cast(TrainableModel, model)
-            pricing = get_model_pricing(trainable_model.base_model)
-            if pricing is not None:
-                trainable_model.set_cost_calculator(build_cost_calculator(pricing))
 
     def _model_inference_name(self, model: Model, step: int | None = None) -> str:
         """Return the inference name for a model checkpoint.
@@ -244,27 +222,25 @@ def _model_inference_name(self, model: Model, step: int | None = None) -> str:
                   If None, returns name for latest checkpoint (step 0 initially).
         """
 
-        requested_step = step
-
-        if step is None and isinstance(model, TrainableModel):
-            from ..dev.validate import is_dedicated_mode
-
-            service = self._services.get(model.name)
-            if service is not None and is_dedicated_mode(
-                model._internal_config or dev.InternalModelConfig()
-            ):
-                loaded_step = getattr(service, "_latest_step", None)
-                if isinstance(loaded_step, int):
-                    step = loaded_step
-
-        if step is None:
-            # The checkpoint directory is written before dedicated-mode
-            # vLLM finishes reloading the new adapter.
-            step = self.__get_step(model)
-        name = f"{model.name}@{step}"
+        # For LocalBackend, vLLM always serves LoRA adapters with @step suffix
+        # Default to step 0 when not specified (the initial checkpoint created at registration)
+        if step is not None:
+            actual_step = step
+        elif model.name in self._services and self._in_process:
+            # In dedicated mode the service tracks which adapter vLLM has
+            # actually loaded.  Reading the filesystem would race: the
+            # checkpoint directory appears before the HTTP reload completes.
+            svc = self._services[model.name]
+            loaded_step = getattr(svc, "_latest_step", None)
+            actual_step = (
+                loaded_step if loaded_step is not None else self.__get_step(model)
+            )
+        else:
+            actual_step = self.__get_step(model)
+        name = f"{model.name}@{actual_step}"
         logger.debug(
-            f"[BACKEND] _model_inference_name: step_arg={requested_step} "
-            f"actual_step={step} -> {name}"
+            f"[BACKEND] _model_inference_name: step_arg={step} "
+            f"actual_step={actual_step} -> {name}"
         )
         return name
 
@@ -529,14 +505,12 @@ async def train(  # type: ignore[override]
         *,
         # Core training parameters
         learning_rate: float = 5e-6,
-        loss_fn: Literal["cispo", "ppo"] = "cispo",
-        loss_fn_config: dict | None = None,
-        normalize_advantages: bool = True,
-        adam_params: object | None = None,
         # KL-penalized advantage adjustment
         kl_penalty_coef: float = 0.0,
         kl_penalty_reference_step: int | None = None,
         kl_ref_adapter_path: str | None = None,
+        # RL algorithm settings
+        ppo: bool = False,
         epsilon: float | None = None,
         epsilon_high: float | None = None,
         # Advantage computation
@@ -573,14 +547,6 @@ async def train(  # type: ignore[override]
             model: The trainable model to train.
             trajectory_groups: Batches of trajectories to train on.
             learning_rate: Learning rate for training. Defaults to 5e-6.
-            loss_fn: RL loss function. LocalBackend currently supports
-                "cispo" and "ppo".
-            loss_fn_config: Additional loss-function config. Not supported by
-                LocalBackend.
-            normalize_advantages: Whether to normalize advantages. LocalBackend
-                currently requires True.
-            adam_params: Custom optimizer params. Not supported by
-                LocalBackend.
             kl_penalty_coef: Coefficient for KL-penalized advantage adjustment.
                 Tokens diverging more from the reference get reduced advantages.
                 Defaults to 0.0 (disabled).
@@ -590,7 +556,8 @@ async def train(  # type: ignore[override]
             kl_ref_adapter_path: Direct filesystem path to a LoRA adapter
                 checkpoint to use as the KL reference. Alternative to
                 kl_penalty_reference_step.
-            epsilon: Clip epsilon for importance sampling. Defaults based on loss_fn.
+            ppo: Whether to use PPO clipping. Defaults to False.
+            epsilon: Clip epsilon for importance sampling. Defaults based on ppo.
             epsilon_high: Asymmetric upper clip bound. Defaults to epsilon.
             advantage_balance: Balance between negative and positive advantages
                 in range [-1.0, 1.0]. Defaults to 0.0 (balanced).
@@ -633,54 +600,37 @@ async def train(  # type: ignore[override]
             # await model.log(metrics=result.metrics, step=result.step)
         """
         groups_list = list(trajectory_groups)
-        if loss_fn not in {"cispo", "ppo"}:
-            raise ValueError("LocalBackend only supports loss_fn='cispo' or 'ppo'.")
-        if loss_fn_config is not None:
-            raise ValueError("LocalBackend requires loss_fn_config=None.")
-        if not normalize_advantages:
-            raise ValueError("LocalBackend requires normalize_advantages=True.")
-        if adam_params is not None:
-            raise ValueError("LocalBackend requires adam_params=None.")
-
-        # Build config objects from explicit kwargs
-        config = TrainConfig(
-            learning_rate=learning_rate, kl_penalty_coef=kl_penalty_coef
-        )
-        dev_config: dev.TrainConfig = {
-            "advantage_balance": advantage_balance,
-            "allow_training_without_logprobs": allow_training_without_logprobs,
-            "importance_sampling_level": importance_sampling_level,
-            "kl_penalty_coef": kl_penalty_coef,
-            "mask_prob_ratio": mask_prob_ratio,
-            "plot_tensors": plot_tensors,
-            "ppo": loss_fn == "ppo",
-            "precalculate_logprobs": precalculate_logprobs,
-            "scale_learning_rate_by_reward_std_dev": scale_learning_rate_by_reward_std_dev,
-            "scale_rewards": scale_rewards,
-            "logprob_calculation_chunk_size": logprob_calculation_chunk_size,
-            "num_trajectories_learning_rate_multiplier_power": num_trajectories_learning_rate_multiplier_power,
-        }
-        # Only include optional fields if they're set
-        if epsilon is not None:
-            dev_config["epsilon"] = epsilon
-        if epsilon_high is not None:
-            dev_config["epsilon_high"] = epsilon_high
-        if max_negative_advantage_importance_sampling_weight is not None:
-            dev_config["max_negative_advantage_importance_sampling_weight"] = (
-                max_negative_advantage_importance_sampling_weight
-            )
-        if kimi_k2_tau is not None:
-            dev_config["kimi_k2_tau"] = kimi_k2_tau
-        if truncated_importance_sampling is not None:
-            dev_config["truncated_importance_sampling"] = truncated_importance_sampling
-        if kl_ref_adapter_path is not None:
-            dev_config["kl_ref_adapter_path"] = kl_ref_adapter_path
-        elif kl_penalty_reference_step is not None:
-            ref_checkpoint_dir = get_step_checkpoint_dir(
+
+        resolved_kl_ref_adapter_path = kl_ref_adapter_path
+        if (
+            resolved_kl_ref_adapter_path is None
+            and kl_penalty_reference_step is not None
+        ):
+            resolved_kl_ref_adapter_path = get_step_checkpoint_dir(
                 get_model_dir(model=model, art_path=self._path),
                 kl_penalty_reference_step,
             )
-            dev_config["kl_ref_adapter_path"] = ref_checkpoint_dir
+        config, dev_config = build_rl_train_configs(
+            learning_rate=learning_rate,
+            advantage_balance=advantage_balance,
+            scale_rewards=scale_rewards,
+            importance_sampling_level=importance_sampling_level,
+            mask_prob_ratio=mask_prob_ratio,
+            ppo=ppo,
+            precalculate_logprobs=precalculate_logprobs,
+            epsilon=epsilon,
+            epsilon_high=epsilon_high,
+            max_negative_advantage_importance_sampling_weight=max_negative_advantage_importance_sampling_weight,
+            kimi_k2_tau=kimi_k2_tau,
+            kl_penalty_coef=kl_penalty_coef,
+            allow_training_without_logprobs=allow_training_without_logprobs,
+            plot_tensors=plot_tensors,
+            truncated_importance_sampling=truncated_importance_sampling,
+            scale_learning_rate_by_reward_std_dev=scale_learning_rate_by_reward_std_dev,
+            logprob_calculation_chunk_size=logprob_calculation_chunk_size,
+            num_trajectories_learning_rate_multiplier_power=num_trajectories_learning_rate_multiplier_power,
+            kl_ref_adapter_path=resolved_kl_ref_adapter_path,
+        )
 
         # Collect metrics from training
         training_metrics: list[dict[str, float]] = []
@@ -690,21 +640,10 @@ async def train(  # type: ignore[override]
         ):
             training_metrics.append(metrics)
 
-        # Aggregate metrics
-        avg_metrics = average_metric_samples(training_metrics)
-        summary = summarize_trajectory_groups(groups_list)
-        avg_metrics.setdefault(
-            "time/step_trainer_s", time.monotonic() - trainer_started
-        )
-        avg_metrics.update(
-            {
-                key: value
-                for key, value in build_training_summary_metrics(
-                    summary,
-                    include_trainable_groups=True,
-                ).items()
-                if key not in avg_metrics
-            }
+        avg_metrics = aggregate_rl_training_metrics(
+            training_metrics=training_metrics,
+            trajectory_groups=groups_list,
+            trainer_started=trainer_started,
         )
 
         # Get step and checkpoint path