ModelTC
diff --git a/‎genrl/advantages.py‎
Lines changed: 10 additions & 28 deletions b/‎genrl/advantages.py‎
Lines changed: 10 additions & 28 deletions
diff --git a/‎genrl/config.py‎
Lines changed: 6 additions & 23 deletions b/‎genrl/config.py‎
Lines changed: 6 additions & 23 deletions
diff --git a/‎genrl/data.py‎
Lines changed: 12 additions & 32 deletions b/‎genrl/data.py‎
Lines changed: 12 additions & 32 deletions
@@ -94,9 +94,7 @@ def compute_advantages(
                 )
                 raise ConfigurationError(msg)
             prompt_ids = accelerator.gather(samples["prompt_ids"]).cpu().numpy()
-            prompts = pipeline.tokenizer.batch_decode(
-                prompt_ids, skip_special_tokens=True
-            )
+            prompts = pipeline.tokenizer.batch_decode(prompt_ids, skip_special_tokens=True)
 
             # Compute advantages for each raw reward separately
             weighted_advantages_list = []
@@ -119,9 +117,7 @@ def compute_advantages(
                         "per_prompt_stat_tracking=True, and kl_reward > 0"
                     )
                     raise ConfigurationError(msg)
-                kl_advantages = _compute_kl_advantages(
-                    gathered_kl, kl_stat_tracker, prompts, use_per_prompt=True
-                )
+                kl_advantages = _compute_kl_advantages(gathered_kl, kl_stat_tracker, prompts, use_per_prompt=True)
                 # Subtract KL advantages with kl_reward as weight
                 # kl_advantages is already negative (because KL is a penalty),
                 # so we directly multiply by kl_reward to get a negative contribution
@@ -131,22 +127,16 @@ def compute_advantages(
             advantages = sum(weighted_advantages_list)
 
             if accelerator.is_local_main_process:
-                logger.info(
-                    f"len(prompts) {len(prompts)} | len unique {len(set(prompts))}"
-                )
+                logger.info(f"len(prompts) {len(prompts)} | len unique {len(set(prompts))}")
             # Use the first stat_tracker for logging
             first_reward_name = next(iter(cfg.reward_fn))
-            group_size, trained_prompt_num = reward_stat_trackers[
-                first_reward_name
-            ].get_stats()
+            group_size, trained_prompt_num = reward_stat_trackers[first_reward_name].get_stats()
             # Calculate zero_std_ratio for each raw reward
             zero_std_ratios = {}
             for reward_name in cfg.reward_fn:
                 raw_reward_key = f"{reward_name}_raw"
-                zero_std_ratios[f"zero_std_ratio_{reward_name}"] = (
-                    calculate_zero_std_ratio(
-                        prompts, gathered_rewards, reward_key=f"ori_{raw_reward_key}"
-                    )
+                zero_std_ratios[f"zero_std_ratio_{reward_name}"] = calculate_zero_std_ratio(
+                    prompts, gathered_rewards, reward_key=f"ori_{raw_reward_key}"
                 )
             log_dict = {
                 "group_size": group_size,
@@ -164,9 +154,7 @@ def compute_advantages(
             weighted_advantages_list = []
             for reward_name in cfg.reward_fn:
                 raw_reward_key = f"{reward_name}_raw"
-                raw_rewards = gathered_rewards[
-                    raw_reward_key
-                ]  # Shape: (total_batch_size, num_timesteps)
+                raw_rewards = gathered_rewards[raw_reward_key]  # Shape: (total_batch_size, num_timesteps)
                 # Direct normalization on full shape
                 reward_advantages = _normalize_rewards(raw_rewards)
                 # Weight the advantages
@@ -175,9 +163,7 @@ def compute_advantages(
 
             # Handle KL as a reward: compute advantage for KL, then subtract with kl_reward weight
             if cfg.sample.kl_reward > 0:
-                kl_advantages = _compute_kl_advantages(
-                    gathered_kl, None, None, use_per_prompt=False
-                )
+                kl_advantages = _compute_kl_advantages(gathered_kl, None, None, use_per_prompt=False)
                 # Subtract KL advantages with kl_reward as weight
                 # kl_advantages is already negative (because KL is a penalty),
                 # so we directly multiply by kl_reward to get a negative contribution
@@ -191,14 +177,10 @@ def compute_advantages(
             msg = "stat_tracker must be provided when per_prompt_stat_tracking=True"
             raise ConfigurationError(msg)
         prompt_ids = accelerator.gather(samples["prompt_ids"]).cpu().numpy()
-        prompts = pipeline.tokenizer.batch_decode(
-            prompt_ids, skip_special_tokens=True
-        )
+        prompts = pipeline.tokenizer.batch_decode(prompt_ids, skip_special_tokens=True)
         advantages = stat_tracker.update(prompts, gathered_rewards["avg"])
         if accelerator.is_local_main_process:
-            logger.info(
-                f"len(prompts) {len(prompts)} | len unique {len(set(prompts))}"
-            )
+            logger.info(f"len(prompts) {len(prompts)} | len unique {len(set(prompts))}")
         group_size, trained_prompt_num = stat_tracker.get_stats()
         zero_std_ratio = calculate_zero_std_ratio(prompts, gathered_rewards)
         log_dict = {
 
@@ -33,9 +33,7 @@ class AccelerateConfig:
 @dataclass
 class TrainConfig:
     batch_size: int = 8
-    gradient_accumulation_steps: int | None = (
-        None  # if None, derive from sample settings
-    )
+    gradient_accumulation_steps: int | None = None  # if None, derive from sample settings
     num_inner_epochs: int = 1
     timestep_fraction: float = 0.99
     beta: float = 0.0
@@ -156,30 +154,15 @@ def build_dataclass(cls, src: dict[str, Any]):
             if field_name in src:
                 val = src[field_name]
                 # dispatch based on nested dataclass types
-                if (
-                    isinstance(field_def.default, FSDPConfig)
-                    or field_def.type == FSDPConfig
-                ):
+                if isinstance(field_def.default, FSDPConfig) or field_def.type == FSDPConfig:
                     kwargs[field_name] = build_dataclass(FSDPConfig, val)
-                elif (
-                    isinstance(field_def.default, AccelerateConfig)
-                    or field_def.type == AccelerateConfig
-                ):
+                elif isinstance(field_def.default, AccelerateConfig) or field_def.type == AccelerateConfig:
                     kwargs[field_name] = build_dataclass(AccelerateConfig, val)
-                elif (
-                    isinstance(field_def.default, TrainConfig)
-                    or field_def.type == TrainConfig
-                ):
+                elif isinstance(field_def.default, TrainConfig) or field_def.type == TrainConfig:
                     kwargs[field_name] = build_dataclass(TrainConfig, val)
-                elif (
-                    isinstance(field_def.default, SampleConfig)
-                    or field_def.type == SampleConfig
-                ):
+                elif isinstance(field_def.default, SampleConfig) or field_def.type == SampleConfig:
                     kwargs[field_name] = build_dataclass(SampleConfig, val)
-                elif (
-                    isinstance(field_def.default, ProjectPaths)
-                    or field_def.type == ProjectPaths
-                ):
+                elif isinstance(field_def.default, ProjectPaths) or field_def.type == ProjectPaths:
                     kwargs[field_name] = build_dataclass(ProjectPaths, val)
                 else:
                     kwargs[field_name] = val
 
@@ -32,9 +32,7 @@ def __getitem__(self, idx: int | tuple[int, int]) -> dict:
     def collate_fn(examples: list[dict]) -> tuple[int | None, list[str], list[dict]]:
         """Batch prompts while preserving a consistent epoch tag."""
         epoch_tags = [example.get("epoch") for example in examples]
-        epoch_tag = (
-            epoch_tags[0] if all(tag == epoch_tags[0] for tag in epoch_tags) else None
-        )
+        epoch_tag = epoch_tags[0] if all(tag == epoch_tags[0] for tag in epoch_tags) else None
         prompts = [example["prompt"] for example in examples]
         metadatas = [example["metadata"] for example in examples]
         return epoch_tag, prompts, metadatas
@@ -66,9 +64,7 @@ def __getitem__(self, idx: int | tuple[int, int]) -> dict:
     def collate_fn(examples: list[dict]) -> tuple[int | None, list[str], list[dict]]:
         """Batch Geneval items while preserving epoch tags."""
         epoch_tags = [example.get("epoch") for example in examples]
-        epoch_tag = (
-            epoch_tags[0] if all(tag == epoch_tags[0] for tag in epoch_tags) else None
-        )
+        epoch_tag = epoch_tags[0] if all(tag == epoch_tags[0] for tag in epoch_tags) else None
         prompts = [example["prompt"] for example in examples]
         metadatas = [example["metadata"] for example in examples]
         return epoch_tag, prompts, metadatas
@@ -93,9 +89,7 @@ def __init__(self, dataset: str, split: str = "train"):
         self.file_path = os.path.join(dataset, f"{split}.json")
         self._prompts = None
         self._metadatas = None
-        self._file_size = (
-            os.path.getsize(self.file_path) if os.path.exists(self.file_path) else 0
-        )
+        self._file_size = os.path.getsize(self.file_path) if os.path.exists(self.file_path) else 0
 
         # Optimization strategy:
         # - For training data, load directly to memory even if large (frequent random access needed)
@@ -185,11 +179,7 @@ def _get_item_lazy(self, idx: int) -> dict:
 
         start_offset = self._line_offsets[idx]
         # Calculate end offset (start of next line or end of file)
-        end_offset = (
-            self._line_offsets[idx + 1]
-            if idx + 1 < len(self._line_offsets)
-            else self._file_size
-        )
+        end_offset = self._line_offsets[idx + 1] if idx + 1 < len(self._line_offsets) else self._file_size
 
         with open(self.file_path, encoding="utf-8") as f:
             f.seek(start_offset)
@@ -257,9 +247,7 @@ def collate_fn(examples: list[dict]) -> tuple[int | None, list[str], list[dict]]
             Tuple of (epoch_tag, prompts, metadatas) where epoch_tag is None if inconsistent.
         """
         epoch_tags = [example.get("epoch") for example in examples]
-        epoch_tag = (
-            epoch_tags[0] if all(tag == epoch_tags[0] for tag in epoch_tags) else None
-        )
+        epoch_tag = epoch_tags[0] if all(tag == epoch_tags[0] for tag in epoch_tags) else None
         prompts = [example["prompt"] for example in examples]
         metadatas = [example["metadata"] for example in examples]
         return epoch_tag, prompts, metadatas
@@ -292,9 +280,9 @@ def __init__(
         self.rank = rank
         self.seed = seed
         self.total_samples = self.num_replicas * self.batch_size
-        assert (
-            self.total_samples % self.k == 0
-        ), f"k can not div n*b, k{k}-num_replicas{num_replicas}-batch_size{batch_size}"
+        assert self.total_samples % self.k == 0, (
+            f"k can not div n*b, k{k}-num_replicas{num_replicas}-batch_size{batch_size}"
+        )
         self.m = self.total_samples // self.k
         self.epoch = 0
 
@@ -305,27 +293,21 @@ def __iter__(self):
             g.manual_seed(self.seed + self.epoch)
             indices = torch.randperm(len(self.dataset), generator=g)[: self.m].tolist()
             repeated_indices = [idx for idx in indices for _ in range(self.k)]
-            shuffled_indices = torch.randperm(
-                len(repeated_indices), generator=g
-            ).tolist()
+            shuffled_indices = torch.randperm(len(repeated_indices), generator=g).tolist()
             shuffled_samples = [repeated_indices[i] for i in shuffled_indices]
             per_card_samples = []
             for i in range(self.num_replicas):
                 start = i * self.batch_size
                 end = start + self.batch_size
-                per_card_samples.append(
-                    [(self.epoch, idx) for idx in shuffled_samples[start:end]]
-                )
+                per_card_samples.append([(self.epoch, idx) for idx in shuffled_samples[start:end]])
             yield per_card_samples[self.rank]
 
     def set_epoch(self, epoch: int):
         """Set epoch tag to keep RNG in sync across workers."""
         self.epoch = epoch
 
 
-def build_dataloaders(
-    cfg, accelerator
-) -> tuple[DataLoader, DataLoader, DistributedKRepeatSampler]:
+def build_dataloaders(cfg, accelerator) -> tuple[DataLoader, DataLoader, DistributedKRepeatSampler]:
     """Construct train/eval dataloaders and sampler with epoch tags.
 
     Args:
@@ -350,9 +332,7 @@ def build_dataloaders(
         collate_fn = JsonPromptDataset.collate_fn
     else:
         msg = "Only general_ocr, geneval, or filtered_prompts prompt_fn supported"
-        raise NotImplementedError(
-            msg
-        )
+        raise NotImplementedError(msg)
 
     train_sampler = DistributedKRepeatSampler(
         dataset=train_dataset,