Skip to content

Commit e1a0c4b

Browse files
maxschridde1494claudePawelPeczek-Roboflow
authored
feat(batch-processing-cli): add --max-image-failure-rate flag (#2360)
* feat(batch-processing-cli): add --max-image-failure-rate flag Exposes the per-shard image-failure tolerance knob through the CLI. Adds the flag to `process-images-with-workflow` and `restart-job`, threads it through `WorkflowProcessingJobV1` to the wire as `maxImageFailureRate`, and appends it to the restart override payload parallel to `maxParallelTasks`. Pure pass-through plumbing; the threshold is enforced inside the workflows-data-processor worker (shipped separately in the batch-processing-services repo). The server is the source of truth for `[0.0, 1.0]` validation — no client-side range check is added, matching the existing behavior of other numeric override flags in this command. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * add openspec to gitignore --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Paweł Pęczek <146137186+PawelPeczek-Roboflow@users.noreply.github.com>
1 parent f1ed39c commit e1a0c4b

4 files changed

Lines changed: 42 additions & 1 deletion

File tree

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,4 +214,7 @@ inference_models/tests/e2e_platform_tests/assets/
214214
inference_testing
215215

216216
# rerun.io recordings
217-
*.rrd
217+
*.rrd
218+
219+
openspec*/
220+
opsx/

inference_cli/lib/roboflow_cloud/batch_processing/api_operations.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ def trigger_job_with_workflows_images_processing(
401401
api_key: str,
402402
inference_backend: Optional[InferenceBackend] = None,
403403
job_name: Optional[str] = None,
404+
max_image_failure_rate: Optional[float] = None,
404405
) -> str:
405406
workspace = get_workspace(api_key=api_key)
406407
compute_configuration = ComputeConfigurationV2(
@@ -431,6 +432,7 @@ def trigger_job_with_workflows_images_processing(
431432
compute_configuration=compute_configuration,
432433
processing_timeout_seconds=max_runtime_seconds,
433434
max_parallel_tasks=max_parallel_tasks,
435+
max_image_failure_rate=max_image_failure_rate,
434436
processing_specification=processing_specification,
435437
notifications_url=notifications_url,
436438
inference_backend=inference_backend,
@@ -755,6 +757,7 @@ def restart_batch_job(
755757
workers_per_machine: Optional[int] = None,
756758
max_runtime_seconds: Optional[float] = None,
757759
max_parallel_tasks: Optional[int] = None,
760+
max_image_failure_rate: Optional[float] = None,
758761
) -> None:
759762
workspace = get_workspace(api_key=api_key)
760763
response = send_restart_job_request(
@@ -765,6 +768,7 @@ def restart_batch_job(
765768
workers_per_machine=workers_per_machine,
766769
max_runtime_seconds=max_runtime_seconds,
767770
max_parallel_tasks=max_parallel_tasks,
771+
max_image_failure_rate=max_image_failure_rate,
768772
)
769773
console = Console()
770774
console.print(JSON.from_data(response, indent=2))
@@ -784,6 +788,7 @@ def send_restart_job_request(
784788
workers_per_machine: Optional[int] = None,
785789
max_runtime_seconds: Optional[float] = None,
786790
max_parallel_tasks: Optional[int] = None,
791+
max_image_failure_rate: Optional[float] = None,
787792
) -> dict:
788793
payload = {
789794
"type": "parameters-override-v1",
@@ -799,6 +804,8 @@ def send_restart_job_request(
799804
payload["maxParallelTasks"] = max_parallel_tasks
800805
if max_runtime_seconds is not None:
801806
payload["processingTimeoutSeconds"] = max_runtime_seconds
807+
if max_image_failure_rate is not None:
808+
payload["maxImageFailureRate"] = max_image_failure_rate
802809
params = {"api_key": api_key}
803810
try:
804811
response = requests.post(

inference_cli/lib/roboflow_cloud/batch_processing/core.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,19 @@ def process_images_with_workflow(
189189
"--max-parallel-tasks", help="Max number of concurrent processing tasks"
190190
),
191191
] = None,
192+
max_image_failure_rate: Annotated[
193+
Optional[float],
194+
typer.Option(
195+
"--max-image-failure-rate",
196+
help=(
197+
"Maximum fraction of images per shard that may fail before the shard "
198+
"is aborted (0.0-1.0). Default: 10 absolute failures per shard "
199+
"(~3.9% on a full 256-image shard, higher effective tolerance on "
200+
"partial last shards). No client-side validation is performed; "
201+
"out-of-range values are rejected by the server with HTTP 400."
202+
),
203+
),
204+
] = None,
192205
aggregation_format: Annotated[
193206
Optional[AggregationFormat],
194207
typer.Option("--aggregation-format", help="Format of results aggregation"),
@@ -261,6 +274,7 @@ def process_images_with_workflow(
261274
api_key=api_key,
262275
inference_backend=inference_backend,
263276
job_name=job_name,
277+
max_image_failure_rate=max_image_failure_rate,
264278
)
265279
print(f"Triggered job with ID: {job_id}")
266280
except KeyboardInterrupt:
@@ -591,6 +605,19 @@ def restart_job(
591605
"--max-parallel-tasks", help="Max number of concurrent processing tasks"
592606
),
593607
] = None,
608+
max_image_failure_rate: Annotated[
609+
Optional[float],
610+
typer.Option(
611+
"--max-image-failure-rate",
612+
help=(
613+
"Maximum fraction of images per shard that may fail before the shard "
614+
"is aborted (0.0-1.0). Default: 10 absolute failures per shard "
615+
"(~3.9% on a full 256-image shard, higher effective tolerance on "
616+
"partial last shards). No client-side validation is performed; "
617+
"out-of-range values are rejected by the server with HTTP 400."
618+
),
619+
),
620+
] = None,
594621
debug_mode: Annotated[
595622
bool,
596623
typer.Option(
@@ -610,6 +637,7 @@ def restart_job(
610637
workers_per_machine=workers_per_machine,
611638
max_runtime_seconds=max_runtime_seconds,
612639
max_parallel_tasks=max_parallel_tasks,
640+
max_image_failure_rate=max_image_failure_rate,
613641
)
614642
except KeyboardInterrupt:
615643
print("Command interrupted.")

inference_cli/lib/roboflow_cloud/batch_processing/entities.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,9 @@ class WorkflowProcessingJobV1(BaseModel):
144144
max_parallel_tasks: Optional[int] = Field(
145145
serialization_alias="maxParallelTasks", default=None
146146
)
147+
max_image_failure_rate: Optional[float] = Field(
148+
serialization_alias="maxImageFailureRate", default=None
149+
)
147150
processing_specification: WorkflowsProcessingSpecificationV1 = Field(
148151
serialization_alias="processingSpecification"
149152
)

0 commit comments

Comments
 (0)