fix: BROS-1087: Evaluation results retained between different subsets

nick-skriabin · robot-ci-heartex · commit 29ae87f5e78f · 2026-05-08T12:25:30.000Z
GitOrigin-RevId: fd6dfe65c361124a55350ff66f48e18f1a93e827
diff --git a/poetry.lock b/poetry.lock
diff --git a/reference.md b/reference.md
@@ -6240,6 +6240,14 @@ client.prompts.subset_tasks(
 <dl>
 <dd>
 
+**model_version:** `typing.Optional[int]` — Restrict prefetched predictions to this specific prompt version. Used with parent_model when no model_run is selected so a newly created version does not inherit predictions from prior versions.
+    
+</dd>
+</dl>
+
+<dl>
+<dd>
+
 **ordering:** `typing.Optional[str]` — Which field to use when ordering the results.
     
 </dd>
@@ -32487,7 +32495,7 @@ client.projects.roles.get(
             This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
         </p>
     </Card>
-Returns label confusion matrix with precision, recall, and top confusion pairs.
+Returns label confusion matrix with precision, recall, and top confusion pairs. In `ground_truth` mode the matrix is directional: rows are GT labels (actual), columns are annotator labels (predicted). In `all` and `accepted` modes — where no canonical "actual vs predicted" axis exists — the matrix is symmetric. When a task has multiple GT annotations the most recently updated one is used. `top_confusion_pairs.rate` is the share of off-diagonal mass.
 </dd>
 </dl>
 </dd>
@@ -32661,7 +32669,7 @@ client.projects.stats.data_quality_agreement_dimensions(
             This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
         </p>
     </Card>
-Returns average agreement, histogram buckets, low-agreement count, and total tasks.
+Returns average agreement, a 10-bucket histogram of `Task.precomputed_agreement` (filled on-the-fly from V2 dimension matrices when null), `low_agreement_count`, and `total_tasks`. The low-agreement threshold is `LseProject.agreement_threshold` (the same project setting Data Manager filters and review-routing rules consume); changing that setting moves the count for this endpoint as well.
 </dd>
 </dl>
 </dd>
diff --git a/src/label_studio_sdk/projects/stats/client.py b/src/label_studio_sdk/projects/stats/client.py
@@ -65,7 +65,7 @@ def data_quality_agreement_confusion_matrix(
                     This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
                 </p>
             </Card>
-        Returns label confusion matrix with precision, recall, and top confusion pairs.
+        Returns label confusion matrix with precision, recall, and top confusion pairs. In `ground_truth` mode the matrix is directional: rows are GT labels (actual), columns are annotator labels (predicted). In `all` and `accepted` modes — where no canonical "actual vs predicted" axis exists — the matrix is symmetric. When a task has multiple GT annotations the most recently updated one is used. `top_confusion_pairs.rate` is the share of off-diagonal mass.
 
         Parameters
         ----------
@@ -149,7 +149,7 @@ def data_quality_agreement_distribution(
                     This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
                 </p>
             </Card>
-        Returns average agreement, histogram buckets, low-agreement count, and total tasks.
+        Returns average agreement, a 10-bucket histogram of `Task.precomputed_agreement` (filled on-the-fly from V2 dimension matrices when null), `low_agreement_count`, and `total_tasks`. The low-agreement threshold is `LseProject.agreement_threshold` (the same project setting Data Manager filters and review-routing rules consume); changing that setting moves the count for this endpoint as well.
 
         Parameters
         ----------
@@ -1218,7 +1218,7 @@ async def data_quality_agreement_confusion_matrix(
                     This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
                 </p>
             </Card>
-        Returns label confusion matrix with precision, recall, and top confusion pairs.
+        Returns label confusion matrix with precision, recall, and top confusion pairs. In `ground_truth` mode the matrix is directional: rows are GT labels (actual), columns are annotator labels (predicted). In `all` and `accepted` modes — where no canonical "actual vs predicted" axis exists — the matrix is symmetric. When a task has multiple GT annotations the most recently updated one is used. `top_confusion_pairs.rate` is the share of off-diagonal mass.
 
         Parameters
         ----------
@@ -1318,7 +1318,7 @@ async def data_quality_agreement_distribution(
                     This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
                 </p>
             </Card>
-        Returns average agreement, histogram buckets, low-agreement count, and total tasks.
+        Returns average agreement, a 10-bucket histogram of `Task.precomputed_agreement` (filled on-the-fly from V2 dimension matrices when null), `low_agreement_count`, and `total_tasks`. The low-agreement threshold is `LseProject.agreement_threshold` (the same project setting Data Manager filters and review-routing rules consume); changing that setting moves the count for this endpoint as well.
 
         Parameters
         ----------
diff --git a/src/label_studio_sdk/projects/stats/raw_client.py b/src/label_studio_sdk/projects/stats/raw_client.py
@@ -61,7 +61,7 @@ def data_quality_agreement_confusion_matrix(
                     This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
                 </p>
             </Card>
-        Returns label confusion matrix with precision, recall, and top confusion pairs.
+        Returns label confusion matrix with precision, recall, and top confusion pairs. In `ground_truth` mode the matrix is directional: rows are GT labels (actual), columns are annotator labels (predicted). In `all` and `accepted` modes — where no canonical "actual vs predicted" axis exists — the matrix is symmetric. When a task has multiple GT annotations the most recently updated one is used. `top_confusion_pairs.rate` is the share of off-diagonal mass.
 
         Parameters
         ----------
@@ -167,7 +167,7 @@ def data_quality_agreement_distribution(
                     This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
                 </p>
             </Card>
-        Returns average agreement, histogram buckets, low-agreement count, and total tasks.
+        Returns average agreement, a 10-bucket histogram of `Task.precomputed_agreement` (filled on-the-fly from V2 dimension matrices when null), `low_agreement_count`, and `total_tasks`. The low-agreement threshold is `LseProject.agreement_threshold` (the same project setting Data Manager filters and review-routing rules consume); changing that setting moves the count for this endpoint as well.
 
         Parameters
         ----------
@@ -1495,7 +1495,7 @@ async def data_quality_agreement_confusion_matrix(
                     This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
                 </p>
             </Card>
-        Returns label confusion matrix with precision, recall, and top confusion pairs.
+        Returns label confusion matrix with precision, recall, and top confusion pairs. In `ground_truth` mode the matrix is directional: rows are GT labels (actual), columns are annotator labels (predicted). In `all` and `accepted` modes — where no canonical "actual vs predicted" axis exists — the matrix is symmetric. When a task has multiple GT annotations the most recently updated one is used. `top_confusion_pairs.rate` is the share of off-diagonal mass.
 
         Parameters
         ----------
@@ -1601,7 +1601,7 @@ async def data_quality_agreement_distribution(
                     This endpoint is not available in Label Studio Community Edition. [Learn more about Label Studio Enterprise](https://humansignal.com/goenterprise)
                 </p>
             </Card>
-        Returns average agreement, histogram buckets, low-agreement count, and total tasks.
+        Returns average agreement, a 10-bucket histogram of `Task.precomputed_agreement` (filled on-the-fly from V2 dimension matrices when null), `low_agreement_count`, and `total_tasks`. The low-agreement threshold is `LseProject.agreement_threshold` (the same project setting Data Manager filters and review-routing rules consume); changing that setting moves the count for this endpoint as well.
 
         Parameters
         ----------
diff --git a/src/label_studio_sdk/prompts/client.py b/src/label_studio_sdk/prompts/client.py
@@ -172,6 +172,7 @@ def subset_tasks(
         alignment_outcome: typing.Optional[SubsetTasksPromptsRequestAlignmentOutcome] = None,
         include_total: typing.Optional[bool] = None,
         model_run: typing.Optional[int] = None,
+        model_version: typing.Optional[int] = None,
         ordering: typing.Optional[str] = None,
         output_class: typing.Optional[str] = None,
         output_from_name: typing.Optional[str] = None,
@@ -212,6 +213,9 @@ def subset_tasks(
         model_run : typing.Optional[int]
             A unique ID of a ModelRun
 
+        model_version : typing.Optional[int]
+            Restrict prefetched predictions to this specific prompt version. Used with parent_model when no model_run is selected so a newly created version does not inherit predictions from prior versions.
+
         ordering : typing.Optional[str]
             Which field to use when ordering the results.
 
@@ -262,6 +266,7 @@ def subset_tasks(
             alignment_outcome=alignment_outcome,
             include_total=include_total,
             model_run=model_run,
+            model_version=model_version,
             ordering=ordering,
             output_class=output_class,
             output_from_name=output_from_name,
@@ -831,6 +836,7 @@ async def subset_tasks(
         alignment_outcome: typing.Optional[SubsetTasksPromptsRequestAlignmentOutcome] = None,
         include_total: typing.Optional[bool] = None,
         model_run: typing.Optional[int] = None,
+        model_version: typing.Optional[int] = None,
         ordering: typing.Optional[str] = None,
         output_class: typing.Optional[str] = None,
         output_from_name: typing.Optional[str] = None,
@@ -871,6 +877,9 @@ async def subset_tasks(
         model_run : typing.Optional[int]
             A unique ID of a ModelRun
 
+        model_version : typing.Optional[int]
+            Restrict prefetched predictions to this specific prompt version. Used with parent_model when no model_run is selected so a newly created version does not inherit predictions from prior versions.
+
         ordering : typing.Optional[str]
             Which field to use when ordering the results.
 
@@ -929,6 +938,7 @@ async def main() -> None:
             alignment_outcome=alignment_outcome,
             include_total=include_total,
             model_run=model_run,
+            model_version=model_version,
             ordering=ordering,
             output_class=output_class,
             output_from_name=output_from_name,
diff --git a/src/label_studio_sdk/prompts/raw_client.py b/src/label_studio_sdk/prompts/raw_client.py
@@ -189,6 +189,7 @@ def subset_tasks(
         alignment_outcome: typing.Optional[SubsetTasksPromptsRequestAlignmentOutcome] = None,
         include_total: typing.Optional[bool] = None,
         model_run: typing.Optional[int] = None,
+        model_version: typing.Optional[int] = None,
         ordering: typing.Optional[str] = None,
         output_class: typing.Optional[str] = None,
         output_from_name: typing.Optional[str] = None,
@@ -229,6 +230,9 @@ def subset_tasks(
         model_run : typing.Optional[int]
             A unique ID of a ModelRun
 
+        model_version : typing.Optional[int]
+            Restrict prefetched predictions to this specific prompt version. Used with parent_model when no model_run is selected so a newly created version does not inherit predictions from prior versions.
+
         ordering : typing.Optional[str]
             Which field to use when ordering the results.
 
@@ -270,6 +274,7 @@ def subset_tasks(
                 "alignment_outcome": alignment_outcome,
                 "include_total": include_total,
                 "model_run": model_run,
+                "model_version": model_version,
                 "ordering": ordering,
                 "output_class": output_class,
                 "output_from_name": output_from_name,
@@ -930,6 +935,7 @@ async def subset_tasks(
         alignment_outcome: typing.Optional[SubsetTasksPromptsRequestAlignmentOutcome] = None,
         include_total: typing.Optional[bool] = None,
         model_run: typing.Optional[int] = None,
+        model_version: typing.Optional[int] = None,
         ordering: typing.Optional[str] = None,
         output_class: typing.Optional[str] = None,
         output_from_name: typing.Optional[str] = None,
@@ -970,6 +976,9 @@ async def subset_tasks(
         model_run : typing.Optional[int]
             A unique ID of a ModelRun
 
+        model_version : typing.Optional[int]
+            Restrict prefetched predictions to this specific prompt version. Used with parent_model when no model_run is selected so a newly created version does not inherit predictions from prior versions.
+
         ordering : typing.Optional[str]
             Which field to use when ordering the results.
 
@@ -1011,6 +1020,7 @@ async def subset_tasks(
                 "alignment_outcome": alignment_outcome,
                 "include_total": include_total,
                 "model_run": model_run,
+                "model_version": model_version,
                 "ordering": ordering,
                 "output_class": output_class,
                 "output_from_name": output_from_name,