Skip to content

Commit 78004d6

Browse files
committed
fix(LAB-4269): isolated LLM annotations specific code
1 parent ef614db commit 78004d6

4 files changed

Lines changed: 244 additions & 80 deletions

File tree

src/kili/adapters/kili_api_gateway/asset/operations.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ def get_assets_query(fragment: str) -> str:
3131
}
3232
"""
3333

34+
# TODO(LAB-4269): TEMPORARY WORKAROUND - Remove when backend handles jsonResponseUrl for LLM
35+
# This GraphQL query is ONLY used for LLM projects to count annotations for batch size optimization
3436
GQL_COUNT_ASSET_ANNOTATIONS = """
3537
query countAssetAnnotations($where: AssetWhere!) {
3638
data: countAssetAnnotations(where: $where)

src/kili/adapters/kili_api_gateway/asset/operations_mixin.py

Lines changed: 137 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,127 @@
2929
from kili.domain.asset import AssetFilters
3030
from kili.domain.types import ListOrTuple
3131

32-
# Threshold for batching based on number of annotations
33-
# This is used to determine whether to use a single batch or multiple batches
34-
# when fetching assets. If the number of annotations counted exceeds this threshold,
35-
# the asset fetch will be done in multiple smaller batches to avoid performance issues.
32+
# TODO(LAB-4269): TEMPORARY WORKAROUND - Remove when backend handles jsonResponseUrl for LLM
33+
# Threshold for batching based on number of annotations.
34+
# This is used ONLY for LLM projects that need to fetch annotations instead of jsonResponseUrl.
35+
# If the number of annotations counted exceeds this threshold, the asset fetch will be done
36+
# in multiple smaller batches to avoid performance issues.
3637
THRESHOLD_FOR_BATCHING = 200
3738

3839

3940
class AssetOperationMixin(BaseOperationMixin):
4041
"""Mixin extending Kili API Gateway class with Assets related operations."""
4142

43+
# ============================================================================
44+
# TODO(LAB-4269): TEMPORARY WORKAROUND - Remove when backend handles jsonResponseUrl for LLM
45+
# The following helper methods handle the case where LLM projects don't have jsonResponseUrl.
46+
# Once the backend properly computes jsonResponseUrl for LLM projects, all these methods
47+
# can be deleted as a unit.
48+
# ============================================================================
49+
50+
@staticmethod
51+
def _is_llm_project_without_json_response_url(project_input_type: str) -> bool:
52+
"""Check if project is LLM type that doesn't have jsonResponseUrl.
53+
54+
TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
55+
56+
Args:
57+
project_input_type: The input type of the project
58+
59+
Returns:
60+
True if project is LLM type without jsonResponseUrl support
61+
"""
62+
return project_input_type in {
63+
"LLM_RLHF",
64+
"LLM_INSTR_FOLLOWING",
65+
"LLM_STATIC",
66+
}
67+
68+
def _calculate_batch_size_for_llm_annotations(
69+
self,
70+
filters,
71+
batch_size_to_use: int,
72+
) -> int:
73+
"""Calculate optimal batch size for LLM projects based on annotation count.
74+
75+
TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
76+
77+
For LLM projects, we fetch annotations instead of jsonResponseUrl. If there are too many
78+
annotations per asset, we reduce the batch size to avoid performance issues.
79+
80+
Args:
81+
filters: Asset filters to count annotations for
82+
batch_size_to_use: The default batch size to use
83+
84+
Returns:
85+
Adjusted batch size (1 if annotations/batch > threshold, otherwise batch_size_to_use)
86+
"""
87+
nb_annotations = self.count_assets_annotations(filters)
88+
return (
89+
1 if nb_annotations / batch_size_to_use > THRESHOLD_FOR_BATCHING else batch_size_to_use
90+
)
91+
92+
@staticmethod
93+
def _build_llm_annotation_fragment() -> dict[str, str]:
94+
"""Build GraphQL fragment for fetching annotations in LLM projects.
95+
96+
TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
97+
98+
Returns:
99+
Dictionary with static fragments for labels and latestLabel containing annotations
100+
"""
101+
inner_annotation_fragment = get_annotation_fragment()
102+
annotation_fragment = f"""
103+
annotations {{
104+
{inner_annotation_fragment}
105+
}}
106+
"""
107+
return {"labels": annotation_fragment, "latestLabel": annotation_fragment}
108+
109+
def _convert_llm_annotations_to_json_response(
110+
self,
111+
assets_gen: Generator[dict, None, None],
112+
project_info: dict,
113+
requested_labels_json_response: bool,
114+
requested_latest_label_json_response: bool,
115+
) -> Generator[dict, None, None]:
116+
"""Convert annotations to jsonResponse for LLM projects.
117+
118+
TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
119+
120+
For LLM projects, we need to rebuild jsonResponse from annotations client-side
121+
because the backend doesn't compute jsonResponseUrl for these project types.
122+
123+
Args:
124+
assets_gen: Generator of assets with annotations
125+
project_info: Project information including jsonInterface and inputType
126+
requested_labels_json_response: Whether labels.jsonResponse was requested
127+
requested_latest_label_json_response: Whether latestLabel.jsonResponse was requested
128+
129+
Yields:
130+
Assets with jsonResponse rebuilt from annotations
131+
"""
132+
converter = AnnotationsToJsonResponseConverter(
133+
json_interface=project_info["jsonInterface"],
134+
project_input_type=project_info["inputType"],
135+
)
136+
for asset in assets_gen:
137+
if requested_latest_label_json_response and asset.get("latestLabel"):
138+
converter.patch_label_json_response(
139+
asset, asset["latestLabel"], asset["latestLabel"]["annotations"]
140+
)
141+
asset["latestLabel"].pop("annotations", None)
142+
143+
if requested_labels_json_response:
144+
for label in asset.get("labels", []):
145+
converter.patch_label_json_response(asset, label, label["annotations"])
146+
label.pop("annotations", None)
147+
yield asset
148+
149+
# ============================================================================
150+
# END TODO(LAB-4269) - LLM workaround methods
151+
# ============================================================================
152+
42153
def list_assets(
43154
self,
44155
filters: AssetFilters,
@@ -86,54 +197,31 @@ def list_assets_split( # pylint: disable=too-many-branches
86197
project_info,
87198
) -> Generator[dict, None, None]:
88199
"""List assets with given options."""
89-
# For LLM projects, we need to fetch annotations and rebuild jsonResponse
90-
# because LLM projects don't have jsonResponseUrl
91-
is_llm_project = project_info["inputType"] in {
92-
"LLM_RLHF",
93-
"LLM_INSTR_FOLLOWING",
94-
"LLM_STATIC",
95-
}
200+
# TODO(LAB-4269): LLM workaround - detect if we need special handling
201+
is_llm_project = self._is_llm_project_without_json_response_url(project_info["inputType"])
96202

97203
assets_batch_max_amount = 10 if project_info["inputType"] == "VIDEO" else 50
98204
batch_size_to_use = min(options.batch_size, assets_batch_max_amount)
99205

100-
# For LLM projects fetching annotations, adjust batch size based on annotation count
101-
if is_llm_project and (
102-
"labels.jsonResponse" in fields or "latestLabel.jsonResponse" in fields
103-
):
104-
nb_annotations = self.count_assets_annotations(filters)
105-
batch_size = (
106-
1
107-
if nb_annotations / batch_size_to_use > THRESHOLD_FOR_BATCHING
108-
else batch_size_to_use
109-
)
206+
# TODO(LAB-4269): LLM workaround - adjust batch size for annotation fetching
207+
requested_labels_json_response = "labels.jsonResponse" in fields
208+
requested_latest_label_json_response = "latestLabel.jsonResponse" in fields
209+
needs_json_response = requested_labels_json_response or requested_latest_label_json_response
210+
211+
if is_llm_project and needs_json_response:
212+
batch_size = self._calculate_batch_size_for_llm_annotations(filters, batch_size_to_use)
110213
else:
111214
batch_size = batch_size_to_use
112215

113216
options = QueryOptions(options.disable_tqdm, options.first, options.skip, batch_size)
114217

115-
requested_labels_json_response = "labels.jsonResponse" in fields
116-
requested_latest_label_json_response = "latestLabel.jsonResponse" in fields
117-
needs_json_response = requested_labels_json_response or requested_latest_label_json_response
118-
119218
required_fields = {"content", "jsonContent", "resolution.width", "resolution.height"}
120219
fields = list(fields)
121220

221+
# TODO(LAB-4269): LLM workaround - build annotation fragment instead of using jsonResponseUrl
122222
static_fragments = {}
123223
if is_llm_project and needs_json_response:
124-
# For LLM projects: fetch annotations and rebuild jsonResponse client-side
125-
inner_annotation_fragment = get_annotation_fragment()
126-
annotation_fragment = f"""
127-
annotations {{
128-
{inner_annotation_fragment}
129-
}}
130-
"""
131-
static_fragments = {"labels": annotation_fragment, "latestLabel": annotation_fragment}
132-
133-
fields = list(fields)
134-
for field in required_fields:
135-
if field not in fields:
136-
fields.append(field)
224+
static_fragments = self._build_llm_annotation_fragment()
137225
else:
138226
if requested_labels_json_response:
139227
required_fields.add("labels.jsonResponseUrl")
@@ -154,24 +242,14 @@ def list_assets_split( # pylint: disable=too-many-branches
154242
load_asset_json_fields(asset, fields, self.http_client) for asset in assets_gen
155243
)
156244

245+
# TODO(LAB-4269): LLM workaround - convert annotations to jsonResponse client-side
157246
if is_llm_project and needs_json_response:
158-
# Rebuild jsonResponse from annotations for LLM projects
159-
converter = AnnotationsToJsonResponseConverter(
160-
json_interface=project_info["jsonInterface"],
161-
project_input_type=project_info["inputType"],
247+
yield from self._convert_llm_annotations_to_json_response(
248+
assets_gen,
249+
project_info,
250+
requested_labels_json_response,
251+
requested_latest_label_json_response,
162252
)
163-
for asset in assets_gen:
164-
if requested_latest_label_json_response and asset.get("latestLabel"):
165-
converter.patch_label_json_response(
166-
asset, asset["latestLabel"], asset["latestLabel"]["annotations"]
167-
)
168-
asset["latestLabel"].pop("annotations", None)
169-
170-
if requested_labels_json_response:
171-
for label in asset.get("labels", []):
172-
converter.patch_label_json_response(asset, label, label["annotations"])
173-
label.pop("annotations", None)
174-
yield asset
175253
else:
176254
yield from assets_gen
177255

@@ -184,7 +262,11 @@ def count_assets(self, filters: AssetFilters) -> int:
184262
return count
185263

186264
def count_assets_annotations(self, filters: AssetFilters) -> int:
187-
"""Count the number of annotations for assets matching the filters."""
265+
"""Count the number of annotations for assets matching the filters.
266+
267+
TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
268+
This method is ONLY used to calculate optimal batch sizes for LLM projects.
269+
"""
188270
where = asset_where_mapper(filters)
189271
payload = {"where": where}
190272
count_result = self.graphql_client.execute(GQL_COUNT_ASSET_ANNOTATIONS, payload)

src/kili/adapters/kili_api_gateway/label/common.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,24 @@
99
from kili.domain.types import ListOrTuple
1010
from kili.exceptions import NotFound
1111

12+
# ============================================================================
13+
# TODO(LAB-4269): TEMPORARY WORKAROUND - Remove when backend handles jsonResponseUrl for LLM
14+
# This entire function is only used for LLM projects that don't have jsonResponseUrl.
15+
# Once the backend properly computes jsonResponseUrl for LLM projects, this function
16+
# can be deleted entirely.
17+
# ============================================================================
18+
1219

1320
def get_annotation_fragment() -> str:
1421
"""Generate a basic annotation fragment for querying annotations.
1522
16-
This is used for LLM projects.
23+
TODO(LAB-4269): Remove this function when backend handles jsonResponseUrl for LLM projects.
24+
25+
This is used ONLY for LLM projects that don't have jsonResponseUrl computed by the backend.
26+
The fragment is used to fetch annotations so we can rebuild jsonResponse client-side.
27+
28+
Returns:
29+
GraphQL fragment for querying LLM annotations (classification, comparison, transcription)
1730
"""
1831
return """
1932
__typename
@@ -46,6 +59,11 @@ def get_annotation_fragment() -> str:
4659
"""
4760

4861

62+
# ============================================================================
63+
# END TODO(LAB-4269) - LLM workaround function
64+
# ============================================================================
65+
66+
4967
def get_asset(
5068
graphql_client: GraphQLClient,
5169
http_client: HttpClient,

0 commit comments

Comments
 (0)