2929from kili .domain .asset import AssetFilters
3030from kili .domain .types import ListOrTuple
3131
32- # Threshold for batching based on number of annotations
33- # This is used to determine whether to use a single batch or multiple batches
34- # when fetching assets. If the number of annotations counted exceeds this threshold,
35- # the asset fetch will be done in multiple smaller batches to avoid performance issues.
32+ # TODO(LAB-4269): TEMPORARY WORKAROUND - Remove when backend handles jsonResponseUrl for LLM
33+ # Threshold for batching based on number of annotations.
34+ # This is used ONLY for LLM projects that need to fetch annotations instead of jsonResponseUrl.
35+ # If the number of annotations counted exceeds this threshold, the asset fetch will be done
36+ # in multiple smaller batches to avoid performance issues.
3637THRESHOLD_FOR_BATCHING = 200
3738
3839
3940class AssetOperationMixin (BaseOperationMixin ):
4041 """Mixin extending Kili API Gateway class with Assets related operations."""
4142
43+ # ============================================================================
44+ # TODO(LAB-4269): TEMPORARY WORKAROUND - Remove when backend handles jsonResponseUrl for LLM
45+ # The following helper methods handle the case where LLM projects don't have jsonResponseUrl.
46+ # Once the backend properly computes jsonResponseUrl for LLM projects, all these methods
47+ # can be deleted as a unit.
48+ # ============================================================================
49+
50+ @staticmethod
51+ def _is_llm_project_without_json_response_url (project_input_type : str ) -> bool :
52+ """Check if project is LLM type that doesn't have jsonResponseUrl.
53+
54+ TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
55+
56+ Args:
57+ project_input_type: The input type of the project
58+
59+ Returns:
60+ True if project is LLM type without jsonResponseUrl support
61+ """
62+ return project_input_type in {
63+ "LLM_RLHF" ,
64+ "LLM_INSTR_FOLLOWING" ,
65+ "LLM_STATIC" ,
66+ }
67+
68+ def _calculate_batch_size_for_llm_annotations (
69+ self ,
70+ filters ,
71+ batch_size_to_use : int ,
72+ ) -> int :
73+ """Calculate optimal batch size for LLM projects based on annotation count.
74+
75+ TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
76+
77+ For LLM projects, we fetch annotations instead of jsonResponseUrl. If there are too many
78+ annotations per asset, we reduce the batch size to avoid performance issues.
79+
80+ Args:
81+ filters: Asset filters to count annotations for
82+ batch_size_to_use: The default batch size to use
83+
84+ Returns:
85+ Adjusted batch size (1 if annotations/batch > threshold, otherwise batch_size_to_use)
86+ """
87+ nb_annotations = self .count_assets_annotations (filters )
88+ return (
89+ 1 if nb_annotations / batch_size_to_use > THRESHOLD_FOR_BATCHING else batch_size_to_use
90+ )
91+
92+ @staticmethod
93+ def _build_llm_annotation_fragment () -> dict [str , str ]:
94+ """Build GraphQL fragment for fetching annotations in LLM projects.
95+
96+ TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
97+
98+ Returns:
99+ Dictionary with static fragments for labels and latestLabel containing annotations
100+ """
101+ inner_annotation_fragment = get_annotation_fragment ()
102+ annotation_fragment = f"""
103+ annotations {{
104+ { inner_annotation_fragment }
105+ }}
106+ """
107+ return {"labels" : annotation_fragment , "latestLabel" : annotation_fragment }
108+
109+ def _convert_llm_annotations_to_json_response (
110+ self ,
111+ assets_gen : Generator [dict , None , None ],
112+ project_info : dict ,
113+ requested_labels_json_response : bool ,
114+ requested_latest_label_json_response : bool ,
115+ ) -> Generator [dict , None , None ]:
116+ """Convert annotations to jsonResponse for LLM projects.
117+
118+ TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
119+
120+ For LLM projects, we need to rebuild jsonResponse from annotations client-side
121+ because the backend doesn't compute jsonResponseUrl for these project types.
122+
123+ Args:
124+ assets_gen: Generator of assets with annotations
125+ project_info: Project information including jsonInterface and inputType
126+ requested_labels_json_response: Whether labels.jsonResponse was requested
127+ requested_latest_label_json_response: Whether latestLabel.jsonResponse was requested
128+
129+ Yields:
130+ Assets with jsonResponse rebuilt from annotations
131+ """
132+ converter = AnnotationsToJsonResponseConverter (
133+ json_interface = project_info ["jsonInterface" ],
134+ project_input_type = project_info ["inputType" ],
135+ )
136+ for asset in assets_gen :
137+ if requested_latest_label_json_response and asset .get ("latestLabel" ):
138+ converter .patch_label_json_response (
139+ asset , asset ["latestLabel" ], asset ["latestLabel" ]["annotations" ]
140+ )
141+ asset ["latestLabel" ].pop ("annotations" , None )
142+
143+ if requested_labels_json_response :
144+ for label in asset .get ("labels" , []):
145+ converter .patch_label_json_response (asset , label , label ["annotations" ])
146+ label .pop ("annotations" , None )
147+ yield asset
148+
149+ # ============================================================================
150+ # END TODO(LAB-4269) - LLM workaround methods
151+ # ============================================================================
152+
42153 def list_assets (
43154 self ,
44155 filters : AssetFilters ,
@@ -86,54 +197,31 @@ def list_assets_split( # pylint: disable=too-many-branches
86197 project_info ,
87198 ) -> Generator [dict , None , None ]:
88199 """List assets with given options."""
89- # For LLM projects, we need to fetch annotations and rebuild jsonResponse
90- # because LLM projects don't have jsonResponseUrl
91- is_llm_project = project_info ["inputType" ] in {
92- "LLM_RLHF" ,
93- "LLM_INSTR_FOLLOWING" ,
94- "LLM_STATIC" ,
95- }
200+ # TODO(LAB-4269): LLM workaround - detect if we need special handling
201+ is_llm_project = self ._is_llm_project_without_json_response_url (project_info ["inputType" ])
96202
97203 assets_batch_max_amount = 10 if project_info ["inputType" ] == "VIDEO" else 50
98204 batch_size_to_use = min (options .batch_size , assets_batch_max_amount )
99205
100- # For LLM projects fetching annotations, adjust batch size based on annotation count
101- if is_llm_project and (
102- "labels.jsonResponse" in fields or "latestLabel.jsonResponse" in fields
103- ):
104- nb_annotations = self .count_assets_annotations (filters )
105- batch_size = (
106- 1
107- if nb_annotations / batch_size_to_use > THRESHOLD_FOR_BATCHING
108- else batch_size_to_use
109- )
206+ # TODO(LAB-4269): LLM workaround - adjust batch size for annotation fetching
207+ requested_labels_json_response = "labels.jsonResponse" in fields
208+ requested_latest_label_json_response = "latestLabel.jsonResponse" in fields
209+ needs_json_response = requested_labels_json_response or requested_latest_label_json_response
210+
211+ if is_llm_project and needs_json_response :
212+ batch_size = self ._calculate_batch_size_for_llm_annotations (filters , batch_size_to_use )
110213 else :
111214 batch_size = batch_size_to_use
112215
113216 options = QueryOptions (options .disable_tqdm , options .first , options .skip , batch_size )
114217
115- requested_labels_json_response = "labels.jsonResponse" in fields
116- requested_latest_label_json_response = "latestLabel.jsonResponse" in fields
117- needs_json_response = requested_labels_json_response or requested_latest_label_json_response
118-
119218 required_fields = {"content" , "jsonContent" , "resolution.width" , "resolution.height" }
120219 fields = list (fields )
121220
221+ # TODO(LAB-4269): LLM workaround - build annotation fragment instead of using jsonResponseUrl
122222 static_fragments = {}
123223 if is_llm_project and needs_json_response :
124- # For LLM projects: fetch annotations and rebuild jsonResponse client-side
125- inner_annotation_fragment = get_annotation_fragment ()
126- annotation_fragment = f"""
127- annotations {{
128- { inner_annotation_fragment }
129- }}
130- """
131- static_fragments = {"labels" : annotation_fragment , "latestLabel" : annotation_fragment }
132-
133- fields = list (fields )
134- for field in required_fields :
135- if field not in fields :
136- fields .append (field )
224+ static_fragments = self ._build_llm_annotation_fragment ()
137225 else :
138226 if requested_labels_json_response :
139227 required_fields .add ("labels.jsonResponseUrl" )
@@ -154,24 +242,14 @@ def list_assets_split( # pylint: disable=too-many-branches
154242 load_asset_json_fields (asset , fields , self .http_client ) for asset in assets_gen
155243 )
156244
245+ # TODO(LAB-4269): LLM workaround - convert annotations to jsonResponse client-side
157246 if is_llm_project and needs_json_response :
158- # Rebuild jsonResponse from annotations for LLM projects
159- converter = AnnotationsToJsonResponseConverter (
160- json_interface = project_info ["jsonInterface" ],
161- project_input_type = project_info ["inputType" ],
247+ yield from self ._convert_llm_annotations_to_json_response (
248+ assets_gen ,
249+ project_info ,
250+ requested_labels_json_response ,
251+ requested_latest_label_json_response ,
162252 )
163- for asset in assets_gen :
164- if requested_latest_label_json_response and asset .get ("latestLabel" ):
165- converter .patch_label_json_response (
166- asset , asset ["latestLabel" ], asset ["latestLabel" ]["annotations" ]
167- )
168- asset ["latestLabel" ].pop ("annotations" , None )
169-
170- if requested_labels_json_response :
171- for label in asset .get ("labels" , []):
172- converter .patch_label_json_response (asset , label , label ["annotations" ])
173- label .pop ("annotations" , None )
174- yield asset
175253 else :
176254 yield from assets_gen
177255
@@ -184,7 +262,11 @@ def count_assets(self, filters: AssetFilters) -> int:
184262 return count
185263
186264 def count_assets_annotations (self , filters : AssetFilters ) -> int :
187- """Count the number of annotations for assets matching the filters."""
265+ """Count the number of annotations for assets matching the filters.
266+
267+ TODO(LAB-4269): Remove this method when backend handles jsonResponseUrl for LLM projects.
268+ This method is ONLY used to calculate optimal batch sizes for LLM projects.
269+ """
188270 where = asset_where_mapper (filters )
189271 payload = {"where" : where }
190272 count_result = self .graphql_client .execute (GQL_COUNT_ASSET_ANNOTATIONS , payload )
0 commit comments