Skip to content

Commit d5573e2

Browse files
FannyGaudinbaptiste-olivier
authored andcommitted
feat(LAB-3105): add param for new llm.export path
1 parent 9ffa6a0 commit d5573e2

8 files changed

Lines changed: 183 additions & 104 deletions

File tree

src/kili/llm/presentation/client/llm.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def export(
4444
disable_tqdm: Optional[bool] = False,
4545
asset_ids: Optional[List[str]] = None,
4646
external_ids: Optional[List[str]] = None,
47+
include_sent_back_labels: Optional[bool] = False,
4748
) -> Optional[List[Dict[str, Union[List[str], str]]]]:
4849
# pylint: disable=line-too-long
4950
"""Returns an export of llm assets with valid labels.
@@ -53,6 +54,7 @@ def export(
5354
asset_ids: Optional list of the assets internal IDs from which to export the labels.
5455
disable_tqdm: Disable the progress bar if True.
5556
external_ids: Optional list of the assets external IDs from which to export the labels.
57+
include_sent_back_labels: Include sent back labels if True.
5658
5759
!!! Example
5860
```python
@@ -83,6 +85,7 @@ def export(
8385
project_id=ProjectId(project_id),
8486
asset_filter=asset_filter,
8587
disable_tqdm=disable_tqdm,
88+
include_sent_back_labels=include_sent_back_labels,
8689
)
8790
except NoCompatibleJobError as excp:
8891
warnings.warn(str(excp), stacklevel=2)

src/kili/llm/services/export/__init__.py

Lines changed: 95 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,120 @@
22

33
from typing import Dict, List, Optional, Union
44

5+
from kili.adapters.kili_api_gateway.helpers.queries import QueryOptions
56
from kili.adapters.kili_api_gateway.kili_api_gateway import KiliAPIGateway
67
from kili.domain.asset.asset import AssetFilters
78
from kili.domain.project import ProjectId
89

910
from .dynamic import LLMDynamicExporter
1011
from .static import LLMStaticExporter
1112

13+
CHAT_ITEMS_NEEDED_FIELDS = [
14+
"id",
15+
"content",
16+
"createdAt",
17+
"modelId",
18+
"parentId",
19+
"role",
20+
]
21+
22+
LABELS_NEEDED_FIELDS = [
23+
"annotations.id",
24+
"author.id",
25+
"author.email",
26+
"author.firstname",
27+
"author.lastname",
28+
*(f"chatItems.{field}" for field in CHAT_ITEMS_NEEDED_FIELDS),
29+
"createdAt",
30+
"id",
31+
"isLatestLabelForUser",
32+
"isSentBackToQueue",
33+
"jsonResponse", # This is needed to keep annotations
34+
"labelType",
35+
"modelName",
36+
]
37+
38+
ASSET_DYNAMIC_NEEDED_FIELDS = [
39+
"assetProjectModels.id",
40+
"assetProjectModels.configuration",
41+
"assetProjectModels.name",
42+
"content",
43+
"externalId",
44+
"jsonMetadata",
45+
*(f"labels.{field}" for field in LABELS_NEEDED_FIELDS),
46+
"status",
47+
]
48+
49+
ASSET_STATIC_NEEDED_FIELDS = [
50+
"content",
51+
"externalId",
52+
"jsonMetadata",
53+
"labels.jsonResponse",
54+
"labels.author.id",
55+
"labels.author.email",
56+
"labels.author.firstname",
57+
"labels.author.lastname",
58+
"labels.createdAt",
59+
"labels.isLatestLabelForUser",
60+
"labels.isSentBackToQueue",
61+
"labels.labelType",
62+
"labels.modelName",
63+
"status",
64+
]
65+
1266

1367
def export( # pylint: disable=too-many-arguments, too-many-locals
1468
kili_api_gateway: KiliAPIGateway,
1569
project_id: ProjectId,
1670
asset_filter: AssetFilters,
1771
disable_tqdm: Optional[bool],
72+
include_sent_back_labels: Optional[bool],
1873
) -> Optional[List[Dict[str, Union[List[str], str]]]]:
1974
"""Export the selected assets with their labels into the required format, and save it into a file archive."""
2075
project = kili_api_gateway.get_project(project_id, ["id", "inputType", "jsonInterface"])
2176
input_type = project["inputType"]
2277

78+
fields = get_fields_to_fetch(input_type)
79+
asset_filter.status_in = ["LABELED", "REVIEWED", "TO_REVIEW"]
80+
assets = list(
81+
kili_api_gateway.list_assets(asset_filter, fields, QueryOptions(disable_tqdm=disable_tqdm))
82+
)
83+
cleaned_assets = preprocess_assets(assets, include_sent_back_labels or False)
2384
if input_type == "LLM_RLHF":
24-
return LLMStaticExporter(kili_api_gateway, disable_tqdm).export(
25-
project_id, asset_filter, project["jsonInterface"]
85+
return LLMStaticExporter(kili_api_gateway).export(
86+
cleaned_assets, project_id, project["jsonInterface"]
2687
)
2788
if input_type == "LLM_INSTR_FOLLOWING":
28-
asset_filter.status_in = ["LABELED", "REVIEWED", "TO_REVIEW"]
29-
return LLMDynamicExporter(kili_api_gateway, disable_tqdm).export(
30-
asset_filter, project["jsonInterface"]
31-
)
89+
return LLMDynamicExporter(kili_api_gateway).export(cleaned_assets, project["jsonInterface"])
3290
raise ValueError(f'Project Input type "{input_type}" cannot be used for llm exports.')
91+
92+
93+
def get_fields_to_fetch(input_type: str) -> List[str]:
94+
"""Return the fields to fetch depending on the export type."""
95+
if input_type == "LLM_RLHF":
96+
return ASSET_STATIC_NEEDED_FIELDS
97+
return ASSET_DYNAMIC_NEEDED_FIELDS
98+
99+
100+
def preprocess_assets(assets: List[Dict], include_sent_back_labels: bool) -> List[Dict]:
101+
"""Format labels in the requested format, and filter out autosave labels."""
102+
assets_in_format = []
103+
for asset in assets:
104+
if "labels" in asset:
105+
labels_of_asset = []
106+
for label in asset["labels"]:
107+
labels_of_asset.append(label)
108+
if not include_sent_back_labels:
109+
labels_of_asset = list(
110+
filter(lambda label: label["isSentBackToQueue"] is False, labels_of_asset)
111+
)
112+
if len(labels_of_asset) > 0:
113+
asset["labels"] = labels_of_asset
114+
assets_in_format.append(asset)
115+
if "latestLabel" in asset:
116+
label = asset["latestLabel"]
117+
if label is not None:
118+
asset["latestLabel"] = label
119+
if include_sent_back_labels or asset["latestLabel"]["isSentBackToQueue"] is False:
120+
assets_in_format.append(asset)
121+
return assets_in_format

src/kili/llm/services/export/dynamic.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
"""Handle LLM_INSTR_FOLLOWING project exports."""
22

33
import logging
4-
from typing import Dict, List, Optional, Union
4+
from typing import Dict, List, Union
55

6-
from kili.adapters.kili_api_gateway.helpers.queries import QueryOptions
76
from kili.adapters.kili_api_gateway.kili_api_gateway import KiliAPIGateway
87
from kili.domain.asset.asset import AssetFilters
98

@@ -46,16 +45,13 @@
4645
class LLMDynamicExporter:
4746
"""Handle exports of LLM_RLHF projects."""
4847

49-
def __init__(self, kili_api_gateway: KiliAPIGateway, disable_tqdm: Optional[bool]):
48+
def __init__(self, kili_api_gateway: KiliAPIGateway):
5049
self.kili_api_gateway = kili_api_gateway
51-
self.disable_tqdm = disable_tqdm
5250

5351
def export(
54-
self, asset_filter: AssetFilters, json_interface: Dict
52+
self, assets: List[Dict], json_interface: Dict
5553
) -> List[Dict[str, Union[List[str], str]]]:
5654
"""Asset content depends of each label."""
57-
options = QueryOptions(disable_tqdm=self.disable_tqdm)
58-
assets = self.kili_api_gateway.list_assets(asset_filter, ASSET_NEEDED_FIELDS, options)
5955
export_res = []
6056
for asset in assets:
6157
# obfuscate models here

src/kili/llm/services/export/static.py

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,48 +5,24 @@
55
from pathlib import Path
66
from typing import Dict, List, Optional, Union
77

8-
from kili.adapters.kili_api_gateway.helpers.queries import QueryOptions
98
from kili.adapters.kili_api_gateway.kili_api_gateway import KiliAPIGateway
10-
from kili.domain.asset.asset import AssetFilters
119
from kili.domain.project import ProjectId
1210
from kili.services.asset_import.helpers import SEPARATOR
1311
from kili.services.export.format.llm.types import ExportLLMItem
1412
from kili.use_cases.asset.media_downloader import MediaDownloader
1513
from kili.utils.tempfile import TemporaryDirectory
1614

17-
ASSET_NEEDED_FIELDS = [
18-
"content",
19-
"externalId",
20-
"jsonMetadata",
21-
"labels.jsonResponse",
22-
"labels.author.id",
23-
"labels.author.email",
24-
"labels.author.firstname",
25-
"labels.author.lastname",
26-
"labels.createdAt",
27-
"labels.isLatestLabelForUser",
28-
"labels.labelType",
29-
"labels.modelName",
30-
"status",
31-
]
32-
3315

3416
class LLMStaticExporter:
3517
"""Handle exports of LLM_RLHF projects."""
3618

37-
def __init__(self, kili_api_gateway: KiliAPIGateway, disable_tqdm: Optional[bool]):
19+
def __init__(self, kili_api_gateway: KiliAPIGateway):
3820
self.kili_api_gateway = kili_api_gateway
39-
self.disable_tqdm = disable_tqdm
4021

4122
def export(
42-
self, project_id: ProjectId, asset_filter: AssetFilters, json_interface: Dict
23+
self, assets: List[Dict], project_id: ProjectId, json_interface: Dict
4324
) -> List[Dict[str, Union[List[str], str]]]:
4425
"""Assets are static, with n labels."""
45-
assets = list(
46-
self.kili_api_gateway.list_assets(
47-
asset_filter, ASSET_NEEDED_FIELDS, QueryOptions(disable_tqdm=self.disable_tqdm)
48-
)
49-
)
5026
with TemporaryDirectory() as tmpdirname:
5127
assets = MediaDownloader(
5228
tmpdirname,

src/kili/services/export/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def export_labels( # pylint: disable=too-many-arguments, too-many-locals
5757
asset_filter_kwargs=asset_filter_kwargs,
5858
normalized_coordinates=normalized_coordinates,
5959
label_type_in=label_type_in,
60-
include_sent_back_labels=include_sent_back_labels,
60+
include_sent_back_labels=include_sent_back_labels if label_format != "llm_v1" else False,
6161
)
6262

6363
logger = get_logger(log_level)

tests/unit/llm/services/export/test_dynamic.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,10 @@
185185
],
186186
"createdAt": "2024-08-06T12:30:42.122Z",
187187
"isLatestLabelForUser": True,
188+
"isSentBackToQueue": False,
188189
"id": "clzief6q2003e7tc91jm46uii",
189190
"jsonResponse": {},
190-
"labelType": "AUTOSAVE",
191+
"labelType": "DEFAULT",
191192
"modelName": None,
192193
}
193194
],
@@ -261,7 +262,7 @@
261262
{
262263
"author": "test+admin@kili-technology.com",
263264
"created_at": "2024-08-06T12:30:42.122Z",
264-
"label_type": "AUTOSAVE",
265+
"label_type": "DEFAULT",
265266
"label": {"COMPARISON_JOB": "A_3", "CLASSIFICATION_JOB": ["BOTH_ARE_GOOD"]},
266267
}
267268
],
@@ -329,7 +330,7 @@
329330
{
330331
"author": "test+admin@kili-technology.com",
331332
"created_at": "2024-08-06T12:30:42.122Z",
332-
"label_type": "AUTOSAVE",
333+
"label_type": "DEFAULT",
333334
"label": {"COMPARISON_JOB": "B_1"},
334335
}
335336
],
@@ -411,7 +412,7 @@
411412
{
412413
"author": "test+admin@kili-technology.com",
413414
"created_at": "2024-08-06T12:30:42.122Z",
414-
"label_type": "AUTOSAVE",
415+
"label_type": "DEFAULT",
415416
"label": {"COMPARISON_JOB": "A_2"},
416417
}
417418
],

tests/unit/llm/services/export/test_static.py

Lines changed: 3 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
},
5555
"createdAt": "2024-08-05T13:03:00.051Z",
5656
"isLatestLabelForUser": True,
57+
"isSentBackToQueue": False,
5758
"labelType": "DEFAULT",
5859
"modelName": None,
5960
}
@@ -77,6 +78,7 @@
7778
},
7879
"createdAt": "2024-08-05T13:03:03.061Z",
7980
"isLatestLabelForUser": True,
81+
"isSentBackToQueue": False,
8082
"labelType": "DEFAULT",
8183
"modelName": None,
8284
}
@@ -101,6 +103,7 @@
101103
},
102104
"createdAt": "2024-08-05T13:03:16.028Z",
103105
"isLatestLabelForUser": True,
106+
"isSentBackToQueue": True,
104107
"labelType": "DEFAULT",
105108
"modelName": None,
106109
}
@@ -257,66 +260,6 @@
257260
}
258261
],
259262
},
260-
{
261-
"raw_data": [
262-
{
263-
"role": "user",
264-
"content": "BLABLABLA",
265-
"id": None,
266-
"chat_id": None,
267-
"model": None,
268-
},
269-
{
270-
"role": "assistant",
271-
"content": "response A1",
272-
"id": None,
273-
"chat_id": None,
274-
"model": None,
275-
},
276-
{
277-
"role": "assistant",
278-
"content": "response B1",
279-
"id": None,
280-
"chat_id": None,
281-
"model": None,
282-
},
283-
{
284-
"role": "user",
285-
"content": "BLIBLIBLI",
286-
"id": None,
287-
"chat_id": None,
288-
"model": None,
289-
},
290-
{
291-
"role": "assistant",
292-
"content": "response A2",
293-
"id": None,
294-
"chat_id": None,
295-
"model": None,
296-
},
297-
{
298-
"role": "assistant",
299-
"content": "response B2",
300-
"id": None,
301-
"chat_id": None,
302-
"model": None,
303-
},
304-
],
305-
"status": "LABELED",
306-
"external_id": "asset#2",
307-
"metadata": {},
308-
"labels": [
309-
{
310-
"author": "test+admin@kili-technology.com",
311-
"created_at": "2024-08-05T13:03:16.028Z",
312-
"label_type": "DEFAULT",
313-
"label": {
314-
"CLASSIFICATION_JOB": ["TIE"],
315-
"TRANSCRIPTION_JOB": "There is only some formatting changes\n",
316-
},
317-
}
318-
],
319-
},
320263
]
321264

322265

0 commit comments

Comments
 (0)