Skip to content

Commit 4cce161

Browse files
authored
Merge pull request #2045 from kili-technology/feature/lab-4341-aakd-i-need-to-validate-file-extensions-in-import-service
feat(lab-4341): check file type before importing
2 parents d0ee3d6 + 8b4a731 commit 4cce161

4 files changed

Lines changed: 253 additions & 4 deletions

File tree

src/kili/services/asset_import/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,5 @@ def import_assets( # pylint: disable=too-many-arguments
6363
asset_importer = importer_by_type[input_type](*importer_params)
6464
casted_assets = cast(list[AssetLike], assets)
6565
asset_importer.check_asset_contents(casted_assets)
66+
asset_importer.check_file_extensions(casted_assets, input_type)
6667
return asset_importer.import_assets(assets=casted_assets, input_type=input_type)

src/kili/services/asset_import/base.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
import logging
55
import mimetypes
66
import os
7+
import urllib.parse
78
import warnings
89
from collections.abc import Callable
910
from concurrent.futures import ThreadPoolExecutor
1011
from itertools import repeat
1112
from json import dumps, loads
12-
from pathlib import Path
13+
from pathlib import Path, PurePosixPath
1314
from typing import (
1415
TYPE_CHECKING,
1516
Any,
@@ -34,6 +35,7 @@
3435
from kili.domain.project import InputType, ProjectId
3536
from kili.domain.types import ListOrTuple
3637
from kili.services.asset_import.constants import (
38+
ALLOWED_EXTENSIONS_BY_INPUT_TYPE,
3739
IMPORT_BATCH_SIZE,
3840
project_compatible_mimetypes,
3941
)
@@ -524,6 +526,45 @@ def check_asset_contents(assets: list[AssetLike]) -> None:
524526
" multi_layer_content and empty json_content"
525527
)
526528

529+
@staticmethod
530+
def check_file_extensions(assets: list[AssetLike], input_type: str) -> None:
531+
"""Validate file extensions against allowed extensions for the given input type.
532+
533+
Skips validation when content has no extension (e.g. raw text, extensionless URLs).
534+
Raises ImportValidationError on the first asset with a disallowed extension.
535+
"""
536+
allowed = ALLOWED_EXTENSIONS_BY_INPUT_TYPE.get(input_type)
537+
if allowed is None:
538+
return
539+
540+
def get_ext(path_or_url: str) -> str:
541+
path = urllib.parse.urlparse(path_or_url).path if is_url(path_or_url) else path_or_url
542+
return PurePosixPath(path).suffix.lower()
543+
544+
for asset in assets:
545+
content = asset.get("content")
546+
if isinstance(content, str):
547+
ext = get_ext(content)
548+
if ext and ext not in allowed:
549+
raise ImportValidationError(
550+
f"File extension '{ext}' is not allowed for {input_type} projects"
551+
f" (asset external_id='{asset.get('external_id', 'unknown')}')."
552+
f" Allowed extensions: {', '.join(sorted(allowed))}"
553+
)
554+
555+
multi_layer = asset.get("multi_layer_content")
556+
if multi_layer:
557+
for layer in multi_layer:
558+
path = layer.get("path") or layer.get("url") or layer.get("content", "")
559+
if path:
560+
ext = get_ext(path)
561+
if ext and ext not in allowed:
562+
raise ImportValidationError(
563+
f"File extension '{ext}' is not allowed for {input_type} projects"
564+
f" (asset external_id='{asset.get('external_id', 'unknown')}')."
565+
f" Allowed extensions: {', '.join(sorted(allowed))}"
566+
)
567+
527568
def _can_upload_from_local_data(self) -> bool:
528569
user_me = self.kili.kili_api_gateway.get_current_user(fields=("email",))
529570
options = QueryOptions(first=1, disable_tqdm=True)

src/kili/services/asset_import/constants.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,28 @@
1010
MB_SIZE = 1024**2
1111
LARGE_IMAGE_THRESHOLD_SIZE = 30 * MB_SIZE
1212
MAX_WIDTH_OR_HEIGHT_NON_TILED = 10000
13+
14+
ALLOWED_EXTENSIONS_BY_INPUT_TYPE: dict[str, frozenset[str]] = {
15+
"AUDIO": frozenset({".flac", ".mp3", ".mp4", ".wav"}),
16+
"GEOSPATIAL": frozenset({".tif", ".tiff", ".jp2", ".ntf", ".nitf"}),
17+
"IMAGE": frozenset(
18+
{
19+
".jpeg",
20+
".jpg",
21+
".png",
22+
".bmp",
23+
".gif",
24+
".webp",
25+
".ico",
26+
".tif",
27+
".tiff",
28+
".jp2",
29+
".ntf",
30+
".nitf",
31+
}
32+
),
33+
"LLM_RLHF": frozenset({".json"}),
34+
"PDF": frozenset({".pdf"}),
35+
"TEXT": frozenset({".txt", ".csv"}),
36+
"VIDEO": frozenset({".mp4", ".mkv", ".3gp", ".avi", ".m4v", ".mov", ".webm"}),
37+
}

tests/unit/services/asset_import/test_import_common.py

Lines changed: 185 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from kili.core.graphql.operations.asset.mutations import GQL_APPEND_MANY_ASSETS
77
from kili.domain.project import ProjectId
88
from kili.services.asset_import import import_assets
9-
from kili.services.asset_import.exceptions import MimeTypeError
9+
from kili.services.asset_import.exceptions import ImportValidationError
1010
from tests.unit.services.asset_import.base import ImportTestCase
1111
from tests.unit.services.asset_import.mocks import (
1212
mocked_request_signed_urls,
@@ -20,11 +20,12 @@
2020
@patch("kili.utils.bucket.upload_data_via_rest", mocked_upload_data_via_rest)
2121
class TestContentType(ImportTestCase):
2222
def test_cannot_upload_an_image_to_video_project(self, *_):
23-
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "VIDEO_LEGACY"}
23+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "VIDEO"}
2424
url = "https://storage.googleapis.com/label-public-staging/car/car_1.jpg"
2525
path_image = self.downloader(url)
2626
assets = [{"content": path_image, "external_id": "image"}]
27-
with pytest.raises(MimeTypeError):
27+
# Extension check runs before MIME type check, so ImportValidationError is raised first
28+
with pytest.raises(ImportValidationError):
2829
import_assets(self.kili, ProjectId(self.project_id), assets, disable_tqdm=True)
2930

3031
def test_cannot_import_files_not_found_to_an_image_project(self, *_):
@@ -84,3 +85,184 @@ def test_import_assets_verify(self, mocked_verify_batch_imported, *_):
8485
mocked_verify_batch_imported.assert_not_called()
8586
import_assets(self.kili, ProjectId("project_id"), assets, verify=True)
8687
mocked_verify_batch_imported.assert_called_once()
88+
89+
90+
@patch("kili.utils.bucket.generate_unique_id", mocked_unique_id)
91+
@patch("kili.utils.bucket.request_signed_urls", mocked_request_signed_urls)
92+
@patch("kili.utils.bucket.upload_data_via_rest", mocked_upload_data_via_rest)
93+
class TestFileExtensionValidation(ImportTestCase):
94+
"""Tests that the import service validates file extensions before uploading."""
95+
96+
# --- IMAGE ---
97+
def test_image_project_rejects_video_extension(self, *_):
98+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "IMAGE"}
99+
assets = [{"content": "https://example.com/clip.mp4", "external_id": "wrong"}]
100+
with pytest.raises(ImportValidationError, match=r"\.mp4"):
101+
import_assets(self.kili, ProjectId(self.project_id), assets)
102+
103+
def test_image_project_rejects_audio_extension(self, *_):
104+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "IMAGE"}
105+
assets = [{"content": "https://example.com/sound.mp3", "external_id": "wrong"}]
106+
with pytest.raises(ImportValidationError, match=r"\.mp3"):
107+
import_assets(self.kili, ProjectId(self.project_id), assets)
108+
109+
def test_image_project_accepts_jpg_extension(self, *_):
110+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "IMAGE"}
111+
assets = [{"content": "https://example.com/image.jpg", "external_id": "ok", "id": "uid"}]
112+
# Should not raise ImportValidationError
113+
import_assets(self.kili, ProjectId(self.project_id), assets)
114+
115+
def test_image_project_accepts_tif_extension(self, *_):
116+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "IMAGE"}
117+
assets = [{"content": "https://example.com/geo.tif", "external_id": "ok", "id": "uid"}]
118+
import_assets(self.kili, ProjectId(self.project_id), assets)
119+
120+
# --- VIDEO ---
121+
def test_video_project_rejects_image_extension(self, *_):
122+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "VIDEO"}
123+
assets = [{"content": "https://example.com/photo.jpg", "external_id": "wrong"}]
124+
with pytest.raises(ImportValidationError, match=r"\.jpg"):
125+
import_assets(self.kili, ProjectId(self.project_id), assets)
126+
127+
def test_video_project_rejects_pdf_extension(self, *_):
128+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "VIDEO"}
129+
assets = [{"content": "https://example.com/doc.pdf", "external_id": "wrong"}]
130+
with pytest.raises(ImportValidationError, match=r"\.pdf"):
131+
import_assets(self.kili, ProjectId(self.project_id), assets)
132+
133+
def test_video_project_accepts_mp4_extension(self, *_):
134+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "VIDEO"}
135+
assets = [{"content": "https://example.com/vid.mp4", "external_id": "ok"}]
136+
import_assets(self.kili, ProjectId(self.project_id), assets)
137+
138+
def test_video_project_accepts_mkv_extension(self, *_):
139+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "VIDEO"}
140+
assets = [{"content": "https://example.com/vid.mkv", "external_id": "ok"}]
141+
import_assets(self.kili, ProjectId(self.project_id), assets)
142+
143+
# --- AUDIO ---
144+
def test_audio_project_rejects_image_extension(self, *_):
145+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "AUDIO"}
146+
assets = [{"content": "https://example.com/photo.jpg", "external_id": "wrong"}]
147+
with pytest.raises(ImportValidationError, match=r"\.jpg"):
148+
import_assets(self.kili, ProjectId(self.project_id), assets)
149+
150+
def test_audio_project_rejects_pdf_extension(self, *_):
151+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "AUDIO"}
152+
assets = [{"content": "https://example.com/doc.pdf", "external_id": "wrong"}]
153+
with pytest.raises(ImportValidationError, match=r"\.pdf"):
154+
import_assets(self.kili, ProjectId(self.project_id), assets)
155+
156+
def test_audio_project_accepts_mp3_extension(self, *_):
157+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "AUDIO"}
158+
assets = [{"content": "https://example.com/audio.mp3", "external_id": "ok"}]
159+
import_assets(self.kili, ProjectId(self.project_id), assets)
160+
161+
def test_audio_project_accepts_wav_extension(self, *_):
162+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "AUDIO"}
163+
assets = [{"content": "https://example.com/audio.wav", "external_id": "ok"}]
164+
import_assets(self.kili, ProjectId(self.project_id), assets)
165+
166+
# --- PDF ---
167+
def test_pdf_project_rejects_image_extension(self, *_):
168+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "PDF"}
169+
assets = [{"content": "https://example.com/photo.jpg", "external_id": "wrong"}]
170+
with pytest.raises(ImportValidationError, match=r"\.jpg"):
171+
import_assets(self.kili, ProjectId(self.project_id), assets)
172+
173+
def test_pdf_project_rejects_video_extension(self, *_):
174+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "PDF"}
175+
assets = [{"content": "https://example.com/clip.mp4", "external_id": "wrong"}]
176+
with pytest.raises(ImportValidationError, match=r"\.mp4"):
177+
import_assets(self.kili, ProjectId(self.project_id), assets)
178+
179+
def test_pdf_project_accepts_pdf_extension(self, *_):
180+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "PDF"}
181+
assets = [{"content": "https://example.com/doc.pdf", "external_id": "ok"}]
182+
import_assets(self.kili, ProjectId(self.project_id), assets)
183+
184+
# --- GEOSPATIAL ---
185+
def test_geospatial_project_rejects_image_extension(self, *_):
186+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "GEOSPATIAL"}
187+
assets = [{"content": "https://example.com/photo.jpg", "external_id": "wrong"}]
188+
with pytest.raises(ImportValidationError, match=r"\.jpg"):
189+
import_assets(self.kili, ProjectId(self.project_id), assets)
190+
191+
def test_geospatial_project_accepts_tif_extension(self, *_):
192+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "GEOSPATIAL"}
193+
assets = [{"content": "https://example.com/geo.tif", "external_id": "ok"}]
194+
import_assets(self.kili, ProjectId(self.project_id), assets)
195+
196+
def test_geospatial_project_accepts_jp2_extension(self, *_):
197+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "GEOSPATIAL"}
198+
assets = [{"content": "https://example.com/geo.jp2", "external_id": "ok"}]
199+
import_assets(self.kili, ProjectId(self.project_id), assets)
200+
201+
def test_geospatial_project_rejects_wrong_extension_in_multi_layer(self, *_):
202+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "GEOSPATIAL"}
203+
assets = [
204+
{
205+
"multi_layer_content": [
206+
{"path": "/local/layer.jpg", "name": "layer1"},
207+
],
208+
"external_id": "wrong_multi_layer",
209+
}
210+
]
211+
with pytest.raises(ImportValidationError, match=r"\.jpg"):
212+
import_assets(self.kili, ProjectId(self.project_id), assets)
213+
214+
def test_geospatial_project_accepts_correct_extension_in_multi_layer(self, *_):
215+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "GEOSPATIAL"}
216+
assets = [
217+
{
218+
"multi_layer_content": [
219+
{"path": "/local/layer.tif", "name": "layer1"},
220+
{"path": "/local/layer2.tiff", "name": "layer2"},
221+
],
222+
"external_id": "ok_multi_layer",
223+
}
224+
]
225+
# Should not raise ImportValidationError (may raise later on file access)
226+
with pytest.raises(Exception) as exc_info:
227+
import_assets(self.kili, ProjectId(self.project_id), assets)
228+
assert not isinstance(exc_info.value, ImportValidationError)
229+
230+
# --- TEXT ---
231+
def test_text_project_rejects_video_extension(self, *_):
232+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "TEXT"}
233+
assets = [{"content": "https://example.com/clip.mp4", "external_id": "wrong"}]
234+
with pytest.raises(ImportValidationError, match=r"\.mp4"):
235+
import_assets(self.kili, ProjectId(self.project_id), assets)
236+
237+
def test_text_project_accepts_txt_extension(self, *_):
238+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "TEXT"}
239+
assets = [{"content": "https://example.com/file.txt", "external_id": "ok"}]
240+
import_assets(self.kili, ProjectId(self.project_id), assets)
241+
242+
def test_text_project_skips_validation_for_raw_text(self, *_):
243+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "TEXT"}
244+
assets = [{"content": "this is raw text with no extension", "external_id": "ok"}]
245+
import_assets(self.kili, ProjectId(self.project_id), assets)
246+
247+
# --- LLM_RLHF ---
248+
def test_llm_project_rejects_non_json_extension(self, *_):
249+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "LLM_RLHF"}
250+
assets = [{"content": "https://example.com/data.txt", "external_id": "wrong"}]
251+
with pytest.raises(ImportValidationError, match=r"\.txt"):
252+
import_assets(self.kili, ProjectId(self.project_id), assets)
253+
254+
def test_llm_project_accepts_json_extension(self, *_):
255+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "LLM_RLHF"}
256+
assets = [{"content": "https://example.com/data.json", "external_id": "ok"}]
257+
import_assets(self.kili, ProjectId(self.project_id), assets)
258+
259+
# --- no extension in URL skips validation ---
260+
def test_extensionless_url_skips_validation_for_image_project(self, *_):
261+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "IMAGE"}
262+
assets = [{"content": "https://example.com/no-ext", "external_id": "ok", "id": "uid"}]
263+
import_assets(self.kili, ProjectId(self.project_id), assets)
264+
265+
def test_extensionless_url_skips_validation_for_video_project(self, *_):
266+
self.kili.kili_api_gateway.get_project.return_value = {"inputType": "VIDEO"}
267+
assets = [{"content": "https://hosted-data", "external_id": "ok"}]
268+
import_assets(self.kili, ProjectId(self.project_id), assets)

0 commit comments

Comments
 (0)