Skip to content

Commit 23e86a8

Browse files
fix: Windows path backslash issue in COCO to LabelStudio path conversion
GitOrigin-RevId: d94a90f9ac137e147b28ee56198e69c4627b5099
1 parent 4313f88 commit 23e86a8

5 files changed

Lines changed: 101 additions & 16 deletions

File tree

src/label_studio_sdk/converter/imports/coco.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
import os
21
import json # better to use "imports ujson as json" for the best performance
32
import uuid
43
import logging
54
from PIL import Image
65

7-
from label_studio_sdk.converter.utils import ExpandFullPath
6+
from label_studio_sdk.converter.utils import ExpandFullPath, join_input_url
87
from label_studio_sdk.converter.imports.label_config import generate_label_config
98

109
logger = logging.getLogger("root")
@@ -22,7 +21,7 @@ def new_task(out_type, root_url, file_name):
2221
dict: Label Studio task structure with image data and empty result array
2322
"""
2423
return {
25-
"data": {"image": os.path.join(root_url, file_name)},
24+
"data": {"image": join_input_url(root_url, file_name)},
2625
# 'annotations' or 'predictions'
2726
out_type: [
2827
{

src/label_studio_sdk/converter/imports/pathtrack.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
from types import SimpleNamespace
1212

13+
from label_studio_sdk.converter.utils import join_input_url
14+
1315
logger = logging.getLogger()
1416
logger.setLevel(logging.DEBUG)
1517

@@ -214,15 +216,12 @@ def convert_dataset(
214216
logger.info("Convert dataset start: %s", root_dir)
215217
tasks = []
216218

217-
if not root_url.endswith("/"):
218-
root_url += "/"
219-
220219
for d in os.listdir(root_dir):
221220
shot_dir = os.path.join(root_dir, d)
222221
if not os.path.isdir(shot_dir):
223222
continue
224223

225-
input_url = root_url + d + "/video.mp4"
224+
input_url = join_input_url(root_url, d, "video.mp4")
226225
label_file = os.path.join(shot_dir, "gt/gt.txt")
227226
info_file = os.path.join(shot_dir, "info.xml")
228227

src/label_studio_sdk/converter/imports/yolo.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,8 @@
66

77
from PIL import Image
88
from typing import Optional, Tuple
9-
from urllib.request import (
10-
pathname2url,
11-
) # for converting "+","*", etc. in file paths to appropriate urls
129

13-
from label_studio_sdk.converter.utils import ExpandFullPath
10+
from label_studio_sdk.converter.utils import ExpandFullPath, join_input_url
1411
from label_studio_sdk.converter.imports.label_config import generate_label_config
1512

1613
logger = logging.getLogger("root")
@@ -79,12 +76,9 @@ def convert_yolo_to_ls(
7976
if not image_file_found_flag:
8077
continue
8178

82-
image_root_url += "" if image_root_url.endswith("/") else "/"
8379
task = {
8480
"data": {
85-
# eg. '../../foo+you.py' -> '../../foo%2Byou.py'
86-
"image": image_root_url
87-
+ str(pathname2url(image_file))
81+
"image": join_input_url(image_root_url, image_file)
8882
}
8983
}
9084

src/label_studio_sdk/converter/utils.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
import re
99
import shutil
1010
import urllib
11+
import posixpath
1112
import wave
1213
from collections import defaultdict
1314
from copy import deepcopy
1415
from operator import itemgetter
15-
from urllib.parse import urlparse
16+
from urllib.parse import parse_qsl, quote, urlencode, urlparse, urlsplit, urlunsplit
1617

1718
import numpy as np
1819
import requests
@@ -134,6 +135,55 @@ def _get_upload_dir(project_dir=None, upload_dir=None):
134135
return upload_dir
135136

136137

138+
def join_input_url(root_url: str, *file_parts: str) -> str:
139+
"""Join a root URL/prefix and file name in a platform-independent way.
140+
141+
The converter supports both regular URL prefixes (http/s3/relative paths)
142+
and Label Studio local-files URLs with the ``?d=`` query parameter.
143+
"""
144+
if not isinstance(root_url, str) or not root_url.strip():
145+
raise ValueError("root_url must be a non-empty string")
146+
147+
normalized_parts = []
148+
for part in file_parts:
149+
if part is None:
150+
continue
151+
if not isinstance(part, str):
152+
raise TypeError("All file path parts must be strings")
153+
154+
normalized_part = part.replace("\\", "/").strip("/")
155+
if not normalized_part:
156+
continue
157+
normalized_parts.append(normalized_part)
158+
159+
if not normalized_parts:
160+
raise ValueError("At least one file path part must be provided")
161+
162+
relative_path = posixpath.join(*normalized_parts).lstrip("/")
163+
parsed = urlsplit(root_url)
164+
query_items = parse_qsl(parsed.query, keep_blank_values=True)
165+
166+
is_local_files_url = parsed.path.rstrip("/").endswith("/data/local-files") and any(
167+
key == "d" for key, _ in query_items
168+
)
169+
if is_local_files_url:
170+
updated_query_items = []
171+
for key, value in query_items:
172+
if key == "d":
173+
value = f"{value.rstrip('/')}/{relative_path}" if value else relative_path
174+
updated_query_items.append((key, value))
175+
encoded_query = urlencode(updated_query_items, doseq=True, quote_via=quote, safe="/")
176+
return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, encoded_query, parsed.fragment))
177+
178+
encoded_path = quote(relative_path, safe="/")
179+
if parsed.scheme or parsed.netloc:
180+
path_prefix = parsed.path.rstrip("/")
181+
joined_path = posixpath.join(path_prefix, encoded_path) if path_prefix else f"/{encoded_path}"
182+
return urlunsplit((parsed.scheme, parsed.netloc, joined_path, parsed.query, parsed.fragment))
183+
184+
return f"{root_url.rstrip('/')}/{encoded_path}"
185+
186+
137187
def download(
138188
url,
139189
output_dir,
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import pytest
2+
3+
from label_studio_sdk.converter.utils import join_input_url
4+
5+
6+
def test_join_input_url_local_files_root_with_windows_style_path_from_issue():
7+
result = join_input_url("/data/local-files/?d=", r"images\test.jpg")
8+
9+
assert result == "/data/local-files/?d=images/test.jpg"
10+
11+
12+
def test_join_input_url_local_files_root_with_existing_prefix_and_spaces():
13+
result = join_input_url("/data/local-files/?d=val2017", "sub dir/image 1.jpg")
14+
15+
assert result == "/data/local-files/?d=val2017/sub%20dir/image%201.jpg"
16+
17+
18+
def test_join_input_url_http_root_encodes_url_path():
19+
result = join_input_url("https://example.com/images", "sub dir/image 1.jpg")
20+
21+
assert result == "https://example.com/images/sub%20dir/image%201.jpg"
22+
23+
24+
def test_join_input_url_s3_root_keeps_prefix_and_slashes():
25+
result = join_input_url("s3://bucket/coco/val2017/", "nested/image.jpg")
26+
27+
assert result == "s3://bucket/coco/val2017/nested/image.jpg"
28+
29+
30+
def test_join_input_url_local_files_root_with_multiple_parts():
31+
result = join_input_url("/data/local-files/?d=", "images\\nested", "test.jpg")
32+
33+
assert result == "/data/local-files/?d=images/nested/test.jpg"
34+
35+
36+
def test_join_input_url_rejects_empty_root_url():
37+
with pytest.raises(ValueError, match="root_url must be a non-empty string"):
38+
join_input_url("", "image.jpg")
39+
40+
41+
def test_join_input_url_rejects_empty_path_parts_after_normalization():
42+
with pytest.raises(ValueError, match="At least one file path part must be provided"):
43+
join_input_url("/data/local-files/?d=", "", "/", None)

0 commit comments

Comments
 (0)