|
8 | 8 | import re |
9 | 9 | import shutil |
10 | 10 | import urllib |
| 11 | +import posixpath |
11 | 12 | import wave |
12 | 13 | from collections import defaultdict |
13 | 14 | from copy import deepcopy |
14 | 15 | from operator import itemgetter |
15 | | -from urllib.parse import urlparse |
| 16 | +from urllib.parse import parse_qsl, quote, urlencode, urlparse, urlsplit, urlunsplit |
16 | 17 |
|
17 | 18 | import numpy as np |
18 | 19 | import requests |
@@ -134,6 +135,55 @@ def _get_upload_dir(project_dir=None, upload_dir=None): |
134 | 135 | return upload_dir |
135 | 136 |
|
136 | 137 |
|
| 138 | +def join_input_url(root_url: str, *file_parts: str) -> str: |
| 139 | + """Join a root URL/prefix and file name in a platform-independent way. |
| 140 | +
|
| 141 | + The converter supports both regular URL prefixes (http/s3/relative paths) |
| 142 | + and Label Studio local-files URLs with the ``?d=`` query parameter. |
| 143 | + """ |
| 144 | + if not isinstance(root_url, str) or not root_url.strip(): |
| 145 | + raise ValueError("root_url must be a non-empty string") |
| 146 | + |
| 147 | + normalized_parts = [] |
| 148 | + for part in file_parts: |
| 149 | + if part is None: |
| 150 | + continue |
| 151 | + if not isinstance(part, str): |
| 152 | + raise TypeError("All file path parts must be strings") |
| 153 | + |
| 154 | + normalized_part = part.replace("\\", "/").strip("/") |
| 155 | + if not normalized_part: |
| 156 | + continue |
| 157 | + normalized_parts.append(normalized_part) |
| 158 | + |
| 159 | + if not normalized_parts: |
| 160 | + raise ValueError("At least one file path part must be provided") |
| 161 | + |
| 162 | + relative_path = posixpath.join(*normalized_parts).lstrip("/") |
| 163 | + parsed = urlsplit(root_url) |
| 164 | + query_items = parse_qsl(parsed.query, keep_blank_values=True) |
| 165 | + |
| 166 | + is_local_files_url = parsed.path.rstrip("/").endswith("/data/local-files") and any( |
| 167 | + key == "d" for key, _ in query_items |
| 168 | + ) |
| 169 | + if is_local_files_url: |
| 170 | + updated_query_items = [] |
| 171 | + for key, value in query_items: |
| 172 | + if key == "d": |
| 173 | + value = f"{value.rstrip('/')}/{relative_path}" if value else relative_path |
| 174 | + updated_query_items.append((key, value)) |
| 175 | + encoded_query = urlencode(updated_query_items, doseq=True, quote_via=quote, safe="/") |
| 176 | + return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, encoded_query, parsed.fragment)) |
| 177 | + |
| 178 | + encoded_path = quote(relative_path, safe="/") |
| 179 | + if parsed.scheme or parsed.netloc: |
| 180 | + path_prefix = parsed.path.rstrip("/") |
| 181 | + joined_path = posixpath.join(path_prefix, encoded_path) if path_prefix else f"/{encoded_path}" |
| 182 | + return urlunsplit((parsed.scheme, parsed.netloc, joined_path, parsed.query, parsed.fragment)) |
| 183 | + |
| 184 | + return f"{root_url.rstrip('/')}/{encoded_path}" |
| 185 | + |
| 186 | + |
137 | 187 | def download( |
138 | 188 | url, |
139 | 189 | output_dir, |
|
0 commit comments