Implement energon prepare with a remote dataset (using local temp)

voegtlel · voegtlel · commit 8c093f4e30ba · 2026-04-08T17:57:27.000+02:00
diff --git a/src/megatron/energon/epathlib/epath.py b/src/megatron/energon/epathlib/epath.py
@@ -272,12 +272,27 @@ def mkdir(self, exist_ok: bool = True, parents: bool = False):
 
     def glob(self, pattern) -> Generator["EPath", None, None]:
         search_path_pattern = (self / pattern)._internal_str_path
+        # MSC S3 glob matches keys like ``bucket/key``; a leading ``/`` breaks wcmatch (pattern
+        # ``/b/**`` never matches ``b/parts/x``). Returned keys may repeat the bucket prefix; strip
+        # it before joining with ``internal_path`` so we do not get ``/b/b/parts/...``.
+        if not self.is_local() and search_path_pattern.startswith("/"):
+            search_path_pattern = search_path_pattern.lstrip("/")
+
+        root_prefix = str(self.internal_path).lstrip("/")
 
         for path in self.fs.glob(search_path_pattern):
             assert isinstance(path, str)
 
+            rel = path
+            if not self.is_local() and root_prefix:
+                pfx = root_prefix + "/"
+                if rel.startswith(pfx):
+                    rel = rel[len(pfx) :]
+                elif rel == root_prefix:
+                    rel = "."
+
             new_path = EPath(self)
-            new_path.internal_path = self._resolve(self.internal_path / PurePosixPath(path))
+            new_path.internal_path = self._resolve(self.internal_path / PurePosixPath(rel))
 
             yield new_path
 
diff --git a/src/megatron/energon/flavors/webdataset/indexing.py b/src/megatron/energon/flavors/webdataset/indexing.py
@@ -64,7 +64,6 @@ def __init__(
 
         # Initialize SQLite connection
         # Only supporting local file system, because sqlite does not support remote file systems.
-        # TODO: Implement remote file systems. Maybe create locally in tmp then upload?
         path = self.sqlite_path.local_path()
         path.parent.mkdir(parents=True, exist_ok=True)
         self.db = sqlite3.connect(path)
diff --git a/src/megatron/energon/flavors/webdataset/prepare.py b/src/megatron/energon/flavors/webdataset/prepare.py
diff --git a/src/megatron/energon/tools/prepare.py b/src/megatron/energon/tools/prepare.py
@@ -6,6 +6,7 @@
 import json
 import re
 import typing
+from pathlib import Path
 from types import FunctionType
 from typing import Any, List, Optional, Tuple, Type
 
@@ -184,6 +185,15 @@ def printify_json(data: Any) -> Any:
     help='Field mapping in JSON format (e.g., \'{"image": "jpg", "caption": "txt"}\'). If not set in non-interactive mode, a sample loader from template will be created. Use with --sample-type. Only applies if sample_type is not set to CrudeWebdataset.',
     default=None,
 )
+@click.option(
+    "--tmp-path",
+    type=click.Path(path_type=Path),
+    default=None,
+    help=(
+        "When PATH is remote (e.g. S3), directory where index.sqlite is built locally before upload. "
+        "Default: a new directory under /tmp (removed after success)."
+    ),
+)
 def command(
     path: EPath,
     progress: bool,
@@ -203,6 +213,7 @@ def command(
     field_map: Optional[str],
     skip_dataset_yaml: bool,
     dataset_yaml_name: str,
+    tmp_path: Optional[Path],
 ):
     """Prepare WebDataset for use with energon.
 
@@ -348,6 +359,7 @@ def progress_fn(els, length=None):
         workers=num_workers,
         media_filter=media_filter_config,
         fix_duplicates=fix_duplicates,
+        index_sqlite_tmp_path=tmp_path,
     )
 
     found_types = list(found_types)
diff --git a/src/megatron/energon/tools/prepare_media.py b/src/megatron/energon/tools/prepare_media.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 
+from pathlib import Path
+from typing import Optional
+
 import click
 
 from megatron.energon.epathlib import EPath
@@ -44,13 +47,23 @@
     is_flag=True,
     help="Media detection by standard file extensions.",
 )
+@click.option(
+    "--tmp-path",
+    type=click.Path(path_type=Path),
+    default=None,
+    help=(
+        "When PATH is remote, directory where index.sqlite is built locally before upload. "
+        "Default: a new directory under /tmp (removed after success)."
+    ),
+)
 def command(
     path: EPath,
     progress: bool,
     num_workers: int,
     media_metadata_by_glob: str | None,
     media_metadata_by_header: bool,
     media_metadata_by_extension: bool,
+    tmp_path: Optional[Path],
 ):
     """Prepare a filesystem dataset by collecting media metadata."""
 
@@ -83,6 +96,7 @@ def progress_fn(els, length=None):
             media_filter=media_filter_config,
             workers=num_workers,
             progress_fn=progress_fn,
+            index_sqlite_tmp_path=tmp_path,
         )
 
         click.echo(f"Done. Stored metadata for {count} files.")
diff --git a/tests/s3_emulator/handler.py b/tests/s3_emulator/handler.py
@@ -6,7 +6,7 @@
 from hashlib import md5
 from http import HTTPStatus
 from http.server import BaseHTTPRequestHandler
-from typing import Protocol
+from typing import Literal, Protocol
 
 from .auth import InvalidSignature, S3Auth
 from .state import S3State
@@ -112,6 +112,37 @@ def _handle_write(self):
 
         qs = _up.parse_qs(parsed.query, keep_blank_values=True)
 
+        # S3 CopyObject: PUT to destination key with x-amz-copy-source and empty body.
+        copy_src = (
+            self.headers.get("x-amz-copy-source") or self.headers.get("X-Amz-Copy-Source") or ""
+        ).strip()
+        if copy_src:
+            if not bucket:
+                self._send_error(HTTPStatus.BAD_REQUEST, "Bucket must be specified")
+                return
+            if key == "":
+                self._send_error(HTTPStatus.BAD_REQUEST, "CopyObject requires an object key")
+                return
+            try:
+                src_bucket, src_key = _parse_copy_source(copy_src)
+            except ValueError as err:
+                self._send_error(HTTPStatus.BAD_REQUEST, str(err))
+                return
+            try:
+                data = self.server.state.copy_object(bucket, key, src_bucket, src_key)
+            except FileNotFoundError:
+                self._send_error(HTTPStatus.NOT_FOUND, "NoSuchKey")
+                return
+            xml = (
+                '<?xml version="1.0" encoding="UTF-8"?>'
+                "<CopyObjectResult>"
+                f"<LastModified>{_escape_xml(formatdate(usegmt=True))}</LastModified>"
+                f"<ETag>&quot;{_escape_xml(_etag(data))}&quot;</ETag>"
+                "</CopyObjectResult>"
+            ).encode()
+            self._send_bytes(xml, status=HTTPStatus.OK, content_type="application/xml")
+            return
+
         # Multipart: upload part
         if "uploadId" in qs and "partNumber" in qs:
             upload_id = qs["uploadId"][0]
@@ -160,15 +191,15 @@ def _handle_read(self, listing: bool, only_headers: bool = False):
             self._send_error(HTTPStatus.BAD_REQUEST, "Bucket must be specified")
             return
 
-        if key == "":  # List bucket contents
+        if key == "":  # List bucket contents (ListObjects / ListObjectsV2)
             if not listing:
-                # We treat listing with GET only
                 try:
                     objects = self.server.state.list_objects(bucket)
                 except KeyError:
                     self._send_error(HTTPStatus.NOT_FOUND, "Bucket not found")
                     return
-                xml_body = self._render_bucket_list(bucket, objects)
+                qs = _up.parse_qs(parsed.query, keep_blank_values=True)
+                xml_body = self._render_list_bucket_result(bucket, objects, qs)
                 self._send_bytes(xml_body, content_type="application/xml")
             else:
                 self._send_error(HTTPStatus.NOT_IMPLEMENTED, "Listing not implemented")
@@ -359,51 +390,135 @@ def _send_bytes(
         if self.command != "HEAD":
             self.wfile.write(data)
 
-    @staticmethod
-    def _render_bucket_list(bucket: str, objects: list[str]) -> bytes:
-        """Generate an XML listing of objects in a bucket.
+    def _render_list_bucket_result(
+        self,
+        bucket: str,
+        all_keys: list[str],
+        qs: dict[str, list[str]],
+    ) -> bytes:
+        """Build ListBucketResult XML (ListObjectsV2-compatible).
+
+        Clients (e.g. MSC) send ``delimiter=/`` and ``prefix=`` and expect
+        ``CommonPrefixes`` for nested keys such as ``parts/data-0.tar``, not
+        only flat ``Contents``.
+        """
+        prefix = (qs.get("prefix") or [""])[0]
+        delimiter = (qs.get("delimiter") or [None])[0]
+        max_keys_s = (qs.get("max-keys") or qs.get("maxkeys") or ["1000"])[0]
+        try:
+            max_keys = max(1, min(int(max_keys_s), 1000))
+        except ValueError:
+            max_keys = 1000
 
-        Args:
-            bucket: The bucket name.
-            objects: List of object keys in the bucket.
+        continuation = (qs.get("continuation-token") or [""])[0]
+        start_after = (qs.get("start-after") or [""])[0]
+        exclusive_after = continuation or start_after
 
-        Returns:
-            The XML document as bytes.
-        """
-        entries = []
         now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z")
-        for key in objects:
-            try:
-                data = S3RequestHandler.server.state.get_object(bucket, key)  # type: ignore[attr-defined]
-                size = len(data)
-                etag = _etag(data)
-            except Exception:  # noqa: BLE001
-                size = 0
-                etag = '""'
-            entries.append(
-                "<Contents>"
-                f"<Key>{_escape_xml(key)}</Key>"
-                f"<LastModified>{now}</LastModified>"
-                f"<ETag>{etag}</ETag>"
-                f"<Size>{size}</Size>"
-                "</Contents>"
-            )
-        obj_elems = "".join(entries)
-        xml = (
-            '<?xml version="1.0" encoding="UTF-8"?>'
-            "<ListBucketResult>"
-            f"<Name>{_escape_xml(bucket)}</Name>"
-            f"{obj_elems}"
-            "</ListBucketResult>"
-        )
-        return xml.encode()
+        state = self.server.state
+
+        items: list[tuple[Literal["cp", "key"], str]] = []
+        if not delimiter:
+            for k in sorted(all_keys):
+                if k.startswith(prefix):
+                    items.append(("key", k))
+        else:
+            common: set[str] = set()
+            contents: list[str] = []
+            for k in sorted(all_keys):
+                if not k.startswith(prefix):
+                    continue
+                relative = k[len(prefix) :]
+                if delimiter in relative:
+                    idx = relative.index(delimiter)
+                    common.add(prefix + relative[: idx + len(delimiter)])
+                else:
+                    contents.append(k)
+            for cp in sorted(common):
+                items.append(("cp", cp))
+            for ck in sorted(contents):
+                items.append(("key", ck))
+            items.sort(key=lambda x: x[1])
+
+        if exclusive_after:
+            items = [it for it in items if it[1] > exclusive_after]
+
+        page = items[:max_keys]
+        truncated = len(items) > max_keys
+        next_token = page[-1][1] if truncated and page else ""
+
+        fragments: list[str] = [
+            '<?xml version="1.0" encoding="UTF-8"?>',
+            "<ListBucketResult>",
+            f"<Name>{_escape_xml(bucket)}</Name>",
+            f"<Prefix>{_escape_xml(prefix)}</Prefix>",
+            f"<KeyCount>{len(page)}</KeyCount>",
+            f"<MaxKeys>{max_keys}</MaxKeys>",
+            f"<IsTruncated>{str(truncated).lower()}</IsTruncated>",
+        ]
+        if delimiter:
+            fragments.append(f"<Delimiter>{_escape_xml(delimiter)}</Delimiter>")
+        if truncated and next_token:
+            fragments.append(f"<NextContinuationToken>{_escape_xml(next_token)}</NextContinuationToken>")
+
+        for kind, path in page:
+            if kind == "cp":
+                fragments.append(f"<CommonPrefixes><Prefix>{_escape_xml(path)}</Prefix></CommonPrefixes>")
+            else:
+                try:
+                    data = state.get_object(bucket, path)
+                    size = len(data)
+                    etag = _etag(data)
+                except Exception:  # noqa: BLE001
+                    size = 0
+                    etag = '""'
+                fragments.append(
+                    "<Contents>"
+                    f"<Key>{_escape_xml(path)}</Key>"
+                    f"<LastModified>{now}</LastModified>"
+                    f"<ETag>{etag}</ETag>"
+                    f"<Size>{size}</Size>"
+                    "</Contents>"
+                )
+
+        fragments.append("</ListBucketResult>")
+        return "".join(fragments).encode()
 
 
 class S3ServerProtocol(Protocol):  # noqa: D101
     state: S3State
     auth: S3Auth
 
 
+def _parse_copy_source(raw: str) -> tuple[str, str]:
+    """Parse ``x-amz-copy-source`` into ``(bucket, key)``.
+
+    Accepts ``/bucket/key``, ``bucket/key``, URL-encoded keys, and strips ``?versionId=``.
+
+    Args:
+        raw: Raw header value.
+
+    Returns:
+        Source bucket and object key.
+
+    Raises:
+        ValueError: If the value cannot be parsed.
+    """
+    s = raw.strip()
+    if not s:
+        raise ValueError("Empty x-amz-copy-source")
+    s = s.split("?", 1)[0]
+    s = _up.unquote(s)
+    if s.startswith("/"):
+        s = s[1:]
+    if "/" not in s:
+        raise ValueError("x-amz-copy-source must be /bucket/key")
+    src_bucket, src_key = s.split("/", 1)
+    if not src_bucket or not src_key:
+        raise ValueError("Invalid x-amz-copy-source")
+    return src_bucket, src_key
+
+
 def _escape_xml(text: str) -> str:  # noqa: D401
     """Escape special characters for XML.
 
diff --git a/tests/s3_emulator/state.py b/tests/s3_emulator/state.py
@@ -111,6 +111,35 @@ def get_object(self, bucket: str, key: str) -> bytes:
             except KeyError as exc:
                 raise FileNotFoundError(f"{bucket}/{key}") from exc
 
+    def copy_object(self, dest_bucket: str, dest_key: str, src_bucket: str, src_key: str) -> bytes:
+        """Copy an object to another key (S3 CopyObject).
+
+        Args:
+            dest_bucket: Destination bucket.
+            dest_key: Destination object key.
+            src_bucket: Source bucket.
+            src_key: Source object key.
+
+        Returns:
+            Copied object bytes (for ETag in the CopyObject XML response).
+
+        Raises:
+            FileNotFoundError: If the source object does not exist.
+        """
+        with self._lock:
+            try:
+                payload = bytes(self._fs[src_bucket][src_key])
+            except KeyError as exc:
+                raise FileNotFoundError(f"{src_bucket}/{src_key}") from exc
+            if dest_bucket not in self._fs:
+                self._fs[dest_bucket] = {}
+            self._fs[dest_bucket][dest_key] = payload
+        if self._root_dir is not None:
+            obj_path = (self._root_dir / dest_bucket / dest_key).resolve()
+            obj_path.parent.mkdir(parents=True, exist_ok=True)
+            obj_path.write_bytes(payload)
+        return payload
+
     def delete_object(self, bucket: str, key: str) -> None:
         """Delete an object from a bucket.
 
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
diff --git a/tests/test_epathlib.py b/tests/test_epathlib.py