Skip to content

Commit 16b25ec

Browse files
yarikopticclaude
andcommitted
Address code review findings for partial zarr support
- Pre-compile regex patterns at ZarrFilter construction time, catching invalid patterns early instead of on every matches() call (B1) - Remove redundant get_asset_download_path override in AssetZarrEntryURL that was identical to the inherited SingleAssetURL method (H1) - Use Literal["full", "patch"] for zarr_mode parameter instead of bare str to prevent silent misbehavior on invalid values (H2) - Collapse consecutive ** glob segments to avoid exponential backtracking in _glob_match_parts (H3) - Simplify split_zarr_location to use str.split instead of PurePosixPath (M1) - Add explanatory comment for type: ignore in parse_zarr_filter (M2) - Yield {"status": "done"} when zarr filter matches zero entries (M4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c168643 commit 16b25ec

6 files changed

Lines changed: 43 additions & 20 deletions

File tree

dandi/dandiarchive.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -471,9 +471,7 @@ def split_zarr_location(location: str) -> tuple[str, str] | None:
471471
>>> split_zarr_location("sub-1/file.ome.zarr") # no subpath
472472
>>> split_zarr_location("sub-1/file.nwb") # no zarr extension
473473
"""
474-
from pathlib import PurePosixPath
475-
476-
parts = PurePosixPath(location).parts
474+
parts = [p for p in location.split("/") if p]
477475
for i, part in enumerate(parts):
478476
if any(part.endswith(ext) for ext in ZARR_EXTENSIONS):
479477
asset_path = "/".join(parts[: i + 1])
@@ -513,15 +511,6 @@ def get_assets(
513511
with _maybe_strict(strict):
514512
yield dandiset.get_asset_by_path(self.asset_path)
515513

516-
def get_asset_download_path(
517-
self, asset: BaseRemoteAsset, preserve_tree: bool
518-
) -> str:
519-
path = asset.path.lstrip("/")
520-
if preserve_tree:
521-
return path
522-
else:
523-
return posixpath.basename(path)
524-
525514

526515
@dataclass
527516
class AssetFolderURL(MultiAssetURL):

dandi/download.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,9 @@ def downloads_gen():
10691069
if final_out is not None:
10701070
break
10711071
else:
1072+
if zarr_entry_filter is not None:
1073+
# Filter matched no entries; still report completion
1074+
yield {"status": "done"}
10721075
return
10731076

10741077
if zarr_entry_filter is not None:

dandi/files/zarr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import os.path
1313
from pathlib import Path
1414
from time import sleep
15-
from typing import Any, Optional
15+
from typing import Any, Literal, Optional
1616
import urllib.parse
1717

1818
from dandischema.models import BareAsset, DigestType
@@ -557,7 +557,7 @@ def iter_upload(
557557
metadata: dict[str, Any],
558558
jobs: int | None = None,
559559
replacing: RemoteAsset | None = None,
560-
zarr_mode: str = "full",
560+
zarr_mode: Literal["full", "patch"] = "full",
561561
) -> Iterator[dict]:
562562
"""
563563
Upload the Zarr directory as an asset with the given metadata to the

dandi/tests/test_zarr_filter.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,18 @@ def test_parse_pattern_with_colon(self) -> None:
165165
filters = parse_zarr_filter("regex:a:b")
166166
assert filters[0].pattern == "a:b"
167167

168+
@pytest.mark.ai_generated
169+
def test_parse_invalid_regex(self) -> None:
170+
"""Invalid regex patterns should raise ValueError at construction time."""
171+
with pytest.raises(ValueError, match="Invalid regex pattern"):
172+
parse_zarr_filter("regex:[invalid")
173+
174+
@pytest.mark.ai_generated
175+
def test_invalid_regex_direct(self) -> None:
176+
"""Direct ZarrFilter construction with bad regex should also fail."""
177+
with pytest.raises(ValueError, match="Invalid regex pattern"):
178+
ZarrFilter("regex", "(unclosed")
179+
168180

169181
class TestMakeZarrEntryFilter:
170182
@pytest.mark.ai_generated

dandi/upload.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import re
2222
import time
2323
from time import sleep
24-
from typing import Any, TypedDict, cast
24+
from typing import Any, Literal, TypedDict, cast
2525
from unittest.mock import patch
2626

2727
import click
@@ -112,7 +112,7 @@ def upload(
112112
jobs: int | None = None,
113113
jobs_per_file: int | None = None,
114114
sync: bool = False,
115-
zarr_mode: str = "full",
115+
zarr_mode: Literal["full", "patch"] = "full",
116116
) -> None:
117117
if paths:
118118
paths = [Path(p).absolute() for p in paths]

dandi/zarr_filter.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from __future__ import annotations
99

10-
from dataclasses import dataclass
10+
from dataclasses import dataclass, field
1111
from fnmatch import fnmatchcase
1212
import re
1313
from typing import Callable, Literal
@@ -19,6 +19,16 @@ class ZarrFilter:
1919

2020
filter_type: Literal["glob", "path", "regex"]
2121
pattern: str
22+
_compiled_regex: re.Pattern[str] | None = field(
23+
init=False, default=None, repr=False, compare=False
24+
)
25+
26+
def __post_init__(self) -> None:
27+
if self.filter_type == "regex":
28+
try:
29+
self._compiled_regex = re.compile(self.pattern)
30+
except re.error as e:
31+
raise ValueError(f"Invalid regex pattern {self.pattern!r}: {e}") from e
2232

2333
def matches(self, entry_path: str) -> bool:
2434
"""Test if a zarr-internal path matches this filter.
@@ -41,7 +51,8 @@ def matches(self, entry_path: str) -> bool:
4151
self.pattern.rstrip("/") + "/"
4252
)
4353
elif self.filter_type == "regex":
44-
return re.search(self.pattern, entry_path) is not None
54+
assert self._compiled_regex is not None
55+
return self._compiled_regex.search(entry_path) is not None
4556
else:
4657
raise ValueError(f"Unknown filter type: {self.filter_type!r}")
4758

@@ -55,7 +66,13 @@ def _glob_match(pattern: str, path: str) -> bool:
5566
"""
5667
patparts = [p for p in pattern.split("/") if p]
5768
pathparts = [p for p in path.split("/") if p]
58-
return _glob_match_parts(patparts, pathparts)
69+
# Collapse consecutive ** into a single ** to avoid exponential backtracking
70+
collapsed: list[str] = []
71+
for p in patparts:
72+
if p == "**" and collapsed and collapsed[-1] == "**":
73+
continue
74+
collapsed.append(p)
75+
return _glob_match_parts(collapsed, pathparts)
5976

6077

6178
def _glob_match_parts(patparts: list[str], pathparts: list[str]) -> bool:
@@ -107,7 +124,8 @@ def parse_zarr_filter(spec: str) -> list[ZarrFilter]:
107124
Raises
108125
------
109126
ValueError
110-
If the spec is not a valid alias or ``TYPE:PATTERN`` string.
127+
If the spec is not a valid alias or ``TYPE:PATTERN`` string,
128+
or if a regex pattern is invalid.
111129
"""
112130
if spec in ZARR_FILTER_ALIASES:
113131
return list(ZARR_FILTER_ALIASES[spec])
@@ -121,6 +139,7 @@ def parse_zarr_filter(spec: str) -> list[ZarrFilter]:
121139
raise ValueError(
122140
f"Unknown zarr filter type: {type_!r}. Expected 'glob', 'path', or 'regex'"
123141
)
142+
# type_ is validated above; narrow str -> Literal for the dataclass
124143
return [ZarrFilter(type_, pattern)] # type: ignore[arg-type]
125144

126145

0 commit comments

Comments
 (0)