Skip to content

Commit 0ca8ca0

Browse files
committed
add name field to DataAsset (#1787)
infers name from url if in s3://aind-open-data/
1 parent 2d1edf8 commit 0ca8ca0

2 files changed

Lines changed: 49 additions & 2 deletions

File tree

src/aind_data_schema/components/identifiers.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Schema for identifiers"""
22

3+
import re
34
from enum import Enum
45
from pathlib import Path
56
from typing import Dict, List, Optional
@@ -26,7 +27,21 @@ class Database(str, Enum):
2627
class DataAsset(DataModel):
2728
"""Description of a single data asset"""
2829

29-
url: str = Field(..., title="Asset location", description="URL pointing to the data asset")
30+
name: Optional[str] = Field(default=None, title="Asset name", description="Name of the data asset")
31+
url: Optional[str] = Field(default=None, title="Asset location", description="URL pointing to the data asset")
32+
33+
@model_validator(mode="after")
34+
def validate_name(self):
35+
"""Validator to be sure name or url is provided
36+
If name isn't provided, attempt to parse name from url. If url is also not provided, raise error.
37+
"""
38+
if not self.name:
39+
if not self.url:
40+
raise ValueError("Either 'name' or 'url' must be provided for a DataAsset.")
41+
match = re.match("^s3://aind-open-data/([^/]+)(/.*)?$", self.url)
42+
if match is not None:
43+
self.name = match.group(1)
44+
return self
3045

3146

3247
class CombinedData(DataModel):

tests/test_identifiers.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from pydantic import ValidationError
66

7-
from aind_data_schema.components.identifiers import Code, Person
7+
from aind_data_schema.components.identifiers import Code, DataAsset, Person
88

99

1010
class Testexperimenter(unittest.TestCase):
@@ -52,6 +52,38 @@ def test_git_hash_invalid(self):
5252
with self.subTest(git_hash=git_hash):
5353
with self.assertRaises(ValidationError):
5454
Code(url="https://github.com/org/repo", commit_hash=git_hash)
55+
class TestDataAsset(unittest.TestCase):
56+
"""Test DataAsset validator"""
57+
58+
def test_name_provided_directly(self):
59+
"""Name is kept as-is when explicitly provided"""
60+
asset = DataAsset(name="my-dataset")
61+
self.assertEqual(asset.name, "my-dataset")
62+
63+
def test_name_parsed_from_url_no_subpath(self):
64+
"""Name is inferred from top-level prefix with no nested path"""
65+
asset = DataAsset(url="s3://aind-open-data/my-dataset")
66+
self.assertEqual(asset.name, "my-dataset")
67+
68+
def test_name_parsed_from_url_with_subpath(self):
69+
"""Name is inferred from top-level prefix, ignoring nested path"""
70+
asset = DataAsset(url="s3://aind-open-data/my-dataset/sub/path/file.txt")
71+
self.assertEqual(asset.name, "my-dataset")
72+
73+
def test_name_not_overridden_when_provided_with_url(self):
74+
"""Explicit name takes precedence over URL-inferred name"""
75+
asset = DataAsset(name="explicit-name", url="s3://aind-open-data/other-dataset/sub")
76+
self.assertEqual(asset.name, "explicit-name")
77+
78+
def test_neither_name_nor_url_raises(self):
79+
"""Raises ValidationError when neither name nor url is provided"""
80+
with self.assertRaises(ValidationError):
81+
DataAsset()
82+
83+
def test_url_wrong_bucket_leaves_name_none(self):
84+
"""URL from a different bucket does not set name (remains None)"""
85+
asset = DataAsset(url="s3://other-bucket/my-dataset")
86+
self.assertIsNone(asset.name)
5587

5688

5789
if __name__ == "__main__":

0 commit comments

Comments
 (0)