Skip to content

Commit b2397e4

Browse files
feat: add name field to DataAsset (#1787) (#1836)
* add name field to DataAsset (#1787) infers name from url if in s3://aind-open-data/ * update docs * chore: lint --------- Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
1 parent a5ad22d commit b2397e4

3 files changed

Lines changed: 53 additions & 3 deletions

File tree

docs/source/components/identifiers.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ Description of a single data asset
5050

5151
| Field | Type | Title (Description) |
5252
|-------|------|-------------|
53-
| `url` | `str` | Asset location (URL pointing to the data asset) |
53+
| `name` | `Optional[str]` | Asset name (Name of the data asset) |
54+
| `url` | `Optional[str]` | Asset location (URL pointing to the data asset) |
5455

5556

5657
### Database

src/aind_data_schema/components/identifiers.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Schema for identifiers"""
22

3+
import re
34
from enum import Enum
45
from pathlib import Path
56
from typing import Dict, List, Optional
@@ -26,7 +27,21 @@ class Database(str, Enum):
2627
class DataAsset(DataModel):
2728
"""Description of a single data asset"""
2829

29-
url: str = Field(..., title="Asset location", description="URL pointing to the data asset")
30+
name: Optional[str] = Field(default=None, title="Asset name", description="Name of the data asset")
31+
url: Optional[str] = Field(default=None, title="Asset location", description="URL pointing to the data asset")
32+
33+
@model_validator(mode="after")
34+
def validate_name(self):
35+
"""Validator to be sure name or url is provided
36+
If name isn't provided, attempt to parse name from url. If url is also not provided, raise error.
37+
"""
38+
if not self.name:
39+
if not self.url:
40+
raise ValueError("Either 'name' or 'url' must be provided for a DataAsset.")
41+
match = re.match("^s3://aind-open-data/([^/]+)(/.*)?$", self.url)
42+
if match is not None:
43+
self.name = match.group(1)
44+
return self
3045

3146

3247
class CombinedData(DataModel):

tests/test_identifiers.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from pydantic import ValidationError
66

7-
from aind_data_schema.components.identifiers import Code, Person
7+
from aind_data_schema.components.identifiers import Code, DataAsset, Person
88

99

1010
class Testexperimenter(unittest.TestCase):
@@ -54,5 +54,39 @@ def test_git_hash_invalid(self):
5454
Code(url="https://github.com/org/repo", commit_hash=git_hash)
5555

5656

57+
class TestDataAsset(unittest.TestCase):
58+
"""Test DataAsset validator"""
59+
60+
def test_name_provided_directly(self):
61+
"""Name is kept as-is when explicitly provided"""
62+
asset = DataAsset(name="my-dataset")
63+
self.assertEqual(asset.name, "my-dataset")
64+
65+
def test_name_parsed_from_url_no_subpath(self):
66+
"""Name is inferred from top-level prefix with no nested path"""
67+
asset = DataAsset(url="s3://aind-open-data/my-dataset")
68+
self.assertEqual(asset.name, "my-dataset")
69+
70+
def test_name_parsed_from_url_with_subpath(self):
71+
"""Name is inferred from top-level prefix, ignoring nested path"""
72+
asset = DataAsset(url="s3://aind-open-data/my-dataset/sub/path/file.txt")
73+
self.assertEqual(asset.name, "my-dataset")
74+
75+
def test_name_not_overridden_when_provided_with_url(self):
76+
"""Explicit name takes precedence over URL-inferred name"""
77+
asset = DataAsset(name="explicit-name", url="s3://aind-open-data/other-dataset/sub")
78+
self.assertEqual(asset.name, "explicit-name")
79+
80+
def test_neither_name_nor_url_raises(self):
81+
"""Raises ValidationError when neither name nor url is provided"""
82+
with self.assertRaises(ValidationError):
83+
DataAsset()
84+
85+
def test_url_wrong_bucket_leaves_name_none(self):
86+
"""URL from a different bucket does not set name (remains None)"""
87+
asset = DataAsset(url="s3://other-bucket/my-dataset")
88+
self.assertIsNone(asset.name)
89+
90+
5791
if __name__ == "__main__":
5892
unittest.main()

0 commit comments

Comments
 (0)