Skip to content

Commit 7a08061

Browse files
committed
chore: dintinguish between default catalog.db and specific dataset catalog in the ORM
1 parent e1abecb commit 7a08061

8 files changed

Lines changed: 135 additions & 405 deletions

File tree

Lines changed: 0 additions & 326 deletions
Original file line numberDiff line numberDiff line change
@@ -1,326 +0,0 @@
1-
"""SQLAlchemy ORM models for the DuckLake catalog schema.
2-
3-
Defines tables for datasets, groups, files, and columns stored
4-
in the pysus schema of the local DuckDB catalog.
5-
"""
6-
7-
import enum
8-
from datetime import datetime
9-
from typing import Optional
10-
11-
from sqlalchemy import (
12-
BigInteger,
13-
Boolean,
14-
Column,
15-
DateTime,
16-
Enum,
17-
ForeignKey,
18-
Index,
19-
Integer,
20-
Sequence,
21-
String,
22-
Table,
23-
)
24-
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
25-
26-
27-
class Base(DeclarativeBase):
28-
"""Base class for all DuckLake catalog ORM models."""
29-
30-
pass
31-
32-
33-
file_columns = Table(
34-
"file_columns",
35-
Base.metadata,
36-
Column(
37-
"file_id",
38-
Integer,
39-
ForeignKey("pysus.files.id"),
40-
primary_key=True,
41-
),
42-
Column(
43-
"column_id",
44-
Integer,
45-
ForeignKey("pysus.dataset_columns.id"),
46-
primary_key=True,
47-
),
48-
schema="pysus",
49-
)
50-
51-
52-
class CatalogTable(Base):
53-
"""Abstract base for catalog tables sharing the pysus schema."""
54-
55-
__abstract__ = True
56-
__table_args__: tuple = ({"schema": "pysus"},)
57-
58-
59-
class Origin(enum.Enum):
60-
"""Origin type for a dataset.
61-
62-
Attributes
63-
----------
64-
FTP : str
65-
Dataset sourced from the FTP server.
66-
API : str
67-
Dataset sourced from an API.
68-
"""
69-
70-
FTP = "ftp"
71-
API = "api"
72-
73-
74-
class CatalogDataset(CatalogTable):
75-
"""ORM model for the datasets table, representing a dataset collection.
76-
77-
Parameters
78-
----------
79-
id : int, optional
80-
Primary key (auto-generated by sequence).
81-
name : str
82-
Unique short name for the dataset.
83-
long_name : str
84-
Human-readable full name.
85-
description : str, optional
86-
Optional description of the dataset contents.
87-
origin : Origin
88-
Whether the dataset originates from FTP or an API.
89-
"""
90-
91-
__tablename__ = "datasets"
92-
93-
id = Column(
94-
Integer,
95-
Sequence("datasets_id_seq", schema="pysus"),
96-
primary_key=True,
97-
)
98-
name = Column(String, nullable=False, unique=True, index=True)
99-
long_name = Column(String, nullable=False)
100-
description = Column(String, nullable=True)
101-
origin = Column(Enum(Origin), nullable=False)
102-
103-
groups = relationship(
104-
"DatasetGroup",
105-
back_populates="dataset",
106-
cascade="all, delete-orphan",
107-
)
108-
files = relationship(
109-
"CatalogFile",
110-
back_populates="dataset",
111-
cascade="all, delete-orphan",
112-
)
113-
columns = relationship(
114-
"ColumnDefinition",
115-
back_populates="dataset",
116-
cascade="all, delete-orphan",
117-
)
118-
119-
120-
class ColumnDefinition(CatalogTable):
121-
"""ORM model for dataset column metadata.
122-
123-
Parameters
124-
----------
125-
id : int, optional
126-
Primary key (auto-generated by sequence).
127-
dataset_id : int
128-
Foreign key referencing the parent dataset.
129-
name : str
130-
Column name.
131-
type : str
132-
Column data type string.
133-
description : str, optional
134-
Optional description of the column.
135-
nullable : bool, optional
136-
Whether the column allows null values.
137-
"""
138-
139-
__tablename__ = "dataset_columns"
140-
141-
id = Column(
142-
Integer,
143-
Sequence("columns_id_seq", schema="pysus"),
144-
primary_key=True,
145-
)
146-
dataset_id = Column(
147-
Integer,
148-
ForeignKey("pysus.datasets.id"),
149-
nullable=False,
150-
index=True,
151-
)
152-
name = Column(String, nullable=False)
153-
type = Column(String, nullable=False)
154-
description = Column(String, nullable=True)
155-
nullable = Column(Boolean, nullable=False, default=True)
156-
157-
dataset = relationship("CatalogDataset", back_populates="columns")
158-
files = relationship(
159-
"CatalogFile",
160-
secondary=file_columns,
161-
back_populates="columns",
162-
)
163-
164-
__table_args__ = (
165-
Index("ix_columns_dataset_name", "dataset_id", "name"),
166-
{"schema": "pysus"},
167-
)
168-
169-
170-
class DatasetGroup(CatalogTable):
171-
"""ORM model for dataset groups, grouping related files within a dataset.
172-
173-
Parameters
174-
----------
175-
id : int, optional
176-
Primary key (auto-generated by sequence).
177-
name : str
178-
Short name for the group.
179-
dataset_id : int
180-
Foreign key referencing the parent dataset.
181-
long_name : str
182-
Human-readable full name.
183-
description : str, optional
184-
Optional description of the group contents.
185-
"""
186-
187-
__tablename__ = "dataset_groups"
188-
189-
id = Column(
190-
Integer,
191-
Sequence("groups_id_seq", schema="pysus"),
192-
primary_key=True,
193-
)
194-
name = Column(String, nullable=False)
195-
dataset_id = Column(
196-
Integer,
197-
ForeignKey("pysus.datasets.id"),
198-
nullable=False,
199-
index=True,
200-
)
201-
long_name = Column(String, nullable=False)
202-
description = Column(String, nullable=True)
203-
204-
dataset = relationship("CatalogDataset", back_populates="groups")
205-
files = relationship(
206-
"CatalogFile",
207-
back_populates="group",
208-
cascade="all, delete-orphan",
209-
)
210-
211-
__table_args__ = (
212-
Index("ix_groups_dataset_name", "dataset_id", "name"),
213-
{"schema": "pysus"},
214-
)
215-
216-
217-
class CatalogFile(CatalogTable):
218-
"""ORM model for the files table, representing individual data files.
219-
220-
Parameters
221-
----------
222-
id : int, optional
223-
Primary key (auto-generated by sequence).
224-
dataset_id : int
225-
Foreign key referencing the parent dataset.
226-
group_id : int, optional
227-
Foreign key referencing the parent group.
228-
path : str
229-
Object storage path to the file.
230-
size : int
231-
File size in bytes.
232-
rows : int
233-
Number of rows in the file.
234-
modified : datetime
235-
Timestamp of the last known modification.
236-
origin_modified : datetime, optional
237-
Original modification timestamp from the source.
238-
origin_path : str
239-
Original source path of the file.
240-
sha256 : str, optional
241-
SHA-256 hex digest for integrity verification.
242-
year : int, optional
243-
Data year associated with the file.
244-
month : int, optional
245-
Data month associated with the file.
246-
state : str, optional
247-
Two-letter state code associated with the file.
248-
"""
249-
250-
__tablename__ = "files"
251-
252-
id: Mapped[int] = mapped_column(
253-
Integer,
254-
Sequence("files_id_seq", schema="pysus"),
255-
primary_key=True,
256-
)
257-
dataset_id: Mapped[int] = mapped_column(
258-
Integer, ForeignKey("pysus.datasets.id"), nullable=False, index=True
259-
)
260-
group_id: Mapped[int | None] = mapped_column(
261-
Integer,
262-
ForeignKey("pysus.dataset_groups.id"),
263-
nullable=True,
264-
index=True,
265-
)
266-
267-
path: Mapped[str] = mapped_column(String, nullable=False, unique=True)
268-
size: Mapped[int] = mapped_column(BigInteger, nullable=False)
269-
rows: Mapped[int] = mapped_column(Integer, nullable=False)
270-
type: Mapped[str] = mapped_column(String, nullable=True)
271-
modified: Mapped[datetime] = mapped_column(DateTime, nullable=False)
272-
origin_modified: Mapped[datetime | None] = mapped_column(
273-
DateTime,
274-
nullable=True,
275-
)
276-
origin_path: Mapped[str] = mapped_column(String, nullable=False)
277-
sha256: Mapped[str | None] = mapped_column(
278-
String(64),
279-
nullable=True,
280-
index=True,
281-
)
282-
283-
year: Mapped[int | None] = mapped_column(
284-
Integer,
285-
nullable=True,
286-
index=True,
287-
)
288-
month: Mapped[int | None] = mapped_column(
289-
Integer,
290-
nullable=True,
291-
index=True,
292-
)
293-
state: Mapped[str | None] = mapped_column(
294-
String(2),
295-
nullable=True,
296-
index=True,
297-
)
298-
299-
dataset: Mapped["CatalogDataset"] = relationship(
300-
"CatalogDataset",
301-
back_populates="files",
302-
)
303-
group: Mapped[Optional["DatasetGroup"]] = relationship(
304-
"DatasetGroup",
305-
back_populates="files",
306-
)
307-
columns: Mapped[list["ColumnDefinition"]] = relationship(
308-
"ColumnDefinition",
309-
secondary=file_columns,
310-
back_populates="files",
311-
cascade="all, delete",
312-
)
313-
314-
__table_args__ = (
315-
Index("ix_files_dataset_group", "dataset_id", "group_id"),
316-
Index("ix_files_temporal", "year", "month"),
317-
Index(
318-
"ix_files_lookup",
319-
"dataset_id",
320-
"group_id",
321-
"year",
322-
"month",
323-
"state",
324-
),
325-
{"schema": "pysus"},
326-
)

pysus/api/ducklake/catalog/orm/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)