|
1 | | -"""SQLAlchemy ORM models for the DuckLake catalog schema. |
2 | | -
|
3 | | -Defines tables for datasets, groups, files, and columns stored |
4 | | -in the pysus schema of the local DuckDB catalog. |
5 | | -""" |
6 | | - |
7 | | -import enum |
8 | | -from datetime import datetime |
9 | | -from typing import Optional |
10 | | - |
11 | | -from sqlalchemy import ( |
12 | | - BigInteger, |
13 | | - Boolean, |
14 | | - Column, |
15 | | - DateTime, |
16 | | - Enum, |
17 | | - ForeignKey, |
18 | | - Index, |
19 | | - Integer, |
20 | | - Sequence, |
21 | | - String, |
22 | | - Table, |
23 | | -) |
24 | | -from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship |
25 | | - |
26 | | - |
27 | | -class Base(DeclarativeBase): |
28 | | - """Base class for all DuckLake catalog ORM models.""" |
29 | | - |
30 | | - pass |
31 | | - |
32 | | - |
33 | | -file_columns = Table( |
34 | | - "file_columns", |
35 | | - Base.metadata, |
36 | | - Column( |
37 | | - "file_id", |
38 | | - Integer, |
39 | | - ForeignKey("pysus.files.id"), |
40 | | - primary_key=True, |
41 | | - ), |
42 | | - Column( |
43 | | - "column_id", |
44 | | - Integer, |
45 | | - ForeignKey("pysus.dataset_columns.id"), |
46 | | - primary_key=True, |
47 | | - ), |
48 | | - schema="pysus", |
49 | | -) |
50 | | - |
51 | | - |
52 | | -class CatalogTable(Base): |
53 | | - """Abstract base for catalog tables sharing the pysus schema.""" |
54 | | - |
55 | | - __abstract__ = True |
56 | | - __table_args__: tuple = ({"schema": "pysus"},) |
57 | | - |
58 | | - |
59 | | -class Origin(enum.Enum): |
60 | | - """Origin type for a dataset. |
61 | | -
|
62 | | - Attributes |
63 | | - ---------- |
64 | | - FTP : str |
65 | | - Dataset sourced from the FTP server. |
66 | | - API : str |
67 | | - Dataset sourced from an API. |
68 | | - """ |
69 | | - |
70 | | - FTP = "ftp" |
71 | | - API = "api" |
72 | | - |
73 | | - |
74 | | -class CatalogDataset(CatalogTable): |
75 | | - """ORM model for the datasets table, representing a dataset collection. |
76 | | -
|
77 | | - Parameters |
78 | | - ---------- |
79 | | - id : int, optional |
80 | | - Primary key (auto-generated by sequence). |
81 | | - name : str |
82 | | - Unique short name for the dataset. |
83 | | - long_name : str |
84 | | - Human-readable full name. |
85 | | - description : str, optional |
86 | | - Optional description of the dataset contents. |
87 | | - origin : Origin |
88 | | - Whether the dataset originates from FTP or an API. |
89 | | - """ |
90 | | - |
91 | | - __tablename__ = "datasets" |
92 | | - |
93 | | - id = Column( |
94 | | - Integer, |
95 | | - Sequence("datasets_id_seq", schema="pysus"), |
96 | | - primary_key=True, |
97 | | - ) |
98 | | - name = Column(String, nullable=False, unique=True, index=True) |
99 | | - long_name = Column(String, nullable=False) |
100 | | - description = Column(String, nullable=True) |
101 | | - origin = Column(Enum(Origin), nullable=False) |
102 | | - |
103 | | - groups = relationship( |
104 | | - "DatasetGroup", |
105 | | - back_populates="dataset", |
106 | | - cascade="all, delete-orphan", |
107 | | - ) |
108 | | - files = relationship( |
109 | | - "CatalogFile", |
110 | | - back_populates="dataset", |
111 | | - cascade="all, delete-orphan", |
112 | | - ) |
113 | | - columns = relationship( |
114 | | - "ColumnDefinition", |
115 | | - back_populates="dataset", |
116 | | - cascade="all, delete-orphan", |
117 | | - ) |
118 | | - |
119 | | - |
120 | | -class ColumnDefinition(CatalogTable): |
121 | | - """ORM model for dataset column metadata. |
122 | | -
|
123 | | - Parameters |
124 | | - ---------- |
125 | | - id : int, optional |
126 | | - Primary key (auto-generated by sequence). |
127 | | - dataset_id : int |
128 | | - Foreign key referencing the parent dataset. |
129 | | - name : str |
130 | | - Column name. |
131 | | - type : str |
132 | | - Column data type string. |
133 | | - description : str, optional |
134 | | - Optional description of the column. |
135 | | - nullable : bool, optional |
136 | | - Whether the column allows null values. |
137 | | - """ |
138 | | - |
139 | | - __tablename__ = "dataset_columns" |
140 | | - |
141 | | - id = Column( |
142 | | - Integer, |
143 | | - Sequence("columns_id_seq", schema="pysus"), |
144 | | - primary_key=True, |
145 | | - ) |
146 | | - dataset_id = Column( |
147 | | - Integer, |
148 | | - ForeignKey("pysus.datasets.id"), |
149 | | - nullable=False, |
150 | | - index=True, |
151 | | - ) |
152 | | - name = Column(String, nullable=False) |
153 | | - type = Column(String, nullable=False) |
154 | | - description = Column(String, nullable=True) |
155 | | - nullable = Column(Boolean, nullable=False, default=True) |
156 | | - |
157 | | - dataset = relationship("CatalogDataset", back_populates="columns") |
158 | | - files = relationship( |
159 | | - "CatalogFile", |
160 | | - secondary=file_columns, |
161 | | - back_populates="columns", |
162 | | - ) |
163 | | - |
164 | | - __table_args__ = ( |
165 | | - Index("ix_columns_dataset_name", "dataset_id", "name"), |
166 | | - {"schema": "pysus"}, |
167 | | - ) |
168 | | - |
169 | | - |
170 | | -class DatasetGroup(CatalogTable): |
171 | | - """ORM model for dataset groups, grouping related files within a dataset. |
172 | | -
|
173 | | - Parameters |
174 | | - ---------- |
175 | | - id : int, optional |
176 | | - Primary key (auto-generated by sequence). |
177 | | - name : str |
178 | | - Short name for the group. |
179 | | - dataset_id : int |
180 | | - Foreign key referencing the parent dataset. |
181 | | - long_name : str |
182 | | - Human-readable full name. |
183 | | - description : str, optional |
184 | | - Optional description of the group contents. |
185 | | - """ |
186 | | - |
187 | | - __tablename__ = "dataset_groups" |
188 | | - |
189 | | - id = Column( |
190 | | - Integer, |
191 | | - Sequence("groups_id_seq", schema="pysus"), |
192 | | - primary_key=True, |
193 | | - ) |
194 | | - name = Column(String, nullable=False) |
195 | | - dataset_id = Column( |
196 | | - Integer, |
197 | | - ForeignKey("pysus.datasets.id"), |
198 | | - nullable=False, |
199 | | - index=True, |
200 | | - ) |
201 | | - long_name = Column(String, nullable=False) |
202 | | - description = Column(String, nullable=True) |
203 | | - |
204 | | - dataset = relationship("CatalogDataset", back_populates="groups") |
205 | | - files = relationship( |
206 | | - "CatalogFile", |
207 | | - back_populates="group", |
208 | | - cascade="all, delete-orphan", |
209 | | - ) |
210 | | - |
211 | | - __table_args__ = ( |
212 | | - Index("ix_groups_dataset_name", "dataset_id", "name"), |
213 | | - {"schema": "pysus"}, |
214 | | - ) |
215 | | - |
216 | | - |
217 | | -class CatalogFile(CatalogTable): |
218 | | - """ORM model for the files table, representing individual data files. |
219 | | -
|
220 | | - Parameters |
221 | | - ---------- |
222 | | - id : int, optional |
223 | | - Primary key (auto-generated by sequence). |
224 | | - dataset_id : int |
225 | | - Foreign key referencing the parent dataset. |
226 | | - group_id : int, optional |
227 | | - Foreign key referencing the parent group. |
228 | | - path : str |
229 | | - Object storage path to the file. |
230 | | - size : int |
231 | | - File size in bytes. |
232 | | - rows : int |
233 | | - Number of rows in the file. |
234 | | - modified : datetime |
235 | | - Timestamp of the last known modification. |
236 | | - origin_modified : datetime, optional |
237 | | - Original modification timestamp from the source. |
238 | | - origin_path : str |
239 | | - Original source path of the file. |
240 | | - sha256 : str, optional |
241 | | - SHA-256 hex digest for integrity verification. |
242 | | - year : int, optional |
243 | | - Data year associated with the file. |
244 | | - month : int, optional |
245 | | - Data month associated with the file. |
246 | | - state : str, optional |
247 | | - Two-letter state code associated with the file. |
248 | | - """ |
249 | | - |
250 | | - __tablename__ = "files" |
251 | | - |
252 | | - id: Mapped[int] = mapped_column( |
253 | | - Integer, |
254 | | - Sequence("files_id_seq", schema="pysus"), |
255 | | - primary_key=True, |
256 | | - ) |
257 | | - dataset_id: Mapped[int] = mapped_column( |
258 | | - Integer, ForeignKey("pysus.datasets.id"), nullable=False, index=True |
259 | | - ) |
260 | | - group_id: Mapped[int | None] = mapped_column( |
261 | | - Integer, |
262 | | - ForeignKey("pysus.dataset_groups.id"), |
263 | | - nullable=True, |
264 | | - index=True, |
265 | | - ) |
266 | | - |
267 | | - path: Mapped[str] = mapped_column(String, nullable=False, unique=True) |
268 | | - size: Mapped[int] = mapped_column(BigInteger, nullable=False) |
269 | | - rows: Mapped[int] = mapped_column(Integer, nullable=False) |
270 | | - type: Mapped[str] = mapped_column(String, nullable=True) |
271 | | - modified: Mapped[datetime] = mapped_column(DateTime, nullable=False) |
272 | | - origin_modified: Mapped[datetime | None] = mapped_column( |
273 | | - DateTime, |
274 | | - nullable=True, |
275 | | - ) |
276 | | - origin_path: Mapped[str] = mapped_column(String, nullable=False) |
277 | | - sha256: Mapped[str | None] = mapped_column( |
278 | | - String(64), |
279 | | - nullable=True, |
280 | | - index=True, |
281 | | - ) |
282 | | - |
283 | | - year: Mapped[int | None] = mapped_column( |
284 | | - Integer, |
285 | | - nullable=True, |
286 | | - index=True, |
287 | | - ) |
288 | | - month: Mapped[int | None] = mapped_column( |
289 | | - Integer, |
290 | | - nullable=True, |
291 | | - index=True, |
292 | | - ) |
293 | | - state: Mapped[str | None] = mapped_column( |
294 | | - String(2), |
295 | | - nullable=True, |
296 | | - index=True, |
297 | | - ) |
298 | | - |
299 | | - dataset: Mapped["CatalogDataset"] = relationship( |
300 | | - "CatalogDataset", |
301 | | - back_populates="files", |
302 | | - ) |
303 | | - group: Mapped[Optional["DatasetGroup"]] = relationship( |
304 | | - "DatasetGroup", |
305 | | - back_populates="files", |
306 | | - ) |
307 | | - columns: Mapped[list["ColumnDefinition"]] = relationship( |
308 | | - "ColumnDefinition", |
309 | | - secondary=file_columns, |
310 | | - back_populates="files", |
311 | | - cascade="all, delete", |
312 | | - ) |
313 | | - |
314 | | - __table_args__ = ( |
315 | | - Index("ix_files_dataset_group", "dataset_id", "group_id"), |
316 | | - Index("ix_files_temporal", "year", "month"), |
317 | | - Index( |
318 | | - "ix_files_lookup", |
319 | | - "dataset_id", |
320 | | - "group_id", |
321 | | - "year", |
322 | | - "month", |
323 | | - "state", |
324 | | - ), |
325 | | - {"schema": "pysus"}, |
326 | | - ) |
0 commit comments