|
1 | | -# Copyright (c) QuantCo 2024-2024 |
| 1 | +# Copyright (c) QuantCo 2024-2025 |
2 | 2 | # SPDX-License-Identifier: BSD-3-Clause |
3 | 3 |
|
4 | 4 | import functools |
5 | 5 | import logging |
6 | 6 | import sys |
7 | | -import tempfile |
8 | | -import warnings |
9 | 7 | from dataclasses import dataclass |
10 | 8 | from pathlib import Path |
11 | 9 | from typing import Literal |
@@ -105,84 +103,6 @@ def main(ctx: click.Context, format: str | None, writer: str): |
105 | 103 | ctx.obj = CliConfig(writer_cls) |
106 | 104 |
|
107 | 105 |
|
108 | | -@main.command() |
109 | | -@click.argument( |
110 | | - "left_parquet", |
111 | | - type=click.Path(exists=True, dir_okay=True, path_type=Path), |
112 | | -) |
113 | | -@click.argument( |
114 | | - "right_parquet", |
115 | | - type=click.Path(exists=True, dir_okay=True, path_type=Path), |
116 | | -) |
117 | | -@table_comparison_options |
118 | | -@click.pass_obj |
119 | | -def parquet( |
120 | | - obj: CliConfig, |
121 | | - left_parquet: Path, |
122 | | - right_parquet: Path, |
123 | | - join_columns: str | None, |
124 | | - hide_matching_columns: bool, |
125 | | - float_precision: float, |
126 | | - collation: str | None, |
127 | | - ignore_casing: bool, |
128 | | - infer_primary_keys: bool, |
129 | | -): |
130 | | - """Compare two (sets of) parquet files using DuckDB.""" |
131 | | - # Disable some warnings emitted by duckdb-engine |
132 | | - warnings.filterwarnings( |
133 | | - "ignore", message="duckdb-engine doesn't yet support reflection on indices" |
134 | | - ) |
135 | | - warnings.filterwarnings("ignore", message="Did not recognize type 'list' of column") |
136 | | - |
137 | | - # Create temporary database for subsequent operations |
138 | | - with tempfile.TemporaryDirectory() as tmpdir: |
139 | | - db_path = Path(tmpdir) / "duck.db" |
140 | | - connection_string = f"duckdb:///{db_path}" |
141 | | - engine = sa.create_engine(connection_string) |
142 | | - |
143 | | - # Create referenceable views on top of the parquet files |
144 | | - left_source_path = ( |
145 | | - str(left_parquet) if left_parquet.is_file() else f"{left_parquet}/*.parquet" |
146 | | - ) |
147 | | - left_query = ( |
148 | | - "CREATE VIEW left_parquet AS " |
149 | | - f"(SELECT * FROM read_parquet('{left_source_path}'))" |
150 | | - ) |
151 | | - |
152 | | - right_source_path = ( |
153 | | - str(right_parquet) |
154 | | - if right_parquet.is_file() |
155 | | - else f"{right_parquet}/*.parquet" |
156 | | - ) |
157 | | - right_query = ( |
158 | | - "CREATE VIEW right_parquet AS " |
159 | | - f"(SELECT * FROM read_parquet('{right_source_path}'))" |
160 | | - ) |
161 | | - |
162 | | - with engine.begin() as tx: |
163 | | - tx.execute(sa.text(left_query)) |
164 | | - tx.execute(sa.text(right_query)) |
165 | | - |
166 | | - # Once these views are created, we can actually run the comparison |
167 | | - # Run the comparison and get the report |
168 | | - comparison = sc.compare_tables( |
169 | | - engine, |
170 | | - "left_parquet", |
171 | | - "right_parquet", |
172 | | - join_columns=join_columns.split(",") if join_columns is not None else None, |
173 | | - float_precision=float_precision, |
174 | | - collation=collation, |
175 | | - ignore_casing=ignore_casing, |
176 | | - infer_primary_keys=infer_primary_keys, |
177 | | - ) |
178 | | - report = comparison.summary_report() |
179 | | - |
180 | | - # Write the report |
181 | | - obj.writer.write( |
182 | | - {"comparison": report}, hide_matching_columns=hide_matching_columns |
183 | | - ) |
184 | | - |
185 | | - |
186 | 106 | @main.command() |
187 | 107 | @click.argument("left_table") |
188 | 108 | @click.argument("right_table") |
|
0 commit comments