|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Compare two exports of backwards-compatibility-data. |
| 4 | +
|
| 5 | +This script compares NumPy exports created by export_backwards_compat_data.py |
| 6 | +to verify that different versions of tiledb-py read the same data identically. |
| 7 | +
|
| 8 | +Usage: |
| 9 | + python compare_backwards_compat_data.py output_folder_v1 output_folder_v2 |
| 10 | +""" |
| 11 | + |
| 12 | +import argparse |
| 13 | +import json |
| 14 | +import sys |
| 15 | +from pathlib import Path |
| 16 | +from typing import Dict, List, Tuple, Any |
| 17 | + |
| 18 | +import numpy as np |
| 19 | +try: |
| 20 | + with tiledb.open(uri, 'r') as A: |
| 21 | + print(f"Schema: {A.schema}") |
| 22 | + ned = A.nonempty_domain() |
| 23 | + print(f"Nonempty domain: {ned}") |
| 24 | + |
| 25 | + if ned: |
| 26 | + slices = [slice(*x) for x in ned] |
| 27 | + data = A[tuple(slices)] |
| 28 | + print(f"Success! Data keys: {list(data.keys())}") |
| 29 | +except Exception as e: |
| 30 | + print(f"ERROR: {e}") |
| 31 | + import traceback |
| 32 | + traceback.print_exc() |
| 33 | + |
| 34 | + |
| 35 | + |
| 36 | + |
| 37 | +class ComparisonResult: |
| 38 | + """Stores the results of a comparison.""" |
| 39 | + |
| 40 | + def __init__(self): |
| 41 | + self.identical_count = 0 |
| 42 | + self.different_count = 0 |
| 43 | + self.missing_in_dir1 = [] |
| 44 | + self.missing_in_dir2 = [] |
| 45 | + self.differences = [] |
| 46 | + |
| 47 | + def add_difference(self, path: str, reason: str, details: Any = None): |
| 48 | + """Add a difference to the results.""" |
| 49 | + self.different_count += 1 |
| 50 | + self.differences.append({ |
| 51 | + "path": path, |
| 52 | + "reason": reason, |
| 53 | + "details": details |
| 54 | + }) |
| 55 | + |
| 56 | + def add_identical(self): |
| 57 | + """Increment the count of identical items.""" |
| 58 | + self.identical_count += 1 |
| 59 | + |
| 60 | + def print_summary(self): |
| 61 | + """Print a summary of the comparison.""" |
| 62 | + print("\n" + "=" * 80) |
| 63 | + print("COMPARISON SUMMARY") |
| 64 | + print("=" * 80) |
| 65 | + print(f"Identical items: {self.identical_count}") |
| 66 | + print(f"Different items: {self.different_count}") |
| 67 | + print(f"Missing in dir1: {len(self.missing_in_dir1)}") |
| 68 | + print(f"Missing in dir2: {len(self.missing_in_dir2)}") |
| 69 | + print("=" * 80) |
| 70 | + |
| 71 | + if self.missing_in_dir1: |
| 72 | + print("\nMissing in first directory:") |
| 73 | + for item in self.missing_in_dir1: |
| 74 | + print(f" - {item}") |
| 75 | + |
| 76 | + if self.missing_in_dir2: |
| 77 | + print("\nMissing in second directory:") |
| 78 | + for item in self.missing_in_dir2: |
| 79 | + print(f" - {item}") |
| 80 | + |
| 81 | + if self.differences: |
| 82 | + print("\nDifferences found:") |
| 83 | + for diff in self.differences: |
| 84 | + print(f"\n Path: {diff['path']}") |
| 85 | + print(f" Reason: {diff['reason']}") |
| 86 | + if diff['details']: |
| 87 | + print(f" Details: {diff['details']}") |
| 88 | + |
| 89 | + print("\n" + "=" * 80) |
| 90 | + if self.different_count == 0 and not self.missing_in_dir1 and not self.missing_in_dir2: |
| 91 | + print("SUCCESS: All data is identical!") |
| 92 | + else: |
| 93 | + print("FAILURE: Differences detected!") |
| 94 | + print("=" * 80 + "\n") |
| 95 | + |
| 96 | + return self.different_count == 0 and not self.missing_in_dir1 and not self.missing_in_dir2 |
| 97 | + |
| 98 | + |
| 99 | +def compare_json_files(file1: Path, file2: Path, result: ComparisonResult, rel_path: str) -> bool: |
| 100 | + """ |
| 101 | + Compare two JSON files. |
| 102 | +
|
| 103 | + Returns True if identical, False otherwise. |
| 104 | + """ |
| 105 | + try: |
| 106 | + with open(file1, "r") as f: |
| 107 | + data1 = json.load(f) |
| 108 | + with open(file2, "r") as f: |
| 109 | + data2 = json.load(f) |
| 110 | + |
| 111 | + if data1 == data2: |
| 112 | + result.add_identical() |
| 113 | + return True |
| 114 | + else: |
| 115 | + # Find specific differences |
| 116 | + diff_keys = [] |
| 117 | + all_keys = set(data1.keys()) | set(data2.keys()) |
| 118 | + for key in all_keys: |
| 119 | + if key not in data1: |
| 120 | + diff_keys.append(f"'{key}' missing in dir1") |
| 121 | + elif key not in data2: |
| 122 | + diff_keys.append(f"'{key}' missing in dir2") |
| 123 | + elif data1[key] != data2[key]: |
| 124 | + diff_keys.append(f"'{key}': {data1[key]} != {data2[key]}") |
| 125 | + |
| 126 | + result.add_difference( |
| 127 | + rel_path, |
| 128 | + "JSON content differs", |
| 129 | + "; ".join(diff_keys) |
| 130 | + ) |
| 131 | + return False |
| 132 | + |
| 133 | + except Exception as e: |
| 134 | + result.add_difference(rel_path, f"Error comparing JSON: {e}") |
| 135 | + return False |
| 136 | + |
| 137 | + |
| 138 | +def compare_numpy_files(file1: Path, file2: Path, result: ComparisonResult, rel_path: str) -> bool: |
| 139 | + """ |
| 140 | + Compare two NumPy files. |
| 141 | +
|
| 142 | + Returns True if arrays are equal, False otherwise. |
| 143 | + """ |
| 144 | + try: |
| 145 | + arr1 = np.load(file1, allow_pickle=True) |
| 146 | + arr2 = np.load(file2, allow_pickle=True) |
| 147 | + |
| 148 | + # Check shapes |
| 149 | + if arr1.shape != arr2.shape: |
| 150 | + result.add_difference( |
| 151 | + rel_path, |
| 152 | + "Array shape mismatch", |
| 153 | + f"{arr1.shape} != {arr2.shape}" |
| 154 | + ) |
| 155 | + return False |
| 156 | + |
| 157 | + # Check dtypes |
| 158 | + if arr1.dtype != arr2.dtype: |
| 159 | + result.add_difference( |
| 160 | + rel_path, |
| 161 | + "Array dtype mismatch", |
| 162 | + f"{arr1.dtype} != {arr2.dtype}" |
| 163 | + ) |
| 164 | + return False |
| 165 | + |
| 166 | + # Check values |
| 167 | + try: |
| 168 | + if np.array_equal(arr1, arr2, equal_nan=True): |
| 169 | + result.add_identical() |
| 170 | + return True |
| 171 | + else: |
| 172 | + # Calculate statistics about differences |
| 173 | + if np.issubdtype(arr1.dtype, np.number): |
| 174 | + diff = np.abs(arr1 - arr2) |
| 175 | + max_diff = np.max(diff) |
| 176 | + mean_diff = np.mean(diff) |
| 177 | + num_different = np.sum(arr1 != arr2) |
| 178 | + |
| 179 | + result.add_difference( |
| 180 | + rel_path, |
| 181 | + "Array values differ", |
| 182 | + f"max_diff={max_diff}, mean_diff={mean_diff}, " |
| 183 | + f"num_different={num_different}/{arr1.size}" |
| 184 | + ) |
| 185 | + else: |
| 186 | + num_different = np.sum(arr1 != arr2) |
| 187 | + result.add_difference( |
| 188 | + rel_path, |
| 189 | + "Array values differ", |
| 190 | + f"num_different={num_different}/{arr1.size}" |
| 191 | + ) |
| 192 | + return False |
| 193 | + |
| 194 | + except Exception as e: |
| 195 | + # For complex objects, try direct comparison |
| 196 | + if arr1.dtype == object or arr2.dtype == object: |
| 197 | + if all(np.array_equal(a, b, equal_nan=True) if isinstance(a, np.ndarray) else a == b |
| 198 | + for a, b in zip(arr1.flat, arr2.flat)): |
| 199 | + result.add_identical() |
| 200 | + return True |
| 201 | + else: |
| 202 | + result.add_difference(rel_path, "Array object values differ") |
| 203 | + return False |
| 204 | + else: |
| 205 | + raise |
| 206 | + |
| 207 | + except Exception as e: |
| 208 | + result.add_difference(rel_path, f"Error comparing NumPy arrays: {e}") |
| 209 | + return False |
| 210 | + |
| 211 | + |
| 212 | +def compare_directories(dir1: Path, dir2: Path, result: ComparisonResult, rel_path: str = "") -> None: |
| 213 | + """ |
| 214 | + Recursively compare two directories. |
| 215 | +
|
| 216 | + Args: |
| 217 | + dir1: First directory to compare |
| 218 | + dir2: Second directory to compare |
| 219 | + result: ComparisonResult object to store results |
| 220 | + rel_path: Relative path for reporting |
| 221 | + """ |
| 222 | + # Get all files and subdirectories |
| 223 | + items1 = set(p.name for p in dir1.iterdir()) |
| 224 | + items2 = set(p.name for p in dir2.iterdir()) |
| 225 | + |
| 226 | + # Find missing items |
| 227 | + missing_in_dir2 = items1 - items2 |
| 228 | + missing_in_dir1 = items2 - items1 |
| 229 | + |
| 230 | + for item in missing_in_dir2: |
| 231 | + result.missing_in_dir2.append(f"{rel_path}/{item}" if rel_path else item) |
| 232 | + |
| 233 | + for item in missing_in_dir1: |
| 234 | + result.missing_in_dir1.append(f"{rel_path}/{item}" if rel_path else item) |
| 235 | + |
| 236 | + # Compare common items |
| 237 | + common_items = items1 & items2 |
| 238 | + |
| 239 | + for item in sorted(common_items): |
| 240 | + path1 = dir1 / item |
| 241 | + path2 = dir2 / item |
| 242 | + item_rel_path = f"{rel_path}/{item}" if rel_path else item |
| 243 | + |
| 244 | + if path1.is_dir() and path2.is_dir(): |
| 245 | + # Recursively compare directories |
| 246 | + compare_directories(path1, path2, result, item_rel_path) |
| 247 | + |
| 248 | + elif path1.is_file() and path2.is_file(): |
| 249 | + # Skip export_info.json - versions and timestamps are expected to differ |
| 250 | + if item == "export_info.json": |
| 251 | + continue |
| 252 | + |
| 253 | + # Compare files based on extension |
| 254 | + if item.endswith(".json"): |
| 255 | + compare_json_files(path1, path2, result, item_rel_path) |
| 256 | + elif item.endswith(".npy") or item.endswith(".pkl"): |
| 257 | + compare_numpy_files(path1, path2, result, item_rel_path) |
| 258 | + elif item.endswith(".txt"): |
| 259 | + # Text file comparison |
| 260 | + try: |
| 261 | + with open(path1, "r") as f: |
| 262 | + content1 = f.read() |
| 263 | + with open(path2, "r") as f: |
| 264 | + content2 = f.read() |
| 265 | + |
| 266 | + if content1 == content2: |
| 267 | + result.add_identical() |
| 268 | + else: |
| 269 | + result.add_difference(item_rel_path, "Text content differs") |
| 270 | + except Exception as e: |
| 271 | + result.add_difference(item_rel_path, f"Error comparing text: {e}") |
| 272 | + else: |
| 273 | + # Binary comparison for other files |
| 274 | + try: |
| 275 | + with open(path1, "rb") as f: |
| 276 | + content1 = f.read() |
| 277 | + with open(path2, "rb") as f: |
| 278 | + content2 = f.read() |
| 279 | + |
| 280 | + if content1 == content2: |
| 281 | + result.add_identical() |
| 282 | + else: |
| 283 | + result.add_difference(item_rel_path, "Binary content differs") |
| 284 | + except Exception as e: |
| 285 | + result.add_difference(item_rel_path, f"Error comparing binary: {e}") |
| 286 | + |
| 287 | + else: |
| 288 | + # Type mismatch (one is file, other is directory) |
| 289 | + result.add_difference( |
| 290 | + item_rel_path, |
| 291 | + "Type mismatch", |
| 292 | + f"dir1={'dir' if path1.is_dir() else 'file'}, " |
| 293 | + f"dir2={'dir' if path2.is_dir() else 'file'}" |
| 294 | + ) |
| 295 | + |
| 296 | + |
| 297 | +def compare_exports(dir1: str, dir2: str) -> bool: |
| 298 | + """ |
| 299 | + Compare two export directories. |
| 300 | +
|
| 301 | + Args: |
| 302 | + dir1: First export directory |
| 303 | + dir2: Second export directory |
| 304 | +
|
| 305 | + Returns: |
| 306 | + True if identical, False if differences found |
| 307 | + """ |
| 308 | + dir1_path = Path(dir1) |
| 309 | + dir2_path = Path(dir2) |
| 310 | + |
| 311 | + if not dir1_path.exists(): |
| 312 | + print(f"Error: Directory not found: {dir1_path}") |
| 313 | + sys.exit(1) |
| 314 | + |
| 315 | + if not dir2_path.exists(): |
| 316 | + print(f"Error: Directory not found: {dir2_path}") |
| 317 | + sys.exit(1) |
| 318 | + |
| 319 | + print("Comparing backwards-compatibility-data exports") |
| 320 | + print(f" Directory 1: {dir1_path}") |
| 321 | + print(f" Directory 2: {dir2_path}") |
| 322 | + print() |
| 323 | + |
| 324 | + # Load and display export info (for informational purposes only) |
| 325 | + print("Export metadata (not compared):") |
| 326 | + for i, d in enumerate([dir1_path, dir2_path], 1): |
| 327 | + info_file = d / "export_info.json" |
| 328 | + if info_file.exists(): |
| 329 | + with open(info_file, "r") as f: |
| 330 | + info = json.load(f) |
| 331 | + print(f" Directory {i}:") |
| 332 | + for key, value in info.items(): |
| 333 | + print(f" {key}: {value}") |
| 334 | + print() |
| 335 | + |
| 336 | + # Perform comparison |
| 337 | + result = ComparisonResult() |
| 338 | + compare_directories(dir1_path, dir2_path, result) |
| 339 | + |
| 340 | + # Print results |
| 341 | + return result.print_summary() |
| 342 | + |
| 343 | + |
| 344 | +def main(): |
| 345 | + parser = argparse.ArgumentParser( |
| 346 | + description="Compare two exports of backwards-compatibility-data" |
| 347 | + ) |
| 348 | + parser.add_argument( |
| 349 | + "dir1", |
| 350 | + help="First export directory" |
| 351 | + ) |
| 352 | + parser.add_argument( |
| 353 | + "dir2", |
| 354 | + help="Second export directory" |
| 355 | + ) |
| 356 | + parser.add_argument( |
| 357 | + "-v", "--verbose", |
| 358 | + action="store_true", |
| 359 | + help="Verbose output" |
| 360 | + ) |
| 361 | + |
| 362 | + args = parser.parse_args() |
| 363 | + |
| 364 | + success = compare_exports(args.dir1, args.dir2) |
| 365 | + |
| 366 | + sys.exit(0 if success else 1) |
| 367 | + |
| 368 | + |
| 369 | +if __name__ == "__main__": |
| 370 | + main() |
0 commit comments