|
| 1 | +# Development |
| 2 | + |
| 3 | +## Setup |
| 4 | + |
| 5 | +```bash |
| 6 | +# Clone repository |
| 7 | +git clone <repository-url> |
| 8 | +cd tablespec |
| 9 | + |
| 10 | +# Install with development dependencies |
| 11 | +uv sync --all-extras |
| 12 | + |
| 13 | +# Install with Spark support |
| 14 | +uv sync --extra spark |
| 15 | +``` |
| 16 | + |
| 17 | +## Running Tests |
| 18 | + |
| 19 | +```bash |
| 20 | +# Run all tests |
| 21 | +uv run pytest |
| 22 | + |
| 23 | +# Run with coverage |
| 24 | +uv run pytest --cov=src/tablespec --cov-report=html |
| 25 | + |
| 26 | +# Run specific test file |
| 27 | +uv run pytest tests/unit/test_gx_baseline.py |
| 28 | +``` |
| 29 | + |
| 30 | +## Project Structure |
| 31 | + |
| 32 | +``` |
| 33 | +src/tablespec/ |
| 34 | +├── __init__.py # Public API exports |
| 35 | +├── cli.py # Typer CLI (validate, info, convert, excel, domains) |
| 36 | +├── models/ |
| 37 | +│ ├── umf.py # Pydantic UMF models |
| 38 | +│ ├── changelog.py # Changelog entry models |
| 39 | +│ └── pipeline.py # Pipeline configuration models |
| 40 | +├── schemas/ |
| 41 | +│ ├── generators.py # Schema generation (SQL, PySpark, JSON) |
| 42 | +│ ├── umf.schema.json # JSON Schema for UMF validation |
| 43 | +│ ├── gx_expectation_suite.schema.json |
| 44 | +│ ├── expectation_categories.json |
| 45 | +│ └── expectation_parameters.json |
| 46 | +├── type_mappings.py # Type system conversions |
| 47 | +├── date_formats.py # Date/datetime format definitions |
| 48 | +├── naming.py # Naming utilities (to_spark_identifier, position_sort_key) |
| 49 | +├── naming_validator.py # Column naming convention validation |
| 50 | +├── gx_baseline.py # GX baseline expectation generation |
| 51 | +├── gx_constraint_extractor.py # Extract constraints from GX suites |
| 52 | +├── gx_schema_validator.py # Schema validation with GX |
| 53 | +├── gx_wrapper.py # GX utility wrapper |
| 54 | +├── excel_converter.py # Bidirectional Excel <-> UMF conversion |
| 55 | +├── excel_import_git.py # Git-integrated Excel import with atomic commits |
| 56 | +├── umf_loader.py # Split/JSON format loader with auto-detection |
| 57 | +├── umf_diff.py # UMF version diffing |
| 58 | +├── umf_change_applier.py # Atomic change application for per-change commits |
| 59 | +├── umf_validator.py # UMF structural validation |
| 60 | +├── changelog_generator.py # Git-based changelog generation |
| 61 | +├── changelog_diff_parser.py # YAML diff parsing for change detection |
| 62 | +├── changelog_formatter.py # Changelog output formatting |
| 63 | +├── inference/ |
| 64 | +│ └── domain_types.py # Domain type registry and inference engine |
| 65 | +├── sample_data/ |
| 66 | +│ ├── engine.py # Main sample data generation engine |
| 67 | +│ ├── config.py # Generation configuration |
| 68 | +│ ├── generators.py # Healthcare-specific data generators |
| 69 | +│ ├── column_value_generator.py # Per-column value generation |
| 70 | +│ ├── constraint_handlers.py # Validation constraint handling |
| 71 | +│ ├── foreign_keys.py # FK relationship-aware generation |
| 72 | +│ ├── graph.py # Dependency graph for generation order |
| 73 | +│ ├── filename_generator.py # Filename pattern generation |
| 74 | +│ ├── date_processing.py # Date format handling |
| 75 | +│ ├── registry.py # Key registry for uniqueness |
| 76 | +│ └── validation.py # Validation rule processing |
| 77 | +├── quality/ |
| 78 | +│ ├── baseline_service.py # Baseline capture and comparison |
| 79 | +│ ├── baseline_storage.py # Baseline persistence |
| 80 | +│ ├── executor.py # Quality check execution |
| 81 | +│ └── storage.py # Quality result storage |
| 82 | +├── profiling/ |
| 83 | +│ ├── types.py # Profiling result types |
| 84 | +│ ├── spark_mapper.py # Spark DataFrame -> UMF (requires PySpark) |
| 85 | +│ └── deequ_mapper.py # Deequ profile -> UMF |
| 86 | +├── prompts/ |
| 87 | +│ ├── documentation.py # Documentation enrichment prompts |
| 88 | +│ ├── validation.py # Table-level validation rule prompts |
| 89 | +│ ├── validation_per_column.py # Per-column validation prompts |
| 90 | +│ ├── column_validation.py # Column-specific validation prompts |
| 91 | +│ ├── relationship.py # Relationship detection prompts |
| 92 | +│ ├── survivorship.py # Survivorship logic prompts |
| 93 | +│ ├── filename_pattern.py # Filename pattern prompts |
| 94 | +│ ├── expectation_guide.py # GX expectation reference |
| 95 | +│ └── utils.py # Prompt utilities |
| 96 | +├── formatting/ |
| 97 | +│ ├── constants.py # Formatting constants |
| 98 | +│ └── yaml_formatter.py # YAML output formatting |
| 99 | +├── validation/ |
| 100 | +│ ├── gx_processor.py # GX expectation processing |
| 101 | +│ ├── table_validator.py # Table validation engine (requires PySpark) |
| 102 | +│ └── custom_gx_expectations.py # Custom GX expectation types |
| 103 | +├── casting_utils.py # Type casting utilities |
| 104 | +├── completeness_validator.py # Data completeness validation |
| 105 | +├── dependency_resolver.py # Module dependency resolution |
| 106 | +├── format_utils.py # Format conversion utilities |
| 107 | +├── merge.py # Table merge with survivorship (requires PySpark) |
| 108 | +├── relationship_validator.py # FK relationship validation |
| 109 | +├── spark_factory.py # SparkSession factory (requires PySpark) |
| 110 | +├── survivorship_display.py # Survivorship rule display |
| 111 | +├── sync_baseline.py # Baseline synchronization |
| 112 | +├── output_formatting.py # Output display formatting |
| 113 | +├── validator.py # Pipeline-level validation orchestration |
| 114 | +└── domain_types.yaml # Domain type registry definitions |
| 115 | +``` |
0 commit comments