diff --git a/docs/docs.json b/docs/docs.json index d7cfe45d7..33e50697d 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -326,18 +326,7 @@ "pages": [ "python-sdk/api-reference/overview", "python-sdk/api-reference/test-decorators", - "python-sdk/api-reference/table-assets", - "python-sdk/api-reference/tests", - "python-sdk/api-reference/test-executions" - ] - }, - { - "group": "Guides", - "pages": [ - "python-sdk/guides/authentication", - "python-sdk/guides/test-decorators", - "python-sdk/guides/sending-data", - "python-sdk/guides/best-practices" + "python-sdk/api-reference/table-assets" ] } ] diff --git a/docs/python-sdk/api-reference/overview.mdx b/docs/python-sdk/api-reference/overview.mdx index 739fe33f4..80fd07f23 100644 --- a/docs/python-sdk/api-reference/overview.mdx +++ b/docs/python-sdk/api-reference/overview.mdx @@ -39,7 +39,7 @@ client = ElementaryCloudClient(project_id, api_key, url) Where: - `project_id` is your Python project identifier (chosen by you, used to deduplicate and identify reported assets) - `api_key` is your API token (generated from the steps above) -- `url` is the full SDK ingest endpoint URL: `{base_url}/sdk-ingest/{env_id}/batch` +- `url` is the full SDK ingest endpoint URL (the Elementary team will provide you with this URL): `{base_url}/sdk-ingest/{env_id}/batch` - Example: `https://app.elementary-data.com/sdk-ingest/a6b2425d-36e2-4e13-8458-9825688ca1f2/batch` ## Test Context @@ -58,6 +58,17 @@ with elementary_test_context(asset=asset) as ctx: client.send_to_cloud(ctx) ``` +### `raise_on_error` + +By default, `elementary_test_context` uses `raise_on_error=False`. This means that if a decorated test (or something inside the context) raises an exception, the SDK **captures it and records an `ERROR` execution** so you can still send results to Elementary Cloud without crashing your pipeline. + +If you prefer **fail-fast** behavior (for example in CI), pass `raise_on_error=True` to re-raise exceptions after they are recorded: + +```python +with elementary_test_context(asset=asset, raise_on_error=True) as ctx: + run_my_tests(df) +``` + ## Test Decorators The SDK provides decorators to define tests: @@ -75,7 +86,7 @@ You can also use context managers for inline tests: with elementary_test_context(asset=asset) as ctx: # Using context managers with ctx.boolean_test(name="my_test", description="Inline test") as my_bool_test: - my_bool_test.assert_value(False) + my_bool_test.assert_value(my_test_function()) with ctx.expected_values_test( name="country_count", @@ -83,7 +94,9 @@ with elementary_test_context(asset=asset) as ctx: allow_none=True, metadata={"my_metadata_field": "my_metadata_value"}, ) as my_expected_values_test: + # This will fail my_expected_values_test.assert_value(5) + # This will pass my_expected_values_test.assert_value(3) with ctx.expected_range_test( @@ -103,17 +116,14 @@ with elementary_test_context(asset=asset) as ctx: ## Supported Objects -The SDK supports three types of objects: +The SDK supports reporting table assets and test results. - + Register tables and views in your data warehouse - - Define data quality tests - - - Report test execution results + + Define data quality tests using decorators @@ -143,7 +153,6 @@ except Exception as e: - **Run multiple tests in one context** - All tests in a single `elementary_test_context` are automatically batched - **Use descriptive test names** - Clear names help identify tests in the Elementary UI - **Include asset metadata** - Add descriptions, owners, tags, and dependencies to assets -- **Handle errors gracefully** - Wrap `send_to_cloud` calls in try-except blocks All tests run within a single `elementary_test_context` are automatically batched and sent together. @@ -153,6 +162,5 @@ All tests run within a single `elementary_test_context` are automatically batche - [Test Decorators](/python-sdk/api-reference/test-decorators) - Complete reference for all test decorators - [Table Assets](/python-sdk/api-reference/table-assets) - Learn about table asset structure -- [Tests](/python-sdk/api-reference/tests) - Understand test definitions -- [Test Executions](/python-sdk/api-reference/test-executions) - See how to report test results +- [Quickstart](/python-sdk/quickstart) - Send your first test results to Elementary Cloud diff --git a/docs/python-sdk/api-reference/table-assets.mdx b/docs/python-sdk/api-reference/table-assets.mdx index 36cba445a..e96af1114 100644 --- a/docs/python-sdk/api-reference/table-assets.mdx +++ b/docs/python-sdk/api-reference/table-assets.mdx @@ -17,7 +17,7 @@ asset = TableAsset( description="string", # Optional: Table description owners=["string"], # Optional: List of owners (emails or usernames) tags=["string"], # Optional: List of tags - depends_on=["string"] # Optional: List of upstream asset IDs + depends_on=["string"] # Optional: List of upstream fully qualified table names ) ``` @@ -37,7 +37,7 @@ asset = TableAsset( | `description` | string | Human-readable description of the table | | `owners` | list[string] | List of owners (email addresses or usernames) | | `tags` | list[string] | List of tags for categorization | -| `depends_on` | list[string] | List of upstream asset IDs (e.g., `["prod.public.customers", "prod.public.orders"]`) for lineage tracking | +| `depends_on` | list[string] | List of upstream fully qualified table names (e.g., `["prod.public.customers", "prod.public.orders"]`) for lineage tracking | ## Example @@ -75,7 +75,6 @@ Table assets are updated on each ingest, so include all current metadata in ever ## Related Documentation -- [Tests](/python-sdk/api-reference/tests) - Define tests for your table assets -- [Test Executions](/python-sdk/api-reference/test-executions) - Report test results -- [Sending Data Guide](/python-sdk/guides/sending-data) - Learn how to send table assets +- [Test Decorators](/python-sdk/api-reference/test-decorators) - Define tests for your table assets +- [API Reference](/python-sdk/api-reference/overview) - Overview of the SDK API diff --git a/docs/python-sdk/api-reference/test-decorators.mdx b/docs/python-sdk/api-reference/test-decorators.mdx index 4491f1da2..22c4606d1 100644 --- a/docs/python-sdk/api-reference/test-decorators.mdx +++ b/docs/python-sdk/api-reference/test-decorators.mdx @@ -49,8 +49,8 @@ def test_function(df: pd.DataFrame) -> bool: | `tags` | list[str] | No | `None` | List of tags | | `owners` | list[str] | No | `None` | List of owners | | `metadata` | dict | No | `None` | Additional metadata | -| `quality_dimension` | QualityDimension | No | `None` | Quality dimension (defaults to VALIDITY if column_name is set) | -| `skip` | bool | No | `False` | Whether to skip this test | +| `quality_dimension` | QualityDimension | No | `None` | Quality dimension (defaults to VALIDITY) | +| `skip` | bool | No | `False` | Whether to skip this test. Useful if you want the test to appear in Elementary Cloud, but you don't want to execute it in this run. | ### Example @@ -68,7 +68,7 @@ def test_unique_ids(df: pd.DataFrame) -> bool: ## @expected_range -Tests that return a numeric value that should fall within a range. +Tests that return a numeric value that should fall within a range. They can also return a list of numeric values or a pandas Series. ### Signature @@ -86,9 +86,11 @@ Tests that return a numeric value that should fall within a range. quality_dimension: QualityDimension | None = None, skip: bool = False, ) -def test_function(df: pd.DataFrame) -> float: +def test_function(df: pd.DataFrame) -> float | list[float] | pd.Series: # Your test logic - return 25.5 # Numeric value + return df["age"].mean() # Numeric value + # return [1, 2, 3] # Numeric values + # return df["age"] # pandas Series ``` ### Parameters @@ -98,10 +100,7 @@ def test_function(df: pd.DataFrame) -> float: | `name` | str | Yes | - | Test name | | `min` | float | No | `None` | Minimum expected value (inclusive) | | `max` | float | No | `None` | Maximum expected value (inclusive) | -| `severity` | str | No | `"ERROR"` | Test severity | -| `description` | str | No | `None` | Test description | -| `column_name` | str | No | `None` | Column being tested | -| `tags`, `owners`, `metadata`, `quality_dimension`, `skip` | - | No | - | Same as `@boolean_test` | +| `severity`, `description`, `column_name`, `tags`, `owners`, `metadata`, `quality_dimension`, `skip` | - | No | - | Same as `@boolean_test` | ### Example @@ -120,7 +119,7 @@ def test_average_age(df: pd.DataFrame) -> float: ## @expected_values -Tests that return a value that should match one of a list of expected values. +Tests that return a value (or values) that should match one of a list of expected values. ### Signature @@ -150,10 +149,7 @@ def test_function(df: pd.DataFrame) -> Any: | `name` | str | Yes | - | Test name | | `expected` | Any \| list[Any] | Yes | - | Expected value(s) - can be single value or list | | `allow_none` | bool | No | `False` | Whether to allow None values | -| `severity` | str | No | `"ERROR"` | Test severity | -| `description` | str | No | `None` | Test description | -| `column_name` | str | No | `None` | Column being tested | -| `tags`, `owners`, `metadata`, `quality_dimension`, `skip` | - | No | - | Same as `@boolean_test` | +| `severity`, `description`, `column_name`, `tags`, `owners`, `metadata`, `quality_dimension`, `skip` | - | No | - | Same as `@boolean_test` | ### Example @@ -199,9 +195,7 @@ def test_function(df: pd.DataFrame) -> Sized: | `name` | str | Yes | - | Test name | | `min` | int | No | `None` | Minimum expected row count (inclusive) | | `max` | int | No | `None` | Maximum expected row count (inclusive) | -| `severity` | str | No | `"ERROR"` | Test severity | -| `description` | str | No | `None` | Test description | -| `tags`, `owners`, `metadata`, `skip` | - | No | - | Same as `@boolean_test` | +| `severity`, `description`, `tags`, `owners`, `metadata`, `skip` | - | No | - | Same as `@boolean_test` | ### Example @@ -214,7 +208,7 @@ def test_function(df: pd.DataFrame) -> Sized: description="Validate user count is within expected range", ) def get_users_df(df: pd.DataFrame) -> pd.DataFrame: - """Return the dataframe - decorator calls len() on it.""" + """Return the DataFrame; the decorator calls len() on it.""" return df ``` @@ -240,6 +234,6 @@ All decorators support these common parameters: ## Related Documentation - [Quickstart](/python-sdk/quickstart) - Get started with test decorators -- [Sending Data](/python-sdk/guides/sending-data) - Learn how to send test results -- [Best Practices](/python-sdk/guides/best-practices) - Best practices for using the SDK +- [API Reference](/python-sdk/api-reference/overview) - Overview of the SDK API +- [Table Assets](/python-sdk/api-reference/table-assets) - Register tables and views in your data warehouse diff --git a/docs/python-sdk/api-reference/test-executions.mdx b/docs/python-sdk/api-reference/test-executions.mdx deleted file mode 100644 index 62e28a916..000000000 --- a/docs/python-sdk/api-reference/test-executions.mdx +++ /dev/null @@ -1,156 +0,0 @@ ---- -title: "Test Executions" ---- - -Test executions represent the results of running a test. Send test execution results to track test runs, failures, and performance metrics. - -## TestExecution Object - -```python -from elementary_python_sdk.core.types.test import ( - TestExecution, - TestExecutionStatus, - QualityDimension -) - -test_execution = TestExecution( - id="string", # Required: Unique identifier - test_id="string", # Required: ID of the test - test_sub_unique_id="string", # Required: Sub-test identifier - sub_type="string", # Required: Sub-type of the test - status=TestExecutionStatus.PASS, # Required: Execution status - start_time=datetime, # Required: Test execution start time - quality_dimension=QualityDimension.COMPLETENESS, # Optional: Quality dimension - failure_count=0, # Optional: Number of failures - description="string", # Optional: Execution description - code="string", # Optional: Test code/query - duration_seconds=0.0, # Optional: Execution duration - exception="string", # Optional: Exception message - traceback="string" # Optional: Exception traceback -) -``` - -## Required Fields - -| Field | Type | Description | -|-------|------|-------------| -| `id` | string | Unique identifier for this test execution | -| `test_id` | string | ID of the test that was executed (must match a Test `id`) | -| `test_sub_unique_id` | string | Sub-test identifier (typically same as `test_id` for simple tests) | -| `sub_type` | string | Sub-type of the test (e.g., `"row_count"`, `"null_rate"`, `"uniqueness"`) | -| `status` | TestExecutionStatus | Execution status (see below) | -| `start_time` | datetime | When the test execution started (UTC timezone) | - -## Optional Fields - -| Field | Type | Description | -|-------|------|-------------| -| `quality_dimension` | QualityDimension | Quality dimension being tested (see below) | -| `failure_count` | int | Number of rows or records that failed the test | -| `description` | string | Human-readable description of the execution | -| `code` | string | Test code or SQL query that was executed | -| `duration_seconds` | float | How long the test took to execute (in seconds) | -| `exception` | string | Exception message if the test failed with an error | -| `traceback` | string | Full exception traceback if the test failed | -| `column_name` | string | Column name for column-level test executions | - -## Test Execution Status - -Test executions can have the following statuses: - -- `TestExecutionStatus.PASS` - Test passed successfully -- `TestExecutionStatus.WARN` - Test passed with warnings -- `TestExecutionStatus.FAIL` - Test failed -- `TestExecutionStatus.ERROR` - Test encountered an error -- `TestExecutionStatus.SKIPPED` - Test was skipped -- `TestExecutionStatus.NO_DATA` - Test had no data to check - -## Quality Dimensions - -Quality dimensions categorize the type of data quality being tested: - -- `QualityDimension.COMPLETENESS` - Data completeness (nulls, missing values) -- `QualityDimension.UNIQUENESS` - Data uniqueness (duplicates) -- `QualityDimension.FRESHNESS` - Data freshness (timeliness) -- `QualityDimension.VALIDITY` - Data validity (format, constraints) -- `QualityDimension.ACCURACY` - Data accuracy (correctness) -- `QualityDimension.CONSISTENCY` - Data consistency (across sources) - -## Example - -```python -from elementary_python_sdk.core.types.test import ( - TestExecution, - TestExecutionStatus, - QualityDimension -) -from datetime import datetime, timezone - -# Successful test execution -success_execution = TestExecution( - id="users_row_count_test_exec_20240101_001", - test_id="users_row_count_test", - test_sub_unique_id="users_row_count_test", - sub_type="row_count", - status=TestExecutionStatus.PASS, - start_time=datetime.now(timezone.utc), - quality_dimension=QualityDimension.COMPLETENESS, - failure_count=0, - duration_seconds=1.5, - description="Row count check passed: 10,000 rows found" -) - -# Failed test execution -failed_execution = TestExecution( - id="users_email_uniqueness_test_exec_20240101_001", - test_id="users_email_uniqueness_test", - test_sub_unique_id="users_email_uniqueness_test", - sub_type="uniqueness", - status=TestExecutionStatus.FAIL, - start_time=datetime.now(timezone.utc), - quality_dimension=QualityDimension.UNIQUENESS, - failure_count=5, - duration_seconds=2.3, - description="Found 5 duplicate email addresses", - code="SELECT email, COUNT(*) FROM users GROUP BY email HAVING COUNT(*) > 1" -) - -# Error test execution -error_execution = TestExecution( - id="users_freshness_test_exec_20240101_001", - test_id="users_freshness_test", - test_sub_unique_id="users_freshness_test", - sub_type="freshness", - status=TestExecutionStatus.ERROR, - start_time=datetime.now(timezone.utc), - quality_dimension=QualityDimension.FRESHNESS, - duration_seconds=0.1, - exception="Connection timeout", - traceback="Traceback (most recent call last):\n ..." -) -``` - -## Best Practices - -1. **Use unique execution IDs** - Generate unique IDs for each test execution (e.g., include timestamp) - -2. **Link to tests** - Always ensure `test_id` matches an existing Test `id` - -3. **Include timing information** - Set `start_time` and `duration_seconds` for performance monitoring - -4. **Report failures accurately** - Set `failure_count` to the actual number of failed rows/records - -5. **Include error details** - For failed tests, include `exception` and `traceback` for debugging - -6. **Set quality dimensions** - Assign appropriate quality dimensions to enable filtering and reporting - - -Test executions are upserted, so you can send the same execution multiple times and it will be updated. - - -## Related Documentation - -- [Tests](/python-sdk/api-reference/tests) - Define tests to execute -- [Table Assets](/python-sdk/api-reference/table-assets) - Create table assets -- [Sending Data Guide](/python-sdk/guides/sending-data) - Learn how to send test executions - diff --git a/docs/python-sdk/api-reference/tests.mdx b/docs/python-sdk/api-reference/tests.mdx deleted file mode 100644 index fd5c870bf..000000000 --- a/docs/python-sdk/api-reference/tests.mdx +++ /dev/null @@ -1,113 +0,0 @@ ---- -title: "Tests" ---- - -Tests define data quality checks for your table assets. Create tests to monitor data quality metrics and detect issues. - -## Test Object - -```python -from elementary_python_sdk.core.types.test import Test, TestSeverity - -test = Test( - id="string", # Required: Unique identifier - name="string", # Required: Test name - test_type="string", # Required: Type of test - asset_id="string", # Required: ID of the table asset - severity=TestSeverity.ERROR, # Required: Test severity - description="string", # Optional: Test description - column_name="string", # Optional: Column name (for column-level tests) - config={}, # Optional: Test configuration - meta={} # Optional: Additional metadata -) -``` - -## Required Fields - -| Field | Type | Description | -|-------|------|-------------| -| `id` | string | Unique identifier for the test | -| `name` | string | Display name for the test | -| `test_type` | string | Type of test (e.g., `"custom"`, `"row_count"`, `"null_rate"`, `"uniqueness"`) | -| `asset_id` | string | ID of the table asset this test is associated with (must match a TableAsset `id`) | -| `severity` | TestSeverity | Severity level: `TestSeverity.ERROR` or `TestSeverity.WARNING` | - -## Optional Fields - -| Field | Type | Description | -|-------|------|-------------| -| `description` | string | Human-readable description of what the test checks | -| `column_name` | string | Column name for column-level tests (leave `None` for table-level tests) | -| `config` | dict | Test-specific configuration parameters | -| `meta` | dict | Additional metadata for the test | - -## Test Severity - -Tests can have two severity levels: - -- `TestSeverity.ERROR` - Critical issues that should be addressed immediately -- `TestSeverity.WARNING` - Non-critical issues that should be monitored - -## Test Types - -Common test types include: - -- `"custom"` - Custom test logic -- `"row_count"` - Check row count -- `"null_rate"` - Check null percentage -- `"uniqueness"` - Check for duplicate values -- `"freshness"` - Check data freshness -- `"volume_anomaly"` - Detect volume anomalies -- `"freshness_anomaly"` - Detect freshness anomalies - -## Example - -```python -from elementary_python_sdk.core.types.test import Test, TestSeverity - -# Table-level test -table_test = Test( - id="users_row_count_test", - name="Users table row count check", - test_type="row_count", - asset_id="analytics.public.users", - severity=TestSeverity.ERROR, - description="Ensures the users table has at least 1000 rows", - config={"min_rows": 1000} -) - -# Column-level test -column_test = Test( - id="users_email_uniqueness_test", - name="Email uniqueness check", - test_type="uniqueness", - asset_id="analytics.public.users", - severity=TestSeverity.ERROR, - description="Ensures email addresses are unique", - column_name="email", - config={} -) -``` - -## Best Practices - -1. **Use descriptive names** - Choose clear, descriptive test names that explain what the test checks - -2. **Link to assets** - Always ensure the `asset_id` matches an existing table asset ID - -3. **Set appropriate severity** - Use `ERROR` for critical checks and `WARNING` for monitoring - -4. **Include descriptions** - Add descriptions to help your team understand test purpose - -5. **Use column_name for column tests** - Set `column_name` for column-level tests, leave `None` for table-level tests - - -Tests are replaced on each ingest, so include all current test definitions in every request. - - -## Related Documentation - -- [Table Assets](/python-sdk/api-reference/table-assets) - Create table assets to test -- [Test Executions](/python-sdk/api-reference/test-executions) - Report test execution results -- [Sending Data Guide](/python-sdk/guides/sending-data) - Learn how to send tests - diff --git a/docs/python-sdk/guides/authentication.mdx b/docs/python-sdk/guides/authentication.mdx deleted file mode 100644 index dbe6d7f40..000000000 --- a/docs/python-sdk/guides/authentication.mdx +++ /dev/null @@ -1,125 +0,0 @@ ---- -title: "Authentication" ---- - -The Elementary Python SDK uses API key authentication to securely send data to Elementary Cloud. - -## Getting Your Credentials - -### Generate an Access Token - -You can generate tokens directly from the Elementary UI: - -1. Go to [User → Personal Tokens](https://app.elementary-data.com/settings/user-tokens) or [Account → Account Tokens](https://app.elementary-data.com/settings/account-tokens) -2. Click **Generate token** -3. (Optional) Add a name/description for the token -4. Copy the token and store it securely — **it is shown only once** - -### Required Credentials - -You'll need the following: -- **Project ID** - Your Python project identifier (this will appear in the metadata of the assets you report) -- **API Key** - Your API token (generated from the steps above) -- **URL** - The full SDK ingest endpoint URL: `{base_url}/sdk-ingest/{env_id}/batch` - - -The same token generation process is used for other Elementary integrations. See the [MCP Setup Guide](/cloud/mcp/setup-guide#1--generate-an-access-token) for more details. - - -## Configuring Authentication - -### Using the Client - -When initializing the `ElementaryCloudClient`, provide your credentials: - -```python -from elementary_python_sdk.core.cloud.cloud_client import ElementaryCloudClient - -PROJECT_ID = "my-python-project" # Your Python project identifier (used to deduplicate and identify assets) -API_KEY = "your-api-token" -URL = "https://app.elementary-data.com/sdk-ingest/{env_id}/batch" - -client = ElementaryCloudClient(PROJECT_ID, API_KEY, URL) -``` - -**Note:** -- The `URL` parameter should be the full endpoint URL including the environment ID. Replace `{env_id}` with your actual environment ID. -- `PROJECT_ID` is your Python project identifier (chosen by you, used to deduplicate and identify reported assets in Elementary Cloud). - -### Using Environment Variables - -For better security, use environment variables: - -```python -import os -from elementary_python_sdk.core.cloud.cloud_client import ElementaryCloudClient - -PROJECT_ID = os.getenv("ELEMENTARY_PROJECT_ID") -API_KEY = os.getenv("ELEMENTARY_API_KEY") -URL = os.getenv("ELEMENTARY_URL") # Full endpoint URL: https://app.elementary-data.com/sdk-ingest/{env_id}/batch - -client = ElementaryCloudClient(PROJECT_ID, API_KEY, URL) -``` - -Set the environment variables: - -```bash -export ELEMENTARY_PROJECT_ID="your-project-id" -export ELEMENTARY_API_KEY="your-api-token" -export ELEMENTARY_URL="https://app.elementary-data.com/sdk-ingest/{env_id}/batch" -``` - -### Using a Configuration File - -You can also store credentials in a configuration file (not recommended for production): - -```python -import json -from elementary_python_sdk.core.cloud.cloud_client import ElementaryCloudClient - -with open("elementary_config.json") as f: - config = json.load(f) - -client = ElementaryCloudClient( - config["project_id"], - config["api_key"], - config["url"] # Full endpoint URL -) -``` - -## Security Best Practices - - -Never commit API tokens to version control. Always use environment variables or secrets management systems. - - -1. **Use environment variables** - Store tokens in environment variables, not in code -2. **Use secrets management** - For production, use services like AWS Secrets Manager, HashCorp Vault, or similar -3. **Rotate tokens regularly** - Periodically rotate your API tokens for better security -4. **Limit token scope** - Create API tokens with minimal required permissions -5. **Monitor usage** - Regularly check API token usage in your Elementary Cloud account - -## Troubleshooting - -### 401 Unauthorized - -If you receive a 401 error: - -- Verify your API token is correct -- Check that the token hasn't been revoked -- Ensure you're using the correct URL - -### 404 Not Found - -If you receive a 404 error: - -- Verify your URL is correct and includes the correct environment ID -- Check that the environment exists in your Elementary Cloud account -- Ensure you have access to the environment - -## Related Documentation - -- [Quickstart](/python-sdk/quickstart) - Get started with the SDK -- [Sending Data](/python-sdk/guides/sending-data) - Learn how to send data -- [Best Practices](/python-sdk/guides/best-practices) - Best practices for using the SDK - diff --git a/docs/python-sdk/guides/best-practices.mdx b/docs/python-sdk/guides/best-practices.mdx deleted file mode 100644 index 8f0199a1b..000000000 --- a/docs/python-sdk/guides/best-practices.mdx +++ /dev/null @@ -1,296 +0,0 @@ ---- -title: "Best Practices" ---- - -Follow these best practices to get the most out of the Elementary Python SDK and ensure reliable data quality monitoring. - -## Object ID Management - -### Use Consistent ID Formats - -Use a consistent format for all object IDs across your application: - -```python -# Good: Consistent format -table_id = f"{database}.{schema}.{table}" -test_id = f"{table_id}_{test_name}" -execution_id = f"{test_id}_exec_{timestamp}" - -# Bad: Inconsistent formats -table_id = f"{table}" # Missing database and schema -test_id = f"test_{random_id}" # Random IDs are hard to track -``` - -### Make IDs Deterministic - -Generate IDs deterministically so the same object always has the same ID: - -```python -# Good: Deterministic ID -table_id = f"{database}.{schema}.{table}" - -# Bad: Non-deterministic ID -table_id = f"table_{uuid.uuid4()}" # Changes every time -``` - -## Data Freshness - -### Send Timestamps Accurately - -Always use UTC timestamps and include timezone information: - -```python -from datetime import datetime, timezone - -# Good: UTC timezone -timestamp = datetime.now(timezone.utc) - -# Bad: Local timezone without conversion -timestamp = datetime.now() # May not be UTC -``` - -### Update Objects Regularly - -Send updated objects when metadata changes: - -```python -# When table description changes, send updated table asset -updated_table_asset = TableAsset( - id=table_id, - name=table_name, - # ... other fields ... - description="Updated description", # New description - # ... other fields ... -) -``` - -## Test Execution Reporting - -### Report All Test Runs - -Send test execution results for both passing and failing tests: - -```python -# Good: Report all results -if test_passed: - status = TestExecutionStatus.PASS -else: - status = TestExecutionStatus.FAIL - -test_execution = TestExecution( - # ... fields ... - status=status, - failure_count=failure_count -) -``` - -### Include Detailed Failure Information - -For failed tests, include as much detail as possible: - -```python -test_execution = TestExecution( - # ... required fields ... - status=TestExecutionStatus.FAIL, - failure_count=failed_row_count, - description=f"Found {failed_row_count} rows that failed validation", - code=test_query, # Include the query that was run - exception=error_message if error_occurred else None, - traceback=traceback_string if error_occurred else None -) -``` - -## Error Handling - -### Implement Robust Error Handling - -Always wrap SDK calls in try-except blocks: - -```python -try: - client.ingest(request) -except ElementaryAPIError as e: - logger.error(f"API error: {e.status_code} - {e.message}") - # Handle API errors appropriately -except Exception as e: - logger.error(f"Unexpected error: {e}") - # Handle unexpected errors -``` - -### Use Retry Logic - -Implement retry logic for transient failures: - -```python -from tenacity import retry, stop_after_attempt, wait_exponential - -@retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=2, max=10) -) -def send_with_retry(client, request): - client.ingest(request) -``` - -## Performance Optimization - -### Batch Objects Efficiently - -Collect objects and send them in batches: - -```python -# Good: Batch multiple objects -objects = [] -for table in tables: - objects.append(create_table_asset(table)) - objects.extend(create_tests_for_table(table)) - objects.extend(get_recent_executions(table)) - -request = ElementaryCloudIngestRequest( - project="my-project", - timestamp=datetime.now(timezone.utc), - objects=objects -) -client.ingest(request) -``` - -### Avoid Redundant Sends - -Don't send the same data multiple times unnecessarily: - -```python -# Good: Track what's been sent -sent_executions = set() - -for execution in new_executions: - if execution.id not in sent_executions: - objects.append(execution) - sent_executions.add(execution.id) -``` - -## Security - -### Protect API Keys - -Never hardcode API keys or commit them to version control: - -```python -# Good: Use environment variables -import os -api_key = os.getenv("ELEMENTARY_API_KEY") - -# Bad: Hardcoded key -api_key = "sk-1234567890abcdef" # Never do this! -``` - -### Use Secrets Management - -For production, use proper secrets management: - -```python -# Example with AWS Secrets Manager -import boto3 - -def get_api_key(): - client = boto3.client('secretsmanager') - response = client.get_secret_value(SecretId='elementary/api-key') - return response['SecretString'] -``` - -## Monitoring and Observability - -### Log SDK Operations - -Log important SDK operations for debugging: - -```python -import logging - -logger = logging.getLogger(__name__) - -try: - client.ingest(request) - logger.info(f"Successfully sent {len(request.objects)} objects") -except Exception as e: - logger.error(f"Failed to send objects: {e}", exc_info=True) -``` - -### Track Metrics - -Monitor SDK usage and performance: - -```python -import time -from datetime import datetime, timezone - -start_time = time.time() -try: - client.ingest(request) - duration = time.time() - start_time - logger.info(f"Ingest completed in {duration:.2f}s") -except Exception as e: - duration = time.time() - start_time - logger.error(f"Ingest failed after {duration:.2f}s: {e}") -``` - -## Code Organization - -### Create Helper Functions - -Organize SDK usage into reusable functions: - -```python -class ElementaryReporter: - def __init__(self, api_key, env_id, base_url): - self.client = ElementaryClient( - api_key=api_key, - base_url=base_url, - env_id=env_id - ) - self.project = "my-project" - - def report_table(self, table_info): - """Report a table asset""" - table_asset = TableAsset( - id=f"{table_info.database}.{table_info.schema}.{table_info.table}", - name=table_info.table, - database_name=table_info.database, - schema_name=table_info.schema, - table_name=table_info.table, - db_type=table_info.db_type, - description=table_info.description, - owners=table_info.owners, - tags=table_info.tags - ) - self._send([table_asset]) - - def report_test_result(self, test_id, result): - """Report a test execution result""" - execution = TestExecution( - id=f"{test_id}_exec_{int(result.timestamp.timestamp())}", - test_id=test_id, - test_sub_unique_id=test_id, - sub_type=result.test_type, - status=result.status, - start_time=result.timestamp, - failure_count=result.failure_count, - duration_seconds=result.duration - ) - self._send([execution]) - - def _send(self, objects): - """Internal method to send objects""" - request = ElementaryCloudIngestRequest( - project=self.project, - timestamp=datetime.now(timezone.utc), - objects=objects - ) - self.client.ingest(request) -``` - -## Related Documentation - -- [Quickstart](/python-sdk/quickstart) - Get started with the SDK -- [Sending Data](/python-sdk/guides/sending-data) - Learn how to send data -- [API Reference](/python-sdk/api-reference/overview) - Full API documentation - diff --git a/docs/python-sdk/guides/sending-data.mdx b/docs/python-sdk/guides/sending-data.mdx deleted file mode 100644 index be965c08f..000000000 --- a/docs/python-sdk/guides/sending-data.mdx +++ /dev/null @@ -1,201 +0,0 @@ ---- -title: "Sending Data" ---- - -Learn how to efficiently send data to Elementary Cloud using the Python SDK, including batching, error handling, and performance optimization. - -## Basic Usage - -The simplest way to send data is using the test context and client: - -```python -from elementary_python_sdk.core.cloud.cloud_client import ElementaryCloudClient -from elementary_python_sdk.core.tests import ( - boolean_test, - elementary_test_context, -) -from elementary_python_sdk.core.types.asset import TableAsset -import pandas as pd - -@boolean_test( - name="unique_ids", - description="All user IDs must be unique", - severity="ERROR", -) -def test_unique_ids(df: pd.DataFrame) -> bool: - return len(df["id"]) == len(df["id"].unique()) - -# Define your asset -asset = TableAsset( - name="users", - database_name="prod", - schema_name="public", - table_name="users" -) - -# Initialize client -PROJECT_ID = "my-python-project" # Your Python project identifier (used to deduplicate and identify assets) -API_KEY = "your-api-key" -URL = "https://app.elementary-data.com/sdk-ingest/{env_id}/batch" - -client = ElementaryCloudClient(PROJECT_ID, API_KEY, URL) - -# Run tests and send results -with elementary_test_context(asset=asset) as ctx: - users_df = pd.DataFrame({"id": [1, 2, 3]}) - test_unique_ids(users_df) - client.send_to_cloud(ctx) -``` - -## Running Multiple Tests - -Run multiple tests in a single context and send them together: - -```python -from elementary_python_sdk.core.cloud.cloud_client import ElementaryCloudClient -from elementary_python_sdk.core.tests import ( - boolean_test, - elementary_test_context, - expected_range, -) -from elementary_python_sdk.core.types.asset import TableAsset -import pandas as pd - -@boolean_test(name="unique_ids", severity="ERROR") -def test_unique_ids(df: pd.DataFrame) -> bool: - return len(df["id"]) == len(df["id"].unique()) - -@expected_range(name="average_age", min=18, max=50, severity="ERROR") -def test_average_age(df: pd.DataFrame) -> float: - return df["age"].mean() - -@boolean_test(name="no_nulls", severity="ERROR") -def test_no_nulls(df: pd.DataFrame) -> bool: - return df["email"].notna().all() - -# Define your asset -asset = TableAsset( - name="users", - database_name="prod", - schema_name="public", - table_name="users" -) - -# Initialize client -PROJECT_ID = "my-python-project" # Your Python project identifier (used to deduplicate and identify assets) -API_KEY = "your-api-key" -URL = "https://app.elementary-data.com/sdk-ingest/{env_id}/batch" - -client = ElementaryCloudClient(PROJECT_ID, API_KEY, URL) - -# Run all tests in a single context -with elementary_test_context(asset=asset) as ctx: - users_df = pd.DataFrame({ - "id": [1, 2, 3], - "age": [25, 30, 35], - "email": ["a@example.com", "b@example.com", "c@example.com"] - }) - - # Run all tests - all results captured in context - test_unique_ids(users_df) - test_average_age(users_df) - test_no_nulls(users_df) - - # Send all results in one request - client.send_to_cloud(ctx) -``` - - -All tests run within a single `elementary_test_context` are automatically batched and sent together. - - -## Error Handling - -Always include error handling when sending data: - -```python -try: - client.send_to_cloud(ctx) - print("Data sent successfully") -except Exception as e: - print(f"Error sending data: {e}") - # The SDK will log detailed error information -``` - -## Retry Logic - -Implement retry logic for transient failures: - -```python -import time - -def send_with_retry(client, ctx, max_retries=3): - for attempt in range(max_retries): - try: - client.send_to_cloud(ctx) - return - except Exception as e: - if attempt < max_retries - 1: - # Retry on errors with exponential backoff - wait_time = 2 ** attempt - time.sleep(wait_time) - continue - else: - raise - -send_with_retry(client, ctx) -``` - -## Using Context Managers for Inline Tests - -You can create tests inline using context managers without decorators: - -```python -with elementary_test_context(asset=asset) as ctx: - # Inline boolean test - with ctx.boolean_test(name="my_test", description="Inline test") as my_bool_test: - my_bool_test.assert_value(False) - - # Inline expected values test - with ctx.expected_values_test( - name="country_count", - expected=[2, 3], - allow_none=True, - metadata={"my_metadata_field": "my_metadata_value"}, - ) as my_expected_values_test: - my_expected_values_test.assert_value(5) # Will fail - my_expected_values_test.assert_value(3) # Will pass - - # Inline expected range test - with ctx.expected_range_test( - name="age_range", - min=18, - max=50, - ) as my_range_test: - my_range_test.assert_value(25.5) # Will pass - - # Inline row count test - with ctx.row_count_test( - name="row_count", - min=1, - max=1000, - ) as my_row_count_test: - my_row_count_test.assert_value(users_df) # Passes DataFrame, list, etc. - - client.send_to_cloud(ctx) -``` - -## Performance Tips - -1. **Batch requests** - Send multiple objects in a single request -2. **Use async when possible** - For high-volume scenarios, consider async operations -3. **Don't send duplicates** - Avoid sending the same data multiple times -4. **Update incrementally** - Only send changed data, not everything every time -5. **Monitor API usage** - Track your API usage to avoid rate limits - -## Related Documentation - -- [API Reference](/python-sdk/api-reference/overview) - Full API documentation -- [Best Practices](/python-sdk/guides/best-practices) - Best practices for using the SDK -- [Authentication](/python-sdk/guides/authentication) - Authentication setup - diff --git a/docs/python-sdk/guides/test-decorators.mdx b/docs/python-sdk/guides/test-decorators.mdx deleted file mode 100644 index fa142e7a2..000000000 --- a/docs/python-sdk/guides/test-decorators.mdx +++ /dev/null @@ -1,268 +0,0 @@ ---- -title: "Test Decorators" ---- - -The Elementary Python SDK provides decorators to define data quality tests. These decorators automatically capture test results and metadata when tests are run within an `elementary_test_context`. - -## Available Decorators - -### @boolean_test - -Use `@boolean_test` for tests that return a boolean (pass/fail) result. - -```python -from elementary_python_sdk.core.tests import boolean_test -import pandas as pd - -@boolean_test( - name="unique_ids", - severity="ERROR", - description="All user IDs must be unique", - column_name="id", -) -def test_unique_ids(df: pd.DataFrame) -> bool: - ids = df["id"].dropna().tolist() - return len(ids) == len(set(ids)) -``` - -**Parameters:** -- `name` (required): Test name -- `severity` (optional): Test severity - `"ERROR"` or `"WARNING"` (default: `"ERROR"`) -- `description` (optional): Test description -- `column_name` (optional): Column being tested - -### @expected_range - -Use `@expected_range` for tests that return a numeric value that should fall within a range. - -```python -from elementary_python_sdk.core.tests import expected_range -import pandas as pd - -@expected_range( - name="average_age", - min=18, - max=50, - severity="ERROR", - description="Average age should be between 18 and 50", - column_name="age", -) -def test_average_age(df: pd.DataFrame) -> float: - return df["age"].mean() -``` - -**Parameters:** -- `name` (required): Test name -- `min` (required): Minimum expected value -- `max` (required): Maximum expected value -- `severity` (optional): Test severity - `"ERROR"` or `"WARNING"` (default: `"ERROR"`) -- `description` (optional): Test description -- `column_name` (optional): Column being tested - -### @row_count - -Use `@row_count` to validate the number of rows in a DataFrame. - -```python -from elementary_python_sdk.core.tests import row_count -import pandas as pd - -@row_count( - name="user_count_range", - min=1, - max=1000000, - severity="WARNING", - description="Validate user count is within expected range", -) -def get_users_df(df: pd.DataFrame) -> pd.DataFrame: - """Return the dataframe - decorator calls len() on it.""" - return df -``` - -**Parameters:** -- `name` (required): Test name -- `min` (required): Minimum expected row count -- `max` (required): Maximum expected row count -- `severity` (optional): Test severity - `"ERROR"` or `"WARNING"` (default: `"ERROR"`) -- `description` (optional): Test description - -### @expected_values - -Use `@expected_values` to validate that a value matches one of the expected values. - -```python -from elementary_python_sdk.core.tests import expected_values -import pandas as pd - -@expected_values( - name="country_count", - expected=2, - severity="ERROR", - description="Should have exactly 2 countries", - column_name="country", -) -def count_unique_countries(df: pd.DataFrame) -> int: - return df["country"].nunique() -``` - -**Parameters:** -- `name` (required): Test name -- `expected` (required): Expected value or list of expected values -- `severity` (optional): Test severity - `"ERROR"` or `"WARNING"` (default: `"ERROR"`) -- `description` (optional): Test description -- `column_name` (optional): Column being tested - -## Using Decorators with Test Context - -Tests decorated with SDK decorators must be run within an `elementary_test_context`: - -```python -from elementary_python_sdk.core.cloud.cloud_client import ElementaryCloudClient -from elementary_python_sdk.core.tests import ( - boolean_test, - elementary_test_context, - expected_range, -) -from elementary_python_sdk.core.types.asset import TableAsset -import pandas as pd - -@boolean_test( - name="unique_ids", - description="All user IDs must be unique", - severity="ERROR", -) -def test_unique_ids(df: pd.DataFrame) -> bool: - return len(df["id"]) == len(df["id"].unique()) - -@expected_range( - name="average_age", - min=18, - max=50, - severity="ERROR", -) -def test_average_age(df: pd.DataFrame) -> float: - return df["age"].mean() - -# Define your asset -asset = TableAsset( - name="users", - database_name="prod", - schema_name="public", - table_name="users" -) - -# Run tests within context -with elementary_test_context(asset=asset) as ctx: - users_df = pd.DataFrame({"id": [1, 2, 3], "age": [25, 30, 35]}) - - # Run tests - results are automatically captured - test_unique_ids(users_df) - test_average_age(users_df) - - # Send results to Elementary Cloud - PROJECT_ID = "my-python-project" # Your Python project identifier (used to deduplicate and identify assets) - API_KEY = "your-api-key" - URL = "https://app.elementary-data.com/sdk-ingest/{env_id}/batch" - - client = ElementaryCloudClient(PROJECT_ID, API_KEY, URL) - client.send_to_cloud(ctx) -``` - -## Using Context Manager Tests - -You can also create tests directly within the context using context managers: - -```python -with elementary_test_context(asset=asset) as ctx: - # Using boolean_test context manager - with ctx.boolean_test(name="my_test", description="Test description") as my_bool_test: - my_bool_test.assert_value(False) # Assert a boolean value - - # Using expected_values_test context manager - with ctx.expected_values_test( - name="country_count", - expected=[2, 3], - allow_none=True, - metadata={"my_metadata_field": "my_metadata_value"}, - ) as my_expected_values_test: - my_expected_values_test.assert_value(5) # This will fail (not in expected list) - my_expected_values_test.assert_value(3) # This will pass - - # Using expected_range_test context manager - with ctx.expected_range_test( - name="age_range", - min=18, - max=50, - ) as my_range_test: - my_range_test.assert_value(25.5) # Assert a numeric value - - # Using row_count_test context manager - with ctx.row_count_test( - name="row_count", - min=1, - max=1000, - ) as my_row_count_test: - my_row_count_test.assert_value(users_df) # Assert a Sized object - - PROJECT_ID = "your-project-id" - API_KEY = "your-api-key" - URL = "https://app.elementary-data.com/sdk-ingest/{env_id}/batch" - - client = ElementaryCloudClient(PROJECT_ID, API_KEY, URL) - client.send_to_cloud(ctx) -``` - -## Framework Integration - -The SDK works with any Python testing framework. You can wrap existing test functions: - -```python -from elementary_python_sdk.core.tests import boolean_test - -# Existing Great Expectations test -def ge_test(df): - # Your Great Expectations code here - return result - -# Wrap it with Elementary decorator -@boolean_test( - name="ge_wrapped_test", - description="Great Expectations test", - severity="ERROR", -) -def elementary_wrapped_test(df): - return ge_test(df) -``` - -## Error Handling in Tests - -Tests can raise exceptions, which will be automatically captured and reported: - -```python -@expected_range( - name="average_age", - min=18, - max=50, - description="Average age should be between 18 and 50", -) -def test_average_age(df: pd.DataFrame) -> float: - # If an exception is raised, it will be captured and reported as a test error - if df.empty: - raise ValueError("DataFrame is empty") - return df["age"].mean() -``` - -## Best Practices - -1. **Use descriptive names** - Choose clear test names that explain what's being validated -2. **Add descriptions** - Include descriptions to help your team understand test purpose -3. **Specify column names** - Set `column_name` for column-level tests -4. **Run in context** - Always run decorated tests within `elementary_test_context` -5. **Send results** - Call `client.send_to_cloud(ctx)` to report results to Elementary - -## Related Documentation - -- [Quickstart](/python-sdk/quickstart) - Get started with test decorators -- [Sending Data](/python-sdk/guides/sending-data) - Learn how to send test results -- [Best Practices](/python-sdk/guides/best-practices) - Best practices for using the SDK - diff --git a/docs/python-sdk/installation.mdx b/docs/python-sdk/installation.mdx index 8a83b8681..697d55ddd 100644 --- a/docs/python-sdk/installation.mdx +++ b/docs/python-sdk/installation.mdx @@ -58,6 +58,5 @@ Keep your API key secure. Never commit it to version control. Use environment va ## Next Steps - [Quickstart Guide](/python-sdk/quickstart) - Send your first data to Elementary Cloud -- [Authentication Guide](/python-sdk/guides/authentication) - Learn about authentication options - [API Reference](/python-sdk/api-reference/overview) - Explore the full API documentation diff --git a/docs/python-sdk/introduction.mdx b/docs/python-sdk/introduction.mdx index 16bb9f178..a76d5476b 100644 --- a/docs/python-sdk/introduction.mdx +++ b/docs/python-sdk/introduction.mdx @@ -32,7 +32,7 @@ The SDK allows you to: - **Framework-agnostic** - Works with any Python testing framework (Great Expectations, DQX, custom code) - **Decorator-based API** - Simple decorators to define tests (`@boolean_test`, `@expected_range`, etc.) - **Context management** - Use `elementary_test_context` to automatically capture test results -- **Unified observability** - Python tests appear alongside dbt tests and cloud tests in Elementary +- **Unified observability** - Python tests appear alongside dbt tests and cloud tests in Elementary Cloud - **Full lineage** - Connect Python assets to dbt models, warehouse tables, and ML outputs ## Use Cases diff --git a/docs/python-sdk/quickstart.mdx b/docs/python-sdk/quickstart.mdx index 968815d7b..ed6def244 100644 --- a/docs/python-sdk/quickstart.mdx +++ b/docs/python-sdk/quickstart.mdx @@ -19,6 +19,8 @@ from elementary_python_sdk.core.tests import ( boolean_test, elementary_test_context, expected_range, + expected_values, + row_count, ) from elementary_python_sdk.core.types.asset import TableAsset ``` @@ -59,20 +61,20 @@ def test_average_age(df: pd.DataFrame) -> float: severity="WARNING", description="Validate user count is within expected range", ) -def get_users_df(df: pd.DataFrame) -> pd.DataFrame: - """Return the dataframe - decorator calls len() on it.""" +def test_users_row_count(df: pd.DataFrame) -> pd.DataFrame: + """Return the DataFrame; the decorator calls len() on it.""" return df # Define an expected values test @expected_values( - name="country_count", - expected=2, + name="only_valid_countries", + expected=["Germany", "France", "Italy"], severity="ERROR", - description="Should have exactly 2 countries", + description="Should contain only valid countries", column_name="country", ) -def count_unique_countries(df: pd.DataFrame) -> int: - return df["country"].nunique() +def test_only_valid_countries(df: pd.DataFrame) -> pd.Series: + return df["country"] ``` ## Step 4: Create Your Data Asset @@ -109,6 +111,8 @@ def main(): # Run tests - results are automatically captured test_average_age(users_df) test_unique_ids(users_df) + test_users_row_count(users_df) + test_only_valid_countries(users_df) # Send results to Elementary Cloud PROJECT_ID = "my-python-project" # Your Python project identifier (used to deduplicate and identify assets) @@ -191,15 +195,15 @@ def main(): client = ElementaryCloudClient(PROJECT_ID, API_KEY, URL) client.send_to_cloud(ctx) + +if __name__ == "__main__": + main() ``` **Note:** - Replace `API_KEY` and `URL` with your actual credentials. The `URL` should be the full SDK ingest endpoint including your environment ID. - `PROJECT_ID` is your Python project identifier - choose any string to identify your code project. This will appear in the metadata of assets you report and is used for deduplication. -if __name__ == "__main__": - main() -``` ## What Happens Next? @@ -214,6 +218,6 @@ Once you send test results to Elementary Cloud: ## What's Next? - [API Reference](/python-sdk/api-reference/overview) - Learn about all available objects and methods -- [Sending Data Guide](/python-sdk/guides/sending-data) - Best practices for sending data -- [Best Practices](/python-sdk/guides/best-practices) - Tips for using the SDK effectively +- [Test Decorators](/python-sdk/api-reference/test-decorators) - Complete reference for all test decorators +- [Table Assets](/python-sdk/api-reference/table-assets) - Learn about table asset structure