Add ClickHouse data catalog#11858
Conversation
|
All contributors have signed the CLA ✍️ ✅ |
|
🔒 Entelligence AI Vulnerability Scanner ✅ No security vulnerabilities found! Your code passed our comprehensive security analysis. 📊 Files Analyzed: 1 files |
| type as data_type, | ||
| comment as column_description, | ||
| default_expression as column_default, | ||
| CASE WHEN is_in_primary_key = 1 THEN 0 ELSE 1 END as is_nullable |
There was a problem hiding this comment.
correctness: is_nullable in meta_get_columns is set based on is_in_primary_key, which is incorrect and will mislabel non-primary key columns as nullable even if they are NOT NULL.
🤖 AI Agent Prompt for Cursor/Windsurf
📋 Copy this prompt to your AI coding assistant (Cursor, Windsurf, etc.) to get help fixing this issue
In mindsdb/integrations/handlers/clickhouse_handler/clickhouse_handler.py, lines 222-225, the `meta_get_columns` method incorrectly sets `is_nullable` using `CASE WHEN is_in_primary_key = 1 THEN 0 ELSE 1 END`, which does not accurately reflect column nullability. Replace this with `is_nullable as is_nullable` to correctly report the nullability status from ClickHouse system columns.
📝 Committable Code Suggestion
‼️ Ensure you review the code suggestion before committing it to the branch. Make sure it replaces the highlighted code, contains no missing lines, and has no issues with indentation.
| type as data_type, | |
| comment as column_description, | |
| default_expression as column_default, | |
| CASE WHEN is_in_primary_key = 1 THEN 0 ELSE 1 END as is_nullable | |
| comment as column_description, | |
| default_expression as column_default, | |
| is_nullable as is_nullable |
| def meta_get_column_statistics_for_table( | ||
| self, table_name: str, column_names: Optional[List[str]] = None | ||
| ) -> Response: | ||
| """ | ||
| Retrieves column statistics for a specific table. | ||
|
|
||
| Args: | ||
| table_name (str): The name of the table. | ||
| column_names (Optional[List[str]]): List of column names to retrieve statistics for. | ||
| If None, statistics for all columns will be returned. | ||
|
|
||
| Returns: | ||
| Response: A response object containing the column statistics for the table. | ||
| """ | ||
| database = self.connection_data['database'] | ||
|
|
||
| # Get the list of columns for this table | ||
| columns_query = f""" | ||
| SELECT name, type | ||
| FROM system.columns | ||
| WHERE database = '{database}' AND table = '{table_name}' | ||
| """ | ||
|
|
||
| if column_names: | ||
| quoted_names = [f"'{c}'" for c in column_names] | ||
| columns_query += f" AND name IN ({','.join(quoted_names)})" | ||
|
|
||
| try: | ||
| columns_result = self.native_query(columns_query) | ||
|
|
||
| if columns_result.resp_type == RESPONSE_TYPE.ERROR or columns_result.data_frame.empty: | ||
| logger.warning(f"No columns found for table {table_name}") | ||
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame()) | ||
|
|
||
| # Build statistics query - collect all stats in one query | ||
| select_parts = [] | ||
| for _, row in columns_result.data_frame.iterrows(): | ||
| col = row['name'] | ||
| # Use backticks to handle special characters in column names | ||
| select_parts.extend([ | ||
| f"countIf(`{col}` IS NULL) AS nulls_{col}", | ||
| f"uniq(`{col}`) AS distincts_{col}", | ||
| f"toString(min(`{col}`)) AS min_{col}", | ||
| f"toString(max(`{col}`)) AS max_{col}", | ||
| ]) | ||
|
|
||
| if not select_parts: | ||
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame()) | ||
|
|
||
| # Build the query to get stats for all columns at once | ||
| stats_query = f""" | ||
| SELECT | ||
| count(*) AS total_rows, | ||
| {', '.join(select_parts)} | ||
| FROM `{database}`.`{table_name}` | ||
| """ | ||
|
|
||
| stats_result = self.native_query(stats_query) | ||
|
|
||
| if stats_result.resp_type != RESPONSE_TYPE.TABLE or stats_result.data_frame.empty: | ||
| logger.warning(f"Could not retrieve stats for table {table_name}") | ||
| # Return placeholder stats | ||
| placeholder_data = [] | ||
| for _, row in columns_result.data_frame.iterrows(): | ||
| placeholder_data.append({ | ||
| 'table_name': table_name, | ||
| 'column_name': row['name'], | ||
| 'null_percentage': None, | ||
| 'distinct_values_count': None, | ||
| 'most_common_values': None, | ||
| 'most_common_frequencies': None, | ||
| 'minimum_value': None, | ||
| 'maximum_value': None, | ||
| }) | ||
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame(placeholder_data)) | ||
|
|
||
| # Parse the stats result | ||
| stats_data = stats_result.data_frame.iloc[0] | ||
| total_rows = stats_data.get('total_rows', 0) | ||
|
|
||
| # Build the final statistics DataFrame | ||
| all_stats = [] | ||
| for _, row in columns_result.data_frame.iterrows(): | ||
| col = row['name'] | ||
| nulls = stats_data.get(f'nulls_{col}', 0) | ||
| distincts = stats_data.get(f'distincts_{col}', None) | ||
| min_val = stats_data.get(f'min_{col}', None) | ||
| max_val = stats_data.get(f'max_{col}', None) | ||
|
|
||
| # Calculate null percentage | ||
| null_pct = None | ||
| if total_rows is not None and total_rows > 0: | ||
| null_pct = round((nulls / total_rows) * 100, 2) | ||
|
|
||
| all_stats.append({ | ||
| 'table_name': table_name, | ||
| 'column_name': col, | ||
| 'null_percentage': null_pct, | ||
| 'distinct_values_count': distincts, | ||
| 'most_common_values': None, | ||
| 'most_common_frequencies': None, | ||
| 'minimum_value': min_val, | ||
| 'maximum_value': max_val, | ||
| }) | ||
|
|
||
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame(all_stats)) | ||
|
|
||
| except Exception as e: | ||
| logger.error(f"Exception while fetching statistics for table {table_name}: {e}") | ||
| # Return empty stats on error | ||
| return Response( | ||
| RESPONSE_TYPE.ERROR, | ||
| error_message=f"Could not retrieve statistics for table {table_name}: {str(e)}" | ||
| ) |
There was a problem hiding this comment.
performance: The meta_get_column_statistics_for_table method (lines 253-366) issues a full table scan for every requested table, which can cause severe performance degradation on large ClickHouse tables due to lack of sampling or row limits.
🤖 AI Agent Prompt for Cursor/Windsurf
📋 Copy this prompt to your AI coding assistant (Cursor, Windsurf, etc.) to get help fixing this issue
Optimize the `meta_get_column_statistics_for_table` method in `mindsdb/integrations/handlers/clickhouse_handler/clickhouse_handler.py` (lines 253-366). The current implementation performs a full table scan for statistics, which is extremely slow on large ClickHouse tables. Refactor the code to use ClickHouse's `SAMPLE` clause (e.g., `SAMPLE 0.1`) to compute statistics on a sample of the data, significantly reducing query time and resource usage for large tables. Ensure the method still returns the same structure and handles errors gracefully.
📝 Committable Code Suggestion
‼️ Ensure you review the code suggestion before committing it to the branch. Make sure it replaces the highlighted code, contains no missing lines, and has no issues with indentation.
| def meta_get_column_statistics_for_table( | |
| self, table_name: str, column_names: Optional[List[str]] = None | |
| ) -> Response: | |
| """ | |
| Retrieves column statistics for a specific table. | |
| Args: | |
| table_name (str): The name of the table. | |
| column_names (Optional[List[str]]): List of column names to retrieve statistics for. | |
| If None, statistics for all columns will be returned. | |
| Returns: | |
| Response: A response object containing the column statistics for the table. | |
| """ | |
| database = self.connection_data['database'] | |
| # Get the list of columns for this table | |
| columns_query = f""" | |
| SELECT name, type | |
| FROM system.columns | |
| WHERE database = '{database}' AND table = '{table_name}' | |
| """ | |
| if column_names: | |
| quoted_names = [f"'{c}'" for c in column_names] | |
| columns_query += f" AND name IN ({','.join(quoted_names)})" | |
| try: | |
| columns_result = self.native_query(columns_query) | |
| if columns_result.resp_type == RESPONSE_TYPE.ERROR or columns_result.data_frame.empty: | |
| logger.warning(f"No columns found for table {table_name}") | |
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame()) | |
| # Build statistics query - collect all stats in one query | |
| select_parts = [] | |
| for _, row in columns_result.data_frame.iterrows(): | |
| col = row['name'] | |
| # Use backticks to handle special characters in column names | |
| select_parts.extend([ | |
| f"countIf(`{col}` IS NULL) AS nulls_{col}", | |
| f"uniq(`{col}`) AS distincts_{col}", | |
| f"toString(min(`{col}`)) AS min_{col}", | |
| f"toString(max(`{col}`)) AS max_{col}", | |
| ]) | |
| if not select_parts: | |
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame()) | |
| # Build the query to get stats for all columns at once | |
| stats_query = f""" | |
| SELECT | |
| count(*) AS total_rows, | |
| {', '.join(select_parts)} | |
| FROM `{database}`.`{table_name}` | |
| """ | |
| stats_result = self.native_query(stats_query) | |
| if stats_result.resp_type != RESPONSE_TYPE.TABLE or stats_result.data_frame.empty: | |
| logger.warning(f"Could not retrieve stats for table {table_name}") | |
| # Return placeholder stats | |
| placeholder_data = [] | |
| for _, row in columns_result.data_frame.iterrows(): | |
| placeholder_data.append({ | |
| 'table_name': table_name, | |
| 'column_name': row['name'], | |
| 'null_percentage': None, | |
| 'distinct_values_count': None, | |
| 'most_common_values': None, | |
| 'most_common_frequencies': None, | |
| 'minimum_value': None, | |
| 'maximum_value': None, | |
| }) | |
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame(placeholder_data)) | |
| # Parse the stats result | |
| stats_data = stats_result.data_frame.iloc[0] | |
| total_rows = stats_data.get('total_rows', 0) | |
| # Build the final statistics DataFrame | |
| all_stats = [] | |
| for _, row in columns_result.data_frame.iterrows(): | |
| col = row['name'] | |
| nulls = stats_data.get(f'nulls_{col}', 0) | |
| distincts = stats_data.get(f'distincts_{col}', None) | |
| min_val = stats_data.get(f'min_{col}', None) | |
| max_val = stats_data.get(f'max_{col}', None) | |
| # Calculate null percentage | |
| null_pct = None | |
| if total_rows is not None and total_rows > 0: | |
| null_pct = round((nulls / total_rows) * 100, 2) | |
| all_stats.append({ | |
| 'table_name': table_name, | |
| 'column_name': col, | |
| 'null_percentage': null_pct, | |
| 'distinct_values_count': distincts, | |
| 'most_common_values': None, | |
| 'most_common_frequencies': None, | |
| 'minimum_value': min_val, | |
| 'maximum_value': max_val, | |
| }) | |
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame(all_stats)) | |
| except Exception as e: | |
| logger.error(f"Exception while fetching statistics for table {table_name}: {e}") | |
| # Return empty stats on error | |
| return Response( | |
| RESPONSE_TYPE.ERROR, | |
| error_message=f"Could not retrieve statistics for table {table_name}: {str(e)}" | |
| ) | |
| def meta_get_column_statistics_for_table( | |
| self, table_name: str, column_names: Optional[List[str]] = None | |
| ) -> Response: | |
| """ | |
| Retrieves column statistics for a specific table, using sampling for large tables to avoid full scans. | |
| """ | |
| database = self.connection_data['database'] | |
| columns_query = f""" | |
| SELECT name, type | |
| FROM system.columns | |
| WHERE database = '{database}' AND table = '{table_name}' | |
| """ | |
| if column_names: | |
| quoted_names = [f"'{c}'" for c in column_names] | |
| columns_query += f" AND name IN ({{','.join(quoted_names)}})" | |
| try: | |
| columns_result = self.native_query(columns_query) | |
| if columns_result.resp_type == RESPONSE_TYPE.ERROR or columns_result.data_frame.empty: | |
| logger.warning(f"No columns found for table {table_name}") | |
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame()) | |
| select_parts = [] | |
| for _, row in columns_result.data_frame.iterrows(): | |
| col = row['name'] | |
| select_parts.extend([ | |
| f"countIf(`{col}` IS NULL) AS nulls_{col}", | |
| f"uniq(`{col}`) AS distincts_{col}", | |
| f"toString(min(`{col}`)) AS min_{col}", | |
| f"toString(max(`{col}`)) AS max_{col}", | |
| ]) | |
| if not select_parts: | |
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame()) | |
| # Use sampling for large tables | |
| sample_clause = "SAMPLE 0.1" # 10% sample, adjust as needed | |
| stats_query = f""" | |
| SELECT | |
| count(*) AS total_rows, | |
| {', '.join(select_parts)} | |
| FROM `{database}`.`{table_name}` {sample_clause} | |
| """ | |
| stats_result = self.native_query(stats_query) | |
| if stats_result.resp_type != RESPONSE_TYPE.TABLE or stats_result.data_frame.empty: | |
| logger.warning(f"Could not retrieve stats for table {table_name}") | |
| placeholder_data = [] | |
| for _, row in columns_result.data_frame.iterrows(): | |
| placeholder_data.append({ | |
| 'table_name': table_name, | |
| 'column_name': row['name'], | |
| 'null_percentage': None, | |
| 'distinct_values_count': None, | |
| 'most_common_values': None, | |
| 'most_common_frequencies': None, | |
| 'minimum_value': None, | |
| 'maximum_value': None, | |
| }) | |
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame(placeholder_data)) | |
| stats_data = stats_result.data_frame.iloc[0] | |
| total_rows = stats_data.get('total_rows', 0) | |
| all_stats = [] | |
| for _, row in columns_result.data_frame.iterrows(): | |
| col = row['name'] | |
| nulls = stats_data.get(f'nulls_{col}', 0) | |
| distincts = stats_data.get(f'distincts_{col}', None) | |
| min_val = stats_data.get(f'min_{col}', None) | |
| max_val = stats_data.get(f'max_{col}', None) | |
| null_pct = None | |
| if total_rows is not None and total_rows > 0: | |
| null_pct = round((nulls / total_rows) * 100, 2) | |
| all_stats.append({ | |
| 'table_name': table_name, | |
| 'column_name': col, | |
| 'null_percentage': null_pct, | |
| 'distinct_values_count': distincts, | |
| 'most_common_values': None, | |
| 'most_common_frequencies': None, | |
| 'minimum_value': min_val, | |
| 'maximum_value': max_val, | |
| }) | |
| return Response(RESPONSE_TYPE.TABLE, pd.DataFrame(all_stats)) | |
| except Exception as e: | |
| logger.error(f"Exception while fetching statistics for table {table_name}: {e}") | |
| return Response( | |
| RESPONSE_TYPE.ERROR, | |
| error_message=f"Could not retrieve statistics for table {table_name}: {str(e)}" | |
| ) | |
|
I have read the CLA Document and I hereby sign the CLA |
Co-authored-by: andrew <elkin.andr@gmail.com>
Co-authored-by: martyna-mindsdb <109554435+martyna-mindsdb@users.noreply.github.com>
…val in ClickHouse handler
f6f1f27 to
d4766c0
Compare
| SELECT | ||
| name as table_name, | ||
| database as table_schema, | ||
| engine as table_type, |
There was a problem hiding this comment.
better to use 'BASE TABLE' for table_type
Description
This PR adds Data Catalog support for the ClickHouse handler, enabling AI agents to automatically read and utilize table and column metadata (including column comments) when generating SQL queries.
Key Changes:
ClickHouseHandlerto inherit fromMetaDatabaseHandlerALTER TABLE ... MODIFY COLUMN ... COMMENTBenefits:
Type of change
Verification Process
To ensure the changes are working as expected:
Prerequisites:
config.json:{ "data_catalog": { "enabled": true } }Test Location:
http://localhost:47334Verification Steps:
CREATE SKILL clickhouse_skill USING type = 'text2sql', database = 'clickhouse_conn', tables = ['test_table'], description = 'Test table with commented columns';CREATE AGENT test_agent USING model = 'gpt-4', skills = ['clickhouse_skill'];Expected: The
COLUMN_DESCRIPTIONfield should contain the comments you set in ClickHouse.Ask the agent a natural language question about your ClickHouse data and verify it generates accurate SQL using the column descriptions.
Additional Media:
Checklist:
Implementation Details
Methods Implemented:
meta_get_tables(): Reads table metadata fromsystem.tablesmeta_get_columns(): Reads column metadata fromsystem.columns✨meta_get_column_statistics(): Computes statistics by querying tablesmeta_get_primary_keys(): Reads primary key info fromsystem.columnsmeta_get_foreign_keys(): Returns empty DataFrame (ClickHouse doesn't support FK constraints)Notes:
Documentation TODO:
/docs/data_catalog/integrations/overview.mdx