Skip to content

Commit 38c708f

Browse files
authored
feat(DATAGO-131057): Add table filtering to sam-sql-database-tool (#153)
1 parent 4c851a5 commit 38c708f

6 files changed

Lines changed: 223 additions & 3 deletions

File tree

sam-sql-database-tool/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,14 @@ tools:
6464
# schema_sample_size: 100
6565
# cache_ttl_seconds: 3600
6666

67+
# --- Table Filtering (glob patterns supported: *, ?, [seq]) ---
68+
# include_tables: # Only include matching tables in schema detection
69+
# - "customers*"
70+
# - "orders*"
71+
# exclude_tables: # Exclude matching tables from schema detection
72+
# - "bkp_*"
73+
# - "*_temp"
74+
6775
# --- Connection Pool (optional tuning) ---
6876
# pool_size: 10
6977
# max_overflow: 10
@@ -105,6 +113,18 @@ tools:
105113
- `max_enum_cardinality`: (Optional, default: `100`) Maximum number of distinct values to consider a column as an enum. Increase for columns like countries (190+), decrease for faster init times.
106114
- `schema_sample_size`: (Optional, default: `100`) Number of rows to sample per table for schema detection. Increase for better accuracy on sparse data, decrease for faster init times.
107115
- `cache_ttl_seconds`: (Optional, default: `3600`) Time-to-live for schema cache in seconds. After this duration, the schema will be re-detected on the next query. Set to `0` to disable caching.
116+
- `include_tables`: (Optional) A list of glob patterns for tables to include in schema detection. If set, only tables matching at least one pattern are included. Supports wildcards: `*`, `?`, `[seq]`. Example: `["tms_trx*", "tms_alert*"]`.
117+
- `exclude_tables`: (Optional) A list of glob patterns for tables to exclude from schema detection. Applied after `include_tables`. Supports the same wildcard syntax. Example: `["bkp_*", "*_temp", "*_dev"]`. Both options can be used together and matching is case-sensitive.
118+
119+
**Important: Table filtering is not access control.** These options only control which tables appear in the schema provided to the LLM. They do not prevent the LLM from executing queries against other tables in the database — for example, by querying database metadata or being prompted to access tables outside the filter. The underlying database connection still has full access to all tables the database user can see.
120+
121+
To reduce the likelihood of the LLM querying unfiltered tables, add an instruction to your agent such as:
122+
```
123+
Only query tables that appear in your tool's schema description.
124+
Do not query database metadata tables or any tables not listed in your schema.
125+
```
126+
127+
**For actual access control, configure the database user in the connection string with `SELECT` permissions restricted to only the allowed tables.** This is the only way to guarantee that the LLM cannot access tables outside the intended scope.
108128
109129
#### Connection Pool Settings
110130

sam-sql-database-tool/config.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,15 @@ apps:
106106
schema_sample_size: 50
107107
cache_ttl_seconds: 1800 # 30 minutes
108108

109+
# Optional table filtering (glob patterns supported: *, ?, [seq]):
110+
# include_tables: # Only include matching tables in schema detection
111+
# - "tms_trx*"
112+
# - "tms_alert*"
113+
# exclude_tables: # Exclude matching tables from schema detection
114+
# - "bkp_*"
115+
# - "*_temp"
116+
# - "*_dev"
117+
109118
# Optional connection pool tuning:
110119
# pool_size: 10 # Persistent connections kept in pool (default: 10)
111120
# max_overflow: 10 # Additional connections allowed during traffic bursts (default: 10)

sam-sql-database-tool/src/sam_sql_database_tool/services/database_service.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Service for handling SQL database operations with parallel processing."""
22

33
from contextlib import contextmanager
4+
from fnmatch import fnmatchcase
45
from typing import List, Dict, Any, Generator, Optional, Tuple
56
from concurrent.futures import ThreadPoolExecutor, as_completed
67
from datetime import datetime, timedelta, timezone
@@ -31,6 +32,8 @@ def __init__(
3132
connect_args: Optional[dict] = None,
3233
isolation_level: Optional[str] = None,
3334
echo: bool = False,
35+
include_tables: Optional[List[str]] = None,
36+
exclude_tables: Optional[List[str]] = None,
3437
):
3538
"""Initialize the database service.
3639
@@ -45,8 +48,12 @@ def __init__(
4548
connect_args: Extra kwargs passed directly to the DBAPI connect() call (default: {}).
4649
isolation_level: Transaction isolation level, e.g. 'READ_COMMITTED' (default: dialect default).
4750
echo: Log all SQL statements to the Python logger (default: False).
51+
include_tables: List of glob patterns for tables to include in schema detection (default: None = all).
52+
exclude_tables: List of glob patterns for tables to exclude from schema detection (default: None = none).
4853
"""
4954
self.connection_string = connection_string
55+
self.include_tables = include_tables
56+
self.exclude_tables = exclude_tables
5057
self.pool_size = pool_size
5158
self.max_overflow = max_overflow
5259
self.pool_timeout = pool_timeout
@@ -311,6 +318,14 @@ def _get_approximate_row_count(self, table_name: str) -> Optional[int]:
311318
log.debug("Could not get row count for %s: %s", table_name, e)
312319
return None
313320

321+
def _filter_tables(self, tables: List[str]) -> List[str]:
322+
"""Filter table list by include/exclude glob patterns."""
323+
if self.include_tables:
324+
tables = [t for t in tables if any(fnmatchcase(t, p) for p in self.include_tables)]
325+
if self.exclude_tables:
326+
tables = [t for t in tables if not any(fnmatchcase(t, p) for p in self.exclude_tables)]
327+
return tables
328+
314329
def _looks_like_enum_column(self, column_name: str) -> bool:
315330
"""Check if a column name suggests it might be an enum."""
316331
enum_patterns = [
@@ -430,7 +445,10 @@ def _process_table(
430445
def _compute_schema(self, max_enum_cardinality: int, sample_size: int) -> str:
431446
"""Compute schema without caching logic."""
432447
log.info("Starting schema detection...")
433-
tables = self.get_tables()
448+
all_tables = self.get_tables()
449+
tables = self._filter_tables(all_tables)
450+
if len(tables) != len(all_tables):
451+
log.info("Table filtering: %d of %d tables selected", len(tables), len(all_tables))
434452
log.debug("Processing %d tables in parallel with 5 workers...", len(tables))
435453
schema = {}
436454

sam-sql-database-tool/src/sam_sql_database_tool/tools.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Dict, Any, Optional
1+
from typing import Dict, Any, Optional, List
22
from pydantic import BaseModel, Field, model_validator, field_validator, SecretStr
33
from google.genai import types as adk_types
44
from solace_agent_mesh.agent.tools.dynamic_tool import DynamicTool
@@ -39,6 +39,20 @@ class DatabaseConfig(BaseModel):
3939
default=3600,
4040
description="Time-to-live for schema cache in seconds (default: 3600 = 1 hour).",
4141
)
42+
include_tables: Optional[List[str]] = Field(
43+
default=None,
44+
description=(
45+
"List of glob patterns for tables to include in schema detection. "
46+
"If set, only matching tables are included. Supports wildcards: *, ?, [seq]."
47+
),
48+
)
49+
exclude_tables: Optional[List[str]] = Field(
50+
default=None,
51+
description=(
52+
"List of glob patterns for tables to exclude from schema detection. "
53+
"Applied after include_tables. Supports wildcards: *, ?, [seq]."
54+
),
55+
)
4256

4357
# Connection pool settings
4458
pool_size: int = Field(
@@ -99,8 +113,13 @@ def check_required_fields(self) -> 'DatabaseConfig':
99113
raise ValueError(
100114
"'schema_summary_override' is required when 'auto_detect_schema' is false"
101115
)
116+
if self.include_tables or self.exclude_tables:
117+
log.warning(
118+
"Tool '%s': include_tables/exclude_tables have no effect when auto_detect_schema is false",
119+
self.tool_name,
120+
)
102121
return self
103-
122+
104123
def get(self, key: str, default: Any = None) -> Any:
105124
"""Allows dictionary-like access to the model's attributes."""
106125
return getattr(self, key, default)
@@ -170,6 +189,8 @@ async def init(self, component: SamAgentComponent, tool_config: Dict):
170189
connect_args=self.tool_config.connect_args,
171190
isolation_level=self.tool_config.isolation_level,
172191
echo=self.tool_config.echo,
192+
include_tables=self.tool_config.include_tables,
193+
exclude_tables=self.tool_config.exclude_tables,
173194
)
174195
except Exception as e:
175196
self._connection_healthy = False

sam-sql-database-tool/tests/unit/test_config.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,57 @@ def test_manual_schema_with_override(self):
4747
)
4848
assert config.auto_detect_schema is False
4949
assert config.schema_summary_override is not None
50+
51+
def test_include_tables(self):
52+
"""Test config accepts include_tables as list of strings."""
53+
config = DatabaseConfig(
54+
tool_name="test",
55+
connection_string="postgresql+psycopg2://user:password@localhost:5432/testdb",
56+
include_tables=["tms_trx*", "tms_alert*"],
57+
)
58+
assert config.include_tables == ["tms_trx*", "tms_alert*"]
59+
60+
def test_exclude_tables(self):
61+
"""Test config accepts exclude_tables as list of strings."""
62+
config = DatabaseConfig(
63+
tool_name="test",
64+
connection_string="postgresql+psycopg2://user:password@localhost:5432/testdb",
65+
exclude_tables=["bkp_*", "*_temp"],
66+
)
67+
assert config.exclude_tables == ["bkp_*", "*_temp"]
68+
69+
def test_include_and_exclude_tables_together(self):
70+
"""Test config accepts both include_tables and exclude_tables."""
71+
config = DatabaseConfig(
72+
tool_name="test",
73+
connection_string="postgresql+psycopg2://user:password@localhost:5432/testdb",
74+
include_tables=["tms_trx*"],
75+
exclude_tables=["*_temp"],
76+
)
77+
assert config.include_tables == ["tms_trx*"]
78+
assert config.exclude_tables == ["*_temp"]
79+
80+
def test_table_filters_ignored_when_auto_detect_false(self):
81+
"""Test that a warning is logged when filters are set with auto_detect_schema=false."""
82+
import logging
83+
from unittest.mock import patch
84+
logger = logging.getLogger("sam_sql_database_tool.tools")
85+
with patch.object(logger, "warning") as mock_warning:
86+
DatabaseConfig(
87+
tool_name="test",
88+
connection_string="postgresql+psycopg2://user:password@localhost:5432/testdb",
89+
auto_detect_schema=False,
90+
schema_summary_override="manual schema",
91+
include_tables=["tms_trx*"],
92+
)
93+
mock_warning.assert_called_once()
94+
assert "include_tables/exclude_tables have no effect" in mock_warning.call_args[0][0]
95+
96+
def test_table_filters_default_to_none(self):
97+
"""Test that include_tables and exclude_tables default to None."""
98+
config = DatabaseConfig(
99+
tool_name="test",
100+
connection_string="postgresql+psycopg2://user:password@localhost:5432/testdb",
101+
)
102+
assert config.include_tables is None
103+
assert config.exclude_tables is None

sam-sql-database-tool/tests/unit/test_database_service.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,101 @@ def test_mysql_dialect_detection(self):
124124

125125
service = DatabaseService("mysql+pymysql://test")
126126
assert service.engine.dialect.name == "mysql"
127+
128+
129+
class TestFilterTables:
130+
"""Unit tests for table filtering logic."""
131+
132+
@pytest.fixture
133+
def service_with_filters(self):
134+
"""Create a DatabaseService with custom include/exclude filters."""
135+
def _create(include_tables=None, exclude_tables=None):
136+
with patch('sqlalchemy.create_engine') as mock_create_engine:
137+
mock_engine = MagicMock()
138+
mock_engine.dialect.name = "postgresql"
139+
mock_create_engine.return_value = mock_engine
140+
return DatabaseService(
141+
"postgresql://test",
142+
include_tables=include_tables,
143+
exclude_tables=exclude_tables,
144+
)
145+
return _create
146+
147+
def test_no_filters_returns_all(self, service_with_filters):
148+
"""No filters should return all tables unchanged."""
149+
service = service_with_filters()
150+
tables = ["users", "orders", "products"]
151+
assert service._filter_tables(tables) == ["users", "orders", "products"]
152+
153+
def test_include_exact_match(self, service_with_filters):
154+
"""Include with exact names should only return matching tables."""
155+
service = service_with_filters(include_tables=["users", "orders"])
156+
tables = ["users", "orders", "products", "logs"]
157+
assert service._filter_tables(tables) == ["users", "orders"]
158+
159+
def test_include_wildcard(self, service_with_filters):
160+
"""Include with wildcard patterns should match correctly."""
161+
service = service_with_filters(include_tables=["tms_trx*", "tms_alert*"])
162+
tables = ["tms_trx5min", "tms_trx60min", "tms_alert_anomaly", "tms_bank", "bkp_data"]
163+
assert service._filter_tables(tables) == ["tms_trx5min", "tms_trx60min", "tms_alert_anomaly"]
164+
165+
def test_exclude_wildcard(self, service_with_filters):
166+
"""Exclude with wildcard patterns should remove matching tables."""
167+
service = service_with_filters(exclude_tables=["bkp_*", "*_temp", "*_dev"])
168+
tables = ["tms_trx5min", "bkp_old_data", "tms_data_temp", "tms_aggr_dev", "tms_alert"]
169+
assert service._filter_tables(tables) == ["tms_trx5min", "tms_alert"]
170+
171+
def test_include_and_exclude_together(self, service_with_filters):
172+
"""Include and exclude together: include first, then exclude from result."""
173+
service = service_with_filters(
174+
include_tables=["tms_trx*"],
175+
exclude_tables=["*_temp", "*_dev"],
176+
)
177+
tables = ["tms_trx5min", "tms_trx_temp", "tms_trx_dev", "tms_trx60min", "tms_bank"]
178+
assert service._filter_tables(tables) == ["tms_trx5min", "tms_trx60min"]
179+
180+
def test_question_mark_wildcard(self, service_with_filters):
181+
"""The ? wildcard should match a single character."""
182+
service = service_with_filters(include_tables=["table_?"])
183+
tables = ["table_a", "table_b", "table_ab", "other"]
184+
assert service._filter_tables(tables) == ["table_a", "table_b"]
185+
186+
def test_case_sensitive(self, service_with_filters):
187+
"""Matching should be case-sensitive."""
188+
service = service_with_filters(include_tables=["Users"])
189+
tables = ["Users", "users", "USERS"]
190+
assert service._filter_tables(tables) == ["Users"]
191+
192+
def test_empty_include_list_returns_all(self, service_with_filters):
193+
"""An empty include list should be falsy and return all tables."""
194+
service = service_with_filters(include_tables=[])
195+
tables = ["users", "orders", "products"]
196+
assert service._filter_tables(tables) == ["users", "orders", "products"]
197+
198+
def test_all_tables_filtered_out(self, service_with_filters):
199+
"""When all tables are excluded, result should be empty."""
200+
service = service_with_filters(include_tables=["nonexistent_*"])
201+
tables = ["users", "orders", "products"]
202+
assert service._filter_tables(tables) == []
203+
204+
def test_compute_schema_uses_filter(self, service_with_filters):
205+
"""_compute_schema should only process tables that pass the filter."""
206+
service = service_with_filters(include_tables=["users"])
207+
208+
mock_inspector = MagicMock()
209+
mock_inspector.get_table_names.return_value = ["users", "orders", "products"]
210+
mock_inspector.get_columns.return_value = [
211+
{"name": "id", "type": "INTEGER", "nullable": False},
212+
]
213+
mock_inspector.get_pk_constraint.return_value = {"constrained_columns": ["id"]}
214+
mock_inspector.get_foreign_keys.return_value = []
215+
mock_inspector.get_indexes.return_value = []
216+
217+
service.get_table_sample = MagicMock(return_value=[{"id": 1}])
218+
219+
with patch('sam_sql_database_tool.services.database_service.inspect', return_value=mock_inspector):
220+
schema = service._compute_schema(max_enum_cardinality=100, sample_size=100)
221+
222+
assert "users" in schema
223+
assert "orders" not in schema
224+
assert "products" not in schema

0 commit comments

Comments
 (0)