Skip to content

Commit 47fbdca

Browse files
committed
CKAN: 'X of Y' phrasing + truncation guards on every row-returning path
User flagged: limit-as-count confusion isn't unique to query_data. Same trap exists in execute_sql ("returned 100 because the SQL said LIMIT 100") and in search_datasets ("found 20 datasets" while CKAN knows there are actually 47 matches, since package_search returns `count`). Switched to the user-suggested 'X of Y' phrasing throughout. Changes: 1. Shared _format_count_header helper renders three cases: - "100 of 531 rows returned (limit=100; raise limit or use ckan__aggregate_data for full count)." - "85 rows returned (full result, limit=100)." - "100 rows returned (limit=100, true total unknown — see TRUNCATED warning above if any)." Used in query_data, search_and_query, and search_datasets (with unit="matching dataset(s) shown"). 2. search_datasets now reads CKAN's `count` from package_search and surfaces it: "5 of 21 matching dataset(s) shown" instead of "Found 5 dataset(s)". 3. execute_sql now extracts the effective LIMIT from the validated SQL via SQLValidator.extract_top_level_limit() and emits a "MAY BE TRUNCATED" block when len(records) >= effective_limit (datastore_search_sql doesn't return a `total`, so this heuristic is the best we can do). aggregate_data inherits the same guard since it formats results via _format_sql_results. 4. Existing query_data/search_and_query header lines switched to the X-of-Y phrasing for consistency. The TRUNCATED warning text now matches the rest of the corpus. Live verification (boston prod CKAN): - search_datasets("parks", limit=5) → "5 of 21 matching dataset(s) shown (limit=5; raise limit to see more)". - execute_sql with LIMIT 100 returning 100 rows → MAY BE TRUNCATED block at top, "100 rows returned (limit=100, true total unknown...)". Tests: +8 (204 -> 212). New: search_datasets count rendering, two execute_sql truncation cases, four extract_top_level_limit unit tests (simple, semicolon, missing, subquery-ignored), enforce-then-extract round trip.
1 parent d9b6fca commit 47fbdca

4 files changed

Lines changed: 338 additions & 55 deletions

File tree

plugins/ckan/plugin.py

Lines changed: 142 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -610,12 +610,16 @@ async def execute_tool(
610610
if tool_name == "search_datasets":
611611
query = arguments.get("query", "")
612612
limit = arguments.get("limit", 20)
613-
datasets = await self.search_datasets(query, limit)
613+
datasets, total = await self._search_datasets_with_count(
614+
query, limit
615+
)
614616
return ToolResult(
615617
content=[
616618
{
617619
"type": "text",
618-
"text": self._format_search_results(datasets),
620+
"text": self._format_search_results(
621+
datasets, total=total, limit=limit
622+
),
619623
}
620624
],
621625
success=True,
@@ -712,7 +716,10 @@ async def execute_tool(
712716
# Format SQL results
713717
records = result.get("records", [])
714718
fields = result.get("fields", [])
715-
formatted_text = self._format_sql_results(records, fields)
719+
effective_limit = result.get("effective_limit")
720+
formatted_text = self._format_sql_results(
721+
records, fields, effective_limit=effective_limit
722+
)
716723
return ToolResult(
717724
content=[{"type": "text", "text": formatted_text}],
718725
success=True,
@@ -790,7 +797,9 @@ async def execute_tool(
790797
error_message=result.get("message", "Aggregation failed"),
791798
)
792799
formatted = self._format_sql_results(
793-
result.get("records", []), result.get("fields", [])
800+
result.get("records", []),
801+
result.get("fields", []),
802+
effective_limit=result.get("effective_limit"),
794803
)
795804
return ToolResult(
796805
content=[{"type": "text", "text": formatted}], success=True
@@ -821,12 +830,29 @@ async def search_datasets(
821830
limit: Maximum number of results
822831
823832
Returns:
824-
List of dataset metadata dictionaries
833+
List of dataset metadata dictionaries (count-aware variant is
834+
``_search_datasets_with_count``).
825835
"""
836+
datasets, _count = await self._search_datasets_with_count(query, limit)
837+
return datasets
838+
839+
async def _search_datasets_with_count(
840+
self, query: str, limit: int = 20
841+
) -> Tuple[List[Dict[str, Any]], Optional[int]]:
842+
"""Same as search_datasets but also returns CKAN's `count` — the
843+
true number of datasets matching the query, regardless of the row
844+
cap. Lets the formatter say "20 of 47 matching datasets returned"
845+
instead of just "Found 20"."""
826846
response = await self._call_ckan_api(
827847
"package_search", {"q": query, "rows": limit}
828848
)
829-
return response.get("result", {}).get("results", [])
849+
result = response.get("result", {})
850+
count_val = result.get("count")
851+
try:
852+
count = int(count_val) if count_val is not None else None
853+
except (TypeError, ValueError):
854+
count = None
855+
return result.get("results", []), count
830856

831857
async def get_dataset(self, dataset_id: str) -> Dict[str, Any]:
832858
"""Get detailed metadata for a specific dataset.
@@ -1045,7 +1071,8 @@ async def execute_sql(self, sql: str) -> Dict[str, Any]:
10451071
sql: PostgreSQL SELECT statement
10461072
10471073
Returns:
1048-
Dictionary with success flag, records, fields, or error message
1074+
Dictionary with success flag, records, fields, effective_limit,
1075+
or error message
10491076
"""
10501077
# Validate SQL
10511078
is_valid, error = SQLValidator.validate_query(sql)
@@ -1054,6 +1081,7 @@ async def execute_sql(self, sql: str) -> Dict[str, Any]:
10541081

10551082
# Bound upstream scan cost: append LIMIT if the caller didn't set one.
10561083
sql = SQLValidator.enforce_row_limit(sql)
1084+
effective_limit = SQLValidator.extract_top_level_limit(sql)
10571085

10581086
# Log SQL execution (truncated for security)
10591087
logger.info("Executing SQL", extra={"sql": sql[:500]})
@@ -1070,6 +1098,7 @@ async def execute_sql(self, sql: str) -> Dict[str, Any]:
10701098
"success": True,
10711099
"records": result.get("result", {}).get("records", []),
10721100
"fields": result.get("result", {}).get("fields", []),
1101+
"effective_limit": effective_limit,
10731102
}
10741103
except Exception as e:
10751104
logger.error(f"SQL execution failed: {e}", exc_info=True)
@@ -1404,7 +1433,12 @@ async def health_check(self) -> bool:
14041433
logger.error(f"Health check failed: {e}")
14051434
return False
14061435

1407-
def _format_search_results(self, datasets: List[Dict[str, Any]]) -> str:
1436+
def _format_search_results(
1437+
self,
1438+
datasets: List[Dict[str, Any]],
1439+
total: Optional[int] = None,
1440+
limit: int = 20,
1441+
) -> str:
14081442
"""Format search results for user display."""
14091443
if not datasets:
14101444
return f"No datasets found in {self.plugin_config.city_name}'s open data portal."
@@ -1448,9 +1482,25 @@ def _format_search_results(self, datasets: List[Dict[str, Any]]) -> str:
14481482
]
14491483
)
14501484

1451-
lines.append(
1452-
f"Found {len(datasets)} dataset(s) in {self.plugin_config.city_name}'s open data portal:\n"
1453-
)
1485+
# Lead with the X-of-Y framing so the model can't mistake the
1486+
# results-shown count for the count of matching datasets.
1487+
n_returned = len(datasets)
1488+
if total is not None and total > n_returned:
1489+
lines.append(
1490+
f"{n_returned} of {total} matching dataset(s) shown "
1491+
f"(limit={limit}; raise limit to see more) in "
1492+
f"{self.plugin_config.city_name}'s open data portal:\n"
1493+
)
1494+
elif total is not None:
1495+
lines.append(
1496+
f"{total} matching dataset(s) (full result, limit={limit}) "
1497+
f"in {self.plugin_config.city_name}'s open data portal:\n"
1498+
)
1499+
else:
1500+
lines.append(
1501+
f"{n_returned} dataset(s) in "
1502+
f"{self.plugin_config.city_name}'s open data portal:\n"
1503+
)
14541504

14551505
for i, dataset in enumerate(datasets, 1):
14561506
title = dataset.get("title", "Untitled")
@@ -1604,22 +1654,11 @@ def _format_query_results(
16041654
lines.append(truncated_warning)
16051655
lines.append("")
16061656

1607-
# Header line: prefer "true total" wording when known, since the
1608-
# model has consistently been mistaking len(records) for the total.
1609-
if total is not None and total != n_returned:
1610-
lines.append(
1611-
f"total_matching_rows: {total} (returned {n_returned}, "
1612-
f"limit={limit})\n"
1613-
)
1614-
elif total is not None:
1615-
lines.append(
1616-
f"total_matching_rows: {total} (limit={limit})\n"
1617-
)
1618-
else:
1619-
lines.append(
1620-
f"returned_rows: {n_returned} (limit={limit}, "
1621-
"total unknown — see warning above if any)\n"
1622-
)
1657+
# Header line: lead with the X-of-Y framing so the model can't
1658+
# mistake the rows-returned count for the answer to a counting
1659+
# question.
1660+
lines.append(self._format_count_header(n_returned, limit, total))
1661+
lines.append("")
16231662

16241663
# Show first few records as examples
16251664
for i, record in enumerate(records[:5], 1):
@@ -1639,6 +1678,31 @@ def _format_query_results(
16391678

16401679
return "\n".join(lines)
16411680

1681+
@staticmethod
1682+
def _format_count_header(
1683+
n_returned: int,
1684+
limit: int,
1685+
total: Optional[int],
1686+
unit: str = "rows",
1687+
) -> str:
1688+
"""One-line "X of Y" summary used at the top of every row-returning
1689+
response. The model has been mistaking returned-rows for true count;
1690+
this phrasing makes the partial/total distinction unambiguous."""
1691+
if total is not None and total > n_returned:
1692+
return (
1693+
f"{n_returned} of {total} {unit} returned "
1694+
f"(limit={limit}; raise limit or use ckan__aggregate_data "
1695+
"for full count)."
1696+
)
1697+
if total is not None:
1698+
# total == n_returned, all rows returned
1699+
return f"{total} {unit} returned (full result, limit={limit})."
1700+
# total unknown
1701+
return (
1702+
f"{n_returned} {unit} returned (limit={limit}, "
1703+
"true total unknown — see TRUNCATED warning above if any)."
1704+
)
1705+
16421706
def _format_truncation_block(
16431707
self,
16441708
n_returned: int,
@@ -1747,20 +1811,7 @@ def _format_search_and_query(
17471811
resource_name = resource.get("name", "Unnamed")
17481812
n_returned = len(records)
17491813

1750-
# Total line: prefer "true total" so the model can't read the
1751-
# returned-rows count as the answer to a counting question.
1752-
if total is not None and total != n_returned:
1753-
count_line = (
1754-
f"total_matching_rows: {total} "
1755-
f"(returned {n_returned}, limit={limit})"
1756-
)
1757-
elif total is not None:
1758-
count_line = f"total_matching_rows: {total} (limit={limit})"
1759-
else:
1760-
count_line = (
1761-
f"returned_rows: {n_returned} "
1762-
f"(limit={limit}, total unknown)"
1763-
)
1814+
count_line = self._format_count_header(n_returned, limit, total)
17641815

17651816
lines: List[str] = []
17661817
truncated_warning = self._format_truncation_block(
@@ -1789,11 +1840,13 @@ def _format_search_and_query(
17891840
"dataset/resource (see alternates below)."
17901841
)
17911842
else:
1792-
preview_caption = (
1793-
f"Showing first 5 of {n_returned} returned"
1794-
+ (f" (true total: {total})" if total is not None else "")
1795-
+ ":"
1796-
)
1843+
if total is not None and total > n_returned:
1844+
preview_caption = (
1845+
f"Showing first 5 of {n_returned} returned "
1846+
f"(true total: {total}):"
1847+
)
1848+
else:
1849+
preview_caption = f"Showing first 5 of {n_returned} returned:"
17971850
lines.append(preview_caption)
17981851
for i, record in enumerate(records[:5], 1):
17991852
lines.append(f"Record {i}:")
@@ -1858,21 +1911,59 @@ def _format_search_and_query(
18581911
return "\n".join(lines)
18591912

18601913
def _format_sql_results(
1861-
self, records: List[Dict[str, Any]], fields: List[Dict[str, Any]]
1914+
self,
1915+
records: List[Dict[str, Any]],
1916+
fields: List[Dict[str, Any]],
1917+
effective_limit: Optional[int] = None,
18621918
) -> str:
18631919
"""Format SQL query results for user display.
18641920
18651921
Args:
18661922
records: List of record dictionaries
18671923
fields: List of field metadata dictionaries
1924+
effective_limit: The LIMIT clause that was actually executed —
1925+
either user-supplied or the enforced default. Used to
1926+
detect truncation: if len(records) >= effective_limit, the
1927+
result was almost certainly capped.
18681928
18691929
Returns:
18701930
Formatted string representation of results
18711931
"""
1932+
n_returned = len(records)
1933+
1934+
# Heuristic truncation detection — datastore_search_sql doesn't
1935+
# return a "total"; the only signal is "did we hit our LIMIT?"
1936+
truncation_block = ""
1937+
if effective_limit is not None and n_returned >= effective_limit:
1938+
truncation_block = (
1939+
"=== MAY BE TRUNCATED ===\n"
1940+
f"This SQL returned exactly the LIMIT ({effective_limit}) "
1941+
"rows. The true total could not be determined from "
1942+
"datastore_search_sql alone. For counting questions, do "
1943+
f"NOT report {n_returned} as the answer — instead run a "
1944+
"separate SELECT COUNT(*) with the same WHERE clause, or "
1945+
"use ckan__aggregate_data with metrics="
1946+
'{"count": "count(*)"}.\n'
1947+
"========================"
1948+
)
1949+
18721950
if not records:
1873-
return "No records found matching the SQL query."
1951+
text = "No records found matching the SQL query."
1952+
return f"{truncation_block}\n\n{text}" if truncation_block else text
1953+
1954+
lines: List[str] = []
1955+
if truncation_block:
1956+
lines.append(truncation_block)
1957+
lines.append("")
18741958

1875-
lines = [f"SQL Query Results: {len(records)} record(s)\n"]
1959+
# Header — total is unknown for raw SQL, so show "X rows returned".
1960+
if effective_limit is not None:
1961+
lines.append(
1962+
f"{n_returned} rows returned (limit={effective_limit}, "
1963+
"true total unknown — see warning above if any).\n"
1964+
)
1965+
else:
1966+
lines.append(f"{n_returned} rows returned.\n")
18761967

18771968
# Show field names if available
18781969
if fields:
@@ -1887,7 +1978,7 @@ def _format_sql_results(
18871978
lines.append(f" {key}: {value}")
18881979
lines.append("")
18891980

1890-
if len(records) > 10:
1891-
lines.append(f"... and {len(records) - 10} more record(s)")
1981+
if n_returned > 10:
1982+
lines.append(f"... and {n_returned - 10} more record(s) returned")
18921983

18931984
return "\n".join(lines)

plugins/ckan/sql_validator.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,38 @@ def validate_query(sql: Any) -> Tuple[bool, Optional[str]]:
159159

160160
return True, None
161161

162+
@classmethod
163+
def extract_top_level_limit(cls, sql: str) -> Optional[int]:
164+
"""Return the integer LIMIT of the outermost statement, or None
165+
if there isn't one (or it can't be parsed). Subquery / CTE limits
166+
are ignored — same scoping rules as ``enforce_row_limit``.
167+
168+
Used by callers to detect truncation: if a SELECT returned the
169+
same number of rows as the effective top-level LIMIT, the result
170+
is almost certainly capped."""
171+
try:
172+
parsed = sqlparse.parse(sql)
173+
except Exception:
174+
return None
175+
if not parsed:
176+
return None
177+
statement = parsed[0]
178+
# Walk top-level tokens for a LIMIT keyword followed by an integer.
179+
toks = [t for t in statement.tokens if not isinstance(t, Parenthesis)]
180+
for i, tok in enumerate(toks):
181+
if tok.ttype in Keyword and tok.normalized.upper() == "LIMIT":
182+
# next non-whitespace token should be the integer literal
183+
for nxt in toks[i + 1 :]:
184+
if nxt.is_whitespace:
185+
continue
186+
text = nxt.value.strip().rstrip(";")
187+
try:
188+
return int(text)
189+
except (TypeError, ValueError):
190+
return None
191+
return None
192+
return None
193+
162194
@classmethod
163195
def enforce_row_limit(cls, sql: str) -> str:
164196
"""Append ``LIMIT`` to an already-validated query if it lacks one.

0 commit comments

Comments
 (0)