Skip to content

Commit d9b6fca

Browse files
committed
CKAN: surface true total + warn on limit-truncation
Real-world failure: bot answered "How many 311 requests closed on 4/29/2016?" with 100 — but 100 was the LIMIT; the true count is 531. The model was reading "Found 100 record(s) (showing up to 100)" as the answer to a counting question. Fixes: 1. Capture CKAN's `total` from datastore_search responses (it's already returned by CKAN; we were discarding it). Plumbed through _query_with_schema → ToolResult → formatter. 2. SQL (`where`) path: when SELECT * hits the limit exactly we don't know the true total, so issue a cheap SELECT COUNT(*) follow-up with the same WHERE. This means we always tell the model a real number when it asks a counting question via search_and_query or query_data, regardless of which path was taken. 3. Formatter rewrite: - Header line is now `total_matching_rows: N (returned K, limit=L)` when total is known and exceeds returned. Removed the misleading "Found 100 record(s)" wording that conflated rows-returned with true count. - When truncation is detected, prepend a `=== TRUNCATED ===` block stating "the answer is N, NOT K" and pointing at ckan__aggregate_data for cheap counts and ckan__execute_sql for custom LIMIT/ORDER BY. - When total can't be determined (count follow-up failed) and rows == limit, prepend `=== MAY BE TRUNCATED ===`. Live verification (boston prod CKAN): - search_and_query "311 closed on 2016-04-29" with default limit=100 → returns the 100 sample rows AND header "total_matching_rows: 531" AND the TRUNCATED warning telling the model the answer is 531. - search_and_query "311 closed on 2026-04-29" with limit=100 → returns 85 rows, no TRUNCATED warning (85 < 100), header reads "total_matching_rows: 85". Tests: +5 (41 -> 46) covering total surfaced from datastore_search, no-warning path when under limit, COUNT(*) follow-up fires only when truncated on the SQL path, and graceful fallback when count fails.
1 parent c1608e2 commit d9b6fca

2 files changed

Lines changed: 463 additions & 32 deletions

File tree

plugins/ckan/plugin.py

Lines changed: 215 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,7 @@ async def execute_tool(
651651
filters = arguments.get("filters") or {}
652652
where = arguments.get("where") or None
653653
limit = arguments.get("limit", 100)
654-
records, fields, error = await self._query_with_schema(
654+
records, fields, total, error = await self._query_with_schema(
655655
resource_id=resource_id,
656656
filters=filters,
657657
limit=limit,
@@ -668,7 +668,7 @@ async def execute_tool(
668668
{
669669
"type": "text",
670670
"text": self._format_query_results(
671-
records, fields, limit
671+
records, fields, limit, total
672672
),
673673
}
674674
],
@@ -862,7 +862,7 @@ async def query_data(
862862
List of data records (the schema-aware variant is
863863
``_query_with_schema``).
864864
"""
865-
records, _fields, error = await self._query_with_schema(
865+
records, _fields, _total, error = await self._query_with_schema(
866866
resource_id=resource_id,
867867
filters=filters,
868868
limit=limit,
@@ -872,14 +872,66 @@ async def query_data(
872872
raise RuntimeError(error)
873873
return records
874874

875+
async def _count_via_sql(
876+
self,
877+
resource_id: str,
878+
where_sql: str,
879+
filters: Optional[Dict[str, Any]] = None,
880+
) -> Optional[int]:
881+
"""Run a SELECT COUNT(*) with the same filters to discover the true
882+
row total when a SELECT * hit the limit.
883+
884+
Returns ``None`` if the count call itself fails (we'd rather show
885+
a 'TRUNCATED' warning than block the data response on a failed
886+
count). Returns the integer total on success.
887+
"""
888+
try:
889+
sql_parts = [f'SELECT COUNT(*) AS n FROM "{resource_id}"']
890+
if where_sql:
891+
sql_parts.append(f" WHERE {where_sql}")
892+
if filters:
893+
eq_conds = [
894+
SafeSQLBuilder.build_filter_condition(f, v)
895+
for f, v in filters.items()
896+
]
897+
joiner = " AND " if where_sql else " WHERE "
898+
sql_parts.append(joiner + " AND ".join(eq_conds))
899+
sql = "".join(sql_parts)
900+
result = await self.execute_sql(sql)
901+
if result.get("error"):
902+
return None
903+
recs = result.get("records") or []
904+
if not recs:
905+
return None
906+
n = recs[0].get("n") or recs[0].get("count")
907+
if n is None:
908+
return None
909+
try:
910+
return int(n)
911+
except (TypeError, ValueError):
912+
return None
913+
except Exception:
914+
return None
915+
875916
async def _query_with_schema(
876917
self,
877918
resource_id: str,
878919
filters: Optional[Dict[str, Any]] = None,
879920
limit: int = 100,
880921
where: Optional[Dict[str, Any]] = None,
881-
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
882-
"""Query datastore and return (records, fields, error_message).
922+
) -> Tuple[
923+
List[Dict[str, Any]],
924+
List[Dict[str, Any]],
925+
Optional[int],
926+
Optional[str],
927+
]:
928+
"""Query datastore and return (records, fields, total, error).
929+
930+
``total`` is the true number of rows matching the filter, regardless
931+
of LIMIT. CKAN's datastore_search returns this for free; for the
932+
SQL (``where``) path we issue a follow-up COUNT(*) only when the
933+
result hit the limit (otherwise len(records) IS the total).
934+
Returns ``None`` when we couldn't determine a total.
883935
884936
Routes through datastore_search_sql when ``where`` is set so the
885937
caller can express ranges/IN/LIKE; otherwise falls back to the
@@ -891,7 +943,7 @@ async def _query_with_schema(
891943
where_sql = SafeSQLBuilder.build_where_clause(where)
892944
limit_int = SafeSQLBuilder.clamp_limit(limit)
893945
except ValueError as e:
894-
return [], [], str(e)
946+
return [], [], None, str(e)
895947

896948
sql_parts = [f'SELECT * FROM "{validated_id}"']
897949
if where_sql:
@@ -904,20 +956,32 @@ async def _query_with_schema(
904956
for f, v in filters.items()
905957
]
906958
except ValueError as e:
907-
return [], [], str(e)
959+
return [], [], None, str(e)
908960
joiner = " AND " if where_sql else " WHERE "
909961
sql_parts.append(joiner + " AND ".join(eq_conds))
910962
sql_parts.append(f" LIMIT {limit_int}")
911963
sql = "".join(sql_parts)
912964

913965
result = await self.execute_sql(sql)
914966
if result.get("error"):
915-
return [], [], result.get("message", "SQL execution failed")
916-
return (
917-
result.get("records", []),
918-
result.get("fields", []),
919-
None,
920-
)
967+
return [], [], None, result.get(
968+
"message", "SQL execution failed"
969+
)
970+
records = result.get("records", [])
971+
fields = result.get("fields", [])
972+
973+
# If we hit the LIMIT exactly, we don't actually know the total —
974+
# do a cheap COUNT(*) follow-up so the model gets a real number
975+
# instead of mistaking the limit for the count.
976+
total: Optional[int]
977+
if len(records) >= limit_int:
978+
total = await self._count_via_sql(
979+
validated_id, where_sql, filters
980+
)
981+
else:
982+
total = len(records)
983+
984+
return records, fields, total, None
921985

922986
# No `where` → cheap datastore_search path.
923987
params: Dict[str, Any] = {"resource_id": resource_id, "limit": limit}
@@ -932,6 +996,7 @@ async def _query_with_schema(
932996
return (
933997
[],
934998
[],
999+
None,
9351000
f"{msg}\n"
9361001
"Hint: this resource may exist as a file download "
9371002
"(GeoJSON/KML/SHP/PDF) but not be loaded into the "
@@ -941,12 +1006,20 @@ async def _query_with_schema(
9411006
"ckan__search_and_query, which auto-picks the "
9421007
"datastore-loaded resource.",
9431008
)
944-
return [], [], msg
1009+
return [], [], None, msg
9451010

9461011
result = response.get("result", {})
1012+
# CKAN returns `total` for free here — a true count of rows
1013+
# matching the filter, not capped by limit.
1014+
total_val = result.get("total")
1015+
try:
1016+
total = int(total_val) if total_val is not None else None
1017+
except (TypeError, ValueError):
1018+
total = None
9471019
return (
9481020
result.get("records", []),
9491021
result.get("fields", []),
1022+
total,
9501023
None,
9511024
)
9521025

@@ -1294,7 +1367,7 @@ async def search_and_query(
12941367
),
12951368
}
12961369

1297-
records, fields, error = await self._query_with_schema(
1370+
records, fields, total, error = await self._query_with_schema(
12981371
resource_id=resource_id,
12991372
filters=filters or None,
13001373
where=where,
@@ -1314,6 +1387,7 @@ async def search_and_query(
13141387
"resource": chosen_resource,
13151388
"records": records,
13161389
"fields": fields,
1390+
"total": total,
13171391
"alternate_datasets": datasets,
13181392
}
13191393

@@ -1508,14 +1582,44 @@ def _format_query_results(
15081582
records: List[Dict[str, Any]],
15091583
fields: Optional[List[Dict[str, Any]]] = None,
15101584
limit: int = 100,
1585+
total: Optional[int] = None,
15111586
) -> str:
15121587
"""Format query results for user display."""
1588+
n_returned = len(records)
1589+
truncated_warning = self._format_truncation_block(
1590+
n_returned, limit, total
1591+
)
1592+
15131593
if not records:
15141594
text = "No records found matching the query."
1595+
parts = [truncated_warning, text] if truncated_warning else [text]
15151596
schema_footer = self._format_schema_footer(fields)
1516-
return f"{text}\n\n{schema_footer}" if schema_footer else text
1597+
if schema_footer:
1598+
parts.append("")
1599+
parts.append(schema_footer)
1600+
return "\n".join(parts)
15171601

1518-
lines = [f"Found {len(records)} record(s) (showing up to {limit}):\n"]
1602+
lines: List[str] = []
1603+
if truncated_warning:
1604+
lines.append(truncated_warning)
1605+
lines.append("")
1606+
1607+
# Header line: prefer "true total" wording when known, since the
1608+
# model has consistently been mistaking len(records) for the total.
1609+
if total is not None and total != n_returned:
1610+
lines.append(
1611+
f"total_matching_rows: {total} (returned {n_returned}, "
1612+
f"limit={limit})\n"
1613+
)
1614+
elif total is not None:
1615+
lines.append(
1616+
f"total_matching_rows: {total} (limit={limit})\n"
1617+
)
1618+
else:
1619+
lines.append(
1620+
f"returned_rows: {n_returned} (limit={limit}, "
1621+
"total unknown — see warning above if any)\n"
1622+
)
15191623

15201624
# Show first few records as examples
15211625
for i, record in enumerate(records[:5], 1):
@@ -1525,8 +1629,8 @@ def _format_query_results(
15251629
lines.append(f" {key}: {value}")
15261630
lines.append("")
15271631

1528-
if len(records) > 5:
1529-
lines.append(f"... and {len(records) - 5} more record(s)")
1632+
if n_returned > 5:
1633+
lines.append(f"... and {n_returned - 5} more record(s) returned")
15301634

15311635
schema_footer = self._format_schema_footer(fields)
15321636
if schema_footer:
@@ -1535,6 +1639,53 @@ def _format_query_results(
15351639

15361640
return "\n".join(lines)
15371641

1642+
def _format_truncation_block(
1643+
self,
1644+
n_returned: int,
1645+
limit: int,
1646+
total: Optional[int],
1647+
) -> str:
1648+
"""Emit a clear warning when the result set is — or might be —
1649+
truncated by LIMIT.
1650+
1651+
Returns ``""`` when no warning is needed (result fits within limit
1652+
and total is known/equal to returned)."""
1653+
# Total known and matches returned → fits within limit, no warning.
1654+
if total is not None and total <= n_returned:
1655+
return ""
1656+
1657+
# Total known but exceeds returned → exact truncation, exact total.
1658+
if total is not None and total > n_returned:
1659+
return (
1660+
"=== TRUNCATED ===\n"
1661+
f"This query has {total} matching rows, but only "
1662+
f"{n_returned} were returned (limit={limit}). For "
1663+
"counting questions, the answer is "
1664+
f"{total}, NOT {n_returned}. To return more rows, raise "
1665+
"`limit` (max 10000) or use `ckan__execute_sql` with "
1666+
"your own LIMIT/ORDER BY. For just the count, use "
1667+
"ckan__aggregate_data with metrics="
1668+
'{"count": "count(*)"} and a matching filter — '
1669+
"it's cheaper than fetching rows.\n"
1670+
"================="
1671+
)
1672+
1673+
# Total unknown and we hit the limit exactly → likely truncated.
1674+
if total is None and n_returned >= limit:
1675+
return (
1676+
"=== MAY BE TRUNCATED ===\n"
1677+
f"Result returned exactly the requested limit "
1678+
f"({limit}) and the true total could not be determined. "
1679+
"Treat this as a possibly-incomplete sample. For "
1680+
"counting questions, do NOT report "
1681+
f"{n_returned} as the answer — use ckan__aggregate_data "
1682+
'with metrics={"count": "count(*)"} and the same '
1683+
"filter, or re-run with a higher limit.\n"
1684+
"========================"
1685+
)
1686+
1687+
return ""
1688+
15381689
def _format_schema_footer(
15391690
self, fields: Optional[List[Dict[str, Any]]]
15401691
) -> str:
@@ -1587,39 +1738,71 @@ def _format_search_and_query(
15871738
resource = composite.get("resource", {}) or {}
15881739
records = composite.get("records", []) or []
15891740
fields = composite.get("fields", []) or []
1741+
total = composite.get("total")
15901742
alternates = composite.get("alternate_datasets", []) or []
15911743

15921744
dataset_id = dataset.get("id", "unknown")
15931745
dataset_title = dataset.get("title", "Untitled")
15941746
resource_id = resource.get("id", "unknown")
15951747
resource_name = resource.get("name", "Unnamed")
1748+
n_returned = len(records)
1749+
1750+
# Total line: prefer "true total" so the model can't read the
1751+
# returned-rows count as the answer to a counting question.
1752+
if total is not None and total != n_returned:
1753+
count_line = (
1754+
f"total_matching_rows: {total} "
1755+
f"(returned {n_returned}, limit={limit})"
1756+
)
1757+
elif total is not None:
1758+
count_line = f"total_matching_rows: {total} (limit={limit})"
1759+
else:
1760+
count_line = (
1761+
f"returned_rows: {n_returned} "
1762+
f"(limit={limit}, total unknown)"
1763+
)
15961764

1597-
lines: List[str] = [
1598-
"=== search_and_query result ===",
1599-
f"matched_dataset: {dataset_title}",
1600-
f"dataset_id: {dataset_id}",
1601-
f"resource_id (use with ckan__query_data): {resource_id}",
1602-
f"resource_name: {resource_name}",
1603-
f"row_count: {len(records)} (limit={limit})",
1604-
"================================",
1605-
"",
1606-
]
1765+
lines: List[str] = []
1766+
truncated_warning = self._format_truncation_block(
1767+
n_returned, limit, total
1768+
)
1769+
if truncated_warning:
1770+
lines.append(truncated_warning)
1771+
lines.append("")
1772+
1773+
lines.extend(
1774+
[
1775+
"=== search_and_query result ===",
1776+
f"matched_dataset: {dataset_title}",
1777+
f"dataset_id: {dataset_id}",
1778+
f"resource_id (use with ckan__query_data): {resource_id}",
1779+
f"resource_name: {resource_name}",
1780+
count_line,
1781+
"================================",
1782+
"",
1783+
]
1784+
)
16071785

16081786
if not records:
16091787
lines.append(
16101788
"No rows returned. Try broadening filters or pick a different "
16111789
"dataset/resource (see alternates below)."
16121790
)
16131791
else:
1614-
lines.append(f"Showing up to 5 of {len(records)} record(s):")
1792+
preview_caption = (
1793+
f"Showing first 5 of {n_returned} returned"
1794+
+ (f" (true total: {total})" if total is not None else "")
1795+
+ ":"
1796+
)
1797+
lines.append(preview_caption)
16151798
for i, record in enumerate(records[:5], 1):
16161799
lines.append(f"Record {i}:")
16171800
for key, value in record.items():
16181801
if key != "_id":
16191802
lines.append(f" {key}: {value}")
16201803
lines.append("")
1621-
if len(records) > 5:
1622-
lines.append(f"... and {len(records) - 5} more record(s)")
1804+
if n_returned > 5:
1805+
lines.append(f"... and {n_returned - 5} more record(s) returned")
16231806

16241807
schema_footer = self._format_schema_footer(fields)
16251808
if schema_footer:

0 commit comments

Comments
 (0)