@@ -651,7 +651,7 @@ async def execute_tool(
651651 filters = arguments .get ("filters" ) or {}
652652 where = arguments .get ("where" ) or None
653653 limit = arguments .get ("limit" , 100 )
654- records , fields , error = await self ._query_with_schema (
654+ records , fields , total , error = await self ._query_with_schema (
655655 resource_id = resource_id ,
656656 filters = filters ,
657657 limit = limit ,
@@ -668,7 +668,7 @@ async def execute_tool(
668668 {
669669 "type" : "text" ,
670670 "text" : self ._format_query_results (
671- records , fields , limit
671+ records , fields , limit , total
672672 ),
673673 }
674674 ],
@@ -862,7 +862,7 @@ async def query_data(
862862 List of data records (the schema-aware variant is
863863 ``_query_with_schema``).
864864 """
865- records , _fields , error = await self ._query_with_schema (
865+ records , _fields , _total , error = await self ._query_with_schema (
866866 resource_id = resource_id ,
867867 filters = filters ,
868868 limit = limit ,
@@ -872,14 +872,66 @@ async def query_data(
872872 raise RuntimeError (error )
873873 return records
874874
875+ async def _count_via_sql (
876+ self ,
877+ resource_id : str ,
878+ where_sql : str ,
879+ filters : Optional [Dict [str , Any ]] = None ,
880+ ) -> Optional [int ]:
881+ """Run a SELECT COUNT(*) with the same filters to discover the true
882+ row total when a SELECT * hit the limit.
883+
884+ Returns ``None`` if the count call itself fails (we'd rather show
885+ a 'TRUNCATED' warning than block the data response on a failed
886+ count). Returns the integer total on success.
887+ """
888+ try :
889+ sql_parts = [f'SELECT COUNT(*) AS n FROM "{ resource_id } "' ]
890+ if where_sql :
891+ sql_parts .append (f" WHERE { where_sql } " )
892+ if filters :
893+ eq_conds = [
894+ SafeSQLBuilder .build_filter_condition (f , v )
895+ for f , v in filters .items ()
896+ ]
897+ joiner = " AND " if where_sql else " WHERE "
898+ sql_parts .append (joiner + " AND " .join (eq_conds ))
899+ sql = "" .join (sql_parts )
900+ result = await self .execute_sql (sql )
901+ if result .get ("error" ):
902+ return None
903+ recs = result .get ("records" ) or []
904+ if not recs :
905+ return None
906+ n = recs [0 ].get ("n" ) or recs [0 ].get ("count" )
907+ if n is None :
908+ return None
909+ try :
910+ return int (n )
911+ except (TypeError , ValueError ):
912+ return None
913+ except Exception :
914+ return None
915+
875916 async def _query_with_schema (
876917 self ,
877918 resource_id : str ,
878919 filters : Optional [Dict [str , Any ]] = None ,
879920 limit : int = 100 ,
880921 where : Optional [Dict [str , Any ]] = None ,
881- ) -> Tuple [List [Dict [str , Any ]], List [Dict [str , Any ]], Optional [str ]]:
882- """Query datastore and return (records, fields, error_message).
922+ ) -> Tuple [
923+ List [Dict [str , Any ]],
924+ List [Dict [str , Any ]],
925+ Optional [int ],
926+ Optional [str ],
927+ ]:
928+ """Query datastore and return (records, fields, total, error).
929+
930+ ``total`` is the true number of rows matching the filter, regardless
931+ of LIMIT. CKAN's datastore_search returns this for free; for the
932+ SQL (``where``) path we issue a follow-up COUNT(*) only when the
933+ result hit the limit (otherwise len(records) IS the total).
934+ Returns ``None`` when we couldn't determine a total.
883935
884936 Routes through datastore_search_sql when ``where`` is set so the
885937 caller can express ranges/IN/LIKE; otherwise falls back to the
@@ -891,7 +943,7 @@ async def _query_with_schema(
891943 where_sql = SafeSQLBuilder .build_where_clause (where )
892944 limit_int = SafeSQLBuilder .clamp_limit (limit )
893945 except ValueError as e :
894- return [], [], str (e )
946+ return [], [], None , str (e )
895947
896948 sql_parts = [f'SELECT * FROM "{ validated_id } "' ]
897949 if where_sql :
@@ -904,20 +956,32 @@ async def _query_with_schema(
904956 for f , v in filters .items ()
905957 ]
906958 except ValueError as e :
907- return [], [], str (e )
959+ return [], [], None , str (e )
908960 joiner = " AND " if where_sql else " WHERE "
909961 sql_parts .append (joiner + " AND " .join (eq_conds ))
910962 sql_parts .append (f" LIMIT { limit_int } " )
911963 sql = "" .join (sql_parts )
912964
913965 result = await self .execute_sql (sql )
914966 if result .get ("error" ):
915- return [], [], result .get ("message" , "SQL execution failed" )
916- return (
917- result .get ("records" , []),
918- result .get ("fields" , []),
919- None ,
920- )
967+ return [], [], None , result .get (
968+ "message" , "SQL execution failed"
969+ )
970+ records = result .get ("records" , [])
971+ fields = result .get ("fields" , [])
972+
973+ # If we hit the LIMIT exactly, we don't actually know the total —
974+ # do a cheap COUNT(*) follow-up so the model gets a real number
975+ # instead of mistaking the limit for the count.
976+ total : Optional [int ]
977+ if len (records ) >= limit_int :
978+ total = await self ._count_via_sql (
979+ validated_id , where_sql , filters
980+ )
981+ else :
982+ total = len (records )
983+
984+ return records , fields , total , None
921985
922986 # No `where` → cheap datastore_search path.
923987 params : Dict [str , Any ] = {"resource_id" : resource_id , "limit" : limit }
@@ -932,6 +996,7 @@ async def _query_with_schema(
932996 return (
933997 [],
934998 [],
999+ None ,
9351000 f"{ msg } \n "
9361001 "Hint: this resource may exist as a file download "
9371002 "(GeoJSON/KML/SHP/PDF) but not be loaded into the "
@@ -941,12 +1006,20 @@ async def _query_with_schema(
9411006 "ckan__search_and_query, which auto-picks the "
9421007 "datastore-loaded resource." ,
9431008 )
944- return [], [], msg
1009+ return [], [], None , msg
9451010
9461011 result = response .get ("result" , {})
1012+ # CKAN returns `total` for free here — a true count of rows
1013+ # matching the filter, not capped by limit.
1014+ total_val = result .get ("total" )
1015+ try :
1016+ total = int (total_val ) if total_val is not None else None
1017+ except (TypeError , ValueError ):
1018+ total = None
9471019 return (
9481020 result .get ("records" , []),
9491021 result .get ("fields" , []),
1022+ total ,
9501023 None ,
9511024 )
9521025
@@ -1294,7 +1367,7 @@ async def search_and_query(
12941367 ),
12951368 }
12961369
1297- records , fields , error = await self ._query_with_schema (
1370+ records , fields , total , error = await self ._query_with_schema (
12981371 resource_id = resource_id ,
12991372 filters = filters or None ,
13001373 where = where ,
@@ -1314,6 +1387,7 @@ async def search_and_query(
13141387 "resource" : chosen_resource ,
13151388 "records" : records ,
13161389 "fields" : fields ,
1390+ "total" : total ,
13171391 "alternate_datasets" : datasets ,
13181392 }
13191393
@@ -1508,14 +1582,44 @@ def _format_query_results(
15081582 records : List [Dict [str , Any ]],
15091583 fields : Optional [List [Dict [str , Any ]]] = None ,
15101584 limit : int = 100 ,
1585+ total : Optional [int ] = None ,
15111586 ) -> str :
15121587 """Format query results for user display."""
1588+ n_returned = len (records )
1589+ truncated_warning = self ._format_truncation_block (
1590+ n_returned , limit , total
1591+ )
1592+
15131593 if not records :
15141594 text = "No records found matching the query."
1595+ parts = [truncated_warning , text ] if truncated_warning else [text ]
15151596 schema_footer = self ._format_schema_footer (fields )
1516- return f"{ text } \n \n { schema_footer } " if schema_footer else text
1597+ if schema_footer :
1598+ parts .append ("" )
1599+ parts .append (schema_footer )
1600+ return "\n " .join (parts )
15171601
1518- lines = [f"Found { len (records )} record(s) (showing up to { limit } ):\n " ]
1602+ lines : List [str ] = []
1603+ if truncated_warning :
1604+ lines .append (truncated_warning )
1605+ lines .append ("" )
1606+
1607+ # Header line: prefer "true total" wording when known, since the
1608+ # model has consistently been mistaking len(records) for the total.
1609+ if total is not None and total != n_returned :
1610+ lines .append (
1611+ f"total_matching_rows: { total } (returned { n_returned } , "
1612+ f"limit={ limit } )\n "
1613+ )
1614+ elif total is not None :
1615+ lines .append (
1616+ f"total_matching_rows: { total } (limit={ limit } )\n "
1617+ )
1618+ else :
1619+ lines .append (
1620+ f"returned_rows: { n_returned } (limit={ limit } , "
1621+ "total unknown — see warning above if any)\n "
1622+ )
15191623
15201624 # Show first few records as examples
15211625 for i , record in enumerate (records [:5 ], 1 ):
@@ -1525,8 +1629,8 @@ def _format_query_results(
15251629 lines .append (f" { key } : { value } " )
15261630 lines .append ("" )
15271631
1528- if len ( records ) > 5 :
1529- lines .append (f"... and { len ( records ) - 5 } more record(s)" )
1632+ if n_returned > 5 :
1633+ lines .append (f"... and { n_returned - 5 } more record(s) returned " )
15301634
15311635 schema_footer = self ._format_schema_footer (fields )
15321636 if schema_footer :
@@ -1535,6 +1639,53 @@ def _format_query_results(
15351639
15361640 return "\n " .join (lines )
15371641
1642+ def _format_truncation_block (
1643+ self ,
1644+ n_returned : int ,
1645+ limit : int ,
1646+ total : Optional [int ],
1647+ ) -> str :
1648+ """Emit a clear warning when the result set is — or might be —
1649+ truncated by LIMIT.
1650+
1651+ Returns ``""`` when no warning is needed (result fits within limit
1652+ and total is known/equal to returned)."""
1653+ # Total known and matches returned → fits within limit, no warning.
1654+ if total is not None and total <= n_returned :
1655+ return ""
1656+
1657+ # Total known but exceeds returned → exact truncation, exact total.
1658+ if total is not None and total > n_returned :
1659+ return (
1660+ "=== TRUNCATED ===\n "
1661+ f"This query has { total } matching rows, but only "
1662+ f"{ n_returned } were returned (limit={ limit } ). For "
1663+ "counting questions, the answer is "
1664+ f"{ total } , NOT { n_returned } . To return more rows, raise "
1665+ "`limit` (max 10000) or use `ckan__execute_sql` with "
1666+ "your own LIMIT/ORDER BY. For just the count, use "
1667+ "ckan__aggregate_data with metrics="
1668+ '{"count": "count(*)"} and a matching filter — '
1669+ "it's cheaper than fetching rows.\n "
1670+ "================="
1671+ )
1672+
1673+ # Total unknown and we hit the limit exactly → likely truncated.
1674+ if total is None and n_returned >= limit :
1675+ return (
1676+ "=== MAY BE TRUNCATED ===\n "
1677+ f"Result returned exactly the requested limit "
1678+ f"({ limit } ) and the true total could not be determined. "
1679+ "Treat this as a possibly-incomplete sample. For "
1680+ "counting questions, do NOT report "
1681+ f"{ n_returned } as the answer — use ckan__aggregate_data "
1682+ 'with metrics={"count": "count(*)"} and the same '
1683+ "filter, or re-run with a higher limit.\n "
1684+ "========================"
1685+ )
1686+
1687+ return ""
1688+
15381689 def _format_schema_footer (
15391690 self , fields : Optional [List [Dict [str , Any ]]]
15401691 ) -> str :
@@ -1587,39 +1738,71 @@ def _format_search_and_query(
15871738 resource = composite .get ("resource" , {}) or {}
15881739 records = composite .get ("records" , []) or []
15891740 fields = composite .get ("fields" , []) or []
1741+ total = composite .get ("total" )
15901742 alternates = composite .get ("alternate_datasets" , []) or []
15911743
15921744 dataset_id = dataset .get ("id" , "unknown" )
15931745 dataset_title = dataset .get ("title" , "Untitled" )
15941746 resource_id = resource .get ("id" , "unknown" )
15951747 resource_name = resource .get ("name" , "Unnamed" )
1748+ n_returned = len (records )
1749+
1750+ # Total line: prefer "true total" so the model can't read the
1751+ # returned-rows count as the answer to a counting question.
1752+ if total is not None and total != n_returned :
1753+ count_line = (
1754+ f"total_matching_rows: { total } "
1755+ f"(returned { n_returned } , limit={ limit } )"
1756+ )
1757+ elif total is not None :
1758+ count_line = f"total_matching_rows: { total } (limit={ limit } )"
1759+ else :
1760+ count_line = (
1761+ f"returned_rows: { n_returned } "
1762+ f"(limit={ limit } , total unknown)"
1763+ )
15961764
1597- lines : List [str ] = [
1598- "=== search_and_query result ===" ,
1599- f"matched_dataset: { dataset_title } " ,
1600- f"dataset_id: { dataset_id } " ,
1601- f"resource_id (use with ckan__query_data): { resource_id } " ,
1602- f"resource_name: { resource_name } " ,
1603- f"row_count: { len (records )} (limit={ limit } )" ,
1604- "================================" ,
1605- "" ,
1606- ]
1765+ lines : List [str ] = []
1766+ truncated_warning = self ._format_truncation_block (
1767+ n_returned , limit , total
1768+ )
1769+ if truncated_warning :
1770+ lines .append (truncated_warning )
1771+ lines .append ("" )
1772+
1773+ lines .extend (
1774+ [
1775+ "=== search_and_query result ===" ,
1776+ f"matched_dataset: { dataset_title } " ,
1777+ f"dataset_id: { dataset_id } " ,
1778+ f"resource_id (use with ckan__query_data): { resource_id } " ,
1779+ f"resource_name: { resource_name } " ,
1780+ count_line ,
1781+ "================================" ,
1782+ "" ,
1783+ ]
1784+ )
16071785
16081786 if not records :
16091787 lines .append (
16101788 "No rows returned. Try broadening filters or pick a different "
16111789 "dataset/resource (see alternates below)."
16121790 )
16131791 else :
1614- lines .append (f"Showing up to 5 of { len (records )} record(s):" )
1792+ preview_caption = (
1793+ f"Showing first 5 of { n_returned } returned"
1794+ + (f" (true total: { total } )" if total is not None else "" )
1795+ + ":"
1796+ )
1797+ lines .append (preview_caption )
16151798 for i , record in enumerate (records [:5 ], 1 ):
16161799 lines .append (f"Record { i } :" )
16171800 for key , value in record .items ():
16181801 if key != "_id" :
16191802 lines .append (f" { key } : { value } " )
16201803 lines .append ("" )
1621- if len ( records ) > 5 :
1622- lines .append (f"... and { len ( records ) - 5 } more record(s)" )
1804+ if n_returned > 5 :
1805+ lines .append (f"... and { n_returned - 5 } more record(s) returned " )
16231806
16241807 schema_footer = self ._format_schema_footer (fields )
16251808 if schema_footer :
0 commit comments