From f507a299ddf33ab9c310ee9826cd5e196ad27c74 Mon Sep 17 00:00:00 2001 From: tprevot Date: Wed, 14 Jan 2026 12:02:21 +0100 Subject: [PATCH 1/5] fix(aggs): top_hits panics with unknown fields --- src/aggregation/metric/top_hits.rs | 94 ++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 11 deletions(-) diff --git a/src/aggregation/metric/top_hits.rs b/src/aggregation/metric/top_hits.rs index 54e5a5ced5..e80cb731c9 100644 --- a/src/aggregation/metric/top_hits.rs +++ b/src/aggregation/metric/top_hits.rs @@ -19,7 +19,7 @@ use crate::aggregation::{AggregationError, BucketId}; use crate::collector::sort_key::ReverseComparator; use crate::collector::TopNComputer; use crate::schema::OwnedValue; -use crate::{DocAddress, DocId, SegmentOrdinal}; +use crate::{DocAddress, DocId, SegmentOrdinal, TantivyError}; /// Contains all information required by the TopHitsSegmentCollector to perform the /// top_hits aggregation on a segment. @@ -149,7 +149,9 @@ impl Serialize for KeyOrder { impl<'de> Deserialize<'de> for KeyOrder { fn deserialize(deserializer: D) -> Result - where D: Deserializer<'de> { + where + D: Deserializer<'de>, + { let mut key_order = >::deserialize(deserializer)?.into_iter(); let (field, order) = key_order.next().ok_or(serde::de::Error::custom( "Expected exactly one key-value pair in sort parameter of top_hits, found none", @@ -218,12 +220,16 @@ impl TopHitsAggregationReq { .doc_value_fields .iter() .map(|field| { - if !field.contains('*') - && reader + if !field.contains('*') { + return reader .iter_columns()? - .any(|(name, _)| name.as_str() == field) - { - return Ok(vec![field.to_owned()]); + .find(|(name, _)| name == field) + .map(|_| vec![field.to_owned()]) + .ok_or_else(|| { + TantivyError::SchemaError(format!( + "Field '{field}' in docvalue_fields does not exist" + )) + }); } let pattern = globbed_string_to_regex(field)?; @@ -235,10 +241,13 @@ impl TopHitsAggregationReq { }) .filter(|name| pattern.is_match(name)) .collect::>(); - assert!( - !fields.is_empty(), - "No fields matched the glob '{field}' in docvalue_fields" - ); + + if fields.is_empty() { + return Err(TantivyError::SchemaError(format!( + "No fields matched the glob '{field}' in docvalue_fields" + ))); + } + Ok(fields) }) .collect::>>()? @@ -663,6 +672,7 @@ mod tests { use crate::collector::ComparableDoc; use crate::query::AllQuery; use crate::schema::OwnedValue; + use crate::TantivyError; fn invert_order(cmp_feature: DocValueAndOrder) -> DocValueAndOrder { let DocValueAndOrder { value, order } = cmp_feature; @@ -939,4 +949,66 @@ mod tests { fn test_aggregation_top_hits_multi_segment() -> crate::Result<()> { test_aggregation_top_hits(false) } + + #[test] + fn test_aggregation_top_hits_unknown_field() -> crate::Result<()> { + let docs = vec![ + vec![ + r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb", "mixed": { "dyn_arr": [1, "2"] } }"#, + r#"{ "date": "2017-06-15T00:00:00Z", "text": "ccc", "text2": "ddd", "mixed": { "dyn_arr": [3, "4"] } }"#, + ], + vec![ + r#"{ "text": "aaa", "text2": "bbb", "date": "2018-01-02T00:00:00Z", "mixed": { "dyn_arr": ["9", 8] } }"#, + r#"{ "text": "aaa", "text2": "bbb", "date": "2016-01-02T00:00:00Z", "mixed": { "dyn_arr": ["7", 6] } }"#, + ], + ]; + + let index = get_test_index_from_docs(false, &docs)?; + + let d: Aggregations = serde_json::from_value(json!({ + "top_hits_req": { + "top_hits": { + "size": 2, + "sort": [ + { "date": "desc" } + ], + "from": 1, + "docvalue_fields": [ + "invalid_basic_field", + ], + } + } + }))?; + + let collector = AggregationCollector::from_aggs(d, Default::default()); + let reader = index.reader()?; + let searcher = reader.searcher(); + + let agg_res = searcher.search(&AllQuery, &collector).unwrap_err(); + assert!(matches!(agg_res, TantivyError::SchemaError(_))); + + let d: Aggregations = serde_json::from_value(json!({ + "top_hits_req": { + "top_hits": { + "size": 2, + "sort": [ + { "date": "desc" } + ], + "from": 1, + "docvalue_fields": [ + "invalid_glob_fie*", + ], + } + } + }))?; + + let collector = AggregationCollector::from_aggs(d, Default::default()); + let reader = index.reader()?; + let searcher = reader.searcher(); + + let agg_res = searcher.search(&AllQuery, &collector).unwrap_err(); + assert!(matches!(agg_res, TantivyError::SchemaError(_))); + + Ok(()) + } } From dc570f86f131f46400dc95d92100b9612277354a Mon Sep 17 00:00:00 2001 From: tprevot Date: Wed, 14 Jan 2026 12:03:30 +0100 Subject: [PATCH 2/5] fix(aggs): top_hits panics with unknown fields --- src/aggregation/metric/top_hits.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/aggregation/metric/top_hits.rs b/src/aggregation/metric/top_hits.rs index e80cb731c9..013eeb2d2d 100644 --- a/src/aggregation/metric/top_hits.rs +++ b/src/aggregation/metric/top_hits.rs @@ -149,9 +149,7 @@ impl Serialize for KeyOrder { impl<'de> Deserialize<'de> for KeyOrder { fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { + where D: Deserializer<'de> { let mut key_order = >::deserialize(deserializer)?.into_iter(); let (field, order) = key_order.next().ok_or(serde::de::Error::custom( "Expected exactly one key-value pair in sort parameter of top_hits, found none", From 862b4cfef0eb3d622271e021879cd8e33745e895 Mon Sep 17 00:00:00 2001 From: tprevot Date: Wed, 14 Jan 2026 12:17:38 +0100 Subject: [PATCH 3/5] fix(aggs): top_hits incorrect escaping when matching non glob fields --- src/aggregation/metric/top_hits.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/aggregation/metric/top_hits.rs b/src/aggregation/metric/top_hits.rs index 013eeb2d2d..6a3e8dc4b7 100644 --- a/src/aggregation/metric/top_hits.rs +++ b/src/aggregation/metric/top_hits.rs @@ -221,7 +221,7 @@ impl TopHitsAggregationReq { if !field.contains('*') { return reader .iter_columns()? - .find(|(name, _)| name == field) + .find(|(name, _)| &name.replace(JSON_PATH_SEGMENT_SEP_STR, ".") == field) .map(|_| vec![field.to_owned()]) .ok_or_else(|| { TantivyError::SchemaError(format!( @@ -871,12 +871,12 @@ mod tests { fn test_aggregation_top_hits(merge_segments: bool) -> crate::Result<()> { let docs = vec![ vec![ - r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb", "mixed": { "dyn_arr": [1, "2"] } }"#, - r#"{ "date": "2017-06-15T00:00:00Z", "text": "ccc", "text2": "ddd", "mixed": { "dyn_arr": [3, "4"] } }"#, + r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb", "mixed": { "dyn_arr": [1, "2"], "dyn_str": "foo" } }"#, + r#"{ "date": "2017-06-15T00:00:00Z", "text": "ccc", "text2": "ddd", "mixed": { "dyn_arr": [3, "4"], "dyn_str": "bar" } }"#, ], vec![ - r#"{ "text": "aaa", "text2": "bbb", "date": "2018-01-02T00:00:00Z", "mixed": { "dyn_arr": ["9", 8] } }"#, - r#"{ "text": "aaa", "text2": "bbb", "date": "2016-01-02T00:00:00Z", "mixed": { "dyn_arr": ["7", 6] } }"#, + r#"{ "text": "aaa", "text2": "bbb", "date": "2018-01-02T00:00:00Z", "mixed": { "dyn_arr": ["9", 8], "dyn_str": "baz" } }"#, + r#"{ "text": "aaa", "text2": "bbb", "date": "2016-01-02T00:00:00Z", "mixed": { "dyn_arr": ["7", 6], "dyn_str": "bor" } }"#, ], ]; @@ -894,6 +894,7 @@ mod tests { "date", "tex*", "mixed.*", + "mixed.dyn_str" ], } } @@ -920,6 +921,7 @@ mod tests { "text": [ "ccc" ], "text2": [ "ddd" ], "mixed.dyn_arr": [ 3, "4" ], + "mixed.dyn_str": [ "bar" ], } }, { @@ -929,6 +931,7 @@ mod tests { "text": [ "aaa" ], "text2": [ "bbb" ], "mixed.dyn_arr": [ 6, "7" ], + "mixed.dyn_str": [ "bor" ], } } ] From d0700a876f7e81d85076215488a95ce0e5d18d42 Mon Sep 17 00:00:00 2001 From: tprevot Date: Wed, 14 Jan 2026 13:46:19 +0100 Subject: [PATCH 4/5] fix(aggs): make top hits agg req fields pub --- src/aggregation/metric/top_hits.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/aggregation/metric/top_hits.rs b/src/aggregation/metric/top_hits.rs index 6a3e8dc4b7..0f3b78e687 100644 --- a/src/aggregation/metric/top_hits.rs +++ b/src/aggregation/metric/top_hits.rs @@ -115,13 +115,17 @@ impl TopHitsAggReqData { /// ``` #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)] pub struct TopHitsAggregationReq { - sort: Vec, - size: usize, - from: Option, - + /// The sort criteria to determine the top hits. + pub sort: Vec, + /// The number of top hits to return. + pub size: usize, + /// The number of top hits to skip. + pub from: Option, + + /// The list of fast fields to retrieve for each document. #[serde(rename = "docvalue_fields")] #[serde(default)] - doc_value_fields: Vec, + pub doc_value_fields: Vec, // Not supported _source: Option, @@ -132,9 +136,12 @@ pub struct TopHitsAggregationReq { version: Option, } +/// A field and its associated sort order. #[derive(Debug, Clone, PartialEq, Default)] -struct KeyOrder { +pub struct KeyOrder { + /// The field name. field: String, + /// The sort order. order: Order, } From 4d986ba7ee4d4f59039bbe2096bb0235f91550f4 Mon Sep 17 00:00:00 2001 From: tprevot Date: Wed, 14 Jan 2026 15:46:11 +0100 Subject: [PATCH 5/5] fix(aggs): use `FastFieldReaders::resolve_field` to check if a field exists --- src/aggregation/agg_data.rs | 2 +- src/aggregation/metric/top_hits.rs | 22 +++++++++++----------- src/fastfield/readers.rs | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/aggregation/agg_data.rs b/src/aggregation/agg_data.rs index de521505e3..576c33e48d 100644 --- a/src/aggregation/agg_data.rs +++ b/src/aggregation/agg_data.rs @@ -677,7 +677,7 @@ fn build_nodes( } AggregationVariants::TopHits(top_hits_req) => { let mut top_hits = top_hits_req.clone(); - top_hits.validate_and_resolve_field_names(reader.fast_fields().columnar())?; + top_hits.validate_and_resolve_field_names(reader.fast_fields())?; let accessors: Vec<(Column, ColumnType)> = top_hits .field_names() .iter() diff --git a/src/aggregation/metric/top_hits.rs b/src/aggregation/metric/top_hits.rs index 0f3b78e687..703cba8abe 100644 --- a/src/aggregation/metric/top_hits.rs +++ b/src/aggregation/metric/top_hits.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::net::Ipv6Addr; -use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn}; +use columnar::{Column, ColumnType, DynamicColumn}; use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR; use common::DateTime; use regex::Regex; @@ -18,6 +18,7 @@ use crate::aggregation::segment_agg_result::SegmentAggregationCollector; use crate::aggregation::{AggregationError, BucketId}; use crate::collector::sort_key::ReverseComparator; use crate::collector::TopNComputer; +use crate::fastfield::FastFieldReaders; use crate::schema::OwnedValue; use crate::{DocAddress, DocId, SegmentOrdinal, TantivyError}; @@ -200,7 +201,7 @@ impl TopHitsAggregationReq { /// Validate and resolve field retrieval parameters pub fn validate_and_resolve_field_names( &mut self, - reader: &ColumnarReader, + reader: &FastFieldReaders, ) -> crate::Result<()> { if self._source.is_some() { use_doc_value_fields_err("_source")?; @@ -226,19 +227,18 @@ impl TopHitsAggregationReq { .iter() .map(|field| { if !field.contains('*') { - return reader - .iter_columns()? - .find(|(name, _)| &name.replace(JSON_PATH_SEGMENT_SEP_STR, ".") == field) - .map(|_| vec![field.to_owned()]) - .ok_or_else(|| { - TantivyError::SchemaError(format!( - "Field '{field}' in docvalue_fields does not exist" - )) - }); + reader.resolve_field(field)?.ok_or_else(|| { + TantivyError::SchemaError(format!( + "Field '{field}' in docvalue_fields does not exist" + )) + })?; + + return Ok(vec![field.to_owned()]); } let pattern = globbed_string_to_regex(field)?; let fields = reader + .columnar() .iter_columns()? .map(|(name, _)| { // normalize path from internal fast field repr diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 083f795322..2416544f64 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -30,7 +30,7 @@ impl FastFieldReaders { Ok(FastFieldReaders { columnar, schema }) } - fn resolve_field(&self, column_name: &str) -> crate::Result> { + pub(crate) fn resolve_field(&self, column_name: &str) -> crate::Result> { let default_field_opt: Option = if cfg!(feature = "quickwit") { self.schema.get_field("_dynamic").ok() } else {