From 93fe202ff4d8d45dc75eaa347b85f25a5f4d324c Mon Sep 17 00:00:00 2001 From: Mohammad Dashti Date: Wed, 14 Jan 2026 22:27:30 -1000 Subject: [PATCH] Added a flag: strict_validation --- src/aggregation/accessor_helpers.rs | 70 ++++++++++++++++------------- src/aggregation/agg_data.rs | 48 ++++++++++++++++---- src/aggregation/agg_tests.rs | 17 ++++--- src/aggregation/mod.rs | 43 +++++++++++++++++- 4 files changed, 132 insertions(+), 46 deletions(-) diff --git a/src/aggregation/accessor_helpers.rs b/src/aggregation/accessor_helpers.rs index cbd24c6b4..42ddb6750 100644 --- a/src/aggregation/accessor_helpers.rs +++ b/src/aggregation/accessor_helpers.rs @@ -55,11 +55,13 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] { ] } -/// Get fast field reader or return an error if the field doesn't exist. +/// Get fast field reader or return an error if the field doesn't exist (when strict validation is +/// enabled). pub(crate) fn get_ff_reader( reader: &SegmentReader, field_name: &str, allowed_column_types: Option<&[ColumnType]>, + strict_validation: bool, ) -> crate::Result<(columnar::Column, ColumnType)> { let ff_fields = reader.fast_fields(); let ff_field_with_type = ff_fields.u64_lenient_for_type(allowed_column_types, field_name)?; @@ -67,26 +69,28 @@ pub(crate) fn get_ff_reader( match ff_field_with_type { Some(field) => Ok(field), None => { - // Check if the field exists in the schema but is not a fast field - let schema = reader.schema(); - if let Some((field, _path)) = schema.find_field(field_name) { - let field_type = schema.get_field_entry(field).field_type(); - if !field_type.is_fast() { - return Err(crate::TantivyError::SchemaError(format!( - "Field '{}' is not a fast field. Aggregations require fast fields.", - field_name - ))); + if strict_validation { + // Check if the field exists in the schema but is not a fast field + let schema = reader.schema(); + if let Some((field, _path)) = schema.find_field(field_name) { + let field_type = schema.get_field_entry(field).field_type(); + if !field_type.is_fast() { + return Err(crate::TantivyError::SchemaError(format!( + "Field '{}' is not a fast field. Aggregations require fast fields.", + field_name + ))); + } + } + + // Field doesn't exist at all or has no values in this segment + // Check if it exists in schema to provide a better error message + if schema.find_field(field_name).is_none() { + return Err(crate::TantivyError::FieldNotFound(field_name.to_string())); } } - // Field doesn't exist at all or has no values in this segment - // Check if it exists in schema to provide a better error message - if schema.find_field(field_name).is_none() { - return Err(crate::TantivyError::FieldNotFound(field_name.to_string())); - } - // Field exists in schema and is a fast field, but has no values in this segment - // This is acceptable - return an empty column + // OR strict validation is disabled - return an empty column Ok(( Column::build_empty_column(reader.num_docs()), ColumnType::U64, @@ -111,35 +115,39 @@ pub(crate) fn get_dynamic_columns( /// Get all fast field reader or empty as default. /// /// Is guaranteed to return at least one column. -/// Returns an error if the field doesn't exist in the schema or is not a fast field. +/// Returns an error if the field doesn't exist in the schema or is not a fast field (when strict +/// validation is enabled). pub(crate) fn get_all_ff_reader_or_empty( reader: &SegmentReader, field_name: &str, allowed_column_types: Option<&[ColumnType]>, fallback_type: ColumnType, + strict_validation: bool, ) -> crate::Result, ColumnType)>> { let ff_fields = reader.fast_fields(); let mut ff_field_with_type = ff_fields.u64_lenient_for_type_all(allowed_column_types, field_name)?; if ff_field_with_type.is_empty() { - // Check if the field exists in the schema but is not a fast field - let schema = reader.schema(); - if let Some((field, _path)) = schema.find_field(field_name) { - let field_type = schema.get_field_entry(field).field_type(); - if !field_type.is_fast() { - return Err(crate::TantivyError::SchemaError(format!( - "Field '{}' is not a fast field. Aggregations require fast fields.", - field_name - ))); + if strict_validation { + // Check if the field exists in the schema but is not a fast field + let schema = reader.schema(); + if let Some((field, _path)) = schema.find_field(field_name) { + let field_type = schema.get_field_entry(field).field_type(); + if !field_type.is_fast() { + return Err(crate::TantivyError::SchemaError(format!( + "Field '{}' is not a fast field. Aggregations require fast fields.", + field_name + ))); + } + } else { + // Field doesn't exist in the schema at all + return Err(crate::TantivyError::FieldNotFound(field_name.to_string())); } - } else { - // Field doesn't exist in the schema at all - return Err(crate::TantivyError::FieldNotFound(field_name.to_string())); } // Field exists in schema and is a fast field, but has no values in this segment - // This is acceptable - return an empty column + // OR strict validation is disabled - return an empty column ff_field_with_type.push((Column::build_empty_column(reader.num_docs()), fallback_type)); } Ok(ff_field_with_type) diff --git a/src/aggregation/agg_data.rs b/src/aggregation/agg_data.rs index 156b014f3..e646f6993 100644 --- a/src/aggregation/agg_data.rs +++ b/src/aggregation/agg_data.rs @@ -501,6 +501,7 @@ fn build_nodes( reader, &range_req.field, Some(get_numeric_or_date_column_types()), + data.context.strict_field_validation, )?; let idx_in_req_data = data.push_range_req_data(RangeAggReqData { accessor, @@ -521,6 +522,7 @@ fn build_nodes( reader, &histo_req.field, Some(get_numeric_or_date_column_types()), + data.context.strict_field_validation, )?; let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData { accessor, @@ -542,8 +544,12 @@ fn build_nodes( }]) } DateHistogram(date_req) => { - let (accessor, field_type) = - get_ff_reader(reader, &date_req.field, Some(&[ColumnType::DateTime]))?; + let (accessor, field_type) = get_ff_reader( + reader, + &date_req.field, + Some(&[ColumnType::DateTime]), + data.context.strict_field_validation, + )?; // Convert to histogram request, normalize to ns precision let mut histo_req = date_req.to_histogram_req()?; histo_req.normalize_date_time(); @@ -626,7 +632,12 @@ fn build_nodes( )) } }; - let (accessor, field_type) = get_ff_reader(reader, field, allowed_column_types)?; + let (accessor, field_type) = get_ff_reader( + reader, + field, + allowed_column_types, + data.context.strict_field_validation, + )?; let idx_in_req_data = data.push_metric_req_data(MetricAggReqData { accessor, field_type, @@ -653,6 +664,7 @@ fn build_nodes( reader, percentiles_req.field_name(), Some(get_numeric_or_date_column_types()), + data.context.strict_field_validation, )?; let idx_in_req_data = data.push_metric_req_data(MetricAggReqData { accessor, @@ -681,7 +693,14 @@ fn build_nodes( let accessors: Vec<(Column, ColumnType)> = top_hits .field_names() .iter() - .map(|field| get_ff_reader(reader, field, Some(get_numeric_or_date_column_types()))) + .map(|field| { + get_ff_reader( + reader, + field, + Some(get_numeric_or_date_column_types()), + data.context.strict_field_validation, + ) + }) .collect::>()?; let value_accessors = top_hits @@ -767,6 +786,7 @@ fn get_term_agg_accessors( reader: &SegmentReader, field_name: &str, missing: &Option, + strict_validation: bool, ) -> crate::Result, ColumnType)>> { let allowed_column_types = [ ColumnType::I64, @@ -794,6 +814,7 @@ fn get_term_agg_accessors( field_name, Some(&allowed_column_types), fallback_type, + strict_validation, )?; Ok(column_and_types) @@ -828,7 +849,12 @@ fn build_terms_or_cardinality_nodes( let str_dict_column = reader.fast_fields().str(field_name)?; - let column_and_types = get_term_agg_accessors(reader, field_name, missing)?; + let column_and_types = get_term_agg_accessors( + reader, + field_name, + missing, + data.context.strict_field_validation, + )?; // Special handling when missing + multi column or incompatible type on text/date. let missing_and_more_than_one_col = column_and_types.len() > 1 && missing.is_some(); @@ -850,9 +876,15 @@ fn build_terms_or_cardinality_nodes( Key::U64(_) => ColumnType::U64, }) .unwrap_or(ColumnType::U64); - let all_accessors = get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)? - .into_iter() - .collect::>(); + let all_accessors = get_all_ff_reader_or_empty( + reader, + field_name, + None, + fallback_type, + data.context.strict_field_validation, + )? + .into_iter() + .collect::>(); // This case only happens when we have term aggregation, or we fail let req = req.as_terms().cloned().ok_or_else(|| { crate::TantivyError::InvalidArgument( diff --git a/src/aggregation/agg_tests.rs b/src/aggregation/agg_tests.rs index ba662116d..d02130cdc 100644 --- a/src/aggregation/agg_tests.rs +++ b/src/aggregation/agg_tests.rs @@ -1440,6 +1440,7 @@ fn test_aggregation_on_json_object_mixed_numerical_segments() { #[test] fn test_aggregation_invalid_field_returns_error() { // Test that aggregations return an error when given an invalid field name + // with strict_field_validation enabled let index = get_test_index_2_segments(false).unwrap(); let reader = index.reader().unwrap(); let searcher = reader.searcher(); @@ -1455,7 +1456,9 @@ fn test_aggregation_invalid_field_returns_error() { } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let collector = get_collector(agg); + // Enable strict field validation + let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); + let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); @@ -1477,7 +1480,8 @@ fn test_aggregation_invalid_field_returns_error() { } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let collector = get_collector(agg); + let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); + let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); @@ -1498,7 +1502,8 @@ fn test_aggregation_invalid_field_returns_error() { } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let collector = get_collector(agg); + let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); + let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); @@ -1519,7 +1524,8 @@ fn test_aggregation_invalid_field_returns_error() { } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let collector = get_collector(agg); + let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); + let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); @@ -1545,7 +1551,8 @@ fn test_aggregation_invalid_field_returns_error() { } }"#; let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let collector = get_collector(agg); + let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); + let collector = AggregationCollector::from_aggs(agg, context); let result = searcher.search(&AllQuery, &collector); assert!(result.is_err()); diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index b4a080d6a..44ba0dead 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -180,18 +180,57 @@ pub type BucketId = u32; /// This struct holds shared resources needed during aggregation execution: /// - `limits`: Memory and bucket limits for the aggregation /// - `tokenizers`: TokenizerManager for parsing query strings in filter aggregations -#[derive(Clone, Default)] +/// - `strict_field_validation`: Whether to enforce strict field validation (default: false) +#[derive(Clone)] pub struct AggContextParams { /// Aggregation limits (memory and bucket count) pub limits: AggregationLimitsGuard, /// Tokenizer manager for query string parsing pub tokenizers: TokenizerManager, + /// If true, aggregations will return an error when a field doesn't exist in the schema. + /// If false (default), aggregations will return empty results for non-existent fields. + /// + /// Set to false for use cases where schema can change over time (e.g., Quickwit splits). + /// Set to true to catch typos and configuration errors early. + pub strict_field_validation: bool, +} + +impl Default for AggContextParams { + fn default() -> Self { + Self { + limits: Default::default(), + tokenizers: Default::default(), + strict_field_validation: false, + } + } } impl AggContextParams { /// Create new aggregation context parameters pub fn new(limits: AggregationLimitsGuard, tokenizers: TokenizerManager) -> Self { - Self { limits, tokenizers } + Self { + limits, + tokenizers, + strict_field_validation: false, + } + } + + /// Create new aggregation context parameters with strict field validation enabled + pub fn with_strict_field_validation( + limits: AggregationLimitsGuard, + tokenizers: TokenizerManager, + ) -> Self { + Self { + limits, + tokenizers, + strict_field_validation: true, + } + } + + /// Enable or disable strict field validation + pub fn set_strict_field_validation(mut self, strict: bool) -> Self { + self.strict_field_validation = strict; + self } }