From 95db7d2e5c176d05c89390a0cda4e811f9e2eb3d Mon Sep 17 00:00:00 2001 From: Mohammad Dashti Date: Thu, 15 Jan 2026 15:37:24 -1000 Subject: [PATCH] Revert "Revert all impl." This reverts commit d5e0991549a05bf80f19f853f7689ad69f96e7e5. --- src/aggregation/accessor_helpers.rs | 64 ++--------- src/aggregation/agg_data.rs | 50 ++------ src/aggregation/agg_tests.rs | 127 --------------------- src/aggregation/bucket/term_missing_agg.rs | 2 - src/aggregation/mod.rs | 19 +-- 5 files changed, 20 insertions(+), 242 deletions(-) diff --git a/src/aggregation/accessor_helpers.rs b/src/aggregation/accessor_helpers.rs index d7f0a8bdd..fa51041e4 100644 --- a/src/aggregation/accessor_helpers.rs +++ b/src/aggregation/accessor_helpers.rs @@ -55,56 +55,22 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] { ] } -/// Validates that a field exists in the schema and is configured as a fast field. -/// -/// Returns an error if: -/// - The field doesn't exist in the schema -/// - The field exists but is not a fast field -fn validate_field_for_agg(reader: &SegmentReader, field_name: &str) -> crate::Result<()> { - let schema = reader.schema(); - // Here we check two things: - // - the field is either directly in the schema or could be part of a json field present in the - // schema. - // - the field (or the json field holding it) is a fast field. - if let Some((field, _path)) = schema.find_field(field_name) { - let field_type = schema.get_field_entry(field).field_type(); - if !field_type.is_fast() { - return Err(crate::TantivyError::SchemaError(format!( - "Field '{}' is not a fast field. Aggregations require fast fields.", - field_name - ))); - } - Ok(()) - } else { - Err(crate::TantivyError::FieldNotFound(field_name.to_string())) - } -} - -/// Get fast field reader or return an error if the field doesn't exist (when strict validation is -/// enabled). +/// Get fast field reader or empty as default. pub(crate) fn get_ff_reader( reader: &SegmentReader, field_name: &str, allowed_column_types: Option<&[ColumnType]>, - strict_validation: bool, ) -> crate::Result<(columnar::Column, ColumnType)> { let ff_fields = reader.fast_fields(); - let ff_field_with_type = ff_fields.u64_lenient_for_type(allowed_column_types, field_name)?; - - if let Some(field_with_type) = ff_field_with_type { - return Ok(field_with_type); - } - - if strict_validation { - validate_field_for_agg(reader, field_name)?; - } - - // Field exists in schema and is a fast field, but has no values in this segment - // OR strict validation is disabled - return an empty column - Ok(( - Column::build_empty_column(reader.num_docs()), - ColumnType::U64, - )) + let ff_field_with_type = ff_fields + .u64_lenient_for_type(allowed_column_types, field_name)? + .unwrap_or_else(|| { + ( + Column::build_empty_column(reader.num_docs()), + ColumnType::U64, + ) + }); + Ok(ff_field_with_type) } pub(crate) fn get_dynamic_columns( @@ -123,26 +89,16 @@ pub(crate) fn get_dynamic_columns( /// Get all fast field reader or empty as default. /// /// Is guaranteed to return at least one column. -/// Returns an error if the field doesn't exist in the schema or is not a fast field (when strict -/// validation is enabled). pub(crate) fn get_all_ff_reader_or_empty( reader: &SegmentReader, field_name: &str, allowed_column_types: Option<&[ColumnType]>, fallback_type: ColumnType, - strict_validation: bool, ) -> crate::Result, ColumnType)>> { let ff_fields = reader.fast_fields(); let mut ff_field_with_type = ff_fields.u64_lenient_for_type_all(allowed_column_types, field_name)?; - if ff_field_with_type.is_empty() { - if strict_validation { - validate_field_for_agg(reader, field_name)?; - } - - // Field exists in schema and is a fast field, but has no values in this segment - // OR strict validation is disabled - return an empty column ff_field_with_type.push((Column::build_empty_column(reader.num_docs()), fallback_type)); } Ok(ff_field_with_type) diff --git a/src/aggregation/agg_data.rs b/src/aggregation/agg_data.rs index f0545ee7a..928e352d7 100644 --- a/src/aggregation/agg_data.rs +++ b/src/aggregation/agg_data.rs @@ -578,7 +578,6 @@ fn build_nodes( reader, &range_req.field, Some(get_numeric_or_date_column_types()), - data.context.strict_field_validation, )?; let idx_in_req_data = data.push_range_req_data(RangeAggReqData { accessor, @@ -599,7 +598,6 @@ fn build_nodes( reader, &histo_req.field, Some(get_numeric_or_date_column_types()), - data.context.strict_field_validation, )?; let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData { accessor, @@ -621,12 +619,8 @@ fn build_nodes( }]) } DateHistogram(date_req) => { - let (accessor, field_type) = get_ff_reader( - reader, - &date_req.field, - Some(&[ColumnType::DateTime]), - data.context.strict_field_validation, - )?; + let (accessor, field_type) = + get_ff_reader(reader, &date_req.field, Some(&[ColumnType::DateTime]))?; // Convert to histogram request, normalize to ns precision let mut histo_req = date_req.to_histogram_req()?; histo_req.normalize_date_time(); @@ -709,12 +703,7 @@ fn build_nodes( )) } }; - let (accessor, field_type) = get_ff_reader( - reader, - field, - allowed_column_types, - data.context.strict_field_validation, - )?; + let (accessor, field_type) = get_ff_reader(reader, field, allowed_column_types)?; let idx_in_req_data = data.push_metric_req_data(MetricAggReqData { accessor, field_type, @@ -741,7 +730,6 @@ fn build_nodes( reader, percentiles_req.field_name(), Some(get_numeric_or_date_column_types()), - data.context.strict_field_validation, )?; let idx_in_req_data = data.push_metric_req_data(MetricAggReqData { accessor, @@ -770,14 +758,7 @@ fn build_nodes( let accessors: Vec<(Column, ColumnType)> = top_hits .field_names() .iter() - .map(|field| { - get_ff_reader( - reader, - field, - Some(get_numeric_or_date_column_types()), - data.context.strict_field_validation, - ) - }) + .map(|field| get_ff_reader(reader, field, Some(get_numeric_or_date_column_types()))) .collect::>()?; let value_accessors = top_hits @@ -900,7 +881,6 @@ fn get_term_agg_accessors( reader: &SegmentReader, field_name: &str, missing: &Option, - strict_validation: bool, ) -> crate::Result, ColumnType)>> { let allowed_column_types = [ ColumnType::I64, @@ -928,7 +908,6 @@ fn get_term_agg_accessors( field_name, Some(&allowed_column_types), fallback_type, - strict_validation, )?; Ok(column_and_types) @@ -963,12 +942,7 @@ fn build_terms_or_cardinality_nodes( let str_dict_column = reader.fast_fields().str(field_name)?; - let column_and_types = get_term_agg_accessors( - reader, - field_name, - missing, - data.context.strict_field_validation, - )?; + let column_and_types = get_term_agg_accessors(reader, field_name, missing)?; // Special handling when missing + multi column or incompatible type on text/date. let missing_and_more_than_one_col = column_and_types.len() > 1 && missing.is_some(); @@ -990,15 +964,9 @@ fn build_terms_or_cardinality_nodes( Key::U64(_) => ColumnType::U64, }) .unwrap_or(ColumnType::U64); - let all_accessors = get_all_ff_reader_or_empty( - reader, - field_name, - None, - fallback_type, - data.context.strict_field_validation, - )? - .into_iter() - .collect::>(); + let all_accessors = get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)? + .into_iter() + .collect::>(); // This case only happens when we have term aggregation, or we fail let req = req.as_terms().cloned().ok_or_else(|| { crate::TantivyError::InvalidArgument( @@ -1197,7 +1165,7 @@ mod tests { "avg": {"field": "score"} })); let terms_string_with_child = agg_from_json(json!({ - "terms": {"field": "text"}, + "terms": {"field": "string_id"}, "aggs": { "histo": {"histogram": {"field": "score", "interval": 10.0}} } diff --git a/src/aggregation/agg_tests.rs b/src/aggregation/agg_tests.rs index d02130cdc..49a8afb37 100644 --- a/src/aggregation/agg_tests.rs +++ b/src/aggregation/agg_tests.rs @@ -1436,130 +1436,3 @@ fn test_aggregation_on_json_object_mixed_numerical_segments() { ) ); } - -#[test] -fn test_aggregation_invalid_field_returns_error() { - // Test that aggregations return an error when given an invalid field name - // with strict_field_validation enabled - let index = get_test_index_2_segments(false).unwrap(); - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - - // Test with a field that doesn't exist at all - let agg_req_str = r#" - { - "date_histogram_test": { - "date_histogram": { - "field": "not_valid_field", - "fixed_interval": "30d" - } - } - }"#; - let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - // Enable strict field validation - let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); - let collector = AggregationCollector::from_aggs(agg, context); - let result = searcher.search(&AllQuery, &collector); - - assert!(result.is_err()); - match result { - Err(crate::TantivyError::FieldNotFound(field_name)) => { - assert_eq!(field_name, "not_valid_field"); - } - _ => panic!("Expected FieldNotFound error, got: {:?}", result), - } - - // Test with histogram aggregation on invalid field - let agg_req_str = r#" - { - "histogram_test": { - "histogram": { - "field": "invalid_histogram_field", - "interval": 10.0 - } - } - }"#; - let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); - let collector = AggregationCollector::from_aggs(agg, context); - let result = searcher.search(&AllQuery, &collector); - - assert!(result.is_err()); - match result { - Err(crate::TantivyError::FieldNotFound(field_name)) => { - assert_eq!(field_name, "invalid_histogram_field"); - } - _ => panic!("Expected FieldNotFound error, got: {:?}", result), - } - - // Test with terms aggregation on invalid field - let agg_req_str = r#" - { - "terms_test": { - "terms": { - "field": "invalid_terms_field" - } - } - }"#; - let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); - let collector = AggregationCollector::from_aggs(agg, context); - let result = searcher.search(&AllQuery, &collector); - - assert!(result.is_err()); - match result { - Err(crate::TantivyError::FieldNotFound(field_name)) => { - assert_eq!(field_name, "invalid_terms_field"); - } - _ => panic!("Expected FieldNotFound error, got: {:?}", result), - } - - // Test with avg metric aggregation on invalid field - let agg_req_str = r#" - { - "avg_test": { - "avg": { - "field": "invalid_avg_field" - } - } - }"#; - let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); - let collector = AggregationCollector::from_aggs(agg, context); - let result = searcher.search(&AllQuery, &collector); - - assert!(result.is_err()); - match result { - Err(crate::TantivyError::FieldNotFound(field_name)) => { - assert_eq!(field_name, "invalid_avg_field"); - } - _ => panic!("Expected FieldNotFound error, got: {:?}", result), - } - - // Test with range aggregation on invalid field - let agg_req_str = r#" - { - "range_test": { - "range": { - "field": "invalid_range_field", - "ranges": [ - { "to": 10.0 }, - { "from": 10.0, "to": 20.0 }, - { "from": 20.0 } - ] - } - } - }"#; - let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true); - let collector = AggregationCollector::from_aggs(agg, context); - let result = searcher.search(&AllQuery, &collector); - - assert!(result.is_err()); - match result { - Err(crate::TantivyError::FieldNotFound(field_name)) => { - assert_eq!(field_name, "invalid_range_field"); - } - _ => panic!("Expected FieldNotFound error, got: {:?}", result), - } -} diff --git a/src/aggregation/bucket/term_missing_agg.rs b/src/aggregation/bucket/term_missing_agg.rs index 6ff554fc4..173426289 100644 --- a/src/aggregation/bucket/term_missing_agg.rs +++ b/src/aggregation/bucket/term_missing_agg.rs @@ -307,7 +307,6 @@ mod tests { fn terms_aggregation_missing_mult_seg_empty() -> crate::Result<()> { let mut schema_builder = Schema::builder(); let score = schema_builder.add_f64_field("score", FAST); - schema_builder.add_json_field("json", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); @@ -355,7 +354,6 @@ mod tests { fn terms_aggregation_missing_single_seg_empty() -> crate::Result<()> { let mut schema_builder = Schema::builder(); let score = schema_builder.add_f64_field("score", FAST); - schema_builder.add_json_field("json", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index 5ea7ae023..66d439e91 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -180,35 +180,18 @@ pub type BucketId = u32; /// This struct holds shared resources needed during aggregation execution: /// - `limits`: Memory and bucket limits for the aggregation /// - `tokenizers`: TokenizerManager for parsing query strings in filter aggregations -/// - `strict_field_validation`: Whether to enforce strict field validation (default: false) #[derive(Clone, Default)] pub struct AggContextParams { /// Aggregation limits (memory and bucket count) pub limits: AggregationLimitsGuard, /// Tokenizer manager for query string parsing pub tokenizers: TokenizerManager, - /// If true, aggregations will return an error when a field doesn't exist in the schema. - /// If false (default), aggregations will return empty results for non-existent fields. - /// - /// Set to false for use cases where schema can change over time (e.g., Quickwit splits). - /// Set to true to catch typos and configuration errors early. - pub strict_field_validation: bool, } impl AggContextParams { /// Create new aggregation context parameters pub fn new(limits: AggregationLimitsGuard, tokenizers: TokenizerManager) -> Self { - Self { - limits, - tokenizers, - strict_field_validation: false, - } - } - - /// Enable or disable strict field validation - pub fn set_strict_field_validation(mut self, strict: bool) -> Self { - self.strict_field_validation = strict; - self + Self { limits, tokenizers } } }