Revert "Revert all impl."

This reverts commit d5e0991549a05bf80f19f853f7689ad69f96e7e5.
This commit is contained in:
Mohammad Dashti
2026-01-15 15:37:24 -10:00
committed by PSeitz
parent fc017c4c74
commit 95db7d2e5c
5 changed files with 20 additions and 242 deletions

View File

@@ -55,56 +55,22 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
]
}
/// Validates that a field exists in the schema and is configured as a fast field.
///
/// Returns an error if:
/// - The field doesn't exist in the schema
/// - The field exists but is not a fast field
fn validate_field_for_agg(reader: &SegmentReader, field_name: &str) -> crate::Result<()> {
let schema = reader.schema();
// Here we check two things:
// - the field is either directly in the schema or could be part of a json field present in the
// schema.
// - the field (or the json field holding it) is a fast field.
if let Some((field, _path)) = schema.find_field(field_name) {
let field_type = schema.get_field_entry(field).field_type();
if !field_type.is_fast() {
return Err(crate::TantivyError::SchemaError(format!(
"Field '{}' is not a fast field. Aggregations require fast fields.",
field_name
)));
}
Ok(())
} else {
Err(crate::TantivyError::FieldNotFound(field_name.to_string()))
}
}
/// Get fast field reader or return an error if the field doesn't exist (when strict validation is
/// enabled).
/// Get fast field reader or empty as default.
pub(crate) fn get_ff_reader(
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
strict_validation: bool,
) -> crate::Result<(columnar::Column<u64>, ColumnType)> {
let ff_fields = reader.fast_fields();
let ff_field_with_type = ff_fields.u64_lenient_for_type(allowed_column_types, field_name)?;
if let Some(field_with_type) = ff_field_with_type {
return Ok(field_with_type);
}
if strict_validation {
validate_field_for_agg(reader, field_name)?;
}
// Field exists in schema and is a fast field, but has no values in this segment
// OR strict validation is disabled - return an empty column
Ok((
Column::build_empty_column(reader.num_docs()),
ColumnType::U64,
))
let ff_field_with_type = ff_fields
.u64_lenient_for_type(allowed_column_types, field_name)?
.unwrap_or_else(|| {
(
Column::build_empty_column(reader.num_docs()),
ColumnType::U64,
)
});
Ok(ff_field_with_type)
}
pub(crate) fn get_dynamic_columns(
@@ -123,26 +89,16 @@ pub(crate) fn get_dynamic_columns(
/// Get all fast field reader or empty as default.
///
/// Is guaranteed to return at least one column.
/// Returns an error if the field doesn't exist in the schema or is not a fast field (when strict
/// validation is enabled).
pub(crate) fn get_all_ff_reader_or_empty(
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
fallback_type: ColumnType,
strict_validation: bool,
) -> crate::Result<Vec<(columnar::Column<u64>, ColumnType)>> {
let ff_fields = reader.fast_fields();
let mut ff_field_with_type =
ff_fields.u64_lenient_for_type_all(allowed_column_types, field_name)?;
if ff_field_with_type.is_empty() {
if strict_validation {
validate_field_for_agg(reader, field_name)?;
}
// Field exists in schema and is a fast field, but has no values in this segment
// OR strict validation is disabled - return an empty column
ff_field_with_type.push((Column::build_empty_column(reader.num_docs()), fallback_type));
}
Ok(ff_field_with_type)

View File

@@ -578,7 +578,6 @@ fn build_nodes(
reader,
&range_req.field,
Some(get_numeric_or_date_column_types()),
data.context.strict_field_validation,
)?;
let idx_in_req_data = data.push_range_req_data(RangeAggReqData {
accessor,
@@ -599,7 +598,6 @@ fn build_nodes(
reader,
&histo_req.field,
Some(get_numeric_or_date_column_types()),
data.context.strict_field_validation,
)?;
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
accessor,
@@ -621,12 +619,8 @@ fn build_nodes(
}])
}
DateHistogram(date_req) => {
let (accessor, field_type) = get_ff_reader(
reader,
&date_req.field,
Some(&[ColumnType::DateTime]),
data.context.strict_field_validation,
)?;
let (accessor, field_type) =
get_ff_reader(reader, &date_req.field, Some(&[ColumnType::DateTime]))?;
// Convert to histogram request, normalize to ns precision
let mut histo_req = date_req.to_histogram_req()?;
histo_req.normalize_date_time();
@@ -709,12 +703,7 @@ fn build_nodes(
))
}
};
let (accessor, field_type) = get_ff_reader(
reader,
field,
allowed_column_types,
data.context.strict_field_validation,
)?;
let (accessor, field_type) = get_ff_reader(reader, field, allowed_column_types)?;
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
accessor,
field_type,
@@ -741,7 +730,6 @@ fn build_nodes(
reader,
percentiles_req.field_name(),
Some(get_numeric_or_date_column_types()),
data.context.strict_field_validation,
)?;
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
accessor,
@@ -770,14 +758,7 @@ fn build_nodes(
let accessors: Vec<(Column<u64>, ColumnType)> = top_hits
.field_names()
.iter()
.map(|field| {
get_ff_reader(
reader,
field,
Some(get_numeric_or_date_column_types()),
data.context.strict_field_validation,
)
})
.map(|field| get_ff_reader(reader, field, Some(get_numeric_or_date_column_types())))
.collect::<crate::Result<_>>()?;
let value_accessors = top_hits
@@ -900,7 +881,6 @@ fn get_term_agg_accessors(
reader: &SegmentReader,
field_name: &str,
missing: &Option<Key>,
strict_validation: bool,
) -> crate::Result<Vec<(Column<u64>, ColumnType)>> {
let allowed_column_types = [
ColumnType::I64,
@@ -928,7 +908,6 @@ fn get_term_agg_accessors(
field_name,
Some(&allowed_column_types),
fallback_type,
strict_validation,
)?;
Ok(column_and_types)
@@ -963,12 +942,7 @@ fn build_terms_or_cardinality_nodes(
let str_dict_column = reader.fast_fields().str(field_name)?;
let column_and_types = get_term_agg_accessors(
reader,
field_name,
missing,
data.context.strict_field_validation,
)?;
let column_and_types = get_term_agg_accessors(reader, field_name, missing)?;
// Special handling when missing + multi column or incompatible type on text/date.
let missing_and_more_than_one_col = column_and_types.len() > 1 && missing.is_some();
@@ -990,15 +964,9 @@ fn build_terms_or_cardinality_nodes(
Key::U64(_) => ColumnType::U64,
})
.unwrap_or(ColumnType::U64);
let all_accessors = get_all_ff_reader_or_empty(
reader,
field_name,
None,
fallback_type,
data.context.strict_field_validation,
)?
.into_iter()
.collect::<Vec<_>>();
let all_accessors = get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?
.into_iter()
.collect::<Vec<_>>();
// This case only happens when we have term aggregation, or we fail
let req = req.as_terms().cloned().ok_or_else(|| {
crate::TantivyError::InvalidArgument(
@@ -1197,7 +1165,7 @@ mod tests {
"avg": {"field": "score"}
}));
let terms_string_with_child = agg_from_json(json!({
"terms": {"field": "text"},
"terms": {"field": "string_id"},
"aggs": {
"histo": {"histogram": {"field": "score", "interval": 10.0}}
}

View File

@@ -1436,130 +1436,3 @@ fn test_aggregation_on_json_object_mixed_numerical_segments() {
)
);
}
#[test]
fn test_aggregation_invalid_field_returns_error() {
// Test that aggregations return an error when given an invalid field name
// with strict_field_validation enabled
let index = get_test_index_2_segments(false).unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
// Test with a field that doesn't exist at all
let agg_req_str = r#"
{
"date_histogram_test": {
"date_histogram": {
"field": "not_valid_field",
"fixed_interval": "30d"
}
}
}"#;
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
// Enable strict field validation
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
let collector = AggregationCollector::from_aggs(agg, context);
let result = searcher.search(&AllQuery, &collector);
assert!(result.is_err());
match result {
Err(crate::TantivyError::FieldNotFound(field_name)) => {
assert_eq!(field_name, "not_valid_field");
}
_ => panic!("Expected FieldNotFound error, got: {:?}", result),
}
// Test with histogram aggregation on invalid field
let agg_req_str = r#"
{
"histogram_test": {
"histogram": {
"field": "invalid_histogram_field",
"interval": 10.0
}
}
}"#;
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
let collector = AggregationCollector::from_aggs(agg, context);
let result = searcher.search(&AllQuery, &collector);
assert!(result.is_err());
match result {
Err(crate::TantivyError::FieldNotFound(field_name)) => {
assert_eq!(field_name, "invalid_histogram_field");
}
_ => panic!("Expected FieldNotFound error, got: {:?}", result),
}
// Test with terms aggregation on invalid field
let agg_req_str = r#"
{
"terms_test": {
"terms": {
"field": "invalid_terms_field"
}
}
}"#;
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
let collector = AggregationCollector::from_aggs(agg, context);
let result = searcher.search(&AllQuery, &collector);
assert!(result.is_err());
match result {
Err(crate::TantivyError::FieldNotFound(field_name)) => {
assert_eq!(field_name, "invalid_terms_field");
}
_ => panic!("Expected FieldNotFound error, got: {:?}", result),
}
// Test with avg metric aggregation on invalid field
let agg_req_str = r#"
{
"avg_test": {
"avg": {
"field": "invalid_avg_field"
}
}
}"#;
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
let collector = AggregationCollector::from_aggs(agg, context);
let result = searcher.search(&AllQuery, &collector);
assert!(result.is_err());
match result {
Err(crate::TantivyError::FieldNotFound(field_name)) => {
assert_eq!(field_name, "invalid_avg_field");
}
_ => panic!("Expected FieldNotFound error, got: {:?}", result),
}
// Test with range aggregation on invalid field
let agg_req_str = r#"
{
"range_test": {
"range": {
"field": "invalid_range_field",
"ranges": [
{ "to": 10.0 },
{ "from": 10.0, "to": 20.0 },
{ "from": 20.0 }
]
}
}
}"#;
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
let collector = AggregationCollector::from_aggs(agg, context);
let result = searcher.search(&AllQuery, &collector);
assert!(result.is_err());
match result {
Err(crate::TantivyError::FieldNotFound(field_name)) => {
assert_eq!(field_name, "invalid_range_field");
}
_ => panic!("Expected FieldNotFound error, got: {:?}", result),
}
}

View File

@@ -307,7 +307,6 @@ mod tests {
fn terms_aggregation_missing_mult_seg_empty() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let score = schema_builder.add_f64_field("score", FAST);
schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
@@ -355,7 +354,6 @@ mod tests {
fn terms_aggregation_missing_single_seg_empty() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let score = schema_builder.add_f64_field("score", FAST);
schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();

View File

@@ -180,35 +180,18 @@ pub type BucketId = u32;
/// This struct holds shared resources needed during aggregation execution:
/// - `limits`: Memory and bucket limits for the aggregation
/// - `tokenizers`: TokenizerManager for parsing query strings in filter aggregations
/// - `strict_field_validation`: Whether to enforce strict field validation (default: false)
#[derive(Clone, Default)]
pub struct AggContextParams {
/// Aggregation limits (memory and bucket count)
pub limits: AggregationLimitsGuard,
/// Tokenizer manager for query string parsing
pub tokenizers: TokenizerManager,
/// If true, aggregations will return an error when a field doesn't exist in the schema.
/// If false (default), aggregations will return empty results for non-existent fields.
///
/// Set to false for use cases where schema can change over time (e.g., Quickwit splits).
/// Set to true to catch typos and configuration errors early.
pub strict_field_validation: bool,
}
impl AggContextParams {
/// Create new aggregation context parameters
pub fn new(limits: AggregationLimitsGuard, tokenizers: TokenizerManager) -> Self {
Self {
limits,
tokenizers,
strict_field_validation: false,
}
}
/// Enable or disable strict field validation
pub fn set_strict_field_validation(mut self, strict: bool) -> Self {
self.strict_field_validation = strict;
self
Self { limits, tokenizers }
}
}