mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 00:20:42 +00:00
Added a flag: strict_validation
This commit is contained in:
@@ -55,11 +55,13 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
|
||||
]
|
||||
}
|
||||
|
||||
/// Get fast field reader or return an error if the field doesn't exist.
|
||||
/// Get fast field reader or return an error if the field doesn't exist (when strict validation is
|
||||
/// enabled).
|
||||
pub(crate) fn get_ff_reader(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
allowed_column_types: Option<&[ColumnType]>,
|
||||
strict_validation: bool,
|
||||
) -> crate::Result<(columnar::Column<u64>, ColumnType)> {
|
||||
let ff_fields = reader.fast_fields();
|
||||
let ff_field_with_type = ff_fields.u64_lenient_for_type(allowed_column_types, field_name)?;
|
||||
@@ -67,26 +69,28 @@ pub(crate) fn get_ff_reader(
|
||||
match ff_field_with_type {
|
||||
Some(field) => Ok(field),
|
||||
None => {
|
||||
// Check if the field exists in the schema but is not a fast field
|
||||
let schema = reader.schema();
|
||||
if let Some((field, _path)) = schema.find_field(field_name) {
|
||||
let field_type = schema.get_field_entry(field).field_type();
|
||||
if !field_type.is_fast() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field '{}' is not a fast field. Aggregations require fast fields.",
|
||||
field_name
|
||||
)));
|
||||
if strict_validation {
|
||||
// Check if the field exists in the schema but is not a fast field
|
||||
let schema = reader.schema();
|
||||
if let Some((field, _path)) = schema.find_field(field_name) {
|
||||
let field_type = schema.get_field_entry(field).field_type();
|
||||
if !field_type.is_fast() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field '{}' is not a fast field. Aggregations require fast fields.",
|
||||
field_name
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// Field doesn't exist at all or has no values in this segment
|
||||
// Check if it exists in schema to provide a better error message
|
||||
if schema.find_field(field_name).is_none() {
|
||||
return Err(crate::TantivyError::FieldNotFound(field_name.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
// Field doesn't exist at all or has no values in this segment
|
||||
// Check if it exists in schema to provide a better error message
|
||||
if schema.find_field(field_name).is_none() {
|
||||
return Err(crate::TantivyError::FieldNotFound(field_name.to_string()));
|
||||
}
|
||||
|
||||
// Field exists in schema and is a fast field, but has no values in this segment
|
||||
// This is acceptable - return an empty column
|
||||
// OR strict validation is disabled - return an empty column
|
||||
Ok((
|
||||
Column::build_empty_column(reader.num_docs()),
|
||||
ColumnType::U64,
|
||||
@@ -111,35 +115,39 @@ pub(crate) fn get_dynamic_columns(
|
||||
/// Get all fast field reader or empty as default.
|
||||
///
|
||||
/// Is guaranteed to return at least one column.
|
||||
/// Returns an error if the field doesn't exist in the schema or is not a fast field.
|
||||
/// Returns an error if the field doesn't exist in the schema or is not a fast field (when strict
|
||||
/// validation is enabled).
|
||||
pub(crate) fn get_all_ff_reader_or_empty(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
allowed_column_types: Option<&[ColumnType]>,
|
||||
fallback_type: ColumnType,
|
||||
strict_validation: bool,
|
||||
) -> crate::Result<Vec<(columnar::Column<u64>, ColumnType)>> {
|
||||
let ff_fields = reader.fast_fields();
|
||||
let mut ff_field_with_type =
|
||||
ff_fields.u64_lenient_for_type_all(allowed_column_types, field_name)?;
|
||||
|
||||
if ff_field_with_type.is_empty() {
|
||||
// Check if the field exists in the schema but is not a fast field
|
||||
let schema = reader.schema();
|
||||
if let Some((field, _path)) = schema.find_field(field_name) {
|
||||
let field_type = schema.get_field_entry(field).field_type();
|
||||
if !field_type.is_fast() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field '{}' is not a fast field. Aggregations require fast fields.",
|
||||
field_name
|
||||
)));
|
||||
if strict_validation {
|
||||
// Check if the field exists in the schema but is not a fast field
|
||||
let schema = reader.schema();
|
||||
if let Some((field, _path)) = schema.find_field(field_name) {
|
||||
let field_type = schema.get_field_entry(field).field_type();
|
||||
if !field_type.is_fast() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field '{}' is not a fast field. Aggregations require fast fields.",
|
||||
field_name
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
// Field doesn't exist in the schema at all
|
||||
return Err(crate::TantivyError::FieldNotFound(field_name.to_string()));
|
||||
}
|
||||
} else {
|
||||
// Field doesn't exist in the schema at all
|
||||
return Err(crate::TantivyError::FieldNotFound(field_name.to_string()));
|
||||
}
|
||||
|
||||
// Field exists in schema and is a fast field, but has no values in this segment
|
||||
// This is acceptable - return an empty column
|
||||
// OR strict validation is disabled - return an empty column
|
||||
ff_field_with_type.push((Column::build_empty_column(reader.num_docs()), fallback_type));
|
||||
}
|
||||
Ok(ff_field_with_type)
|
||||
|
||||
@@ -501,6 +501,7 @@ fn build_nodes(
|
||||
reader,
|
||||
&range_req.field,
|
||||
Some(get_numeric_or_date_column_types()),
|
||||
data.context.strict_field_validation,
|
||||
)?;
|
||||
let idx_in_req_data = data.push_range_req_data(RangeAggReqData {
|
||||
accessor,
|
||||
@@ -521,6 +522,7 @@ fn build_nodes(
|
||||
reader,
|
||||
&histo_req.field,
|
||||
Some(get_numeric_or_date_column_types()),
|
||||
data.context.strict_field_validation,
|
||||
)?;
|
||||
let idx_in_req_data = data.push_histogram_req_data(HistogramAggReqData {
|
||||
accessor,
|
||||
@@ -542,8 +544,12 @@ fn build_nodes(
|
||||
}])
|
||||
}
|
||||
DateHistogram(date_req) => {
|
||||
let (accessor, field_type) =
|
||||
get_ff_reader(reader, &date_req.field, Some(&[ColumnType::DateTime]))?;
|
||||
let (accessor, field_type) = get_ff_reader(
|
||||
reader,
|
||||
&date_req.field,
|
||||
Some(&[ColumnType::DateTime]),
|
||||
data.context.strict_field_validation,
|
||||
)?;
|
||||
// Convert to histogram request, normalize to ns precision
|
||||
let mut histo_req = date_req.to_histogram_req()?;
|
||||
histo_req.normalize_date_time();
|
||||
@@ -626,7 +632,12 @@ fn build_nodes(
|
||||
))
|
||||
}
|
||||
};
|
||||
let (accessor, field_type) = get_ff_reader(reader, field, allowed_column_types)?;
|
||||
let (accessor, field_type) = get_ff_reader(
|
||||
reader,
|
||||
field,
|
||||
allowed_column_types,
|
||||
data.context.strict_field_validation,
|
||||
)?;
|
||||
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
|
||||
accessor,
|
||||
field_type,
|
||||
@@ -653,6 +664,7 @@ fn build_nodes(
|
||||
reader,
|
||||
percentiles_req.field_name(),
|
||||
Some(get_numeric_or_date_column_types()),
|
||||
data.context.strict_field_validation,
|
||||
)?;
|
||||
let idx_in_req_data = data.push_metric_req_data(MetricAggReqData {
|
||||
accessor,
|
||||
@@ -681,7 +693,14 @@ fn build_nodes(
|
||||
let accessors: Vec<(Column<u64>, ColumnType)> = top_hits
|
||||
.field_names()
|
||||
.iter()
|
||||
.map(|field| get_ff_reader(reader, field, Some(get_numeric_or_date_column_types())))
|
||||
.map(|field| {
|
||||
get_ff_reader(
|
||||
reader,
|
||||
field,
|
||||
Some(get_numeric_or_date_column_types()),
|
||||
data.context.strict_field_validation,
|
||||
)
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
let value_accessors = top_hits
|
||||
@@ -767,6 +786,7 @@ fn get_term_agg_accessors(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
missing: &Option<Key>,
|
||||
strict_validation: bool,
|
||||
) -> crate::Result<Vec<(Column<u64>, ColumnType)>> {
|
||||
let allowed_column_types = [
|
||||
ColumnType::I64,
|
||||
@@ -794,6 +814,7 @@ fn get_term_agg_accessors(
|
||||
field_name,
|
||||
Some(&allowed_column_types),
|
||||
fallback_type,
|
||||
strict_validation,
|
||||
)?;
|
||||
|
||||
Ok(column_and_types)
|
||||
@@ -828,7 +849,12 @@ fn build_terms_or_cardinality_nodes(
|
||||
|
||||
let str_dict_column = reader.fast_fields().str(field_name)?;
|
||||
|
||||
let column_and_types = get_term_agg_accessors(reader, field_name, missing)?;
|
||||
let column_and_types = get_term_agg_accessors(
|
||||
reader,
|
||||
field_name,
|
||||
missing,
|
||||
data.context.strict_field_validation,
|
||||
)?;
|
||||
|
||||
// Special handling when missing + multi column or incompatible type on text/date.
|
||||
let missing_and_more_than_one_col = column_and_types.len() > 1 && missing.is_some();
|
||||
@@ -850,9 +876,15 @@ fn build_terms_or_cardinality_nodes(
|
||||
Key::U64(_) => ColumnType::U64,
|
||||
})
|
||||
.unwrap_or(ColumnType::U64);
|
||||
let all_accessors = get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?
|
||||
.into_iter()
|
||||
.collect::<Vec<_>>();
|
||||
let all_accessors = get_all_ff_reader_or_empty(
|
||||
reader,
|
||||
field_name,
|
||||
None,
|
||||
fallback_type,
|
||||
data.context.strict_field_validation,
|
||||
)?
|
||||
.into_iter()
|
||||
.collect::<Vec<_>>();
|
||||
// This case only happens when we have term aggregation, or we fail
|
||||
let req = req.as_terms().cloned().ok_or_else(|| {
|
||||
crate::TantivyError::InvalidArgument(
|
||||
|
||||
@@ -1440,6 +1440,7 @@ fn test_aggregation_on_json_object_mixed_numerical_segments() {
|
||||
#[test]
|
||||
fn test_aggregation_invalid_field_returns_error() {
|
||||
// Test that aggregations return an error when given an invalid field name
|
||||
// with strict_field_validation enabled
|
||||
let index = get_test_index_2_segments(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
@@ -1455,7 +1456,9 @@ fn test_aggregation_invalid_field_returns_error() {
|
||||
}
|
||||
}"#;
|
||||
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||
let collector = get_collector(agg);
|
||||
// Enable strict field validation
|
||||
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
|
||||
let collector = AggregationCollector::from_aggs(agg, context);
|
||||
let result = searcher.search(&AllQuery, &collector);
|
||||
|
||||
assert!(result.is_err());
|
||||
@@ -1477,7 +1480,8 @@ fn test_aggregation_invalid_field_returns_error() {
|
||||
}
|
||||
}"#;
|
||||
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||
let collector = get_collector(agg);
|
||||
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
|
||||
let collector = AggregationCollector::from_aggs(agg, context);
|
||||
let result = searcher.search(&AllQuery, &collector);
|
||||
|
||||
assert!(result.is_err());
|
||||
@@ -1498,7 +1502,8 @@ fn test_aggregation_invalid_field_returns_error() {
|
||||
}
|
||||
}"#;
|
||||
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||
let collector = get_collector(agg);
|
||||
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
|
||||
let collector = AggregationCollector::from_aggs(agg, context);
|
||||
let result = searcher.search(&AllQuery, &collector);
|
||||
|
||||
assert!(result.is_err());
|
||||
@@ -1519,7 +1524,8 @@ fn test_aggregation_invalid_field_returns_error() {
|
||||
}
|
||||
}"#;
|
||||
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||
let collector = get_collector(agg);
|
||||
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
|
||||
let collector = AggregationCollector::from_aggs(agg, context);
|
||||
let result = searcher.search(&AllQuery, &collector);
|
||||
|
||||
assert!(result.is_err());
|
||||
@@ -1545,7 +1551,8 @@ fn test_aggregation_invalid_field_returns_error() {
|
||||
}
|
||||
}"#;
|
||||
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||
let collector = get_collector(agg);
|
||||
let context = crate::aggregation::AggContextParams::default().set_strict_field_validation(true);
|
||||
let collector = AggregationCollector::from_aggs(agg, context);
|
||||
let result = searcher.search(&AllQuery, &collector);
|
||||
|
||||
assert!(result.is_err());
|
||||
|
||||
@@ -180,18 +180,57 @@ pub type BucketId = u32;
|
||||
/// This struct holds shared resources needed during aggregation execution:
|
||||
/// - `limits`: Memory and bucket limits for the aggregation
|
||||
/// - `tokenizers`: TokenizerManager for parsing query strings in filter aggregations
|
||||
#[derive(Clone, Default)]
|
||||
/// - `strict_field_validation`: Whether to enforce strict field validation (default: false)
|
||||
#[derive(Clone)]
|
||||
pub struct AggContextParams {
|
||||
/// Aggregation limits (memory and bucket count)
|
||||
pub limits: AggregationLimitsGuard,
|
||||
/// Tokenizer manager for query string parsing
|
||||
pub tokenizers: TokenizerManager,
|
||||
/// If true, aggregations will return an error when a field doesn't exist in the schema.
|
||||
/// If false (default), aggregations will return empty results for non-existent fields.
|
||||
///
|
||||
/// Set to false for use cases where schema can change over time (e.g., Quickwit splits).
|
||||
/// Set to true to catch typos and configuration errors early.
|
||||
pub strict_field_validation: bool,
|
||||
}
|
||||
|
||||
impl Default for AggContextParams {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
limits: Default::default(),
|
||||
tokenizers: Default::default(),
|
||||
strict_field_validation: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AggContextParams {
|
||||
/// Create new aggregation context parameters
|
||||
pub fn new(limits: AggregationLimitsGuard, tokenizers: TokenizerManager) -> Self {
|
||||
Self { limits, tokenizers }
|
||||
Self {
|
||||
limits,
|
||||
tokenizers,
|
||||
strict_field_validation: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create new aggregation context parameters with strict field validation enabled
|
||||
pub fn with_strict_field_validation(
|
||||
limits: AggregationLimitsGuard,
|
||||
tokenizers: TokenizerManager,
|
||||
) -> Self {
|
||||
Self {
|
||||
limits,
|
||||
tokenizers,
|
||||
strict_field_validation: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable or disable strict field validation
|
||||
pub fn set_strict_field_validation(mut self, strict: bool) -> Self {
|
||||
self.strict_field_validation = strict;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user