mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 17:10:48 +00:00
Compare commits
1 Commits
paul.masur
...
PSeitz-pat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
042490479a |
@@ -4,7 +4,7 @@ Tantivy 0.26.1
|
||||
## Performance
|
||||
- Fix quadratic runtime in nested term and composite aggregations: memory accounting scanned all parent buckets on every collect instead of just the current parent (@PSeitz @fulmicoton)
|
||||
|
||||
Tantivy 0.26 (Unreleased)
|
||||
Tantivy 0.26
|
||||
================================
|
||||
|
||||
## Bugfixes
|
||||
|
||||
@@ -65,7 +65,7 @@ tantivy-bitpacker = { version = "0.10", path = "./bitpacker" }
|
||||
common = { version = "0.11", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version = "0.7", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.4", features = ["use_serde"] }
|
||||
datasketches = { version = "0.3.0", features = ["hll"] }
|
||||
datasketches = { git = "https://github.com/fulmicoton-dd/datasketches-rust", rev = "7635fb8" }
|
||||
futures-util = { version = "0.3.28", optional = true }
|
||||
futures-channel = { version = "0.3.28", optional = true }
|
||||
fnv = "1.0.7"
|
||||
@@ -75,7 +75,7 @@ typetag = "0.2.21"
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.17.0"
|
||||
binggan = "0.16.1"
|
||||
rand = "0.9"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
|
||||
@@ -23,7 +23,7 @@ downcast-rs = "2.0.1"
|
||||
proptest = "1"
|
||||
more-asserts = "0.3.1"
|
||||
rand = "0.9"
|
||||
binggan = "0.17.0"
|
||||
binggan = "0.16.1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_merge"
|
||||
|
||||
@@ -19,6 +19,6 @@ time = { version = "0.3.47", features = ["serde-well-known"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.17.0"
|
||||
binggan = "0.16.1"
|
||||
proptest = "1.0.0"
|
||||
rand = "0.9"
|
||||
|
||||
@@ -115,71 +115,6 @@ pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
fast_field_names
|
||||
}
|
||||
|
||||
/// Validates that all fields referenced in the aggregation request exist in the schema
|
||||
/// and are configured as fast fields.
|
||||
///
|
||||
/// This is a convenience function for upfront validation before executing aggregations.
|
||||
/// Returns an error if any field doesn't exist or is not a fast field.
|
||||
///
|
||||
/// Validation is intentionally opt-in rather than baked into aggregation execution: the
|
||||
/// default lenient behavior (returning empty results for missing fields) supports
|
||||
/// schema evolution and federated queries where the same request runs against segments
|
||||
/// or indices with different schemas.
|
||||
///
|
||||
/// # Example
|
||||
/// ```
|
||||
/// use tantivy::aggregation::agg_req::{Aggregations, validate_aggregation_fields_exist};
|
||||
/// use tantivy::schema::{Schema, FAST};
|
||||
/// use tantivy::Index;
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// // Create a simple index
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// schema_builder.add_f64_field("price", FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// // Parse aggregation request
|
||||
/// let agg_req: Aggregations = serde_json::from_str(r#"{
|
||||
/// "avg_price": { "avg": { "field": "price" } }
|
||||
/// }"#)?;
|
||||
///
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// // Validate fields before executing
|
||||
/// for segment_reader in searcher.segment_readers() {
|
||||
/// validate_aggregation_fields_exist(&agg_req, segment_reader)?;
|
||||
/// }
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn validate_aggregation_fields_exist(
|
||||
aggs: &Aggregations,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<()> {
|
||||
let field_names = get_fast_field_names(aggs);
|
||||
let schema = reader.schema();
|
||||
|
||||
for field_name in field_names {
|
||||
// Check if the field is either directly in the schema or could be part of a json field
|
||||
// present in the schema, and verify it's a fast field.
|
||||
if let Some((field, _path)) = schema.find_field(&field_name) {
|
||||
let field_type = schema.get_field_entry(field).field_type();
|
||||
if !field_type.is_fast() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field '{}' is not a fast field. Aggregations require fast fields.",
|
||||
field_name
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(crate::TantivyError::FieldNotFound(field_name));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// All aggregation types.
|
||||
pub enum AggregationVariants {
|
||||
|
||||
@@ -1436,46 +1436,3 @@ fn test_aggregation_on_json_object_mixed_numerical_segments() {
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_field_validation_helper() {
|
||||
// Test the standalone validation helper function for field validation
|
||||
let index = get_test_index_2_segments(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
// Test with invalid field
|
||||
let agg_req: Aggregations = serde_json::from_str(
|
||||
r#"{
|
||||
"avg_test": {
|
||||
"avg": { "field": "nonexistent_field" }
|
||||
}
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let result =
|
||||
crate::aggregation::agg_req::validate_aggregation_fields_exist(&agg_req, segment_reader);
|
||||
assert!(result.is_err());
|
||||
match result {
|
||||
Err(crate::TantivyError::FieldNotFound(field_name)) => {
|
||||
assert_eq!(field_name, "nonexistent_field");
|
||||
}
|
||||
_ => panic!("Expected FieldNotFound error, got: {:?}", result),
|
||||
}
|
||||
|
||||
// Test with valid field
|
||||
let agg_req: Aggregations = serde_json::from_str(
|
||||
r#"{
|
||||
"avg_test": {
|
||||
"avg": { "field": "score" }
|
||||
}
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let result =
|
||||
crate::aggregation::agg_req::validate_aggregation_fields_exist(&agg_req, segment_reader);
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
@@ -166,11 +166,7 @@ impl CouponCache {
|
||||
let should_use_dense =
|
||||
highest_term_ord < 1_000_000u64 || highest_term_ord < num_terms as u64 * 3u64;
|
||||
if should_use_dense {
|
||||
// We don't really care about the value here. We will populate all the values we will
|
||||
// read anyway.
|
||||
let uninitialized_coupon = Coupon::from_hash(0);
|
||||
let mut coupon_map: Vec<Coupon> =
|
||||
vec![uninitialized_coupon; highest_term_ord as usize + 1];
|
||||
let mut coupon_map: Vec<Coupon> = vec![Coupon::EMPTY; highest_term_ord as usize + 1];
|
||||
for (term_ord, coupon) in term_ords.into_iter().zip(coupons.into_iter()) {
|
||||
coupon_map[term_ord as usize] = coupon;
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ use common::{ByteCount, HasLen};
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::directory::error::OpenReadError;
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
|
||||
@@ -160,10 +159,12 @@ impl SegmentReader {
|
||||
let postings_file = segment.open_read(SegmentComponent::Postings)?;
|
||||
let postings_composite = CompositeFile::open(&postings_file)?;
|
||||
|
||||
let positions_composite = match segment.open_read(SegmentComponent::Positions) {
|
||||
Ok(positions_file) => CompositeFile::open(&positions_file)?,
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => CompositeFile::empty(),
|
||||
Err(open_read_error) => return Err(open_read_error.into()),
|
||||
let positions_composite = {
|
||||
if let Ok(positions_file) = segment.open_read(SegmentComponent::Positions) {
|
||||
CompositeFile::open(&positions_file)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
@@ -27,7 +27,7 @@ rand = "0.9"
|
||||
zipf = "7.0.0"
|
||||
rustc-hash = "2.1.0"
|
||||
proptest = "1.2.0"
|
||||
binggan = { version = "0.17.0" }
|
||||
binggan = { version = "0.16.1" }
|
||||
rand_distr = "0.5"
|
||||
|
||||
[features]
|
||||
|
||||
Reference in New Issue
Block a user