diff --git a/Cargo.toml b/Cargo.toml index 2a9d45409..af19fefa4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy" -version = "0.24.0" +version = "0.25.0" authors = ["Paul Masurel "] license = "MIT" categories = ["database-implementations", "data-structures"] diff --git a/src/index/index.rs b/src/index/index.rs index e1a508a79..5495ddced 100644 --- a/src/index/index.rs +++ b/src/index/index.rs @@ -216,7 +216,7 @@ impl IndexBuilder { /// Opens or creates a new index in the provided directory pub fn open_or_create>>(self, dir: T) -> crate::Result { - let dir = dir.into(); + let dir: Box = dir.into(); if !Index::exists(&*dir)? { return self.create(dir); } @@ -494,7 +494,7 @@ impl Index { .into_iter() .map(|segment| SegmentReader::open(&segment)?.fields_metadata()) .collect::>()?; - Ok(merge_field_meta_data(fields_metadata, &self.schema())) + Ok(merge_field_meta_data(fields_metadata)) } /// Creates a new segment_meta (Advanced user only). diff --git a/src/index/inverted_index_reader.rs b/src/index/inverted_index_reader.rs index 4be9d9c51..be29c06df 100644 --- a/src/index/inverted_index_reader.rs +++ b/src/index/inverted_index_reader.rs @@ -1,8 +1,7 @@ use std::io; use common::json_path_writer::JSON_END_OF_PATH; -use common::BinarySerializable; -use fnv::FnvHashSet; +use common::{BinarySerializable, ByteCount}; #[cfg(feature = "quickwit")] use futures_util::{FutureExt, StreamExt, TryStreamExt}; #[cfg(feature = "quickwit")] @@ -36,6 +35,33 @@ pub struct InvertedIndexReader { total_num_tokens: u64, } +/// Object that records the amount of space used by a field in an inverted index. +pub(crate) struct InvertedIndexFieldSpace { + pub field_name: String, + pub field_type: Type, + pub postings_size: ByteCount, + pub positions_size: ByteCount, + pub num_terms: u64, +} + +/// Returns None if the term is not a valid JSON path. +fn extract_field_name_and_field_type_from_json_path(term: &[u8]) -> Option<(String, Type)> { + let index = term.iter().position(|&byte| byte == JSON_END_OF_PATH)?; + let field_type_code = term.get(index + 1).copied()?; + let field_type = Type::from_code(field_type_code)?; + // Let's flush the current field. + let field_name = String::from_utf8_lossy(&term[..index]).to_string(); + Some((field_name, field_type)) +} + +impl InvertedIndexFieldSpace { + fn record(&mut self, term_info: &TermInfo) { + self.postings_size += ByteCount::from(term_info.posting_num_bytes() as u64); + self.positions_size += ByteCount::from(term_info.positions_num_bytes() as u64); + self.num_terms += 1; + } +} + impl InvertedIndexReader { pub(crate) fn new( termdict: TermDictionary, @@ -81,20 +107,56 @@ impl InvertedIndexReader { /// /// Notice: This requires a full scan and therefore **very expensive**. /// TODO: Move to sstable to use the index. - pub fn list_encoded_fields(&self) -> io::Result> { + pub(crate) fn list_encoded_json_fields(&self) -> io::Result> { let mut stream = self.termdict.stream()?; - let mut fields = Vec::new(); - let mut fields_set = FnvHashSet::default(); - while let Some((term, _term_info)) = stream.next() { - if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) { - if !fields_set.contains(&term[..index + 2]) { - fields_set.insert(term[..index + 2].to_vec()); - let typ = Type::from_code(term[index + 1]).unwrap(); - fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ)); + let mut fields: Vec = Vec::new(); + + let mut current_field_opt: Option = None; + // Current field bytes, including the JSON_END_OF_PATH. + let mut current_field_bytes: Vec = Vec::new(); + + while let Some((term, term_info)) = stream.next() { + if let Some(current_field) = &mut current_field_opt { + if term.starts_with(¤t_field_bytes) { + // We are still in the same field. + current_field.record(term_info); + continue; } } + + // This is a new field! + // Let's flush the current field. + fields.extend(current_field_opt.take()); + current_field_bytes.clear(); + + // And create a new one. + let Some((field_name, field_type)) = + extract_field_name_and_field_type_from_json_path(term) + else { + error!( + "invalid term bytes encountered {term:?}. this only happens if the term \ + dictionary is corrupted. please report" + ); + continue; + }; + let mut field_space = InvertedIndexFieldSpace { + field_name, + field_type, + postings_size: ByteCount::default(), + positions_size: ByteCount::default(), + num_terms: 0u64, + }; + field_space.record(&term_info); + + // We include the json type and the json end of path to make sure the prefix check + // is meaningful. + current_field_bytes.extend_from_slice(&term[..field_space.field_name.len() + 2]); + current_field_opt = Some(field_space); } + // We need to flush the last field as well. + fields.extend(current_field_opt.take()); + Ok(fields) } diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index afb12bf00..844e32939 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -1,8 +1,8 @@ use std::collections::HashMap; -use std::ops::BitOrAssign; use std::sync::{Arc, RwLock}; use std::{fmt, io}; +use common::{ByteCount, HasLen}; use fnv::FnvHashMap; use itertools::Itertools; @@ -304,12 +304,16 @@ impl SegmentReader { for (field, field_entry) in self.schema().fields() { let field_name = field_entry.name().to_string(); let is_indexed = field_entry.is_indexed(); - if is_indexed { let is_json = field_entry.field_type().value_type() == Type::Json; if is_json { + let term_dictionary_json_field_num_bytes: u64 = self + .termdict_composite + .open_read(field) + .map(|file_slice| file_slice.len() as u64) + .unwrap_or(0u64); let inv_index = self.inverted_index(field)?; - let encoded_fields_in_index = inv_index.list_encoded_fields()?; + let encoded_fields_in_index = inv_index.list_encoded_json_fields()?; let mut build_path = |field_name: &str, mut json_path: String| { // In this case we need to map the potential fast field to the field name // accepted by the query parser. @@ -328,30 +332,65 @@ impl SegmentReader { format!("{field_name}.{json_path}") } }; - indexed_fields.extend( - encoded_fields_in_index - .into_iter() - .map(|(name, typ)| (build_path(&field_name, name), typ)) - .map(|(field_name, typ)| FieldMetadata { - indexed: true, - stored: false, - field_name, - fast: false, - typ, - }), - ); + let total_num_terms = encoded_fields_in_index + .iter() + .map(|field_space| field_space.num_terms) + .sum(); + indexed_fields.extend(encoded_fields_in_index.into_iter().map(|field_space| { + let field_name = build_path(&field_name, field_space.field_name); + // It is complex to attribute the exact amount of bytes required by specific + // field in the json field. Instead, as a proxy, we + // attribute the total amount of bytes for the entire json field, + // proportionally to the number of terms in each + // fields. + let term_dictionary_size = (term_dictionary_json_field_num_bytes + * field_space.num_terms) + .checked_div(total_num_terms) + .unwrap_or(0); + FieldMetadata { + postings_size: Some(field_space.postings_size), + positions_size: Some(field_space.positions_size), + term_dictionary_size: Some(ByteCount::from(term_dictionary_size)), + fast_size: None, + // The stored flag will be set at the end of this function! + stored: field_entry.is_stored(), + field_name, + typ: field_space.field_type, + } + })); } else { + let postings_size: ByteCount = self + .postings_composite + .open_read(field) + .map(|posting_fileslice| posting_fileslice.len()) + .unwrap_or(0) + .into(); + let positions_size: ByteCount = self + .positions_composite + .open_read(field) + .map(|positions_fileslice| positions_fileslice.len()) + .unwrap_or(0) + .into(); + let term_dictionary_size: ByteCount = self + .termdict_composite + .open_read(field) + .map(|term_dictionary_fileslice| term_dictionary_fileslice.len()) + .unwrap_or(0) + .into(); indexed_fields.push(FieldMetadata { - indexed: true, - stored: false, field_name: field_name.to_string(), - fast: false, typ: field_entry.field_type().value_type(), + // The stored flag will be set at the end of this function! + stored: field_entry.is_stored(), + fast_size: None, + term_dictionary_size: Some(term_dictionary_size), + postings_size: Some(postings_size), + positions_size: Some(positions_size), }); } } } - let mut fast_fields: Vec = self + let fast_fields: Vec = self .fast_fields() .columnar() .iter_columns()? @@ -363,23 +402,21 @@ impl SegmentReader { .get(&field_name) .unwrap_or(&field_name) .to_string(); + let stored = is_field_stored(&field_name, &self.schema); FieldMetadata { - indexed: false, - stored: false, field_name, - fast: true, typ: Type::from(handle.column_type()), + stored, + fast_size: Some(handle.num_bytes()), + term_dictionary_size: None, + postings_size: None, + positions_size: None, } }) .collect(); - // Since the type is encoded differently in the fast field and in the inverted index, - // the order of the fields is not guaranteed to be the same. Therefore, we sort the fields. - // If we are sure that the order is the same, we can remove this sort. - indexed_fields.sort_unstable(); - fast_fields.sort_unstable(); - let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema); - - Ok(merged) + let merged_field_metadatas: Vec = + merge_field_meta_data(vec![indexed_fields, fast_fields]); + Ok(merged_field_metadatas) } /// Returns the segment id @@ -443,20 +480,47 @@ pub struct FieldMetadata { // Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by // field_name then typ. pub typ: Type, - /// Is the field indexed for search - pub indexed: bool, /// Is the field stored in the doc store pub stored: bool, - /// Is the field stored in the columnar storage - pub fast: bool, + /// Size occupied in the columnar storage (None if not fast) + pub fast_size: Option, + /// term_dictionary + pub term_dictionary_size: Option, + /// Size occupied in the index postings storage (None if not indexed) + pub postings_size: Option, + /// Size occupied in the index postings storage (None if positions are not recorded) + pub positions_size: Option, } -impl BitOrAssign for FieldMetadata { - fn bitor_assign(&mut self, rhs: Self) { - assert!(self.field_name == rhs.field_name); - assert!(self.typ == rhs.typ); - self.indexed |= rhs.indexed; + +fn merge_options(left: Option, right: Option) -> Option { + match (left, right) { + (Some(l), Some(r)) => Some(l + r), + (None, right) => right, + (left, None) => left, + } +} + +impl FieldMetadata { + /// Returns true if and only if the field is indexed. + pub fn is_indexed(&self) -> bool { + self.postings_size.is_some() + } + + /// Returns true if and only if the field is a fast field (i.e.: recorded in columnar format). + pub fn is_fast(&self) -> bool { + self.fast_size.is_some() + } + + /// Merges two field metadata. + pub fn merge(&mut self, rhs: Self) { + assert_eq!(self.field_name, rhs.field_name); + assert_eq!(self.typ, rhs.typ); self.stored |= rhs.stored; - self.fast |= rhs.fast; + self.fast_size = merge_options(self.fast_size, rhs.fast_size); + self.term_dictionary_size = + merge_options(self.term_dictionary_size, rhs.term_dictionary_size); + self.postings_size = merge_options(self.postings_size, rhs.postings_size); + self.positions_size = merge_options(self.positions_size, rhs.positions_size); } } @@ -469,23 +533,29 @@ fn is_field_stored(field_name: &str, schema: &Schema) -> bool { } /// Helper to merge the field metadata from multiple segments. -pub fn merge_field_meta_data( - field_metadatas: Vec>, - schema: &Schema, -) -> Vec { +pub fn merge_field_meta_data(mut field_metadatas: Vec>) -> Vec { + // READ BEFORE REMOVING THIS! + // + // Because we replace field sep by `.`, fields are not always sorted. + // Also, to enforce such an implicit contract, we would have to add + // assert here. + // + // Sorting is linear time on pre-sorted data, so we are simply better off sorting data here. + for field_metadatas in &mut field_metadatas { + field_metadatas.sort_unstable(); + } let mut merged_field_metadata = Vec::new(); for (_key, mut group) in &field_metadatas .into_iter() - .kmerge_by(|left, right| left < right) + .kmerge() // TODO: Remove allocation .chunk_by(|el| (el.field_name.to_string(), el.typ)) { let mut merged: FieldMetadata = group.next().unwrap(); for el in group { - merged |= el; + merged.merge(el); } // Currently is_field_stored is maybe too slow for the high cardinality case - merged.stored = is_field_stored(&merged.field_name, schema); merged_field_metadata.push(merged); } merged_field_metadata @@ -507,7 +577,7 @@ fn intersect_alive_bitset( } impl fmt::Debug for SegmentReader { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "SegmentReader({:?})", self.segment_id) } } @@ -516,122 +586,168 @@ impl fmt::Debug for SegmentReader { mod test { use super::*; use crate::index::Index; - use crate::schema::{SchemaBuilder, Term, STORED, TEXT}; + use crate::schema::{Term, STORED, TEXT}; use crate::IndexWriter; + #[track_caller] + fn assert_merge(fields_metadatas: &[Vec], expected: &[FieldMetadata]) { + use itertools::Itertools; + let num_els = fields_metadatas.len(); + for permutation in fields_metadatas.iter().cloned().permutations(num_els) { + let res = merge_field_meta_data(permutation); + assert_eq!(&res, &expected); + } + } + #[test] - fn test_merge_field_meta_data_same() { - let schema = SchemaBuilder::new().build(); + fn test_merge_field_meta_data_same_field() { let field_metadata1 = FieldMetadata { field_name: "a".to_string(), typ: crate::schema::Type::Str, - indexed: true, stored: false, - fast: true, + term_dictionary_size: Some(ByteCount::from(100u64)), + postings_size: Some(ByteCount::from(1_000u64)), + positions_size: Some(ByteCount::from(2_000u64)), + fast_size: Some(ByteCount::from(1_000u64).into()), }; let field_metadata2 = FieldMetadata { field_name: "a".to_string(), typ: crate::schema::Type::Str, - indexed: true, stored: false, - fast: true, + term_dictionary_size: Some(ByteCount::from(80u64)), + postings_size: Some(ByteCount::from(1_500u64)), + positions_size: Some(ByteCount::from(2_500u64)), + fast_size: Some(ByteCount::from(3_000u64).into()), }; - let res = merge_field_meta_data( - vec![vec![field_metadata1.clone()], vec![field_metadata2]], - &schema, + let expected = FieldMetadata { + field_name: "a".to_string(), + typ: crate::schema::Type::Str, + stored: false, + term_dictionary_size: Some(ByteCount::from(180u64)), + postings_size: Some(ByteCount::from(2_500u64)), + positions_size: Some(ByteCount::from(4_500u64)), + fast_size: Some(ByteCount::from(4_000u64).into()), + }; + assert_merge( + &[vec![field_metadata1.clone()], vec![field_metadata2]], + &[expected], ); - assert_eq!(res, vec![field_metadata1]); } + + #[track_caller] #[test] fn test_merge_field_meta_data_different() { - let schema = SchemaBuilder::new().build(); let field_metadata1 = FieldMetadata { field_name: "a".to_string(), typ: crate::schema::Type::Str, - indexed: false, stored: false, - fast: true, + fast_size: Some(1_000u64.into()), + term_dictionary_size: Some(100u64.into()), + postings_size: Some(2_000u64.into()), + positions_size: Some(4_000u64.into()), }; let field_metadata2 = FieldMetadata { field_name: "b".to_string(), typ: crate::schema::Type::Str, - indexed: false, stored: false, - fast: true, + fast_size: Some(1_002u64.into()), + term_dictionary_size: None, + postings_size: None, + positions_size: None, }; let field_metadata3 = FieldMetadata { field_name: "a".to_string(), typ: crate::schema::Type::Str, - indexed: true, + term_dictionary_size: Some(101u64.into()), + postings_size: Some(2_001u64.into()), + positions_size: Some(4_001u64.into()), stored: false, - fast: false, + fast_size: None, }; - let res = merge_field_meta_data( - vec![ + let expected = vec![ + FieldMetadata { + field_name: "a".to_string(), + typ: crate::schema::Type::Str, + stored: false, + term_dictionary_size: Some(201u64.into()), + postings_size: Some(4_001u64.into()), + positions_size: Some(8_001u64.into()), + fast_size: Some(1_000u64.into()), + }, + FieldMetadata { + field_name: "b".to_string(), + typ: crate::schema::Type::Str, + stored: false, + term_dictionary_size: None, + postings_size: None, + positions_size: None, + fast_size: Some(1_002u64.into()), + }, + ]; + assert_merge( + &[ vec![field_metadata1.clone(), field_metadata2.clone()], vec![field_metadata3], ], - &schema, + &expected, ); - let field_metadata_expected1 = FieldMetadata { - field_name: "a".to_string(), - typ: crate::schema::Type::Str, - indexed: true, - stored: false, - fast: true, - }; - assert_eq!(res, vec![field_metadata_expected1, field_metadata2.clone()]); } + #[test] fn test_merge_field_meta_data_merge() { - use pretty_assertions::assert_eq; let get_meta_data = |name: &str, typ: Type| FieldMetadata { field_name: name.to_string(), typ, - indexed: false, + term_dictionary_size: None, + postings_size: None, + positions_size: None, stored: false, - fast: true, + fast_size: Some(1u64.into()), }; - let schema = SchemaBuilder::new().build(); - let mut metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)]; - metas.sort(); - let res = merge_field_meta_data(vec![vec![get_meta_data("e", Type::Str)], metas], &schema); - assert_eq!( - res, - vec![ + let metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)]; + assert_merge( + &[vec![get_meta_data("e", Type::Str)], metas], + &[ get_meta_data("d", Type::Str), get_meta_data("e", Type::Str), get_meta_data("e", Type::U64), - ] + ], ); } + #[test] fn test_merge_field_meta_data_bitxor() { let field_metadata1 = FieldMetadata { field_name: "a".to_string(), typ: crate::schema::Type::Str, - indexed: false, + term_dictionary_size: None, + postings_size: None, + positions_size: None, stored: false, - fast: true, + fast_size: Some(10u64.into()), }; let field_metadata2 = FieldMetadata { field_name: "a".to_string(), typ: crate::schema::Type::Str, - indexed: true, + term_dictionary_size: Some(10u64.into()), + postings_size: Some(11u64.into()), + positions_size: Some(12u64.into()), stored: false, - fast: false, + fast_size: None, }; let field_metadata_expected = FieldMetadata { field_name: "a".to_string(), typ: crate::schema::Type::Str, - indexed: true, + term_dictionary_size: Some(10u64.into()), + postings_size: Some(11u64.into()), + positions_size: Some(12u64.into()), stored: false, - fast: true, + fast_size: Some(10u64.into()), }; let mut res1 = field_metadata1.clone(); - res1 |= field_metadata2.clone(); + res1.merge(field_metadata2.clone()); let mut res2 = field_metadata2.clone(); - res2 |= field_metadata1; + res2.merge(field_metadata1); assert_eq!(res1, field_metadata_expected); assert_eq!(res2, field_metadata_expected); } @@ -662,6 +778,7 @@ mod test { assert_eq!(4, searcher.segment_reader(0).max_doc()); Ok(()) } + #[test] fn test_alive_docs_iterator() -> crate::Result<()> { let mut schema_builder = Schema::builder(); diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 278e5aca1..f5e047a7a 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -61,6 +61,8 @@ type AddBatchReceiver = channel::Receiver>; #[cfg(test)] mod tests_mmap { + use common::ByteCount; + use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::AggregationCollector; @@ -280,11 +282,14 @@ mod tests_mmap { field_name_out }; - let mut fields = reader.searcher().segment_readers()[0] + let mut fields: Vec<(String, Type)> = reader.searcher().segment_readers()[0] .inverted_index(field) .unwrap() - .list_encoded_fields() - .unwrap(); + .list_encoded_json_fields() + .unwrap() + .into_iter() + .map(|field_space| (field_space.field_name, field_space.field_type)) + .collect(); assert_eq!(fields.len(), 8); fields.sort(); let mut expected_fields = vec![ @@ -385,7 +390,12 @@ mod tests_mmap { let reader = &searcher.segment_readers()[0]; let inverted_index = reader.inverted_index(json_field).unwrap(); assert_eq!( - inverted_index.list_encoded_fields().unwrap(), + inverted_index + .list_encoded_json_fields() + .unwrap() + .into_iter() + .map(|field_space| (field_space.field_name, field_space.field_type)) + .collect::>(), [ ("k8s.container.name".to_string(), Type::Str), ("sub\u{1}a".to_string(), Type::I64), @@ -402,19 +412,41 @@ mod tests_mmap { fn test_json_fields_metadata_expanded_dots_one_segment() { test_json_fields_metadata(true, true); } + #[test] fn test_json_fields_metadata_expanded_dots_multi_segment() { test_json_fields_metadata(true, false); } + #[test] fn test_json_fields_metadata_no_expanded_dots_one_segment() { test_json_fields_metadata(false, true); } + #[test] fn test_json_fields_metadata_no_expanded_dots_multi_segment() { test_json_fields_metadata(false, false); } + #[track_caller] + fn assert_size_eq(lhs: Option, rhs: Option) { + let ignore_actual_values = |size_opt: Option| size_opt.map(|val| val > 0); + assert_eq!(ignore_actual_values(lhs), ignore_actual_values(rhs)); + } + + #[track_caller] + fn assert_field_metadata_eq_but_ignore_field_size( + expected: &FieldMetadata, + actual: &FieldMetadata, + ) { + assert_eq!(&expected.field_name, &actual.field_name); + assert_eq!(&expected.typ, &actual.typ); + assert_eq!(&expected.stored, &actual.stored); + assert_size_eq(expected.postings_size, actual.postings_size); + assert_size_eq(expected.positions_size, actual.positions_size); + assert_size_eq(expected.fast_size, actual.fast_size); + } + fn test_json_fields_metadata(expanded_dots: bool, one_segment: bool) { use pretty_assertions::assert_eq; let mut schema_builder = Schema::builder(); @@ -453,81 +485,101 @@ mod tests_mmap { assert_eq!(searcher.num_docs(), 3); let fields_metadata = index.fields_metadata().unwrap(); - assert_eq!( - fields_metadata, - [ - FieldMetadata { - field_name: "empty".to_string(), - indexed: true, - stored: true, - fast: true, - typ: Type::U64 + + let expected_fields = &[ + FieldMetadata { + field_name: "empty".to_string(), + stored: true, + typ: Type::U64, + term_dictionary_size: Some(0u64.into()), + fast_size: Some(1u64.into()), + postings_size: Some(0u64.into()), + positions_size: Some(0u64.into()), + }, + FieldMetadata { + field_name: if expanded_dots { + "json.shadow.k8s.container.name".to_string() + } else { + "json.shadow.k8s\\.container\\.name".to_string() }, - FieldMetadata { - field_name: if expanded_dots { - "json.shadow.k8s.container.name".to_string() - } else { - "json.shadow.k8s\\.container\\.name".to_string() - }, - indexed: true, - stored: true, - fast: true, - typ: Type::Str - }, - FieldMetadata { - field_name: "json.shadow.sub.a".to_string(), - indexed: true, - stored: true, - fast: true, - typ: Type::I64 - }, - FieldMetadata { - field_name: "json.shadow.sub.b".to_string(), - indexed: true, - stored: true, - fast: true, - typ: Type::I64 - }, - FieldMetadata { - field_name: "json.shadow.suber.a".to_string(), - indexed: true, - stored: true, - fast: true, - typ: Type::I64 - }, - FieldMetadata { - field_name: "json.shadow.suber.a".to_string(), - indexed: true, - stored: true, - fast: true, - typ: Type::Str - }, - FieldMetadata { - field_name: "json.shadow.suber.b".to_string(), - indexed: true, - stored: true, - fast: true, - typ: Type::I64 - }, - FieldMetadata { - field_name: "json.shadow.val".to_string(), - indexed: true, - stored: true, - fast: true, - typ: Type::Str - }, - FieldMetadata { - field_name: "numbers".to_string(), - indexed: false, - stored: false, - fast: true, - typ: Type::U64 - } - ] - ); + stored: true, + typ: Type::Str, + term_dictionary_size: Some(1u64.into()), + fast_size: Some(1u64.into()), + postings_size: Some(1u64.into()), + positions_size: Some(1u64.into()), + }, + FieldMetadata { + field_name: "json.shadow.sub.a".to_string(), + typ: Type::I64, + stored: true, + fast_size: Some(1u64.into()), + term_dictionary_size: Some(1u64.into()), + postings_size: Some(1u64.into()), + positions_size: Some(1u64.into()), + }, + FieldMetadata { + field_name: "json.shadow.sub.b".to_string(), + typ: Type::I64, + stored: true, + fast_size: Some(1u64.into()), + term_dictionary_size: Some(1u64.into()), + postings_size: Some(1u64.into()), + positions_size: Some(1u64.into()), + }, + FieldMetadata { + field_name: "json.shadow.suber.a".to_string(), + stored: true, + typ: Type::I64, + fast_size: Some(1u64.into()), + term_dictionary_size: Some(1u64.into()), + postings_size: Some(1u64.into()), + positions_size: Some(1u64.into()), + }, + FieldMetadata { + field_name: "json.shadow.suber.a".to_string(), + typ: Type::Str, + stored: true, + fast_size: Some(1u64.into()), + term_dictionary_size: Some(1u64.into()), + postings_size: Some(1u64.into()), + positions_size: Some(1u64.into()), + }, + FieldMetadata { + field_name: "json.shadow.suber.b".to_string(), + typ: Type::I64, + stored: true, + fast_size: Some(1u64.into()), + term_dictionary_size: Some(1u64.into()), + postings_size: Some(1u64.into()), + positions_size: Some(1u64.into()), + }, + FieldMetadata { + field_name: "json.shadow.val".to_string(), + typ: Type::Str, + stored: true, + fast_size: Some(1u64.into()), + term_dictionary_size: Some(1u64.into()), + postings_size: Some(1u64.into()), + positions_size: Some(1u64.into()), + }, + FieldMetadata { + field_name: "numbers".to_string(), + stored: false, + typ: Type::U64, + fast_size: Some(1u64.into()), + term_dictionary_size: None, + postings_size: None, + positions_size: None, + }, + ]; + assert_eq!(fields_metadata.len(), expected_fields.len()); + for (expected, value) in expected_fields.iter().zip(fields_metadata.iter()) { + assert_field_metadata_eq_but_ignore_field_size(expected, value); + } let query_parser = QueryParser::for_index(&index, vec![]); // Test if returned field name can be queried - for indexed_field in fields_metadata.iter().filter(|meta| meta.indexed) { + for indexed_field in fields_metadata.iter().filter(|meta| meta.is_indexed()) { let val = if indexed_field.typ == Type::Str { "a" } else { @@ -543,7 +595,10 @@ mod tests_mmap { } } // Test if returned field name can be used for aggregation - for fast_field in fields_metadata.iter().filter(|meta| meta.fast) { + for fast_field in fields_metadata + .iter() + .filter(|field_metadata| field_metadata.is_fast()) + { let agg_req_str = json!( { "termagg": { diff --git a/src/lib.rs b/src/lib.rs index cf4308d55..41ceda695 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -165,7 +165,7 @@ mod macros; mod future_result; // Re-exports -pub use common::DateTime; +pub use common::{ByteCount, DateTime}; pub use {columnar, query_grammar, time}; pub use crate::error::TantivyError;