From 7b21b3f25ac86092efaf3a8ad49c0b622ce59849 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 25 Oct 2019 09:06:44 +0900 Subject: [PATCH] Refactoring around Field (#673) * Refactoring around Field Removing the contract about the order of the field, and the field id allocation. * Update delete_queue.rs * Update field.rs --- src/collector/facet_collector.rs | 4 +-- src/collector/top_score_collector.rs | 2 +- src/common/composite_file.rs | 12 ++++--- src/fastfield/readers.rs | 3 +- src/fastfield/writer.rs | 3 +- src/fieldnorm/writer.rs | 23 +++++++------ src/indexer/delete_queue.rs | 2 +- src/indexer/merger.rs | 16 ++++----- src/indexer/segment_writer.rs | 19 ++++++----- src/postings/mod.rs | 14 ++++---- src/postings/postings_writer.rs | 17 +++++----- src/query/query_parser/query_parser.rs | 10 ++++-- src/query/term_query/mod.rs | 2 +- src/schema/field.rs | 14 +++++++- src/schema/schema.rs | 45 +++++++++++++++++++------- src/schema/term.rs | 6 ++-- tests/failpoints/mod.rs | 1 - 17 files changed, 117 insertions(+), 76 deletions(-) diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index e2e6c288c..f2f84d92e 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -515,7 +515,7 @@ mod tests { #[should_panic(expected = "Tried to add a facet which is a descendant of \ an already added facet.")] fn test_misused_facet_collector() { - let mut facet_collector = FacetCollector::for_field(Field(0)); + let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0)); facet_collector.add_facet(Facet::from("/country")); facet_collector.add_facet(Facet::from("/country/europe")); } @@ -546,7 +546,7 @@ mod tests { #[test] fn test_non_used_facet_collector() { - let mut facet_collector = FacetCollector::for_field(Field(0)); + let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0)); facet_collector.add_facet(Facet::from("/country")); facet_collector.add_facet(Facet::from("/countryeurope")); } diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index b9b9f7bf9..19cc48813 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -551,7 +551,7 @@ mod tests { )); }); let searcher = index.reader().unwrap().searcher(); - let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field(2)); + let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2)); let segment_reader = searcher.segment_reader(0u32); top_collector .for_segment(0, segment_reader) diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index adbef0a11..34cfe2a59 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -199,13 +199,13 @@ mod test { let w = directory.open_write(path).unwrap(); let mut composite_write = CompositeWrite::wrap(w); { - let mut write_0 = composite_write.for_field(Field(0u32)); + let mut write_0 = composite_write.for_field(Field::from_field_id(0u32)); VInt(32431123u64).serialize(&mut write_0).unwrap(); write_0.flush().unwrap(); } { - let mut write_4 = composite_write.for_field(Field(4u32)); + let mut write_4 = composite_write.for_field(Field::from_field_id(4u32)); VInt(2).serialize(&mut write_4).unwrap(); write_4.flush().unwrap(); } @@ -215,14 +215,18 @@ mod test { let r = directory.open_read(path).unwrap(); let composite_file = CompositeFile::open(&r).unwrap(); { - let file0 = composite_file.open_read(Field(0u32)).unwrap(); + let file0 = composite_file + .open_read(Field::from_field_id(0u32)) + .unwrap(); let mut file0_buf = file0.as_slice(); let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0; assert_eq!(file0_buf.len(), 0); assert_eq!(payload_0, 32431123u64); } { - let file4 = composite_file.open_read(Field(4u32)).unwrap(); + let file4 = composite_file + .open_read(Field::from_field_id(4u32)) + .unwrap(); let mut file4_buf = file4.as_slice(); let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0; assert_eq!(file4_buf.len(), 0); diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 1eb4ca28b..c1cd7a6de 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -59,8 +59,7 @@ impl FastFieldReaders { fast_bytes: Default::default(), fast_fields_composite: fast_fields_composite.clone(), }; - for (field_id, field_entry) in schema.fields().iter().enumerate() { - let field = Field(field_id as u32); + for (field, field_entry) in schema.fields() { let field_type = field_entry.field_type(); if field_type == &FieldType::Bytes { let idx_reader = fast_fields_composite diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 3afe6af9f..c3c9e1424 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -24,8 +24,7 @@ impl FastFieldsWriter { let mut multi_values_writers = Vec::new(); let mut bytes_value_writers = Vec::new(); - for (field_id, field_entry) in schema.fields().iter().enumerate() { - let field = Field(field_id as u32); + for (field, field_entry) in schema.fields() { let default_value = match *field_entry.field_type() { FieldType::I64(_) => common::i64_to_u64(0i64), FieldType::F64(_) => common::f64_to_u64(0.0f64), diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index 9df7f94fc..ceeac05d2 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -22,11 +22,14 @@ impl FieldNormsWriter { pub(crate) fn fields_with_fieldnorm(schema: &Schema) -> Vec { schema .fields() - .iter() - .enumerate() - .filter(|&(_, field_entry)| field_entry.is_indexed()) - .map(|(field, _)| Field(field as u32)) - .collect::>() + .filter_map(|(field, field_entry)| { + if field_entry.is_indexed() { + Some(field) + } else { + None + } + }) + .collect::>() } /// Initialize with state for tracking the field norm fields @@ -35,7 +38,7 @@ impl FieldNormsWriter { let fields = FieldNormsWriter::fields_with_fieldnorm(schema); let max_field = fields .iter() - .map(|field| field.0) + .map(Field::field_id) .max() .map(|max_field_id| max_field_id as usize + 1) .unwrap_or(0); @@ -50,8 +53,8 @@ impl FieldNormsWriter { /// /// Will extend with 0-bytes for documents that have not been seen. pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) { - for &field in self.fields.iter() { - self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8); + for field in self.fields.iter() { + self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8); } } @@ -64,7 +67,7 @@ impl FieldNormsWriter { /// * field - the field being set /// * fieldnorm - the number of terms present in document `doc` in field `field` pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) { - let fieldnorm_buffer: &mut Vec = &mut self.fieldnorms_buffer[field.0 as usize]; + let fieldnorm_buffer: &mut Vec = &mut self.fieldnorms_buffer[field.field_id() as usize]; assert!( fieldnorm_buffer.len() <= doc as usize, "Cannot register a given fieldnorm twice" @@ -77,7 +80,7 @@ impl FieldNormsWriter { /// Serialize the seen fieldnorm values to the serializer for all fields. pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> { for &field in self.fields.iter() { - let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..]; + let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..]; fieldnorms_serializer.serialize_field(field, fieldnorm_values)?; } Ok(()) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 7bd780f68..1a867a921 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -258,7 +258,7 @@ mod tests { let delete_queue = DeleteQueue::new(); let make_op = |i: usize| { - let field = Field(1u32); + let field = Field::from_field_id(1u32); DeleteOperation { opstamp: i as u64, term: Term::from_field_u64(field, i as u64), diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index be38c0a87..199780ffb 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -190,8 +190,7 @@ impl IndexMerger { fast_field_serializer: &mut FastFieldSerializer, mut term_ord_mappings: HashMap, ) -> Result<()> { - for (field_id, field_entry) in self.schema.fields().iter().enumerate() { - let field = Field(field_id as u32); + for (field, field_entry) in self.schema.fields() { let field_type = field_entry.field_type(); match *field_type { FieldType::HierarchicalFacet => { @@ -649,15 +648,12 @@ impl IndexMerger { serializer: &mut InvertedIndexSerializer, ) -> Result> { let mut term_ordinal_mappings = HashMap::new(); - for (field_ord, field_entry) in self.schema.fields().iter().enumerate() { + for (field, field_entry) in self.schema.fields() { if field_entry.is_indexed() { - let indexed_field = Field(field_ord as u32); - if let Some(term_ordinal_mapping) = self.write_postings_for_field( - indexed_field, - field_entry.field_type(), - serializer, - )? { - term_ordinal_mappings.insert(indexed_field, term_ordinal_mapping); + if let Some(term_ordinal_mapping) = + self.write_postings_for_field(field, field_entry.field_type(), serializer)? + { + term_ordinal_mappings.insert(field, term_ordinal_mapping); } } } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 8b9d99c1a..638e9de4d 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -6,11 +6,11 @@ use crate::fieldnorm::FieldNormsWriter; use crate::indexer::segment_serializer::SegmentSerializer; use crate::postings::compute_table_size; use crate::postings::MultiFieldPostingsWriter; -use crate::schema::FieldEntry; use crate::schema::FieldType; use crate::schema::Schema; use crate::schema::Term; use crate::schema::Value; +use crate::schema::{Field, FieldEntry}; use crate::tokenizer::BoxedTokenizer; use crate::tokenizer::FacetTokenizer; use crate::tokenizer::{TokenStream, Tokenizer}; @@ -70,12 +70,10 @@ impl SegmentWriter { let table_num_bits = initial_table_size(memory_budget)?; let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); - let tokenizers = - schema - .fields() - .iter() - .map(FieldEntry::field_type) - .map(|field_type| match *field_type { + let tokenizers = schema + .fields() + .map( + |(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() { FieldType::Str(ref text_options) => text_options .get_indexing_options() .and_then(|text_index_option| { @@ -83,8 +81,9 @@ impl SegmentWriter { segment.index().tokenizers().get(tokenizer_name) }), _ => None, - }) - .collect(); + }, + ) + .collect(); Ok(SegmentWriter { max_doc: 0, multifield_postings, @@ -160,7 +159,7 @@ impl SegmentWriter { } FieldType::Str(_) => { let num_tokens = if let Some(ref mut tokenizer) = - self.tokenizers[field.0 as usize] + self.tokenizers[field.field_id() as usize] { let texts: Vec<&str> = field_values .iter() diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 9d290f02c..b66beb413 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -356,9 +356,9 @@ pub mod tests { #[test] fn test_skip_next() { - let term_0 = Term::from_field_u64(Field(0), 0); - let term_1 = Term::from_field_u64(Field(0), 1); - let term_2 = Term::from_field_u64(Field(0), 2); + let term_0 = Term::from_field_u64(Field::from_field_id(0), 0); + let term_1 = Term::from_field_u64(Field::from_field_id(0), 1); + let term_2 = Term::from_field_u64(Field::from_field_id(0), 2); let num_docs = 300u32; @@ -511,19 +511,19 @@ pub mod tests { } pub static TERM_A: Lazy = Lazy::new(|| { - let field = Field(0); + let field = Field::from_field_id(0); Term::from_field_text(field, "a") }); pub static TERM_B: Lazy = Lazy::new(|| { - let field = Field(0); + let field = Field::from_field_id(0); Term::from_field_text(field, "b") }); pub static TERM_C: Lazy = Lazy::new(|| { - let field = Field(0); + let field = Field::from_field_id(0); Term::from_field_text(field, "c") }); pub static TERM_D: Lazy = Lazy::new(|| { - let field = Field(0); + let field = Field::from_field_id(0); Term::from_field_text(field, "d") }); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index d9167743c..e6801dc2b 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -61,12 +61,12 @@ fn make_field_partition( .iter() .map(|(key, _, _)| Term::wrap(key).field()) .enumerate(); - let mut prev_field = Field(u32::max_value()); + let mut prev_field_opt = None; let mut fields = vec![]; let mut offsets = vec![]; for (offset, field) in term_offsets_it { - if field != prev_field { - prev_field = field; + if Some(field) != prev_field_opt { + prev_field_opt = Some(field); fields.push(field); offsets.push(offset); } @@ -86,8 +86,7 @@ impl MultiFieldPostingsWriter { let term_index = TermHashMap::new(table_bits); let per_field_postings_writers: Vec<_> = schema .fields() - .iter() - .map(|field_entry| posting_from_field_entry(field_entry)) + .map(|(_, field_entry)| posting_from_field_entry(field_entry)) .collect(); MultiFieldPostingsWriter { heap: MemoryArena::new(), @@ -107,7 +106,8 @@ impl MultiFieldPostingsWriter { field: Field, token_stream: &mut dyn TokenStream, ) -> u32 { - let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut(); + let postings_writer = + self.per_field_postings_writers[field.field_id() as usize].deref_mut(); postings_writer.index_text( &mut self.term_index, doc, @@ -118,7 +118,8 @@ impl MultiFieldPostingsWriter { } pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId { - let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut(); + let postings_writer = + self.per_field_postings_writers[term.field().field_id() as usize].deref_mut(); postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap) } @@ -160,7 +161,7 @@ impl MultiFieldPostingsWriter { FieldType::Bytes => {} } - let postings_writer = &self.per_field_postings_writers[field.0 as usize]; + let postings_writer = &self.per_field_postings_writers[field.field_id() as usize]; let mut field_serializer = serializer.new_field(field, postings_writer.total_num_tokens())?; postings_writer.serialize( diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 7648cdcc3..aad1de652 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -674,13 +674,19 @@ mod test { test_parse_query_to_logical_ast_helper( "signed:-2324", - &format!("{:?}", Term::from_field_i64(Field(2u32), -2324)), + &format!( + "{:?}", + Term::from_field_i64(Field::from_field_id(2u32), -2324) + ), false, ); test_parse_query_to_logical_ast_helper( "float:2.5", - &format!("{:?}", Term::from_field_f64(Field(10u32), 2.5)), + &format!( + "{:?}", + Term::from_field_f64(Field::from_field_id(10u32), 2.5) + ), false, ); } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index c22c9141e..d5a29f9fd 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -118,7 +118,7 @@ mod tests { #[test] fn test_term_query_debug() { let term_query = TermQuery::new( - Term::from_field_text(Field(1), "hello"), + Term::from_field_text(Field::from_field_id(1), "hello"), IndexRecordOption::WithFreqs, ); assert_eq!( diff --git a/src/schema/field.rs b/src/schema/field.rs index 350a15df1..6ab3ec359 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -6,7 +6,19 @@ use std::io::Write; /// `Field` is represented by an unsigned 32-bit integer type /// The schema holds the mapping between field names and `Field` objects. #[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)] -pub struct Field(pub u32); +pub struct Field(u32); + +impl Field { + /// Create a new field object for the given FieldId. + pub fn from_field_id(field_id: u32) -> Field { + Field(field_id) + } + + /// Returns a u32 identifying uniquely a field within a schema. + pub fn field_id(&self) -> u32 { + self.0 + } +} impl BinarySerializable for Field { fn serialize(&self, writer: &mut W) -> io::Result<()> { diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 57da5f361..db10036ae 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -167,7 +167,7 @@ impl SchemaBuilder { /// Adds a field entry to the schema in build. fn add_field(&mut self, field_entry: FieldEntry) -> Field { - let field = Field(self.fields.len() as u32); + let field = Field::from_field_id(self.fields.len() as u32); let field_name = field_entry.name().to_string(); self.fields.push(field_entry); self.fields_map.insert(field_name, field); @@ -223,7 +223,7 @@ pub struct Schema(Arc); impl Schema { /// Return the `FieldEntry` associated to a `Field`. pub fn get_field_entry(&self, field: Field) -> &FieldEntry { - &self.0.fields[field.0 as usize] + &self.0.fields[field.field_id() as usize] } /// Return the field name for a given `Field`. @@ -232,8 +232,12 @@ impl Schema { } /// Return the list of all the `Field`s. - pub fn fields(&self) -> &[FieldEntry] { - &self.0.fields + pub fn fields(&self) -> impl Iterator { + self.0 + .fields + .iter() + .enumerate() + .map(|(field_id, field_entry)| (Field::from_field_id(field_id as u32), field_entry)) } /// Creates a new builder. @@ -485,13 +489,32 @@ mod tests { let schema: Schema = serde_json::from_str(expected).unwrap(); - let mut fields = schema.fields().iter(); - - assert_eq!("title", fields.next().unwrap().name()); - assert_eq!("author", fields.next().unwrap().name()); - assert_eq!("count", fields.next().unwrap().name()); - assert_eq!("popularity", fields.next().unwrap().name()); - assert_eq!("score", fields.next().unwrap().name()); + let mut fields = schema.fields(); + { + let (field, field_entry) = fields.next().unwrap(); + assert_eq!("title", field_entry.name()); + assert_eq!(0, field.field_id()); + } + { + let (field, field_entry) = fields.next().unwrap(); + assert_eq!("author", field_entry.name()); + assert_eq!(1, field.field_id()); + } + { + let (field, field_entry) = fields.next().unwrap(); + assert_eq!("count", field_entry.name()); + assert_eq!(2, field.field_id()); + } + { + let (field, field_entry) = fields.next().unwrap(); + assert_eq!("popularity", field_entry.name()); + assert_eq!(3, field.field_id()); + } + { + let (field, field_entry) = fields.next().unwrap(); + assert_eq!("score", field_entry.name()); + assert_eq!(4, field.field_id()); + } assert!(fields.next().is_none()); } diff --git a/src/schema/term.rs b/src/schema/term.rs index 2dd1e0c2f..f5425702d 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -105,7 +105,7 @@ impl Term { if self.0.len() < 4 { self.0.resize(4, 0u8); } - BigEndian::write_u32(&mut self.0[0..4], field.0); + BigEndian::write_u32(&mut self.0[0..4], field.field_id()); } /// Sets a u64 value in the term. @@ -157,7 +157,7 @@ where /// Returns the field. pub fn field(&self) -> Field { - Field(BigEndian::read_u32(&self.0.as_ref()[..4])) + Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4])) } /// Returns the `u64` value stored in a term. @@ -227,7 +227,7 @@ impl fmt::Debug for Term { write!( f, "Term(field={},bytes={:?})", - self.field().0, + self.field().field_id(), self.value_bytes() ) } diff --git a/tests/failpoints/mod.rs b/tests/failpoints/mod.rs index 509e3759f..311744972 100644 --- a/tests/failpoints/mod.rs +++ b/tests/failpoints/mod.rs @@ -1,5 +1,4 @@ use fail; -use std::io::Write; use std::path::Path; use tantivy::directory::{Directory, ManagedDirectory, RAMDirectory, TerminatingWrite}; use tantivy::doc;