diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 1aabf3e9c..72110cd82 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -8,7 +8,7 @@ use crate::directory::ReadOnlySource; use crate::fastfield::DeleteBitSet; use crate::fastfield::FacetReader; use crate::fastfield::FastFieldReaders; -use crate::fieldnorm::FieldNormReader; +use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::schema::Field; use crate::schema::FieldType; use crate::schema::Schema; @@ -48,7 +48,7 @@ pub struct SegmentReader { positions_composite: CompositeFile, positions_idx_composite: CompositeFile, fast_fields_readers: Arc, - fieldnorms_composite: CompositeFile, + fieldnorm_readers: FieldNormReaders, store_source: ReadOnlySource, delete_bitset_opt: Option, @@ -126,8 +126,8 @@ impl SegmentReader { /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader { - if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) { - FieldNormReader::open(fieldnorm_source) + if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) { + fieldnorm_reader } else { let field_name = self.schema.get_field_name(field); let err_msg = format!( @@ -178,8 +178,8 @@ impl SegmentReader { let fast_field_readers = Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?); - let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?; - let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?; + let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?; + let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?; let delete_bitset_opt = if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::DELETE)?; @@ -195,7 +195,7 @@ impl SegmentReader { termdict_composite, postings_composite, fast_fields_readers: fast_field_readers, - fieldnorms_composite, + fieldnorm_readers, segment_id: segment.id(), store_source, delete_bitset_opt, @@ -308,7 +308,7 @@ impl SegmentReader { self.positions_composite.space_usage(), self.positions_idx_composite.space_usage(), self.fast_fields_readers.space_usage(), - self.fieldnorms_composite.space_usage(), + self.fieldnorm_readers.space_usage(), self.get_store_reader().space_usage(), self.delete_bitset_opt .as_ref() diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index 12370608d..7450376c7 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -21,7 +21,7 @@ mod reader; mod serializer; mod writer; -pub use self::reader::FieldNormReader; +pub use self::reader::{FieldNormReader, FieldNormReaders}; pub use self::serializer::FieldNormsSerializer; pub use self::writer::FieldNormsWriter; diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index 8a57739fa..193dda5f8 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -1,6 +1,41 @@ use super::{fieldnorm_to_id, id_to_fieldnorm}; +use crate::common::CompositeFile; use crate::directory::ReadOnlySource; +use crate::schema::Field; +use crate::space_usage::PerFieldSpaceUsage; use crate::DocId; +use std::sync::Arc; + +/// Reader for the fieldnorm (for each document, the number of tokens indexed in the +/// field) of all indexed fields in the index. +/// +/// Each fieldnorm is approximately compressed over one byte. We refer to this byte as +/// `fieldnorm_id`. +/// The mapping from `fieldnorm` to `fieldnorm_id` is given by monotonic. +#[derive(Clone)] +pub struct FieldNormReaders { + data: Arc, +} + +impl FieldNormReaders { + /// Creates a field norm reader. + pub fn new(source: ReadOnlySource) -> crate::Result { + let data = CompositeFile::open(&source)?; + Ok(FieldNormReaders { + data: Arc::new(data), + }) + } + + /// Returns the FieldNormReader for a specific field. + pub fn get_field(&self, field: Field) -> Option { + self.data.open_read(field).map(FieldNormReader::open) + } + + /// Return a break down of the space usage per field. + pub fn space_usage(&self) -> PerFieldSpaceUsage { + self.data.space_usage() + } +} /// Reads the fieldnorm associated to a document. /// The fieldnorm represents the length associated to @@ -19,6 +54,7 @@ use crate::DocId; /// Apart from compression, this scale also makes it possible to /// precompute computationally expensive functions of the fieldnorm /// in a very short array. +#[derive(Clone)] pub struct FieldNormReader { data: ReadOnlySource, } @@ -29,6 +65,11 @@ impl FieldNormReader { FieldNormReader { data } } + /// Returns the number of documents in this segment. + pub fn num_docs(&self) -> u32 { + self.data.len() as u32 + } + /// Returns the `fieldnorm` associated to a doc id. /// The fieldnorm is a value approximating the number /// of tokens in a given field of the `doc_id`. @@ -65,10 +106,11 @@ impl FieldNormReader { } #[cfg(test)] -impl From> for FieldNormReader { - fn from(field_norms: Vec) -> FieldNormReader { +impl From<&[u32]> for FieldNormReader { + fn from(field_norms: &[u32]) -> FieldNormReader { let field_norms_id = field_norms - .into_iter() + .iter() + .cloned() .map(FieldNormReader::fieldnorm_to_id) .collect::>(); let field_norms_data = ReadOnlySource::from(field_norms_id); diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index ceeac05d2..5c72a1362 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -78,11 +78,12 @@ impl FieldNormsWriter { } /// Serialize the seen fieldnorm values to the serializer for all fields. - pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> { + pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> { for &field in self.fields.iter() { let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..]; fieldnorms_serializer.serialize_field(field, fieldnorm_values)?; } + fieldnorms_serializer.close()?; Ok(()) } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 8e6f75fa1..efb0eb46b 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -167,7 +167,7 @@ impl IndexMerger { fn write_fieldnorms( &self, - fieldnorms_serializer: &mut FieldNormsSerializer, + mut fieldnorms_serializer: FieldNormsSerializer, ) -> crate::Result<()> { let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema); let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize); @@ -182,6 +182,7 @@ impl IndexMerger { } fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?; } + fieldnorms_serializer.close()?; Ok(()) } @@ -668,8 +669,10 @@ impl IndexMerger { impl SerializableSegment for IndexMerger { fn write(&self, mut serializer: SegmentSerializer) -> crate::Result { + if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { + self.write_fieldnorms(fieldnorms_serializer)?; + } let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?; - self.write_fieldnorms(serializer.get_fieldnorms_serializer())?; self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?; self.write_storable_fields(serializer.get_store_writer())?; serializer.close()?; @@ -1504,12 +1507,9 @@ mod tests { for i in 0..100 { let mut doc = Document::new(); doc.add_f64(field, 42.0); - doc.add_f64(multi_field, 0.24); doc.add_f64(multi_field, 0.27); - writer.add_document(doc); - if i % 5 == 0 { writer.commit()?; } @@ -1521,7 +1521,6 @@ mod tests { // If a merging thread fails, we should end up with more // than one segment here assert_eq!(1, index.searchable_segments()?.len()); - Ok(()) } } diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index d259b6bb5..f2afdfc59 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -8,15 +8,16 @@ use crate::store::StoreWriter; /// Segment serializer is in charge of laying out on disk /// the data accumulated and sorted by the `SegmentWriter`. pub struct SegmentSerializer { + segment: Segment, store_writer: StoreWriter, fast_field_serializer: FastFieldSerializer, - fieldnorms_serializer: FieldNormsSerializer, + fieldnorms_serializer: Option, postings_serializer: InvertedIndexSerializer, } impl SegmentSerializer { /// Creates a new `SegmentSerializer`. - pub fn for_segment(segment: &mut Segment) -> crate::Result { + pub fn for_segment(mut segment: Segment) -> crate::Result { let store_write = segment.open_write(SegmentComponent::STORE)?; let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?; @@ -25,15 +26,21 @@ impl SegmentSerializer { let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; - let postings_serializer = InvertedIndexSerializer::open(segment)?; + let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; Ok(SegmentSerializer { + segment, store_writer: StoreWriter::new(store_write), fast_field_serializer, - fieldnorms_serializer, + fieldnorms_serializer: Some(fieldnorms_serializer), postings_serializer, }) } + #[allow(dead_code)] + pub fn segment(&self) -> &Segment { + &self.segment + } + /// Accessor to the `PostingsSerializer`. pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { &mut self.postings_serializer @@ -44,9 +51,11 @@ impl SegmentSerializer { &mut self.fast_field_serializer } - /// Accessor to the field norm serializer. - pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer { - &mut self.fieldnorms_serializer + /// Extract the field norm serializer. + /// + /// Note the fieldnorms serializer can only be extracted once. + pub fn extract_fieldnorms_serializer(&mut self) -> Option { + self.fieldnorms_serializer.take() } /// Accessor to the `StoreWriter`. @@ -55,11 +64,13 @@ impl SegmentSerializer { } /// Finalize the segment serialization. - pub fn close(self) -> crate::Result<()> { + pub fn close(mut self) -> crate::Result<()> { + if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() { + fieldnorms_serializer.close()?; + } self.fast_field_serializer.close()?; self.postings_serializer.close()?; self.store_writer.close()?; - self.fieldnorms_serializer.close()?; Ok(()) } } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 46ac26e1f..9205ac30a 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -112,7 +112,7 @@ fn merge( target_opstamp: Opstamp, ) -> crate::Result { // first we need to apply deletes to our segment. - let mut merged_segment = index.new_segment(); + let merged_segment = index.new_segment(); // First we apply all of the delet to the merged segment, up to the target opstamp. for segment_entry in &mut segment_entries { @@ -131,12 +131,13 @@ fn merge( let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?; // ... we just serialize this index merger in our new segment to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?; + let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?; let num_docs = merger.write(segment_serializer)?; - let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs); + let merged_segment_id = merged_segment.id(); + let segment_meta = index.new_segment_meta(merged_segment_id, num_docs); Ok(SegmentEntry::new(segment_meta, delete_cursor, None)) } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 4b2b1f35d..d8ba92157 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -62,11 +62,12 @@ impl SegmentWriter { /// - schema pub fn for_segment( memory_budget: usize, - mut segment: Segment, + segment: Segment, schema: &Schema, ) -> crate::Result { + let tokenizer_manager = segment.index().tokenizers().clone(); let table_num_bits = initial_table_size(memory_budget)?; - let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; + let segment_serializer = SegmentSerializer::for_segment(segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); let tokenizers = schema .fields() @@ -76,7 +77,7 @@ impl SegmentWriter { .get_indexing_options() .and_then(|text_index_option| { let tokenizer_name = &text_index_option.tokenizer(); - segment.index().tokenizers().get(tokenizer_name) + tokenizer_manager.get(tokenizer_name) }), _ => None, }, @@ -280,9 +281,11 @@ fn write( fieldnorms_writer: &FieldNormsWriter, mut serializer: SegmentSerializer, ) -> crate::Result<()> { + if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { + fieldnorms_writer.serialize(fieldnorms_serializer)?; + } let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?; fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?; - fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?; serializer.close()?; Ok(()) } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 0c3e5f849..1001321ed 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -14,7 +14,7 @@ use std::fmt; /// - a field name /// - a field type, itself wrapping up options describing /// how the field should be indexed. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub struct FieldEntry { name: String, field_type: FieldType, diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index d7f05c4c3..42b005c5b 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -48,7 +48,7 @@ pub enum Type { /// A `FieldType` describes the type (text, u64) of a field as well as /// how it should be handled by tantivy. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub enum FieldType { /// String field type configuration Str(TextOptions), diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 08dc4d80b..16ffe3d21 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -6,7 +6,7 @@ use std::borrow::Cow; use std::ops::BitOr; /// Define how a text field should be handled by tantivy. -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct TextOptions { indexing: Option, stored: bool, @@ -51,7 +51,7 @@ impl Default for TextOptions { /// - the amount of information that should be stored about the presence of a term in a document. /// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)). /// - the name of the `Tokenizer` that should be used to process the field. -#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)] +#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] pub struct TextFieldIndexing { record: IndexRecordOption, tokenizer: Cow<'static, str>,