diff --git a/src/core/segment.rs b/src/core/segment.rs index 41dc6c91e..90f1139b3 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -3,7 +3,7 @@ use crate::core::Index; use crate::core::SegmentId; use crate::core::SegmentMeta; use crate::directory::error::{OpenReadError, OpenWriteError}; -use crate::directory::Directory; +use crate::directory::{Directory, DirectoryClone}; use crate::directory::{ReadOnlySource, WritePtr}; use crate::indexer::segment_serializer::SegmentSerializer; use crate::schema::Schema; @@ -14,12 +14,22 @@ use std::path::PathBuf; use std::result; /// A segment is a piece of the index. -#[derive(Clone)] pub struct Segment { - index: Index, + schema: Schema, + directory: Box, meta: SegmentMeta, } +impl Clone for Segment { + fn clone(&self) -> Self { + Segment { + schema: self.schema.clone(), + directory: self.directory.box_clone(), + meta: self.meta.clone(), + } + } +} + impl fmt::Debug for Segment { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Segment({:?})", self.id().uuid_string()) @@ -31,18 +41,18 @@ impl fmt::Debug for Segment { /// The function is here to make it private outside `tantivy`. /// #[doc(hidden)] pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment { - Segment { index, meta } + Segment { + directory: index.directory().box_clone(), + schema: index.schema(), + meta, + } } impl Segment { - /// Returns the index the segment belongs to. - pub fn index(&self) -> &Index { - &self.index - } - /// Returns our index's schema. + // TODO return a ref. pub fn schema(&self) -> Schema { - self.index.schema() + self.schema.clone() } /// Returns the segment meta-information @@ -56,7 +66,8 @@ impl Segment { /// as we finalize a fresh new segment. pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment { Segment { - index: self.index, + directory: self.directory, + schema: self.schema, meta: self.meta.with_max_doc(max_doc), } } @@ -64,7 +75,8 @@ impl Segment { #[doc(hidden)] pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment { Segment { - index: self.index, + directory: self.directory, + schema: self.schema, meta: self.meta.with_delete_meta(num_deleted_docs, opstamp), } } @@ -88,7 +100,7 @@ impl Segment { component: SegmentComponent, ) -> result::Result { let path = self.relative_path(component); - let source = self.index.directory().open_read(&path)?; + let source = self.directory.open_read(&path)?; Ok(source) } @@ -98,7 +110,7 @@ impl Segment { component: SegmentComponent, ) -> result::Result { let path = self.relative_path(component); - let write = self.index.directory_mut().open_write(&path)?; + let write = self.directory.open_write(&path)?; Ok(write) } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 40c2b678b..ba597b867 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -23,6 +23,7 @@ use crate::indexer::SegmentWriter; use crate::schema::Document; use crate::schema::IndexRecordOption; use crate::schema::Term; +use crate::tokenizer::TokenizerManager; use crate::Opstamp; use crossbeam::channel; use futures::executor::block_on; @@ -189,11 +190,12 @@ fn index_documents( segment: Segment, grouped_document_iterator: &mut dyn Iterator, segment_updater: &mut SegmentUpdater, + tokenizers: &TokenizerManager, mut delete_cursor: DeleteCursor, ) -> crate::Result { + let mut segment_writer = + SegmentWriter::for_segment(memory_budget, segment.clone(), tokenizers)?; let schema = segment.schema(); - - let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?; for document_group in grouped_document_iterator { for doc in document_group { segment_writer.add_document(doc, &schema)?; @@ -434,6 +436,7 @@ impl IndexWriter { segment, &mut document_iterator, &mut segment_updater, + index.tokenizers(), delete_cursor.clone(), )?; } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 6df89f1ac..066f8f7b2 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -1,10 +1,8 @@ use super::segment_register::SegmentRegister; use crate::core::SegmentId; use crate::core::SegmentMeta; -use crate::error::TantivyError; use crate::indexer::delete_queue::DeleteCursor; use crate::indexer::SegmentEntry; -use crate::Result as TantivyResult; use std::collections::hash_set::HashSet; use std::fmt::{self, Debug, Formatter}; use std::sync::RwLock; @@ -145,7 +143,7 @@ impl SegmentManager { /// Returns an error if some segments are missing, or if /// the `segment_ids` are not either all committed or all /// uncommitted. - pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult> { + pub fn start_merge(&self, segment_ids: &[SegmentId]) -> crate::Result> { let registers_lock = self.read(); let mut segment_entries = vec![]; if registers_lock.uncommitted.contains_all(segment_ids) { @@ -166,7 +164,7 @@ impl SegmentManager { let error_msg = "Merge operation sent for segments that are not \ all uncommited or commited." .to_string(); - return Err(TantivyError::InvalidArgument(error_msg)); + return Err(crate::Error::InvalidArgument(error_msg)); } Ok(segment_entries) } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 8ed1025ba..45758221c 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -11,9 +11,9 @@ use crate::schema::Schema; use crate::schema::Term; use crate::schema::Value; use crate::schema::{Field, FieldEntry}; -use crate::tokenizer::BoxedTokenizer; use crate::tokenizer::FacetTokenizer; use crate::tokenizer::PreTokenizedStream; +use crate::tokenizer::{BoxedTokenizer, TokenizerManager}; use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer}; use crate::DocId; use crate::Opstamp; @@ -66,11 +66,12 @@ impl SegmentWriter { pub fn for_segment( memory_budget: usize, mut segment: Segment, - schema: &Schema, + tokenizers: &TokenizerManager, ) -> Result { + let schema = segment.schema(); let table_num_bits = initial_table_size(memory_budget)?; let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; - let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); + let multifield_postings = MultiFieldPostingsWriter::new(&schema, table_num_bits); let tokenizers = schema .fields() .map( @@ -79,7 +80,7 @@ impl SegmentWriter { .get_indexing_options() .and_then(|text_index_option| { let tokenizer_name = &text_index_option.tokenizer(); - segment.index().tokenizers().get(tokenizer_name) + tokenizers.get(tokenizer_name) }), _ => None, }, @@ -88,9 +89,9 @@ impl SegmentWriter { Ok(SegmentWriter { max_doc: 0, multifield_postings, - fieldnorms_writer: FieldNormsWriter::for_schema(schema), + fieldnorms_writer: FieldNormsWriter::for_schema(&schema), segment_serializer, - fast_field_writers: FastFieldsWriter::from_schema(schema), + fast_field_writers: FastFieldsWriter::from_schema(&schema), doc_opstamps: Vec::with_capacity(1_000), tokenizers, }) diff --git a/src/postings/mod.rs b/src/postings/mod.rs index b66beb413..cf6352765 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -220,7 +220,7 @@ pub mod tests { { let mut segment_writer = - SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap(); + SegmentWriter::for_segment(3_000_000, segment.clone(), index.tokenizers()).unwrap(); { let mut doc = Document::default(); // checking that position works if the field has two values