Segment embeds their own directory, schema instead of relying on Index.

This commit is contained in:
Paul Masurel
2019-12-28 16:45:38 +09:00
parent d12a06b65b
commit a6a78fa607
5 changed files with 41 additions and 27 deletions

View File

@@ -3,7 +3,7 @@ use crate::core::Index;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::directory::error::{OpenReadError, OpenWriteError};
use crate::directory::Directory;
use crate::directory::{Directory, DirectoryClone};
use crate::directory::{ReadOnlySource, WritePtr};
use crate::indexer::segment_serializer::SegmentSerializer;
use crate::schema::Schema;
@@ -14,12 +14,22 @@ use std::path::PathBuf;
use std::result;
/// A segment is a piece of the index.
#[derive(Clone)]
pub struct Segment {
index: Index,
schema: Schema,
directory: Box<dyn Directory>,
meta: SegmentMeta,
}
impl Clone for Segment {
fn clone(&self) -> Self {
Segment {
schema: self.schema.clone(),
directory: self.directory.box_clone(),
meta: self.meta.clone(),
}
}
}
impl fmt::Debug for Segment {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Segment({:?})", self.id().uuid_string())
@@ -31,18 +41,18 @@ impl fmt::Debug for Segment {
/// The function is here to make it private outside `tantivy`.
/// #[doc(hidden)]
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
Segment { index, meta }
Segment {
directory: index.directory().box_clone(),
schema: index.schema(),
meta,
}
}
impl Segment {
/// Returns the index the segment belongs to.
pub fn index(&self) -> &Index {
&self.index
}
/// Returns our index's schema.
// TODO return a ref.
pub fn schema(&self) -> Schema {
self.index.schema()
self.schema.clone()
}
/// Returns the segment meta-information
@@ -56,7 +66,8 @@ impl Segment {
/// as we finalize a fresh new segment.
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
Segment {
index: self.index,
directory: self.directory,
schema: self.schema,
meta: self.meta.with_max_doc(max_doc),
}
}
@@ -64,7 +75,8 @@ impl Segment {
#[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
Segment {
index: self.index,
directory: self.directory,
schema: self.schema,
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
}
}
@@ -88,7 +100,7 @@ impl Segment {
component: SegmentComponent,
) -> result::Result<ReadOnlySource, OpenReadError> {
let path = self.relative_path(component);
let source = self.index.directory().open_read(&path)?;
let source = self.directory.open_read(&path)?;
Ok(source)
}
@@ -98,7 +110,7 @@ impl Segment {
component: SegmentComponent,
) -> result::Result<WritePtr, OpenWriteError> {
let path = self.relative_path(component);
let write = self.index.directory_mut().open_write(&path)?;
let write = self.directory.open_write(&path)?;
Ok(write)
}
}

View File

@@ -23,6 +23,7 @@ use crate::indexer::SegmentWriter;
use crate::schema::Document;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::tokenizer::TokenizerManager;
use crate::Opstamp;
use crossbeam::channel;
use futures::executor::block_on;
@@ -189,11 +190,12 @@ fn index_documents(
segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
segment_updater: &mut SegmentUpdater,
tokenizers: &TokenizerManager,
mut delete_cursor: DeleteCursor,
) -> crate::Result<bool> {
let mut segment_writer =
SegmentWriter::for_segment(memory_budget, segment.clone(), tokenizers)?;
let schema = segment.schema();
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
for document_group in grouped_document_iterator {
for doc in document_group {
segment_writer.add_document(doc, &schema)?;
@@ -434,6 +436,7 @@ impl IndexWriter {
segment,
&mut document_iterator,
&mut segment_updater,
index.tokenizers(),
delete_cursor.clone(),
)?;
}

View File

@@ -1,10 +1,8 @@
use super::segment_register::SegmentRegister;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::error::TantivyError;
use crate::indexer::delete_queue::DeleteCursor;
use crate::indexer::SegmentEntry;
use crate::Result as TantivyResult;
use std::collections::hash_set::HashSet;
use std::fmt::{self, Debug, Formatter};
use std::sync::RwLock;
@@ -145,7 +143,7 @@ impl SegmentManager {
/// Returns an error if some segments are missing, or if
/// the `segment_ids` are not either all committed or all
/// uncommitted.
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> crate::Result<Vec<SegmentEntry>> {
let registers_lock = self.read();
let mut segment_entries = vec![];
if registers_lock.uncommitted.contains_all(segment_ids) {
@@ -166,7 +164,7 @@ impl SegmentManager {
let error_msg = "Merge operation sent for segments that are not \
all uncommited or commited."
.to_string();
return Err(TantivyError::InvalidArgument(error_msg));
return Err(crate::Error::InvalidArgument(error_msg));
}
Ok(segment_entries)
}

View File

@@ -11,9 +11,9 @@ use crate::schema::Schema;
use crate::schema::Term;
use crate::schema::Value;
use crate::schema::{Field, FieldEntry};
use crate::tokenizer::BoxedTokenizer;
use crate::tokenizer::FacetTokenizer;
use crate::tokenizer::PreTokenizedStream;
use crate::tokenizer::{BoxedTokenizer, TokenizerManager};
use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer};
use crate::DocId;
use crate::Opstamp;
@@ -66,11 +66,12 @@ impl SegmentWriter {
pub fn for_segment(
memory_budget: usize,
mut segment: Segment,
schema: &Schema,
tokenizers: &TokenizerManager,
) -> Result<SegmentWriter> {
let schema = segment.schema();
let table_num_bits = initial_table_size(memory_budget)?;
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let multifield_postings = MultiFieldPostingsWriter::new(&schema, table_num_bits);
let tokenizers = schema
.fields()
.map(
@@ -79,7 +80,7 @@ impl SegmentWriter {
.get_indexing_options()
.and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
segment.index().tokenizers().get(tokenizer_name)
tokenizers.get(tokenizer_name)
}),
_ => None,
},
@@ -88,9 +89,9 @@ impl SegmentWriter {
Ok(SegmentWriter {
max_doc: 0,
multifield_postings,
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
fast_field_writers: FastFieldsWriter::from_schema(&schema),
doc_opstamps: Vec::with_capacity(1_000),
tokenizers,
})

View File

@@ -220,7 +220,7 @@ pub mod tests {
{
let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
SegmentWriter::for_segment(3_000_000, segment.clone(), index.tokenizers()).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values