mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
Segment embeds their own directory, schema instead of relying on Index.
This commit is contained in:
@@ -3,7 +3,7 @@ use crate::core::Index;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::directory::error::{OpenReadError, OpenWriteError};
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::{Directory, DirectoryClone};
|
||||
use crate::directory::{ReadOnlySource, WritePtr};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::schema::Schema;
|
||||
@@ -14,12 +14,22 @@ use std::path::PathBuf;
|
||||
use std::result;
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
#[derive(Clone)]
|
||||
pub struct Segment {
|
||||
index: Index,
|
||||
schema: Schema,
|
||||
directory: Box<dyn Directory>,
|
||||
meta: SegmentMeta,
|
||||
}
|
||||
|
||||
impl Clone for Segment {
|
||||
fn clone(&self) -> Self {
|
||||
Segment {
|
||||
schema: self.schema.clone(),
|
||||
directory: self.directory.box_clone(),
|
||||
meta: self.meta.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Segment {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Segment({:?})", self.id().uuid_string())
|
||||
@@ -31,18 +41,18 @@ impl fmt::Debug for Segment {
|
||||
/// The function is here to make it private outside `tantivy`.
|
||||
/// #[doc(hidden)]
|
||||
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment { index, meta }
|
||||
Segment {
|
||||
directory: index.directory().box_clone(),
|
||||
schema: index.schema(),
|
||||
meta,
|
||||
}
|
||||
}
|
||||
|
||||
impl Segment {
|
||||
/// Returns the index the segment belongs to.
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// Returns our index's schema.
|
||||
// TODO return a ref.
|
||||
pub fn schema(&self) -> Schema {
|
||||
self.index.schema()
|
||||
self.schema.clone()
|
||||
}
|
||||
|
||||
/// Returns the segment meta-information
|
||||
@@ -56,7 +66,8 @@ impl Segment {
|
||||
/// as we finalize a fresh new segment.
|
||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
directory: self.directory,
|
||||
schema: self.schema,
|
||||
meta: self.meta.with_max_doc(max_doc),
|
||||
}
|
||||
}
|
||||
@@ -64,7 +75,8 @@ impl Segment {
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
directory: self.directory,
|
||||
schema: self.schema,
|
||||
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
||||
}
|
||||
}
|
||||
@@ -88,7 +100,7 @@ impl Segment {
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
let path = self.relative_path(component);
|
||||
let source = self.index.directory().open_read(&path)?;
|
||||
let source = self.directory.open_read(&path)?;
|
||||
Ok(source)
|
||||
}
|
||||
|
||||
@@ -98,7 +110,7 @@ impl Segment {
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<WritePtr, OpenWriteError> {
|
||||
let path = self.relative_path(component);
|
||||
let write = self.index.directory_mut().open_write(&path)?;
|
||||
let write = self.directory.open_write(&path)?;
|
||||
Ok(write)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use crate::indexer::SegmentWriter;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::Opstamp;
|
||||
use crossbeam::channel;
|
||||
use futures::executor::block_on;
|
||||
@@ -189,11 +190,12 @@ fn index_documents(
|
||||
segment: Segment,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
tokenizers: &TokenizerManager,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> crate::Result<bool> {
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(memory_budget, segment.clone(), tokenizers)?;
|
||||
let schema = segment.schema();
|
||||
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
@@ -434,6 +436,7 @@ impl IndexWriter {
|
||||
segment,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
index.tokenizers(),
|
||||
delete_cursor.clone(),
|
||||
)?;
|
||||
}
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::Result as TantivyResult;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::sync::RwLock;
|
||||
@@ -145,7 +143,7 @@ impl SegmentManager {
|
||||
/// Returns an error if some segments are missing, or if
|
||||
/// the `segment_ids` are not either all committed or all
|
||||
/// uncommitted.
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> crate::Result<Vec<SegmentEntry>> {
|
||||
let registers_lock = self.read();
|
||||
let mut segment_entries = vec![];
|
||||
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||
@@ -166,7 +164,7 @@ impl SegmentManager {
|
||||
let error_msg = "Merge operation sent for segments that are not \
|
||||
all uncommited or commited."
|
||||
.to_string();
|
||||
return Err(TantivyError::InvalidArgument(error_msg));
|
||||
return Err(crate::Error::InvalidArgument(error_msg));
|
||||
}
|
||||
Ok(segment_entries)
|
||||
}
|
||||
|
||||
@@ -11,9 +11,9 @@ use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::FacetTokenizer;
|
||||
use crate::tokenizer::PreTokenizedStream;
|
||||
use crate::tokenizer::{BoxedTokenizer, TokenizerManager};
|
||||
use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer};
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
@@ -66,11 +66,12 @@ impl SegmentWriter {
|
||||
pub fn for_segment(
|
||||
memory_budget: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
tokenizers: &TokenizerManager,
|
||||
) -> Result<SegmentWriter> {
|
||||
let schema = segment.schema();
|
||||
let table_num_bits = initial_table_size(memory_budget)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(&schema, table_num_bits);
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.map(
|
||||
@@ -79,7 +80,7 @@ impl SegmentWriter {
|
||||
.get_indexing_options()
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
segment.index().tokenizers().get(tokenizer_name)
|
||||
tokenizers.get(tokenizer_name)
|
||||
}),
|
||||
_ => None,
|
||||
},
|
||||
@@ -88,9 +89,9 @@ impl SegmentWriter {
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
multifield_postings,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
|
||||
segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
fast_field_writers: FastFieldsWriter::from_schema(&schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
tokenizers,
|
||||
})
|
||||
|
||||
@@ -220,7 +220,7 @@ pub mod tests {
|
||||
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), index.tokenizers()).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
|
||||
Reference in New Issue
Block a user