diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 8291cf032..a8f9c50f8 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -239,7 +239,7 @@ impl InnerSegmentMeta { /// /// Contains settings which are applied on the whole /// index, like presort documents. -#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub struct IndexSettings { /// Sorts the documents by information /// provided in `IndexSortByField` @@ -248,7 +248,26 @@ pub struct IndexSettings { /// The `Compressor` used to compress the doc store. #[serde(default)] pub docstore_compression: Compressor, + #[serde(default = "default_docstore_blocksize")] + /// The size of each block that will be compressed and written to disk + pub docstore_blocksize: usize, } + +/// Must be a function to be compatible with serde defaults +fn default_docstore_blocksize() -> usize { + 16_384 +} + +impl Default for IndexSettings { + fn default() -> Self { + Self { + sort_by_field: None, + docstore_compression: Compressor::default(), + docstore_blocksize: default_docstore_blocksize(), + } + } +} + /// Settings to presort the documents in an index /// /// Presorting documents can greatly performance diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 923b5b6dd..554503e66 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -39,9 +39,10 @@ impl SegmentSerializer { let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; let compressor = segment.index().settings().docstore_compression; + let blocksize = segment.index().settings().docstore_blocksize; Ok(SegmentSerializer { segment, - store_writer: StoreWriter::new(store_write, compressor), + store_writer: StoreWriter::new(store_write, compressor, blocksize), fast_field_serializer, fieldnorms_serializer: Some(fieldnorms_serializer), postings_serializer, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 537b4f502..c1ae1c6e8 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -372,9 +372,10 @@ fn remap_and_write( .segment_mut() .open_write(SegmentComponent::Store)?; let compressor = serializer.segment().index().settings().docstore_compression; + let block_size = serializer.segment().index().settings().docstore_blocksize; let old_store_writer = std::mem::replace( &mut serializer.store_writer, - StoreWriter::new(store_write, compressor), + StoreWriter::new(store_write, compressor, block_size), ); old_store_writer.close()?; let store_read = StoreReader::open( diff --git a/src/store/mod.rs b/src/store/mod.rs index b378d1db5..13f6d76b3 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -69,10 +69,13 @@ pub mod tests { sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt \ mollit anim id est laborum."; + const BLOCK_SIZE: usize = 16_384; + pub fn write_lorem_ipsum_store( writer: WritePtr, num_docs: usize, compressor: Compressor, + blocksize: usize, ) -> Schema { let mut schema_builder = Schema::builder(); let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored()); @@ -80,7 +83,7 @@ pub mod tests { schema_builder.add_text_field("title", TextOptions::default().set_stored()); let schema = schema_builder.build(); { - let mut store_writer = StoreWriter::new(writer, compressor); + let mut store_writer = StoreWriter::new(writer, compressor, blocksize); for i in 0..num_docs { let mut doc = Document::default(); doc.add_field_value(field_body, LOREM.to_string()); @@ -103,7 +106,7 @@ pub mod tests { let path = Path::new("store"); let directory = RamDirectory::create(); let store_wrt = directory.open_write(path)?; - let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4); + let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4, BLOCK_SIZE); let field_title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; let store = StoreReader::open(store_file)?; @@ -139,11 +142,11 @@ pub mod tests { Ok(()) } - fn test_store(compressor: Compressor) -> crate::Result<()> { + fn test_store(compressor: Compressor, blocksize: usize) -> crate::Result<()> { let path = Path::new("store"); let directory = RamDirectory::create(); let store_wrt = directory.open_write(path)?; - let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor); + let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor, blocksize); let field_title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; let store = StoreReader::open(store_file)?; @@ -169,22 +172,22 @@ pub mod tests { #[test] fn test_store_noop() -> crate::Result<()> { - test_store(Compressor::None) + test_store(Compressor::None, BLOCK_SIZE) } #[cfg(feature = "lz4-compression")] #[test] fn test_store_lz4_block() -> crate::Result<()> { - test_store(Compressor::Lz4) + test_store(Compressor::Lz4, BLOCK_SIZE) } #[cfg(feature = "snappy-compression")] #[test] fn test_store_snap() -> crate::Result<()> { - test_store(Compressor::Snappy) + test_store(Compressor::Snappy, BLOCK_SIZE) } #[cfg(feature = "brotli-compression")] #[test] fn test_store_brotli() -> crate::Result<()> { - test_store(Compressor::Brotli) + test_store(Compressor::Brotli, BLOCK_SIZE) } #[test] diff --git a/src/store/reader.rs b/src/store/reader.rs index f3dadda2a..830396b08 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -304,6 +304,8 @@ mod tests { use crate::store::tests::write_lorem_ipsum_store; use crate::Directory; + const BLOCK_SIZE: usize = 16_384; + fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> { doc.get_first(*field).and_then(|f| f.as_text()) } @@ -313,7 +315,7 @@ mod tests { let directory = RamDirectory::create(); let path = Path::new("store"); let writer = directory.open_write(path)?; - let schema = write_lorem_ipsum_store(writer, 500, Compressor::default()); + let schema = write_lorem_ipsum_store(writer, 500, Compressor::default(), BLOCK_SIZE); let title = schema.get_field("title").unwrap(); let store_file = directory.open_read(path)?; let store = StoreReader::open(store_file)?; diff --git a/src/store/writer.rs b/src/store/writer.rs index 0efdb0fc6..96864c75b 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -11,8 +11,6 @@ use crate::schema::Document; use crate::store::index::Checkpoint; use crate::DocId; -const BLOCK_SIZE: usize = 16_384; - /// Write tantivy's [`Store`](./index.html) /// /// Contrary to the other components of `tantivy`, @@ -22,6 +20,7 @@ const BLOCK_SIZE: usize = 16_384; /// The skip list index on the other hand, is built in memory. pub struct StoreWriter { compressor: Compressor, + block_size: usize, doc: DocId, first_doc_in_block: DocId, offset_index_writer: SkipIndexBuilder, @@ -35,9 +34,10 @@ impl StoreWriter { /// /// The store writer will writes blocks on disc as /// document are added. - pub fn new(writer: WritePtr, compressor: Compressor) -> StoreWriter { + pub fn new(writer: WritePtr, compressor: Compressor, block_size: usize) -> StoreWriter { StoreWriter { compressor, + block_size, doc: 0, first_doc_in_block: 0, offset_index_writer: SkipIndexBuilder::new(), @@ -65,7 +65,7 @@ impl StoreWriter { VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; self.current_block.write_all(serialized_document)?; self.doc += 1; - if self.current_block.len() > BLOCK_SIZE { + if self.current_block.len() > self.block_size { self.write_and_compress_block()?; } Ok(()) @@ -86,7 +86,7 @@ impl StoreWriter { self.current_block .write_all(&self.intermediary_buffer[..])?; self.doc += 1; - if self.current_block.len() > BLOCK_SIZE { + if self.current_block.len() > self.block_size { self.write_and_compress_block()?; } Ok(())