From c23a03ad8191ca928d96b668f8800a4ab22b2e13 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 8 Oct 2020 16:36:51 +0900 Subject: [PATCH] Large API Change in the Directory API. (#901) Tantivy used to assume that all files could be somehow memory mapped. After this change, Directory return a `FileSlice` that can be reduced and eventually read into an `OwnedBytes` object. Long and blocking io operation are still required by they do not span over the entire file. --- CHANGELOG.md | 1 + Cargo.toml | 2 - examples/iterating_docs_and_positions.rs | 8 +- src/collector/facet_collector.rs | 6 +- src/common/bitpacker.rs | 24 +-- src/common/composite_file.rs | 66 +++--- src/core/inverted_index_reader.rs | 96 +++++---- src/core/searcher.rs | 43 ++-- src/core/segment.rs | 7 +- src/core/segment_reader.rs | 117 +++++----- src/directory/directory.rs | 13 +- src/directory/error.rs | 10 + src/directory/file_slice.rs | 264 +++++++++++++++++++++++ src/directory/footer.rs | 25 ++- src/directory/managed_directory.rs | 68 +++--- src/directory/mmap_directory.rs | 77 +++---- src/directory/mod.rs | 7 +- src/directory/owned_bytes.rs | 239 ++++++++++++++++++++ src/directory/ram_directory.rs | 25 ++- src/directory/read_only_source.rs | 137 ------------ src/directory/tests.rs | 81 ++++--- src/fastfield/bytes/mod.rs | 4 +- src/fastfield/bytes/reader.rs | 17 +- src/fastfield/delete.rs | 24 ++- src/fastfield/mod.rs | 122 +++++------ src/fastfield/reader.rs | 37 ++-- src/fastfield/readers.rs | 48 +++-- src/fieldnorm/reader.rs | 30 +-- src/indexer/index_writer.rs | 41 +++- src/indexer/merger.rs | 18 +- src/lib.rs | 88 ++++---- src/positions/mod.rs | 27 ++- src/positions/reader.rs | 74 ++++--- src/postings/block_segment_postings.rs | 67 +++--- src/postings/mod.rs | 89 ++++---- src/postings/postings_writer.rs | 2 +- src/postings/segment_postings.rs | 18 +- src/postings/skip.rs | 78 +++---- src/postings/stacker/expull.rs | 6 +- src/query/automaton_weight.rs | 4 +- src/query/bm25.rs | 28 +-- src/query/phrase_query/phrase_query.rs | 2 +- src/query/phrase_query/phrase_weight.rs | 8 +- src/query/range_query.rs | 4 +- src/query/term_query/term_query.rs | 18 +- src/query/term_query/term_scorer.rs | 2 +- src/query/term_query/term_weight.rs | 12 +- src/reader/mod.rs | 6 +- src/schema/term.rs | 2 +- src/snippet/mod.rs | 22 +- src/space_usage/mod.rs | 46 ++-- src/store/mod.rs | 18 +- src/store/reader.rs | 54 ++--- src/store/writer.rs | 3 +- src/termdict/mod.rs | 162 ++++++-------- src/termdict/term_info_store.rs | 57 +++-- src/termdict/termdict.rs | 47 ++-- tests/failpoints/mod.rs | 13 +- 58 files changed, 1497 insertions(+), 1117 deletions(-) create mode 100644 src/directory/file_slice.rs create mode 100644 src/directory/owned_bytes.rs delete mode 100644 src/directory/read_only_source.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 199e9c215..2f3f0dea7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Tantivy 0.14.0 - Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan). - Migrated tantivy error from the now deprecated `failure` crate to `thiserror` #760. (@hirevo) - API Change. Accessing the typed value off a `Schema::Value` now returns an Option instead of panicking if the type does not match. +- Large API Change in the Directory API. Tantivy used to assume that all files could be somehow memory mapped. After this change, Directory return a `FileSlice` that can be reduced and eventually read into an `OwnedBytes` object. Long and blocking io operation are still required by they do not span over the entire file. Tantivy 0.13.2 =================== diff --git a/Cargo.toml b/Cargo.toml index efd59bdf5..d44fc2283 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,6 @@ notify = {version="4", optional=true} uuid = { version = "0.8", features = ["v4", "serde"] } crossbeam = "0.7" futures = {version = "0.3", features=["thread-pool"] } -owning_ref = "0.4" tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" } stable_deref_trait = "1" rust-stemmers = "1" @@ -41,7 +40,6 @@ downcast-rs = "1" bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]} census = "0.4" fnv = "1" -owned-read = "0.4" thiserror = "1.0" htmlescape = "0.3" fail = "0.4" diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs index d55186bf6..3055895e4 100644 --- a/examples/iterating_docs_and_positions.rs +++ b/examples/iterating_docs_and_positions.rs @@ -45,7 +45,7 @@ fn main() -> tantivy::Result<()> { // Inverted index stands for the combination of // - the term dictionary // - the inverted lists associated to each terms and their positions - let inverted_index = segment_reader.inverted_index(title); + let inverted_index = segment_reader.inverted_index(title)?; // A `Term` is a text token associated with a field. // Let's go through all docs containing the term `title:the` and access their position @@ -58,7 +58,7 @@ fn main() -> tantivy::Result<()> { // If you don't need all this information, you may get better performance by decompressing less // information. if let Some(mut segment_postings) = - inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) + inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)? { // this buffer will be used to request for positions let mut positions: Vec = Vec::with_capacity(100); @@ -106,7 +106,7 @@ fn main() -> tantivy::Result<()> { // Inverted index stands for the combination of // - the term dictionary // - the inverted lists associated to each terms and their positions - let inverted_index = segment_reader.inverted_index(title); + let inverted_index = segment_reader.inverted_index(title)?; // This segment posting object is like a cursor over the documents matching the term. // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies @@ -115,7 +115,7 @@ fn main() -> tantivy::Result<()> { // If you don't need all this information, you may get better performance by decompressing less // information. if let Some(mut block_segment_postings) = - inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) + inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)? { loop { let docs = block_segment_postings.docs(); diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 98b844fb6..9e86472cb 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -7,7 +7,6 @@ use crate::DocId; use crate::Score; use crate::SegmentLocalId; use crate::SegmentReader; -use crate::TantivyError; use std::cmp::Ordering; use std::collections::btree_map; use std::collections::BTreeMap; @@ -266,10 +265,7 @@ impl Collector for FacetCollector { _: SegmentLocalId, reader: &SegmentReader, ) -> crate::Result { - let field_name = reader.schema().get_field_name(self.field); - let facet_reader = reader.facet_reader(self.field).ok_or_else(|| { - TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name)) - })?; + let facet_reader = reader.facet_reader(self.field)?; let mut collapse_mapping = Vec::new(); let mut counts = Vec::new(); diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 22a21bcd5..640d8adcf 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -1,6 +1,7 @@ use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; use std::io; -use std::ops::Deref; + +use crate::directory::OwnedBytes; pub(crate) struct BitPacker { mini_buffer: u64, @@ -60,20 +61,14 @@ impl BitPacker { } #[derive(Clone)] -pub struct BitUnpacker -where - Data: Deref, -{ +pub struct BitUnpacker { num_bits: u64, mask: u64, - data: Data, + data: OwnedBytes, } -impl BitUnpacker -where - Data: Deref, -{ - pub fn new(data: Data, num_bits: u8) -> BitUnpacker { +impl BitUnpacker { + pub fn new(data: OwnedBytes, num_bits: u8) -> BitUnpacker { let mask: u64 = if num_bits == 64 { !0u64 } else { @@ -90,7 +85,7 @@ where if self.num_bits == 0 { return 0u64; } - let data: &[u8] = &*self.data; + let data: &[u8] = self.data.as_slice(); let num_bits = self.num_bits; let mask = self.mask; let addr_in_bits = idx * num_bits; @@ -109,8 +104,9 @@ where #[cfg(test)] mod test { use super::{BitPacker, BitUnpacker}; + use crate::directory::OwnedBytes; - fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker>, Vec) { + fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec) { let mut data = Vec::new(); let mut bitpacker = BitPacker::new(); let max_val: u64 = (1u64 << num_bits as u64) - 1u64; @@ -122,7 +118,7 @@ mod test { } bitpacker.close(&mut data).unwrap(); assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7); - let bitunpacker = BitUnpacker::new(data, num_bits); + let bitunpacker = BitUnpacker::new(OwnedBytes::new(data), num_bits); (bitunpacker, vals) } diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index 34cfe2a59..775bd62a9 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -1,14 +1,15 @@ use crate::common::BinarySerializable; use crate::common::CountingWriter; use crate::common::VInt; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; use crate::directory::{TerminatingWrite, WritePtr}; use crate::schema::Field; use crate::space_usage::FieldUsage; use crate::space_usage::PerFieldSpaceUsage; use std::collections::HashMap; -use std::io::Write; -use std::io::{self, Read}; +use std::io::{self, Read, Write}; + +use super::HasLen; #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)] pub struct FileAddr { @@ -103,25 +104,26 @@ impl CompositeWrite { /// for each field. #[derive(Clone)] pub struct CompositeFile { - data: ReadOnlySource, + data: FileSlice, offsets_index: HashMap, } impl CompositeFile { /// Opens a composite file stored in a given - /// `ReadOnlySource`. - pub fn open(data: &ReadOnlySource) -> io::Result { + /// `FileSlice`. + pub fn open(data: &FileSlice) -> io::Result { let end = data.len(); - let footer_len_data = data.slice_from(end - 4); + let footer_len_data = data.slice_from(end - 4).read_bytes()?; let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize; let footer_start = end - 4 - footer_len; - let footer_data = data.slice(footer_start, footer_start + footer_len); + let footer_data = data + .slice(footer_start, footer_start + footer_len) + .read_bytes()?; let mut footer_buffer = footer_data.as_slice(); let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize; let mut file_addrs = vec![]; let mut offsets = vec![]; - let mut field_index = HashMap::new(); let mut offset = 0; @@ -150,19 +152,19 @@ impl CompositeFile { pub fn empty() -> CompositeFile { CompositeFile { offsets_index: HashMap::new(), - data: ReadOnlySource::empty(), + data: FileSlice::empty(), } } - /// Returns the `ReadOnlySource` associated + /// Returns the `FileSlice` associated /// to a given `Field` and stored in a `CompositeFile`. - pub fn open_read(&self, field: Field) -> Option { + pub fn open_read(&self, field: Field) -> Option { self.open_read_with_idx(field, 0) } - /// Returns the `ReadOnlySource` associated + /// Returns the `FileSlice` associated /// to a given `Field` and stored in a `CompositeFile`. - pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option { + pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option { self.offsets_index .get(&FileAddr { field, idx }) .map(|&(from, to)| self.data.slice(from, to)) @@ -192,46 +194,44 @@ mod test { use std::path::Path; #[test] - fn test_composite_file() { + fn test_composite_file() -> crate::Result<()> { let path = Path::new("test_path"); let mut directory = RAMDirectory::create(); { let w = directory.open_write(path).unwrap(); let mut composite_write = CompositeWrite::wrap(w); - { - let mut write_0 = composite_write.for_field(Field::from_field_id(0u32)); - VInt(32431123u64).serialize(&mut write_0).unwrap(); - write_0.flush().unwrap(); - } - - { - let mut write_4 = composite_write.for_field(Field::from_field_id(4u32)); - VInt(2).serialize(&mut write_4).unwrap(); - write_4.flush().unwrap(); - } - composite_write.close().unwrap(); + let mut write_0 = composite_write.for_field(Field::from_field_id(0u32)); + VInt(32431123u64).serialize(&mut write_0)?; + write_0.flush()?; + let mut write_4 = composite_write.for_field(Field::from_field_id(4u32)); + VInt(2).serialize(&mut write_4)?; + write_4.flush()?; + composite_write.close()?; } { - let r = directory.open_read(path).unwrap(); - let composite_file = CompositeFile::open(&r).unwrap(); + let r = directory.open_read(path)?; + let composite_file = CompositeFile::open(&r)?; { let file0 = composite_file .open_read(Field::from_field_id(0u32)) - .unwrap(); + .unwrap() + .read_bytes()?; let mut file0_buf = file0.as_slice(); - let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0; + let payload_0 = VInt::deserialize(&mut file0_buf)?.0; assert_eq!(file0_buf.len(), 0); assert_eq!(payload_0, 32431123u64); } { let file4 = composite_file .open_read(Field::from_field_id(4u32)) - .unwrap(); + .unwrap() + .read_bytes()?; let mut file4_buf = file4.as_slice(); - let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0; + let payload_4 = VInt::deserialize(&mut file4_buf)?.0; assert_eq!(file4_buf.len(), 0); assert_eq!(payload_4, 2u64); } } + Ok(()) } } diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index e62f65cf5..dec2bf684 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,5 +1,7 @@ +use std::io; + use crate::common::BinarySerializable; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; use crate::positions::PositionReader; use crate::postings::TermInfo; use crate::postings::{BlockSegmentPostings, SegmentPostings}; @@ -14,7 +16,7 @@ use crate::termdict::TermDictionary; /// /// It is safe to delete the segment associated to /// an `InvertedIndexReader`. As long as it is open, -/// the `ReadOnlySource` it is relying on should +/// the `FileSlice` it is relying on should /// stay available. /// /// @@ -22,9 +24,9 @@ use crate::termdict::TermDictionary; /// the `SegmentReader`'s [`.inverted_index(...)`] method pub struct InvertedIndexReader { termdict: TermDictionary, - postings_source: ReadOnlySource, - positions_source: ReadOnlySource, - positions_idx_source: ReadOnlySource, + postings_file_slice: FileSlice, + positions_file_slice: FileSlice, + positions_idx_file_slice: FileSlice, record_option: IndexRecordOption, total_num_tokens: u64, } @@ -33,22 +35,21 @@ impl InvertedIndexReader { #[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symmetry pub(crate) fn new( termdict: TermDictionary, - postings_source: ReadOnlySource, - positions_source: ReadOnlySource, - positions_idx_source: ReadOnlySource, + postings_file_slice: FileSlice, + positions_file_slice: FileSlice, + positions_idx_file_slice: FileSlice, record_option: IndexRecordOption, - ) -> InvertedIndexReader { - let total_num_tokens_data = postings_source.slice(0, 8); - let mut total_num_tokens_cursor = total_num_tokens_data.as_slice(); - let total_num_tokens = u64::deserialize(&mut total_num_tokens_cursor).unwrap_or(0u64); - InvertedIndexReader { + ) -> io::Result { + let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8); + let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?; + Ok(InvertedIndexReader { termdict, - postings_source: postings_source.slice_from(8), - positions_source, - positions_idx_source, + postings_file_slice: postings_body, + positions_file_slice, + positions_idx_file_slice, record_option, total_num_tokens, - } + }) } /// Creates an empty `InvertedIndexReader` object, which @@ -56,9 +57,9 @@ impl InvertedIndexReader { pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader { InvertedIndexReader { termdict: TermDictionary::empty(), - postings_source: ReadOnlySource::empty(), - positions_source: ReadOnlySource::empty(), - positions_idx_source: ReadOnlySource::empty(), + postings_file_slice: FileSlice::empty(), + positions_file_slice: FileSlice::empty(), + positions_idx_file_slice: FileSlice::empty(), record_option, total_num_tokens: 0u64, } @@ -88,11 +89,12 @@ impl InvertedIndexReader { &self, term_info: &TermInfo, block_postings: &mut BlockSegmentPostings, - ) { - let offset = term_info.postings_offset as usize; - let end_source = self.postings_source.len(); - let postings_slice = self.postings_source.slice(offset, end_source); - block_postings.reset(term_info.doc_freq, postings_slice); + ) -> io::Result<()> { + let postings_slice = self + .postings_file_slice + .slice_from(term_info.postings_offset as usize); + block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?); + Ok(()) } /// Returns a block postings given a `Term`. @@ -103,9 +105,11 @@ impl InvertedIndexReader { &self, term: &Term, option: IndexRecordOption, - ) -> Option { - self.get_term_info(term) + ) -> io::Result> { + Ok(self + .get_term_info(term) .map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option)) + .transpose()?) } /// Returns a block postings given a `term_info`. @@ -116,10 +120,10 @@ impl InvertedIndexReader { &self, term_info: &TermInfo, requested_option: IndexRecordOption, - ) -> BlockSegmentPostings { + ) -> io::Result { let offset = term_info.postings_offset as usize; - let postings_data = self.postings_source.slice_from(offset); - BlockSegmentPostings::from_data( + let postings_data = self.postings_file_slice.slice_from(offset); + BlockSegmentPostings::open( term_info.doc_freq, postings_data, self.record_option, @@ -135,20 +139,23 @@ impl InvertedIndexReader { &self, term_info: &TermInfo, option: IndexRecordOption, - ) -> SegmentPostings { - let block_postings = self.read_block_postings_from_terminfo(term_info, option); + ) -> io::Result { + let block_postings = self.read_block_postings_from_terminfo(term_info, option)?; let position_stream = { if option.has_positions() { - let position_reader = self.positions_source.clone(); - let skip_reader = self.positions_idx_source.clone(); + let position_reader = self.positions_file_slice.clone(); + let skip_reader = self.positions_idx_file_slice.clone(); let position_reader = - PositionReader::new(position_reader, skip_reader, term_info.positions_idx); + PositionReader::new(position_reader, skip_reader, term_info.positions_idx)?; Some(position_reader) } else { None } }; - SegmentPostings::from_block_postings(block_postings, position_stream) + Ok(SegmentPostings::from_block_postings( + block_postings, + position_stream, + )) } /// Returns the total number of tokens recorded for all documents @@ -167,24 +174,31 @@ impl InvertedIndexReader { /// For instance, requesting `IndexRecordOption::Freq` for a /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. - pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option { + pub fn read_postings( + &self, + term: &Term, + option: IndexRecordOption, + ) -> io::Result> { self.get_term_info(term) .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) + .transpose() } pub(crate) fn read_postings_no_deletes( &self, term: &Term, option: IndexRecordOption, - ) -> Option { + ) -> io::Result> { self.get_term_info(term) .map(|term_info| self.read_postings_from_terminfo(&term_info, option)) + .transpose() } /// Returns the number of documents containing the term. - pub fn doc_freq(&self, term: &Term) -> u32 { - self.get_term_info(term) + pub fn doc_freq(&self, term: &Term) -> io::Result { + Ok(self + .get_term_info(term) .map(|term_info| term_info.doc_freq) - .unwrap_or(0u32) + .unwrap_or(0u32)) } } diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 8e8775efd..925e4c4e7 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -11,8 +11,8 @@ use crate::store::StoreReader; use crate::termdict::TermMerger; use crate::DocAddress; use crate::Index; -use std::fmt; use std::sync::Arc; +use std::{fmt, io}; /// Holds a list of `SegmentReader`s ready for search. /// @@ -32,17 +32,17 @@ impl Searcher { schema: Schema, index: Index, segment_readers: Vec, - ) -> Searcher { - let store_readers = segment_readers + ) -> io::Result { + let store_readers: Vec = segment_readers .iter() .map(SegmentReader::get_store_reader) - .collect(); - Searcher { + .collect::>>()?; + Ok(Searcher { schema, index, segment_readers, store_readers, - } + }) } /// Returns the `Index` associated to the `Searcher` @@ -75,13 +75,14 @@ impl Searcher { /// Return the overall number of documents containing /// the given term. - pub fn doc_freq(&self, term: &Term) -> u64 { - self.segment_readers - .iter() - .map(|segment_reader| { - u64::from(segment_reader.inverted_index(term.field()).doc_freq(term)) - }) - .sum::() + pub fn doc_freq(&self, term: &Term) -> crate::Result { + let mut total_doc_freq = 0; + for segment_reader in &self.segment_readers { + let inverted_index = segment_reader.inverted_index(term.field())?; + let doc_freq = inverted_index.doc_freq(term)?; + total_doc_freq += u64::from(doc_freq); + } + Ok(total_doc_freq) } /// Return the list of segment readers @@ -148,22 +149,22 @@ impl Searcher { } /// Return the field searcher associated to a `Field`. - pub fn field(&self, field: Field) -> FieldSearcher { - let inv_index_readers = self + pub fn field(&self, field: Field) -> crate::Result { + let inv_index_readers: Vec> = self .segment_readers .iter() .map(|segment_reader| segment_reader.inverted_index(field)) - .collect::>(); - FieldSearcher::new(inv_index_readers) + .collect::>>()?; + Ok(FieldSearcher::new(inv_index_readers)) } /// Summarize total space usage of this searcher. - pub fn space_usage(&self) -> SearcherSpaceUsage { + pub fn space_usage(&self) -> io::Result { let mut space_usage = SearcherSpaceUsage::new(); - for segment_reader in self.segment_readers.iter() { - space_usage.add_segment(segment_reader.space_usage()); + for segment_reader in &self.segment_readers { + space_usage.add_segment(segment_reader.space_usage()?); } - space_usage + Ok(space_usage) } } diff --git a/src/core/segment.rs b/src/core/segment.rs index 113706e78..efd1bcbb7 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -4,7 +4,7 @@ use crate::core::SegmentId; use crate::core::SegmentMeta; use crate::directory::error::{OpenReadError, OpenWriteError}; use crate::directory::Directory; -use crate::directory::{ReadOnlySource, WritePtr}; +use crate::directory::{FileSlice, WritePtr}; use crate::indexer::segment_serializer::SegmentSerializer; use crate::schema::Schema; use crate::Opstamp; @@ -78,10 +78,9 @@ impl Segment { } /// Open one of the component file for a *regular* read. - pub fn open_read(&self, component: SegmentComponent) -> Result { + pub fn open_read(&self, component: SegmentComponent) -> Result { let path = self.relative_path(component); - let source = self.index.directory().open_read(&path)?; - Ok(source) + self.index.directory().open_read(&path) } /// Open one of the component file for *regular* write. diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index d89a3755c..e2ce45291 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -1,10 +1,9 @@ -use crate::common::CompositeFile; use crate::common::HasLen; use crate::core::InvertedIndexReader; use crate::core::Segment; use crate::core::SegmentComponent; use crate::core::SegmentId; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; use crate::fastfield::DeleteBitSet; use crate::fastfield::FacetReader; use crate::fastfield::FastFieldReaders; @@ -16,11 +15,12 @@ use crate::space_usage::SegmentSpaceUsage; use crate::store::StoreReader; use crate::termdict::TermDictionary; use crate::DocId; +use crate::{common::CompositeFile, error::DataCorruption}; use fail::fail_point; -use std::collections::HashMap; use std::fmt; use std::sync::Arc; use std::sync::RwLock; +use std::{collections::HashMap, io}; /// Entry point to access all of the datastructures of the `Segment` /// @@ -50,7 +50,7 @@ pub struct SegmentReader { fast_fields_readers: Arc, fieldnorm_readers: FieldNormReaders, - store_source: ReadOnlySource, + store_file: FileSlice, delete_bitset_opt: Option, schema: Schema, } @@ -106,19 +106,26 @@ impl SegmentReader { } /// Accessor to the `FacetReader` associated to a given `Field`. - pub fn facet_reader(&self, field: Field) -> Option { + pub fn facet_reader(&self, field: Field) -> crate::Result { let field_entry = self.schema.get_field_entry(field); if field_entry.field_type() != &FieldType::HierarchicalFacet { - return None; + return Err(crate::TantivyError::InvalidArgument(format!( + "Field {:?} is not a facet field.", + field_entry.name() + ))); } - let term_ords_reader = self.fast_fields().u64s(field)?; + let term_ords_reader = self.fast_fields().u64s(field).ok_or_else(|| { + DataCorruption::comment_only(format!( + "Cannot find data for hierarchical facet {:?}", + field_entry.name() + )) + })?; let termdict = self .termdict_composite .open_read(field) - .map(|source| TermDictionary::from_source(&source)) - .unwrap_or_else(TermDictionary::empty); - let facet_reader = FacetReader::new(term_ords_reader, termdict); - Some(facet_reader) + .map(TermDictionary::open) + .unwrap_or_else(|| Ok(TermDictionary::empty()))?; + Ok(FacetReader::new(term_ords_reader, termdict)) } /// Accessor to the segment's `Field norms`'s reader. @@ -129,7 +136,7 @@ impl SegmentReader { /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result { - self.fieldnorm_readers.get_field(field).ok_or_else(|| { + self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { let field_name = self.schema.get_field_name(field); let err_msg = format!( "Field norm not found for field {:?}. Was it marked as indexed during indexing?", @@ -140,33 +147,33 @@ impl SegmentReader { } /// Accessor to the segment's `StoreReader`. - pub fn get_store_reader(&self) -> StoreReader { - StoreReader::from_source(self.store_source.clone()) + pub fn get_store_reader(&self) -> io::Result { + StoreReader::open(self.store_file.clone()) } /// Open a new segment for reading. pub fn open(segment: &Segment) -> crate::Result { - let termdict_source = segment.open_read(SegmentComponent::TERMS)?; - let termdict_composite = CompositeFile::open(&termdict_source)?; + let termdict_file = segment.open_read(SegmentComponent::TERMS)?; + let termdict_composite = CompositeFile::open(&termdict_file)?; - let store_source = segment.open_read(SegmentComponent::STORE)?; + let store_file = segment.open_read(SegmentComponent::STORE)?; fail_point!("SegmentReader::open#middle"); - let postings_source = segment.open_read(SegmentComponent::POSTINGS)?; - let postings_composite = CompositeFile::open(&postings_source)?; + let postings_file = segment.open_read(SegmentComponent::POSTINGS)?; + let postings_composite = CompositeFile::open(&postings_file)?; let positions_composite = { - if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) { - CompositeFile::open(&source)? + if let Ok(positions_file) = segment.open_read(SegmentComponent::POSITIONS) { + CompositeFile::open(&positions_file)? } else { CompositeFile::empty() } }; let positions_idx_composite = { - if let Ok(source) = segment.open_read(SegmentComponent::POSITIONSSKIP) { - CompositeFile::open(&source)? + if let Ok(positions_skip_file) = segment.open_read(SegmentComponent::POSITIONSSKIP) { + CompositeFile::open(&positions_skip_file)? } else { CompositeFile::empty() } @@ -184,13 +191,14 @@ impl SegmentReader { let delete_bitset_opt = if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::DELETE)?; - Some(DeleteBitSet::open(delete_data)) + let delete_bitset = DeleteBitSet::open(delete_data)?; + Some(delete_bitset) } else { None }; Ok(SegmentReader { - inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), + inv_idx_reader_cache: Default::default(), max_doc: segment.meta().max_doc(), num_docs: segment.meta().num_docs(), termdict_composite, @@ -198,7 +206,7 @@ impl SegmentReader { fast_fields_readers: fast_field_readers, fieldnorm_readers, segment_id: segment.id(), - store_source, + store_file, delete_bitset_opt, positions_composite, positions_idx_composite, @@ -218,14 +226,14 @@ impl SegmentReader { /// is returned. /// Similarly if the field is marked as indexed but no term has been indexed for the given /// index. an empty `InvertedIndexReader` is returned (but no warning is logged). - pub fn inverted_index(&self, field: Field) -> Arc { + pub fn inverted_index(&self, field: Field) -> crate::Result> { if let Some(inv_idx_reader) = self .inv_idx_reader_cache .read() .expect("Lock poisoned. This should never happen") .get(&field) { - return Arc::clone(inv_idx_reader); + return Ok(Arc::clone(inv_idx_reader)); } let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -235,41 +243,42 @@ impl SegmentReader { warn!("Field {:?} does not seem indexed.", field_entry.name()); } - let postings_source_opt = self.postings_composite.open_read(field); + let postings_file_opt = self.postings_composite.open_read(field); - if postings_source_opt.is_none() || record_option_opt.is_none() { + if postings_file_opt.is_none() || record_option_opt.is_none() { // no documents in the segment contained this field. // As a result, no data is associated to the inverted index. // // Returns an empty inverted index. let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic); - return Arc::new(InvertedIndexReader::empty(record_option)); + return Ok(Arc::new(InvertedIndexReader::empty(record_option))); } let record_option = record_option_opt.unwrap(); - let postings_source = postings_source_opt.unwrap(); + let postings_file = postings_file_opt.unwrap(); - let termdict_source = self.termdict_composite.open_read(field).expect( - "Failed to open field term dictionary in composite file. Is the field indexed?", - ); + let termdict_file: FileSlice = self.termdict_composite.open_read(field) + .ok_or_else(|| + DataCorruption::comment_only(format!("Failed to open field {:?}'s term dictionary in the composite file. Has the schema been modified?", field_entry.name())) + )?; - let positions_source = self + let positions_file = self .positions_composite .open_read(field) .expect("Index corrupted. Failed to open field positions in composite file."); - let positions_idx_source = self + let positions_idx_file = self .positions_idx_composite .open_read(field) .expect("Index corrupted. Failed to open field positions in composite file."); let inv_idx_reader = Arc::new(InvertedIndexReader::new( - TermDictionary::from_source(&termdict_source), - postings_source, - positions_source, - positions_idx_source, + TermDictionary::open(termdict_file)?, + postings_file, + positions_file, + positions_idx_file, record_option, - )); + )?); // by releasing the lock in between, we may end up opening the inverting index // twice, but this is fine. @@ -278,7 +287,7 @@ impl SegmentReader { .expect("Field reader cache lock poisoned. This should never happen.") .insert(field, Arc::clone(&inv_idx_reader)); - inv_idx_reader + Ok(inv_idx_reader) } /// Returns the segment id @@ -306,8 +315,8 @@ impl SegmentReader { } /// Summarize total space usage of this segment. - pub fn space_usage(&self) -> SegmentSpaceUsage { - SegmentSpaceUsage::new( + pub fn space_usage(&self) -> io::Result { + Ok(SegmentSpaceUsage::new( self.num_docs(), self.termdict_composite.space_usage(), self.postings_composite.space_usage(), @@ -315,12 +324,12 @@ impl SegmentReader { self.positions_idx_composite.space_usage(), self.fast_fields_readers.space_usage(), self.fieldnorm_readers.space_usage(), - self.get_store_reader().space_usage(), + self.get_store_reader()?.space_usage(), self.delete_bitset_opt .as_ref() .map(DeleteBitSet::space_usage) .unwrap_or(0), - ) + )) } } @@ -337,7 +346,7 @@ mod test { use crate::DocId; #[test] - fn test_alive_docs_iterator() { + fn test_alive_docs_iterator() -> crate::Result<()> { let mut schema_builder = Schema::builder(); schema_builder.add_text_field("name", TEXT | STORED); let schema = schema_builder.build(); @@ -345,26 +354,26 @@ mod test { let name = schema.get_field("name").unwrap(); { - let mut index_writer = index.writer_for_tests().unwrap(); + let mut index_writer = index.writer_for_tests()?; index_writer.add_document(doc!(name => "tantivy")); index_writer.add_document(doc!(name => "horse")); index_writer.add_document(doc!(name => "jockey")); index_writer.add_document(doc!(name => "cap")); - // we should now have one segment with two docs - index_writer.commit().unwrap(); + index_writer.commit()?; } { - let mut index_writer2 = index.writer(50_000_000).unwrap(); + let mut index_writer2 = index.writer(50_000_000)?; index_writer2.delete_term(Term::from_field_text(name, "horse")); index_writer2.delete_term(Term::from_field_text(name, "cap")); // ok, now we should have a deleted doc - index_writer2.commit().unwrap(); + index_writer2.commit()?; } - let searcher = index.reader().unwrap().searcher(); + let searcher = index.reader()?.searcher(); let docs: Vec = searcher.segment_reader(0).doc_ids_alive().collect(); assert_eq!(vec![0u32, 2u32], docs); + Ok(()) } } diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 900d398c5..024746dc3 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -3,7 +3,7 @@ use crate::directory::error::LockError; use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError}; use crate::directory::WatchCallback; use crate::directory::WatchHandle; -use crate::directory::{ReadOnlySource, WritePtr}; +use crate::directory::{FileSlice, WritePtr}; use std::fmt; use std::io; use std::io::Write; @@ -11,7 +11,6 @@ use std::marker::Send; use std::marker::Sync; use std::path::Path; use std::path::PathBuf; -use std::result; use std::thread; use std::time::Duration; @@ -117,19 +116,19 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static { /// change. /// /// Specifically, subsequent writes or flushes should - /// have no effect on the returned `ReadOnlySource` object. + /// have no effect on the returned `FileSlice` object. /// /// You should only use this to read files create with [Directory::open_write]. - fn open_read(&self, path: &Path) -> result::Result; + fn open_read(&self, path: &Path) -> Result; /// Removes a file /// /// Removing a file will not affect an eventual - /// existing ReadOnlySource pointing to it. + /// existing FileSlice pointing to it. /// /// Removing a nonexistent file, yields a /// `DeleteError::DoesNotExist`. - fn delete(&self, path: &Path) -> result::Result<(), DeleteError>; + fn delete(&self, path: &Path) -> Result<(), DeleteError>; /// Returns true iff the file exists fn exists(&self, path: &Path) -> bool; @@ -139,7 +138,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static { /// /// Right after this call, the file should be created /// and any subsequent call to `open_read` for the - /// same path should return a `ReadOnlySource`. + /// same path should return a `FileSlice`. /// /// Write operations may be aggressively buffered. /// The client of this trait is responsible for calling flush diff --git a/src/directory/error.rs b/src/directory/error.rs index 1f0a7fc9b..f3856288e 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -57,6 +57,11 @@ pub enum OpenWriteError { }, } +impl OpenWriteError { + pub(crate) fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self { + Self::IOError { io_error, filepath } + } +} /// Type of index incompatibility between the library and the index found on disk /// Used to catch and provide a hint to solve this incompatibility issue pub enum Incompatibility { @@ -137,6 +142,11 @@ pub enum OpenReadError { IncompatibleIndex(Incompatibility), } +impl OpenReadError { + pub(crate) fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self { + Self::IOError { io_error, filepath } + } +} /// Error that may occur when trying to delete a file #[derive(Debug, Error)] pub enum DeleteError { diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs new file mode 100644 index 000000000..f1ea23ecb --- /dev/null +++ b/src/directory/file_slice.rs @@ -0,0 +1,264 @@ +use crate::common::HasLen; +use crate::directory::OwnedBytes; +use stable_deref_trait::{CloneStableDeref, StableDeref}; +use std::sync::Arc; +use std::{io, ops::Deref}; + +pub type BoxedData = Box + Send + Sync + 'static>; + +/// Objects that represents files sections in tantivy. +/// +/// These read objects are only in charge to deliver +/// the data in the form of a constant read-only `&[u8]`. +/// Whatever happens to the directory file, the data +/// hold by this object should never be altered or destroyed. +pub trait FileSliceTrait: 'static + Send + Sync + HasLen { + fn read_bytes(&self) -> io::Result; + fn slice(&self, from: usize, to: usize) -> FileSlice; +} + +impl FileSliceTrait for &'static [u8] { + fn read_bytes(&self) -> io::Result { + Ok(OwnedBytes::new(*self)) + } + + fn slice(&self, from: usize, to: usize) -> FileSlice { + FileSlice::from(&self[from..to]) + } +} + +impl HasLen for &'static [u8] { + fn len(&self) -> usize { + self.as_ref().len() + } +} + +/// Logical slice of read only file in tantivy. +// +/// In other words, it is more or less equivalent to the triplet `(file, start_byteoffset, stop_offset)`. +/// +/// FileSlice is a simple wrapper over an `Arc>`. It can +/// be cloned cheaply. +/// +/// The underlying behavior is therefore specific to the `Directory` that created it. +/// Despite its name, a `FileSlice` may or may not directly map to an actual file +/// on the filesystem. +#[derive(Clone)] +pub struct FileSlice(Arc>); + +impl FileSlice { + /// Creates a FileSlice, wrapping over a FileSliceTrait. + pub fn new(data: D) -> Self + where + D: Deref + Send + Sync + 'static, + { + FileSlice::from(SlicedDeref::new(data)) + } + + /// Creates an empty FileSlice + pub fn empty() -> FileSlice { + let data: &'static [u8] = &[]; + FileSlice::from(data) + } + + /// Returns a `OwnedBytes` with all of the data in the `FileSlice`. + /// + /// The behavior is strongly dependant on the implementation of the underlying + /// `Directory` and the `FileSliceTrait` it creates. + /// In particular, it is up to the `Directory` implementation + /// to handle caching if needed. + pub fn read_bytes(&self) -> io::Result { + self.0.read_bytes() + } + + /// Splits the file slice at the given offset and return two file slices. + /// `file_slice[..split_offset]` and `file_slice[split_offset..]`. + /// + /// This operation is cheap and must not copy any underlying data. + pub fn split(self, left_len: usize) -> (FileSlice, FileSlice) { + let left = self.slice_to(left_len); + let right = self.slice_from(left_len); + (left, right) + } + + /// Splits the file slice at the given offset and return two file slices. + /// `file_slice[..split_offset]` and `file_slice[split_offset..]`. + pub fn split_from_end(self, right_len: usize) -> (FileSlice, FileSlice) { + let left_len = self.len() - right_len; + self.split(left_len) + } + + /// Creates a FileSlice that is just a view over a slice of the data. + pub fn slice(&self, start: usize, stop: usize) -> FileSlice { + assert!( + start <= stop, + "Requested negative slice [{}..{}]", + start, + stop + ); + assert!(stop <= self.len()); + self.0.slice(start, stop) + } + + /// Like `.slice(...)` but enforcing only the `from` + /// boundary. + /// + /// Equivalent to `.slice(from_offset, self.len())` + pub fn slice_from(&self, from_offset: usize) -> FileSlice { + self.slice(from_offset, self.len()) + } + + /// Like `.slice(...)` but enforcing only the `to` + /// boundary. + /// + /// Equivalent to `.slice(0, to_offset)` + pub fn slice_to(&self, to_offset: usize) -> FileSlice { + self.slice(0, to_offset) + } +} + +impl HasLen for FileSlice { + fn len(&self) -> usize { + self.0.len() + } +} + +impl From for FileSlice { + fn from(file: S) -> Self { + FileSlice(Arc::new(Box::new(file))) + } +} + +impl From> for FileSlice { + fn from(data: Arc) -> Self { + let slice_deref: SlicedDeref = SlicedDeref::from(data); + FileSlice::from(slice_deref) + } +} + +/// `SliceDeref` wraps an `Arc` to implement `FileSliceTrait` . +/// It keeps track of (start, stop) boundaries. +#[derive(Clone)] +pub struct SlicedDeref { + data: Arc, + start: usize, + stop: usize, +} + +impl SlicedDeref { + /// Wraps a new `Deref` + pub fn new(data: D) -> Self + where + D: Deref + 'static + Send + Sync, + { + let len = data.len(); + SlicedDeref { + data: Arc::new(Box::new(data)), + start: 0, + stop: len, + } + } +} + +impl From> for SlicedDeref { + fn from(data: Arc) -> Self { + let len = data.len(); + SlicedDeref { + data, + start: 0, + stop: len, + } + } +} + +unsafe impl StableDeref for SlicedDeref {} +unsafe impl CloneStableDeref for SlicedDeref {} + +impl FileSliceTrait for SlicedDeref { + fn read_bytes(&self) -> io::Result { + Ok(OwnedBytes::new(self.clone())) + } + + fn slice(&self, from: usize, to: usize) -> FileSlice { + assert!(to <= self.len()); + FileSlice::from(SlicedDeref { + data: self.data.clone(), + start: self.start + from, + stop: self.start + to, + }) + } +} + +impl HasLen for SlicedDeref { + fn len(&self) -> usize { + self.stop - self.start + } +} + +impl Deref for SlicedDeref { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + &self.data.deref()[self.start..self.stop] + } +} + +#[cfg(test)] +mod tests { + use super::{FileSlice, FileSliceTrait, SlicedDeref}; + use crate::common::HasLen; + use std::io; + + #[test] + fn test_file_slice() -> io::Result<()> { + let file_slice = FileSlice::new(b"abcdef".as_ref()); + assert_eq!(file_slice.len(), 6); + assert_eq!(file_slice.slice_from(2).read_bytes()?.as_slice(), b"cdef"); + assert_eq!(file_slice.slice_to(2).read_bytes()?.as_slice(), b"ab"); + assert_eq!( + file_slice + .slice_from(1) + .slice_to(2) + .read_bytes()? + .as_slice(), + b"bc" + ); + { + let (left, right) = file_slice.clone().split(0); + assert_eq!(left.read_bytes()?.as_slice(), b""); + assert_eq!(right.read_bytes()?.as_slice(), b"abcdef"); + } + { + let (left, right) = file_slice.clone().split(2); + assert_eq!(left.read_bytes()?.as_slice(), b"ab"); + assert_eq!(right.read_bytes()?.as_slice(), b"cdef"); + } + { + let (left, right) = file_slice.clone().split_from_end(0); + assert_eq!(left.read_bytes()?.as_slice(), b"abcdef"); + assert_eq!(right.read_bytes()?.as_slice(), b""); + } + { + let (left, right) = file_slice.clone().split_from_end(2); + assert_eq!(left.read_bytes()?.as_slice(), b"abcd"); + assert_eq!(right.read_bytes()?.as_slice(), b"ef"); + } + Ok(()) + } + + #[test] + fn test_file_slice_trait_slice_len() { + let blop: &'static [u8] = b"abc"; + let owned_bytes: Box = Box::new(blop); + assert_eq!(owned_bytes.len(), 3); + } + + #[test] + fn test_slice_deref() -> io::Result<()> { + let slice_deref = SlicedDeref::new(&b"abcdef"[..]); + assert_eq!(slice_deref.len(), 6); + assert_eq!(slice_deref.read_bytes()?.as_ref(), b"abcdef"); + assert_eq!(slice_deref.slice(1, 4).read_bytes()?.as_ref(), b"bcd"); + Ok(()) + } +} diff --git a/src/directory/footer.rs b/src/directory/footer.rs index 33f6d06d4..3a696896f 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -1,9 +1,8 @@ -use crate::common::{BinarySerializable, CountingWriter, FixedSize, VInt}; +use crate::common::{BinarySerializable, CountingWriter, FixedSize, HasLen, VInt}; use crate::directory::error::Incompatibility; -use crate::directory::read_only_source::ReadOnlySource; +use crate::directory::FileSlice; use crate::directory::{AntiCallToken, TerminatingWrite}; use crate::Version; -use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; use crc32fast::Hasher; use std::io; use std::io::Write; @@ -64,26 +63,26 @@ impl Footer { let mut counting_write = CountingWriter::wrap(&mut write); self.serialize(&mut counting_write)?; let written_len = counting_write.written_bytes(); - write.write_u32::(written_len as u32)?; + (written_len as u32).serialize(write)?; Ok(()) } - pub fn extract_footer(source: ReadOnlySource) -> Result<(Footer, ReadOnlySource), io::Error> { - if source.len() < 4 { + pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> { + if file.len() < 4 { return Err(io::Error::new( io::ErrorKind::UnexpectedEof, format!( "File corrupted. The file is smaller than 4 bytes (len={}).", - source.len() + file.len() ), )); } - let (body_footer, footer_len_bytes) = source.split_from_end(u32::SIZE_IN_BYTES); - let footer_len = LittleEndian::read_u32(footer_len_bytes.as_slice()) as usize; - let body_len = body_footer.len() - footer_len; - let (body, footer_data) = body_footer.split(body_len); - let mut cursor = footer_data.as_slice(); - let footer = Footer::deserialize(&mut cursor)?; + let (body_footer, footer_len_file) = file.split_from_end(u32::SIZE_IN_BYTES); + let mut footer_len_bytes = footer_len_file.read_bytes()?; + let footer_len = u32::deserialize(&mut footer_len_bytes)? as usize; + let (body, footer) = body_footer.split_from_end(footer_len); + let mut footer_bytes = footer.read_bytes()?; + let footer = Footer::deserialize(&mut footer_bytes)?; Ok((footer, body)) } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 7d4e6198c..13c6cf762 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -5,7 +5,7 @@ use crate::directory::DirectoryLock; use crate::directory::GarbageCollectionResult; use crate::directory::Lock; use crate::directory::META_LOCK; -use crate::directory::{ReadOnlySource, WritePtr}; +use crate::directory::{FileSlice, WritePtr}; use crate::directory::{WatchCallback, WatchHandle}; use crate::error::DataCorruption; use crate::Directory; @@ -86,12 +86,7 @@ impl ManagedDirectory { directory: Box::new(directory), meta_informations: Arc::default(), }), - Err(OpenReadError::IOError { io_error, filepath }) => { - Err(crate::TantivyError::OpenReadError(OpenReadError::IOError { - io_error, - filepath, - })) - } + io_err @ Err(OpenReadError::IOError { .. }) => Err(io_err.err().unwrap().into()), Err(OpenReadError::IncompatibleIndex(incompatibility)) => { // For the moment, this should never happen `meta.json` // do not have any footer and cannot detect incompatibility. @@ -241,8 +236,14 @@ impl ManagedDirectory { io_error, filepath: path.to_path_buf(), })?; + let bytes = data + .read_bytes() + .map_err(|io_error| OpenReadError::IOError { + filepath: path.to_path_buf(), + io_error, + })?; let mut hasher = Hasher::new(); - hasher.update(data.as_slice()); + hasher.update(bytes.as_slice()); let crc = hasher.finalize(); Ok(footer .versioned_footer @@ -273,24 +274,17 @@ impl ManagedDirectory { } impl Directory for ManagedDirectory { - fn open_read(&self, path: &Path) -> result::Result { - let read_only_source = self.directory.open_read(path)?; - let (footer, reader) = Footer::extract_footer(read_only_source).map_err(|io_error| { - OpenReadError::IOError { - io_error, - filepath: path.to_path_buf(), - } - })?; + fn open_read(&self, path: &Path) -> result::Result { + let file_slice = self.directory.open_read(path)?; + let (footer, reader) = Footer::extract_footer(file_slice) + .map_err(|io_error| OpenReadError::wrap_io_error(io_error, path.to_path_buf()))?; footer.is_compatible()?; Ok(reader) } fn open_write(&mut self, path: &Path) -> result::Result { self.register_file_as_managed(path) - .map_err(|io_error| OpenWriteError::IOError { - io_error, - filepath: path.to_path_buf(), - })?; + .map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?; Ok(io::BufWriter::new(Box::new(FooterProxy::new( self.directory .open_write(path)? @@ -414,39 +408,37 @@ mod tests_mmap_specific { } #[test] - fn test_checksum() { + fn test_checksum() -> crate::Result<()> { let test_path1: &'static Path = Path::new("some_path_for_test"); let test_path2: &'static Path = Path::new("other_test_path"); let tempdir = TempDir::new().unwrap(); let tempdir_path = PathBuf::from(tempdir.path()); - let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); - let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap(); - let mut write = managed_directory.open_write(test_path1).unwrap(); - write.write_all(&[0u8, 1u8]).unwrap(); - write.terminate().unwrap(); + let mmap_directory = MmapDirectory::open(&tempdir_path)?; + let mut managed_directory = ManagedDirectory::wrap(mmap_directory)?; + let mut write = managed_directory.open_write(test_path1)?; + write.write_all(&[0u8, 1u8])?; + write.terminate()?; - let mut write = managed_directory.open_write(test_path2).unwrap(); - write.write_all(&[3u8, 4u8, 5u8]).unwrap(); - write.terminate().unwrap(); + let mut write = managed_directory.open_write(test_path2)?; + write.write_all(&[3u8, 4u8, 5u8])?; + write.terminate()?; - let read_source = managed_directory.open_read(test_path2).unwrap(); - assert_eq!(read_source.as_slice(), &[3u8, 4u8, 5u8]); + let read_file = managed_directory.open_read(test_path2)?.read_bytes()?; + assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]); assert!(managed_directory.list_damaged().unwrap().is_empty()); let mut corrupted_path = tempdir_path.clone(); corrupted_path.push(test_path2); - let mut file = OpenOptions::new() - .write(true) - .open(&corrupted_path) - .unwrap(); - file.write_all(&[255u8]).unwrap(); - file.flush().unwrap(); + let mut file = OpenOptions::new().write(true).open(&corrupted_path)?; + file.write_all(&[255u8])?; + file.flush()?; drop(file); - let damaged = managed_directory.list_damaged().unwrap(); + let damaged = managed_directory.list_damaged()?; assert_eq!(damaged.len(), 1); assert!(damaged.contains(test_path2)); + Ok(()) } } diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index b360eb25b..be6942615 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -1,12 +1,12 @@ use crate::core::META_FILEPATH; use crate::directory::error::LockError; use crate::directory::error::{DeleteError, OpenDirectoryError, OpenReadError, OpenWriteError}; -use crate::directory::read_only_source::BoxedData; use crate::directory::AntiCallToken; +use crate::directory::BoxedData; use crate::directory::Directory; use crate::directory::DirectoryLock; +use crate::directory::FileSlice; use crate::directory::Lock; -use crate::directory::ReadOnlySource; use crate::directory::WatchCallback; use crate::directory::WatchCallbackList; use crate::directory::WatchHandle; @@ -42,21 +42,17 @@ pub(crate) fn make_io_err(msg: String) -> io::Error { /// Returns None iff the file exists, can be read, but is empty (and hence /// cannot be mmapped) fn open_mmap(full_path: &Path) -> result::Result, OpenReadError> { - let file = File::open(full_path).map_err(|e| { - if e.kind() == io::ErrorKind::NotFound { - OpenReadError::FileDoesNotExist(full_path.to_owned()) + let file = File::open(full_path).map_err(|io_err| { + if io_err.kind() == io::ErrorKind::NotFound { + OpenReadError::FileDoesNotExist(full_path.to_path_buf()) } else { - OpenReadError::IOError { - io_error: e, - filepath: full_path.to_owned(), - } + OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()) } })?; - let meta_data = file.metadata().map_err(|e| OpenReadError::IOError { - io_error: e, - filepath: full_path.to_owned(), - })?; + let meta_data = file + .metadata() + .map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_owned()))?; if meta_data.len() == 0 { // if the file size is 0, it will not be possible // to mmap the file, so we return None @@ -66,10 +62,7 @@ fn open_mmap(full_path: &Path) -> result::Result, OpenReadError> { unsafe { memmap::Mmap::map(&file) .map(Some) - .map_err(|e| OpenReadError::IOError { - io_error: e, - filepath: full_path.to_owned(), - }) + .map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf())) } } @@ -408,7 +401,7 @@ impl TerminatingWrite for SafeFileWriter { } impl Directory for MmapDirectory { - fn open_read(&self, path: &Path) -> result::Result { + fn open_read(&self, path: &Path) -> result::Result { debug!("Open Read {:?}", path); let full_path = self.resolve_path(path); @@ -418,15 +411,13 @@ impl Directory for MmapDirectory { on mmap cache while reading {:?}", path ); - OpenReadError::IOError { - io_error: make_io_err(msg), - filepath: path.to_owned(), - } + let io_err = make_io_err(msg); + OpenReadError::wrap_io_error(io_err, path.to_path_buf()) })?; Ok(mmap_cache .get_mmap(&full_path)? - .map(ReadOnlySource::from) - .unwrap_or_else(ReadOnlySource::empty)) + .map(FileSlice::from) + .unwrap_or_else(FileSlice::empty)) } /// Any entry associated to the path in the mmap will be @@ -465,29 +456,22 @@ impl Directory for MmapDirectory { .create_new(true) .open(full_path); - let mut file = open_res.map_err(|err| { - if err.kind() == io::ErrorKind::AlreadyExists { - OpenWriteError::FileAlreadyExists(path.to_owned()) + let mut file = open_res.map_err(|io_err| { + if io_err.kind() == io::ErrorKind::AlreadyExists { + OpenWriteError::FileAlreadyExists(path.to_path_buf()) } else { - OpenWriteError::IOError { - io_error: err, - filepath: path.to_owned(), - } + OpenWriteError::wrap_io_error(io_err, path.to_path_buf()) } })?; // making sure the file is created. - file.flush().map_err(|io_error| OpenWriteError::IOError { - io_error, - filepath: path.to_owned(), - })?; + file.flush() + .map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?; // Apparetntly, on some filesystem syncing the parent // directory is required. - self.sync_directory().map_err(|e| OpenWriteError::IOError { - io_error: e, - filepath: path.to_owned(), - })?; + self.sync_directory() + .map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?; let writer = SafeFileWriter::new(file); Ok(BufWriter::new(Box::new(writer))) @@ -498,21 +482,16 @@ impl Directory for MmapDirectory { let mut buffer = Vec::new(); match File::open(&full_path) { Ok(mut file) => { - file.read_to_end(&mut buffer) - .map_err(|io_error| OpenReadError::IOError { - io_error, - filepath: path.to_owned(), - })?; + file.read_to_end(&mut buffer).map_err(|io_error| { + OpenReadError::wrap_io_error(io_error, path.to_path_buf()) + })?; Ok(buffer) } Err(io_error) => { if io_error.kind() == io::ErrorKind::NotFound { Err(OpenReadError::FileDoesNotExist(path.to_owned())) } else { - Err(OpenReadError::IOError { - io_error, - filepath: path.to_owned(), - }) + Err(OpenReadError::wrap_io_error(io_error, path.to_path_buf())) } } } @@ -560,10 +539,10 @@ mod tests { // The following tests are specific to the MmapDirectory use super::*; - use crate::indexer::LogMergePolicy; use crate::schema::{Schema, SchemaBuilder, TEXT}; use crate::Index; use crate::ReloadPolicy; + use crate::{common::HasLen, indexer::LogMergePolicy}; use std::fs; use std::sync::atomic::{AtomicUsize, Ordering}; diff --git a/src/directory/mod.rs b/src/directory/mod.rs index df5e55d81..09662cab5 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -9,10 +9,11 @@ mod mmap_directory; mod directory; mod directory_lock; +mod file_slice; mod footer; mod managed_directory; +mod owned_bytes; mod ram_directory; -mod read_only_source; mod watch_event_router; /// Errors specific to the directory module. @@ -21,8 +22,10 @@ pub mod error; pub use self::directory::DirectoryLock; pub use self::directory::{Directory, DirectoryClone}; pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK}; +pub(crate) use self::file_slice::BoxedData; +pub use self::file_slice::FileSlice; +pub use self::owned_bytes::OwnedBytes; pub use self::ram_directory::RAMDirectory; -pub use self::read_only_source::ReadOnlySource; pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle}; use std::io::{self, BufWriter, Write}; use std::path::PathBuf; diff --git a/src/directory/owned_bytes.rs b/src/directory/owned_bytes.rs new file mode 100644 index 000000000..7aa366ceb --- /dev/null +++ b/src/directory/owned_bytes.rs @@ -0,0 +1,239 @@ +use stable_deref_trait::StableDeref; +use std::mem; +use std::ops::Deref; +use std::sync::Arc; +use std::{fmt, io}; + +/// An OwnedBytes simply wraps an object that owns a slice of data and exposes +/// this data as a static slice. +/// +/// The backing object is required to be `StableDeref`. +#[derive(Clone)] +pub struct OwnedBytes { + data: &'static [u8], + box_stable_deref: Arc + Sync + Send>, +} + +impl OwnedBytes { + /// Creates an empty `OwnedBytes`. + pub fn empty() -> OwnedBytes { + OwnedBytes::new(&[][..]) + } + + /// Creates an `OwnedBytes` intance given a `StableDeref` object. + pub fn new + 'static + Send + Sync>( + data_holder: T, + ) -> OwnedBytes { + let box_stable_deref = Arc::new(data_holder); + let data = unsafe { mem::transmute::<_, &'static [u8]>(box_stable_deref.deref().deref()) }; + OwnedBytes { + box_stable_deref, + data, + } + } + + /// Returns the underlying slice of data. + /// `Deref` and `AsRef` are also available. + #[inline(always)] + pub fn as_slice(&self) -> &[u8] { + self.data + } + + /// Returns the len of the slice. + #[inline(always)] + pub fn len(&self) -> usize { + self.data.len() + } + + /// Splits the OwnedBytes into two OwnedBytes `(left, right)`. + /// + /// Left will hold `split_len` bytes. + /// + /// This operation is cheap and does not require to copy any memory. + /// On the other hand, both `left` and `right` retain a handle over + /// the entire slice of memory. In other words, the memory will only + /// be released when both left and right are dropped. + pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) { + let right_box_stable_deref = self.box_stable_deref.clone(); + let left = OwnedBytes { + data: &self.data[..split_len], + box_stable_deref: self.box_stable_deref, + }; + let right = OwnedBytes { + data: &self.data[split_len..], + box_stable_deref: right_box_stable_deref, + }; + (left, right) + } + + /// Returns true iff this `OwnedBytes` is empty. + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.as_slice().is_empty() + } + + /// Drops the left most `advance_len` bytes. + /// + /// See also [.clip(clip_len: usize))](#method.clip). + #[inline(always)] + pub fn advance(&mut self, advance_len: usize) { + self.data = &self.data[advance_len..] + } +} + +impl fmt::Debug for OwnedBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // We truncate the bytes in order to make sure the debug string + // is not too long. + let bytes_truncated: &[u8] = if self.len() > 8 { + &self.as_slice()[..10] + } else { + self.as_slice() + }; + write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len()) + } +} + +impl Deref for OwnedBytes { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl io::Read for OwnedBytes { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let read_len = { + let data = self.as_slice(); + if data.len() >= buf.len() { + let buf_len = buf.len(); + buf.copy_from_slice(&data[..buf_len]); + buf.len() + } else { + let data_len = data.len(); + buf[..data_len].copy_from_slice(data); + data_len + } + }; + self.advance(read_len); + Ok(read_len) + } + fn read_to_end(&mut self, buf: &mut Vec) -> io::Result { + let read_len = { + let data = self.as_slice(); + buf.extend(data); + data.len() + }; + self.advance(read_len); + Ok(read_len) + } + fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> { + let read_len = self.read(buf)?; + if read_len != buf.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "failed to fill whole buffer", + )); + } + Ok(()) + } +} + +impl AsRef<[u8]> for OwnedBytes { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +#[cfg(test)] +mod tests { + use std::io::{self, Read}; + + use super::OwnedBytes; + + #[test] + fn test_owned_bytes_debug() { + let short_bytes = OwnedBytes::new(b"abcd".as_ref()); + assert_eq!( + format!("{:?}", short_bytes), + "OwnedBytes([97, 98, 99, 100], len=4)" + ); + let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref()); + assert_eq!( + format!("{:?}", long_bytes), + "OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)" + ); + } + + #[test] + fn test_owned_bytes_read() -> io::Result<()> { + let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref()); + { + let mut buf = [0u8; 5]; + bytes.read_exact(&mut buf[..]).unwrap(); + assert_eq!(&buf, b"abcde"); + assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz") + } + { + let mut buf = [0u8; 2]; + bytes.read_exact(&mut buf[..]).unwrap(); + assert_eq!(&buf, b"fg"); + assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz") + } + Ok(()) + } + + #[test] + fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> { + let mut bytes = OwnedBytes::new(b"abcde".as_ref()); + let mut buf = [0u8; 5]; + assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5); + assert_eq!(&buf, b"abcde"); + assert_eq!(bytes.as_slice(), b""); + assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0); + assert_eq!(&buf, b"abcde"); + Ok(()) + } + #[test] + fn test_owned_bytes_read_incomplete() -> io::Result<()> { + let mut bytes = OwnedBytes::new(b"abcde".as_ref()); + let mut buf = [0u8; 7]; + assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5); + assert_eq!(&buf[..5], b"abcde"); + assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0); + Ok(()) + } + + #[test] + fn test_owned_bytes_read_to_end() -> io::Result<()> { + let mut bytes = OwnedBytes::new(b"abcde".as_ref()); + let mut buf = Vec::new(); + bytes.read_to_end(&mut buf)?; + assert_eq!(buf.as_slice(), b"abcde".as_ref()); + Ok(()) + } + + #[test] + fn test_owned_bytes_split() { + let bytes = OwnedBytes::new(b"abcdefghi".as_ref()); + let (left, right) = bytes.split(3); + assert_eq!(left.as_slice(), b"abc"); + assert_eq!(right.as_slice(), b"defghi"); + } + + #[test] + fn test_owned_bytes_split_boundary() { + let bytes = OwnedBytes::new(b"abcdefghi".as_ref()); + { + let (left, right) = bytes.clone().split(0); + assert_eq!(left.as_slice(), b""); + assert_eq!(right.as_slice(), b"abcdefghi"); + } + { + let (left, right) = bytes.split(9); + assert_eq!(left.as_slice(), b"abcdefghi"); + assert_eq!(right.as_slice(), b""); + } + } +} diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 3b9a58182..0dc7dc572 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -1,9 +1,9 @@ -use crate::core::META_FILEPATH; use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError}; use crate::directory::AntiCallToken; use crate::directory::WatchCallbackList; -use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle}; +use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle}; use crate::directory::{TerminatingWrite, WritePtr}; +use crate::{common::HasLen, core::META_FILEPATH}; use fail::fail_point; use std::collections::HashMap; use std::fmt; @@ -80,17 +80,17 @@ impl TerminatingWrite for VecWriter { #[derive(Default)] struct InnerDirectory { - fs: HashMap, + fs: HashMap, watch_router: WatchCallbackList, } impl InnerDirectory { fn write(&mut self, path: PathBuf, data: &[u8]) -> bool { - let data = ReadOnlySource::new(Vec::from(data)); + let data = FileSlice::new(Vec::from(data)); self.fs.insert(path, data).is_some() } - fn open_read(&self, path: &Path) -> Result { + fn open_read(&self, path: &Path) -> Result { self.fs .get(path) .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path))) @@ -153,9 +153,9 @@ impl RAMDirectory { /// If an error is encounterred, files may be persisted partially. pub fn persist(&self, dest: &mut dyn Directory) -> crate::Result<()> { let wlock = self.fs.write().unwrap(); - for (path, source) in wlock.fs.iter() { + for (path, file) in wlock.fs.iter() { let mut dest_wrt = dest.open_write(path)?; - dest_wrt.write_all(source.as_slice())?; + dest_wrt.write_all(file.read_bytes()?.as_slice())?; dest_wrt.terminate()?; } Ok(()) @@ -163,7 +163,7 @@ impl RAMDirectory { } impl Directory for RAMDirectory { - fn open_read(&self, path: &Path) -> result::Result { + fn open_read(&self, path: &Path) -> result::Result { self.fs.read().unwrap().open_read(path) } @@ -195,7 +195,14 @@ impl Directory for RAMDirectory { } fn atomic_read(&self, path: &Path) -> Result, OpenReadError> { - Ok(self.open_read(path)?.as_slice().to_owned()) + let bytes = + self.open_read(path)? + .read_bytes() + .map_err(|io_error| OpenReadError::IOError { + io_error, + filepath: path.to_path_buf(), + })?; + Ok(bytes.as_slice().to_owned()) } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs deleted file mode 100644 index 9949b9e77..000000000 --- a/src/directory/read_only_source.rs +++ /dev/null @@ -1,137 +0,0 @@ -use crate::common::HasLen; -use stable_deref_trait::{CloneStableDeref, StableDeref}; -use std::ops::Deref; -use std::sync::Arc; - -pub type BoxedData = Box + Send + Sync + 'static>; - -/// Read object that represents files in tantivy. -/// -/// These read objects are only in charge to deliver -/// the data in the form of a constant read-only `&[u8]`. -/// Whatever happens to the directory file, the data -/// hold by this object should never be altered or destroyed. -pub struct ReadOnlySource { - data: Arc, - start: usize, - stop: usize, -} - -unsafe impl StableDeref for ReadOnlySource {} -unsafe impl CloneStableDeref for ReadOnlySource {} - -impl Deref for ReadOnlySource { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - self.as_slice() - } -} - -impl From> for ReadOnlySource { - fn from(data: Arc) -> Self { - let len = data.len(); - ReadOnlySource { - data, - start: 0, - stop: len, - } - } -} - -impl ReadOnlySource { - pub(crate) fn new(data: D) -> ReadOnlySource - where - D: Deref + Send + Sync + 'static, - { - let len = data.len(); - ReadOnlySource { - data: Arc::new(Box::new(data)), - start: 0, - stop: len, - } - } - - /// Creates an empty ReadOnlySource - pub fn empty() -> ReadOnlySource { - ReadOnlySource::new(&[][..]) - } - - /// Returns the data underlying the ReadOnlySource object. - pub fn as_slice(&self) -> &[u8] { - &self.data[self.start..self.stop] - } - - /// Splits into 2 `ReadOnlySource`, at the offset given - /// as an argument. - pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) { - let left = self.slice(0, addr); - let right = self.slice_from(addr); - (left, right) - } - - /// Splits into 2 `ReadOnlySource`, at the offset `end - right_len`. - pub fn split_from_end(self, right_len: usize) -> (ReadOnlySource, ReadOnlySource) { - let left_len = self.len() - right_len; - self.split(left_len) - } - - /// Creates a ReadOnlySource that is just a - /// view over a slice of the data. - /// - /// Keep in mind that any living slice extends - /// the lifetime of the original ReadOnlySource, - /// - /// For instance, if `ReadOnlySource` wraps 500MB - /// worth of data in anonymous memory, and only a - /// 1KB slice is remaining, the whole `500MBs` - /// are retained in memory. - pub fn slice(&self, start: usize, stop: usize) -> ReadOnlySource { - assert!( - start <= stop, - "Requested negative slice [{}..{}]", - start, - stop - ); - assert!(stop <= self.len()); - ReadOnlySource { - data: self.data.clone(), - start: self.start + start, - stop: self.start + stop, - } - } - - /// Like `.slice(...)` but enforcing only the `from` - /// boundary. - /// - /// Equivalent to `.slice(from_offset, self.len())` - pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource { - self.slice(from_offset, self.len()) - } - - /// Like `.slice(...)` but enforcing only the `to` - /// boundary. - /// - /// Equivalent to `.slice(0, to_offset)` - pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource { - self.slice(0, to_offset) - } -} - -impl HasLen for ReadOnlySource { - fn len(&self) -> usize { - self.stop - self.start - } -} - -impl Clone for ReadOnlySource { - fn clone(&self) -> Self { - self.slice_from(0) - } -} - -impl From> for ReadOnlySource { - fn from(data: Vec) -> ReadOnlySource { - ReadOnlySource::new(data) - } -} diff --git a/src/directory/tests.rs b/src/directory/tests.rs index 74f2f2840..6767f0bc6 100644 --- a/src/directory/tests.rs +++ b/src/directory/tests.rs @@ -20,9 +20,9 @@ mod mmap_directory_tests { } #[test] - fn test_simple() { + fn test_simple() -> crate::Result<()> { let mut directory = make_directory(); - super::test_simple(&mut directory); + super::test_simple(&mut directory) } #[test] @@ -32,15 +32,17 @@ mod mmap_directory_tests { } #[test] - fn test_rewrite_forbidden() { + fn test_rewrite_forbidden() -> crate::Result<()> { let mut directory = make_directory(); - super::test_rewrite_forbidden(&mut directory); + super::test_rewrite_forbidden(&mut directory)?; + Ok(()) } #[test] - fn test_directory_delete() { + fn test_directory_delete() -> crate::Result<()> { let mut directory = make_directory(); - super::test_directory_delete(&mut directory); + super::test_directory_delete(&mut directory)?; + Ok(()) } #[test] @@ -72,9 +74,9 @@ mod ram_directory_tests { } #[test] - fn test_simple() { + fn test_simple() -> crate::Result<()> { let mut directory = make_directory(); - super::test_simple(&mut directory); + super::test_simple(&mut directory) } #[test] @@ -84,15 +86,17 @@ mod ram_directory_tests { } #[test] - fn test_rewrite_forbidden() { + fn test_rewrite_forbidden() -> crate::Result<()> { let mut directory = make_directory(); - super::test_rewrite_forbidden(&mut directory); + super::test_rewrite_forbidden(&mut directory)?; + Ok(()) } #[test] - fn test_directory_delete() { + fn test_directory_delete() -> crate::Result<()> { let mut directory = make_directory(); - super::test_directory_delete(&mut directory); + super::test_directory_delete(&mut directory)?; + Ok(()) } #[test] @@ -123,35 +127,28 @@ fn ram_directory_panics_if_flush_forgotten() { assert!(write_file.write_all(&[4]).is_ok()); } -fn test_simple(directory: &mut dyn Directory) { +fn test_simple(directory: &mut dyn Directory) -> crate::Result<()> { let test_path: &'static Path = Path::new("some_path_for_test"); - { - let mut write_file = directory.open_write(test_path).unwrap(); - assert!(directory.exists(test_path)); - write_file.write_all(&[4]).unwrap(); - write_file.write_all(&[3]).unwrap(); - write_file.write_all(&[7, 3, 5]).unwrap(); - write_file.flush().unwrap(); - } - { - let read_file = directory.open_read(test_path).unwrap(); - let data: &[u8] = &*read_file; - assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]); - } + let mut write_file = directory.open_write(test_path)?; + assert!(directory.exists(test_path)); + write_file.write_all(&[4])?; + write_file.write_all(&[3])?; + write_file.write_all(&[7, 3, 5])?; + write_file.flush()?; + let read_file = directory.open_read(test_path)?.read_bytes()?; + assert_eq!(read_file.as_slice(), &[4u8, 3u8, 7u8, 3u8, 5u8]); assert!(directory.delete(test_path).is_ok()); assert!(!directory.exists(test_path)); + Ok(()) } -fn test_rewrite_forbidden(directory: &mut dyn Directory) { +fn test_rewrite_forbidden(directory: &mut dyn Directory) -> crate::Result<()> { let test_path: &'static Path = Path::new("some_path_for_test"); - { - directory.open_write(test_path).unwrap(); - assert!(directory.exists(test_path)); - } - { - assert!(directory.open_write(test_path).is_err()); - } + directory.open_write(test_path)?; + assert!(directory.exists(test_path)); + assert!(directory.open_write(test_path).is_err()); assert!(directory.delete(test_path).is_ok()); + Ok(()) } fn test_write_create_the_file(directory: &mut dyn Directory) { @@ -165,21 +162,20 @@ fn test_write_create_the_file(directory: &mut dyn Directory) { } } -fn test_directory_delete(directory: &mut dyn Directory) { +fn test_directory_delete(directory: &mut dyn Directory) -> crate::Result<()> { let test_path: &'static Path = Path::new("some_path_for_test"); assert!(directory.open_read(test_path).is_err()); - let mut write_file = directory.open_write(&test_path).unwrap(); - write_file.write_all(&[1, 2, 3, 4]).unwrap(); - write_file.flush().unwrap(); + let mut write_file = directory.open_write(&test_path)?; + write_file.write_all(&[1, 2, 3, 4])?; + write_file.flush()?; { - let read_handle = directory.open_read(&test_path).unwrap(); - assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]); + let read_handle = directory.open_read(&test_path)?.read_bytes()?; + assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]); // Mapped files can't be deleted on Windows if !cfg!(windows) { assert!(directory.delete(&test_path).is_ok()); - assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]); + assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]); } - assert!(directory.delete(Path::new("SomeOtherPath")).is_err()); } @@ -189,6 +185,7 @@ fn test_directory_delete(directory: &mut dyn Directory) { assert!(directory.open_read(&test_path).is_err()); assert!(directory.delete(&test_path).is_err()); + Ok(()) } fn test_watch(directory: &mut dyn Directory) { diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index 1ce4fe416..9800881e1 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -85,7 +85,7 @@ mod tests { let field = searcher.schema().get_field("string_bytes").unwrap(); let term = Term::from_field_bytes(field, b"lucene".as_ref()); let term_query = TermQuery::new(term, IndexRecordOption::Basic); - let term_weight = term_query.specialized_weight(&searcher, true); + let term_weight = term_query.specialized_weight(&searcher, true)?; let term_scorer = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0f32)?; assert_eq!(term_scorer.doc(), 0u32); Ok(()) @@ -98,7 +98,7 @@ mod tests { let field = searcher.schema().get_field("string_bytes").unwrap(); let term = Term::from_field_bytes(field, b"lucene".as_ref()); let term_query = TermQuery::new(term, IndexRecordOption::Basic); - let term_weight = term_query.specialized_weight(&searcher, false); + let term_weight = term_query.specialized_weight(&searcher, false)?; let term_scorer_err = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0f32); assert!(matches!( term_scorer_err, diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs index bf120152c..123d6a89b 100644 --- a/src/fastfield/bytes/reader.rs +++ b/src/fastfield/bytes/reader.rs @@ -1,6 +1,5 @@ -use owning_ref::OwningRef; - -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; +use crate::directory::OwnedBytes; use crate::fastfield::FastFieldReader; use crate::DocId; @@ -17,16 +16,16 @@ use crate::DocId; #[derive(Clone)] pub struct BytesFastFieldReader { idx_reader: FastFieldReader, - values: OwningRef, + values: OwnedBytes, } impl BytesFastFieldReader { pub(crate) fn open( idx_reader: FastFieldReader, - values_source: ReadOnlySource, - ) -> BytesFastFieldReader { - let values = OwningRef::new(values_source).map(|source| &source[..]); - BytesFastFieldReader { idx_reader, values } + values_file: FileSlice, + ) -> crate::Result { + let values = values_file.read_bytes()?; + Ok(BytesFastFieldReader { idx_reader, values }) } fn range(&self, doc: DocId) -> (usize, usize) { @@ -38,7 +37,7 @@ impl BytesFastFieldReader { /// Returns the bytes associated to the given `doc` pub fn get_bytes(&self, doc: DocId) -> &[u8] { let (start, stop) = self.range(doc); - &self.values[start..stop] + &self.values.as_slice()[start..stop] } /// Returns the overall number of bytes in this bytes fast field. diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 5dfd16056..b726a0e3a 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -1,5 +1,6 @@ use crate::common::{BitSet, HasLen}; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; +use crate::directory::OwnedBytes; use crate::directory::WritePtr; use crate::space_usage::ByteCount; use crate::DocId; @@ -39,7 +40,7 @@ pub fn write_delete_bitset( /// Set of deleted `DocId`s. #[derive(Clone)] pub struct DeleteBitSet { - data: ReadOnlySource, + data: OwnedBytes, len: usize, } @@ -58,21 +59,22 @@ impl DeleteBitSet { let mut wrt = directory.open_write(path).unwrap(); write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap(); wrt.terminate().unwrap(); - let source = directory.open_read(path).unwrap(); - Self::open(source) + let file = directory.open_read(path).unwrap(); + Self::open(file).unwrap() } - /// Opens a delete bitset given its data source. - pub fn open(data: ReadOnlySource) -> DeleteBitSet { - let num_deleted: usize = data + /// Opens a delete bitset given its file. + pub fn open(file: FileSlice) -> crate::Result { + let bytes = file.read_bytes()?; + let num_deleted: usize = bytes .as_slice() .iter() .map(|b| b.count_ones() as usize) .sum(); - DeleteBitSet { - data, + Ok(DeleteBitSet { + data: bytes, len: num_deleted, - } + }) } /// Returns true iff the document is still "alive". In other words, if it has not been deleted. @@ -84,7 +86,7 @@ impl DeleteBitSet { #[inline(always)] pub fn is_deleted(&self, doc: DocId) -> bool { let byte_offset = doc / 8u32; - let b: u8 = (*self.data)[byte_offset as usize]; + let b: u8 = self.data.as_slice()[byte_offset as usize]; let shift = (doc & 7u32) as u8; b & (1u8 << shift) != 0 } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 2e7808225..227cc1e2f 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -209,6 +209,7 @@ mod tests { use crate::schema::FAST; use crate::schema::{Document, IntOptions}; use crate::{Index, SegmentId, SegmentReader}; + use common::HasLen; use once_cell::sync::Lazy; use rand::prelude::SliceRandom; use rand::rngs::StdRng; @@ -239,7 +240,7 @@ mod tests { } #[test] - fn test_intfastfield_small() { + fn test_intfastfield_small() -> crate::Result<()> { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); { @@ -254,27 +255,24 @@ mod tests { .unwrap(); serializer.close().unwrap(); } - let source = directory.open_read(&path).unwrap(); - { - assert_eq!(source.len(), 36 as usize); - } - { - let composite_file = CompositeFile::open(&source).unwrap(); - let field_source = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(field_source); - assert_eq!(fast_field_reader.get(0), 13u64); - assert_eq!(fast_field_reader.get(1), 14u64); - assert_eq!(fast_field_reader.get(2), 2u64); - } + let file = directory.open_read(&path).unwrap(); + assert_eq!(file.len(), 36 as usize); + let composite_file = CompositeFile::open(&file)?; + let file = composite_file.open_read(*FIELD).unwrap(); + let fast_field_reader = FastFieldReader::::open(file)?; + assert_eq!(fast_field_reader.get(0), 13u64); + assert_eq!(fast_field_reader.get(1), 14u64); + assert_eq!(fast_field_reader.get(2), 2u64); + Ok(()) } #[test] - fn test_intfastfield_large() { + fn test_intfastfield_large() -> crate::Result<()> { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); { - let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::from_write(write).unwrap(); + let write: WritePtr = directory.open_write(Path::new("test"))?; + let mut serializer = FastFieldSerializer::from_write(write)?; let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); fast_field_writers.add_document(&doc!(*FIELD=>4u64)); fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64)); @@ -285,19 +283,15 @@ mod tests { fast_field_writers.add_document(&doc!(*FIELD=>1_002u64)); fast_field_writers.add_document(&doc!(*FIELD=>1_501u64)); fast_field_writers.add_document(&doc!(*FIELD=>215u64)); - fast_field_writers - .serialize(&mut serializer, &HashMap::new()) - .unwrap(); - serializer.close().unwrap(); + fast_field_writers.serialize(&mut serializer, &HashMap::new())?; + serializer.close()?; } - let source = directory.open_read(&path).unwrap(); + let file = directory.open_read(&path)?; + assert_eq!(file.len(), 61 as usize); { - assert_eq!(source.len(), 61 as usize); - } - { - let fast_fields_composite = CompositeFile::open(&source).unwrap(); + let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(data); + let fast_field_reader = FastFieldReader::::open(data)?; assert_eq!(fast_field_reader.get(0), 4u64); assert_eq!(fast_field_reader.get(1), 14_082_001u64); assert_eq!(fast_field_reader.get(2), 3_052u64); @@ -308,10 +302,11 @@ mod tests { assert_eq!(fast_field_reader.get(7), 1_501u64); assert_eq!(fast_field_reader.get(8), 215u64); } + Ok(()) } #[test] - fn test_intfastfield_null_amplitude() { + fn test_intfastfield_null_amplitude() -> crate::Result<()> { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); @@ -327,22 +322,21 @@ mod tests { .unwrap(); serializer.close().unwrap(); } - let source = directory.open_read(&path).unwrap(); + let file = directory.open_read(&path).unwrap(); + assert_eq!(file.len(), 34 as usize); { - assert_eq!(source.len(), 34 as usize); - } - { - let fast_fields_composite = CompositeFile::open(&source).unwrap(); + let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(data); + let fast_field_reader = FastFieldReader::::open(data)?; for doc in 0..10_000 { assert_eq!(fast_field_reader.get(doc), 100_000u64); } } + Ok(()) } #[test] - fn test_intfastfield_large_numbers() { + fn test_intfastfield_large_numbers() -> crate::Result<()> { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); @@ -360,14 +354,12 @@ mod tests { .unwrap(); serializer.close().unwrap(); } - let source = directory.open_read(&path).unwrap(); + let file = directory.open_read(&path).unwrap(); + assert_eq!(file.len(), 80042 as usize); { - assert_eq!(source.len(), 80042 as usize); - } - { - let fast_fields_composite = CompositeFile::open(&source).unwrap(); + let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(data); + let fast_field_reader = FastFieldReader::::open(data)?; assert_eq!(fast_field_reader.get(0), 0u64); for doc in 1..10_001 { assert_eq!( @@ -376,10 +368,11 @@ mod tests { ); } } + Ok(()) } #[test] - fn test_signed_intfastfield() { + fn test_signed_intfastfield() -> crate::Result<()> { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); let mut schema_builder = Schema::builder(); @@ -400,14 +393,12 @@ mod tests { .unwrap(); serializer.close().unwrap(); } - let source = directory.open_read(&path).unwrap(); + let file = directory.open_read(&path).unwrap(); + assert_eq!(file.len(), 17709 as usize); { - assert_eq!(source.len(), 17709 as usize); - } - { - let fast_fields_composite = CompositeFile::open(&source).unwrap(); + let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(i64_field).unwrap(); - let fast_field_reader = FastFieldReader::::open(data); + let fast_field_reader = FastFieldReader::::open(data)?; assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.max_value(), 9_999i64); @@ -420,10 +411,11 @@ mod tests { assert_eq!(buffer[i], -100i64 + 53i64 + i as i64); } } + Ok(()) } #[test] - fn test_signed_intfastfield_default_val() { + fn test_signed_intfastfield_default_val() -> crate::Result<()> { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); let mut schema_builder = Schema::builder(); @@ -442,13 +434,14 @@ mod tests { serializer.close().unwrap(); } - let source = directory.open_read(&path).unwrap(); + let file = directory.open_read(&path).unwrap(); { - let fast_fields_composite = CompositeFile::open(&source).unwrap(); + let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite.open_read(i64_field).unwrap(); - let fast_field_reader = FastFieldReader::::open(data); + let fast_field_reader = FastFieldReader::::open(data)?; assert_eq!(fast_field_reader.get(0u32), 0i64); } + Ok(()) } // Warning: this generates the same permutation at each call @@ -459,28 +452,26 @@ mod tests { } #[test] - fn test_intfastfield_permutation() { + fn test_intfastfield_permutation() -> crate::Result<()> { let path = Path::new("test"); let permutation = generate_permutation(); let n = permutation.len(); let mut directory = RAMDirectory::create(); { - let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = FastFieldSerializer::from_write(write).unwrap(); + let write: WritePtr = directory.open_write(Path::new("test"))?; + let mut serializer = FastFieldSerializer::from_write(write)?; let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA); for &x in &permutation { fast_field_writers.add_document(&doc!(*FIELD=>x)); } - fast_field_writers - .serialize(&mut serializer, &HashMap::new()) - .unwrap(); - serializer.close().unwrap(); + fast_field_writers.serialize(&mut serializer, &HashMap::new())?; + serializer.close()?; } - let source = directory.open_read(&path).unwrap(); + let file = directory.open_read(&path)?; { - let fast_fields_composite = CompositeFile::open(&source).unwrap(); + let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(*FIELD).unwrap(); - let fast_field_reader = FastFieldReader::::open(data); + let fast_field_reader = FastFieldReader::::open(data)?; let mut a = 0u64; for _ in 0..n { @@ -488,6 +479,7 @@ mod tests { a = fast_field_reader.get(a as u32); } } + Ok(()) } #[test] @@ -633,9 +625,9 @@ mod bench { .unwrap(); serializer.close().unwrap(); } - let source = directory.open_read(&path).unwrap(); + let file = directory.open_read(&path).unwrap(); { - let fast_fields_composite = CompositeFile::open(&source).unwrap(); + let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap(); let fast_field_reader = FastFieldReader::::open(data); @@ -667,9 +659,9 @@ mod bench { .unwrap(); serializer.close().unwrap(); } - let source = directory.open_read(&path).unwrap(); + let file = directory.open_read(&path).unwrap(); { - let fast_fields_composite = CompositeFile::open(&source).unwrap(); + let fast_fields_composite = CompositeFile::open(&file).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap(); let fast_field_reader = FastFieldReader::::open(data); diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 867163b99..ab5fa88ba 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -3,13 +3,12 @@ use crate::common::bitpacker::BitUnpacker; use crate::common::compute_num_bits; use crate::common::BinarySerializable; use crate::common::CompositeFile; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; use crate::directory::{Directory, RAMDirectory, WritePtr}; use crate::fastfield::{FastFieldSerializer, FastFieldsWriter}; use crate::schema::Schema; use crate::schema::FAST; use crate::DocId; -use owning_ref::OwningRef; use std::collections::HashMap; use std::marker::PhantomData; use std::path::Path; @@ -20,34 +19,27 @@ use std::path::Path; /// fast field is required. #[derive(Clone)] pub struct FastFieldReader { - bit_unpacker: BitUnpacker>, + bit_unpacker: BitUnpacker, min_value_u64: u64, max_value_u64: u64, _phantom: PhantomData, } impl FastFieldReader { - /// Opens a fast field given a source. - pub fn open(data: ReadOnlySource) -> Self { - let min_value: u64; - let amplitude: u64; - { - let mut cursor = data.as_slice(); - min_value = - u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field."); - amplitude = - u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field."); - } + /// Opens a fast field given a file. + pub fn open(file: FileSlice) -> crate::Result { + let mut bytes = file.read_bytes()?; + let min_value = u64::deserialize(&mut bytes)?; + let amplitude = u64::deserialize(&mut bytes)?; let max_value = min_value + amplitude; let num_bits = compute_num_bits(amplitude); - let owning_ref = OwningRef::new(data).map(|data| &data[16..]); - let bit_unpacker = BitUnpacker::new(owning_ref, num_bits); - FastFieldReader { + let bit_unpacker = BitUnpacker::new(bytes, num_bits); + Ok(FastFieldReader { min_value_u64: min_value, max_value_u64: max_value, bit_unpacker, _phantom: PhantomData, - } + }) } pub(crate) fn into_u64_reader(self) -> FastFieldReader { @@ -157,12 +149,11 @@ impl From> for FastFieldReader { serializer.close().unwrap(); } - let source = directory.open_read(path).expect("Failed to open the file"); - let composite_file = - CompositeFile::open(&source).expect("Failed to read the composite file"); - let field_source = composite_file + let file = directory.open_read(path).expect("Failed to open the file"); + let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file"); + let field_file = composite_file .open_read(field) .expect("File component not found"); - FastFieldReader::open(field_source) + FastFieldReader::open(field_file).unwrap() } } diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index fe7dc5a22..177f23cd2 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -72,44 +72,48 @@ impl FastFieldReaders { if !bytes_option.is_fast() { continue; } - let idx_reader = fast_fields_composite + let fast_field_idx_file = fast_fields_composite .open_read_with_idx(field, 0) - .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(FastFieldReader::open)?; + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?; + let idx_reader = FastFieldReader::open(fast_field_idx_file)?; let data = fast_fields_composite .open_read_with_idx(field, 1) .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?; + let bytes_fast_field_reader = BytesFastFieldReader::open(idx_reader, data)?; fast_field_readers .fast_bytes - .insert(field, BytesFastFieldReader::open(idx_reader, data)); + .insert(field, bytes_fast_field_reader); } else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) { match cardinality { Cardinality::SingleValue => { if let Some(fast_field_data) = fast_fields_composite.open_read(field) { match fast_type { FastType::U64 => { - let fast_field_reader = FastFieldReader::open(fast_field_data); + let fast_field_reader = FastFieldReader::open(fast_field_data)?; fast_field_readers .fast_field_u64 .insert(field, fast_field_reader); } FastType::I64 => { - fast_field_readers.fast_field_i64.insert( - field, - FastFieldReader::open(fast_field_data.clone()), - ); + let fast_field_reader = + FastFieldReader::open(fast_field_data.clone())?; + fast_field_readers + .fast_field_i64 + .insert(field, fast_field_reader); } FastType::F64 => { - fast_field_readers.fast_field_f64.insert( - field, - FastFieldReader::open(fast_field_data.clone()), - ); + let fast_field_reader = + FastFieldReader::open(fast_field_data.clone())?; + fast_field_readers + .fast_field_f64 + .insert(field, fast_field_reader); } FastType::Date => { - fast_field_readers.fast_field_date.insert( - field, - FastFieldReader::open(fast_field_data.clone()), - ); + let fast_field_reader = + FastFieldReader::open(fast_field_data.clone())?; + fast_field_readers + .fast_field_date + .insert(field, fast_field_reader); } } } else { @@ -120,10 +124,10 @@ impl FastFieldReaders { let idx_opt = fast_fields_composite.open_read_with_idx(field, 0); let data_opt = fast_fields_composite.open_read_with_idx(field, 1); if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) { - let idx_reader = FastFieldReader::open(fast_field_idx); + let idx_reader = FastFieldReader::open(fast_field_idx)?; match fast_type { FastType::I64 => { - let vals_reader = FastFieldReader::open(fast_field_data); + let vals_reader = FastFieldReader::open(fast_field_data)?; let multivalued_int_fast_field = MultiValueIntFastFieldReader::open(idx_reader, vals_reader); fast_field_readers @@ -131,7 +135,7 @@ impl FastFieldReaders { .insert(field, multivalued_int_fast_field); } FastType::U64 => { - let vals_reader = FastFieldReader::open(fast_field_data); + let vals_reader = FastFieldReader::open(fast_field_data)?; let multivalued_int_fast_field = MultiValueIntFastFieldReader::open(idx_reader, vals_reader); fast_field_readers @@ -139,7 +143,7 @@ impl FastFieldReaders { .insert(field, multivalued_int_fast_field); } FastType::F64 => { - let vals_reader = FastFieldReader::open(fast_field_data); + let vals_reader = FastFieldReader::open(fast_field_data)?; let multivalued_int_fast_field = MultiValueIntFastFieldReader::open(idx_reader, vals_reader); fast_field_readers @@ -147,7 +151,7 @@ impl FastFieldReaders { .insert(field, multivalued_int_fast_field); } FastType::Date => { - let vals_reader = FastFieldReader::open(fast_field_data); + let vals_reader = FastFieldReader::open(fast_field_data)?; let multivalued_int_fast_field = MultiValueIntFastFieldReader::open(idx_reader, vals_reader); fast_field_readers diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index d798c8679..bd8ed6628 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -1,6 +1,7 @@ use super::{fieldnorm_to_id, id_to_fieldnorm}; use crate::common::CompositeFile; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; +use crate::directory::OwnedBytes; use crate::schema::Field; use crate::space_usage::PerFieldSpaceUsage; use crate::DocId; @@ -19,16 +20,21 @@ pub struct FieldNormReaders { impl FieldNormReaders { /// Creates a field norm reader. - pub fn open(source: ReadOnlySource) -> crate::Result { - let data = CompositeFile::open(&source)?; + pub fn open(file: FileSlice) -> crate::Result { + let data = CompositeFile::open(&file)?; Ok(FieldNormReaders { data: Arc::new(data), }) } /// Returns the FieldNormReader for a specific field. - pub fn get_field(&self, field: Field) -> Option { - self.data.open_read(field).map(FieldNormReader::open) + pub fn get_field(&self, field: Field) -> crate::Result> { + if let Some(file) = self.data.open_read(field) { + let fieldnorm_reader = FieldNormReader::open(file)?; + Ok(Some(fieldnorm_reader)) + } else { + Ok(None) + } } /// Return a break down of the space usage per field. @@ -56,13 +62,14 @@ impl FieldNormReaders { /// in a very short array. #[derive(Clone)] pub struct FieldNormReader { - data: ReadOnlySource, + data: OwnedBytes, } impl FieldNormReader { - /// Opens a field norm reader given its data source. - pub fn open(data: ReadOnlySource) -> Self { - FieldNormReader { data } + /// Opens a field norm reader given its file. + pub fn open(fieldnorm_file: FileSlice) -> crate::Result { + let data = fieldnorm_file.read_bytes()?; + Ok(FieldNormReader { data }) } /// Returns the number of documents in this segment. @@ -87,8 +94,7 @@ impl FieldNormReader { /// Returns the `fieldnorm_id` associated to a document. #[inline(always)] pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 { - let fielnorms_data = self.data.as_slice(); - fielnorms_data[doc_id as usize] + self.data.as_slice()[doc_id as usize] } /// Converts a `fieldnorm_id` into a fieldnorm. @@ -111,7 +117,7 @@ impl FieldNormReader { .cloned() .map(FieldNormReader::fieldnorm_to_id) .collect::>(); - let field_norms_data = ReadOnlySource::from(field_norms_id); + let field_norms_data = OwnedBytes::new(field_norms_id); FieldNormReader { data: field_norms_data, } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index b36ac331c..0ad6a9034 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -108,9 +108,9 @@ fn compute_deleted_bitset( // Limit doc helps identify the first document // that may be affected by the delete operation. let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); - let inverted_index = segment_reader.inverted_index(delete_op.term.field()); + let inverted_index = segment_reader.inverted_index(delete_op.term.field())?; if let Some(mut docset) = - inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic) + inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)? { let mut deleted_doc = docset.doc(); while deleted_doc != TERMINATED { @@ -979,7 +979,7 @@ mod tests { let num_docs_containing = |s: &str| { let searcher = reader.searcher(); let term = Term::from_field_text(text_field, s); - searcher.doc_freq(&term) + searcher.doc_freq(&term).unwrap() }; { @@ -1015,7 +1015,7 @@ mod tests { .unwrap(); let num_docs_containing = |s: &str| { let term_a = Term::from_field_text(text_field, s); - reader.searcher().doc_freq(&term_a) + reader.searcher().doc_freq(&term_a).unwrap() }; { // writing the segment @@ -1110,6 +1110,7 @@ mod tests { .unwrap() .searcher() .doc_freq(&term_a) + .unwrap() }; assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("b"), 100); @@ -1129,7 +1130,7 @@ mod tests { reader.reload().unwrap(); let searcher = reader.searcher(); let term = Term::from_field_text(text_field, s); - searcher.doc_freq(&term) + searcher.doc_freq(&term).unwrap() }; let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap(); @@ -1180,7 +1181,15 @@ mod tests { // working with an empty index == no documents let term_b = Term::from_field_text(text_field, "b"); - assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_b), 0); + assert_eq!( + index + .reader() + .unwrap() + .searcher() + .doc_freq(&term_b) + .unwrap(), + 0 + ); } #[test] @@ -1200,7 +1209,15 @@ mod tests { let term_a = Term::from_field_text(text_field, "a"); // expect the document with that term to be in the index - assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1); + assert_eq!( + index + .reader() + .unwrap() + .searcher() + .doc_freq(&term_a) + .unwrap(), + 1 + ); } #[test] @@ -1226,7 +1243,15 @@ mod tests { // Find original docs in the index let term_a = Term::from_field_text(text_field, "a"); // expect the document with that term to be in the index - assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1); + assert_eq!( + index + .reader() + .unwrap() + .searcher() + .doc_freq(&term_a) + .unwrap(), + 1 + ); } #[test] diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 9acf7f18c..6ad3f61e7 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -38,7 +38,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::R count[fieldnorm_id as usize] += 1; } } else { - total_tokens += reader.inverted_index(field).total_num_tokens(); + total_tokens += reader.inverted_index(field)?.total_num_tokens(); } } Ok(total_tokens @@ -510,7 +510,7 @@ impl IndexMerger { .readers .iter() .map(|reader| reader.inverted_index(indexed_field)) - .collect(); + .collect::>>()?; for field_reader in &field_readers { let terms = field_reader.terms(); @@ -583,8 +583,8 @@ impl IndexMerger { let term_info = heap_item.streamer.value(); let segment_reader = &self.readers[heap_item.segment_ord]; let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord]; - let segment_postings = - inverted_index.read_postings_from_terminfo(term_info, segment_postings_option); + let segment_postings = inverted_index + .read_postings_from_terminfo(term_info, segment_postings_option)?; let delete_bitset_opt = segment_reader.delete_bitset(); let doc_freq = if let Some(delete_bitset) = delete_bitset_opt { segment_postings.doc_freq_given_deletes(delete_bitset) @@ -653,7 +653,7 @@ impl IndexMerger { ) -> crate::Result> { let mut term_ordinal_mappings = HashMap::new(); for (field, field_entry) in self.schema.fields() { - let fieldnorm_reader = fieldnorm_readers.get_field(field); + let fieldnorm_reader = fieldnorm_readers.get_field(field)?; if field_entry.is_indexed() { if let Some(term_ordinal_mapping) = self.write_postings_for_field( field, @@ -670,7 +670,7 @@ impl IndexMerger { fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> crate::Result<()> { for reader in &self.readers { - let store_reader = reader.get_store_reader(); + let store_reader = reader.get_store_reader()?; if reader.num_deleted_docs() > 0 { for doc_id in reader.doc_ids_alive() { let doc = store_reader.get(doc_id)?; @@ -1533,7 +1533,7 @@ mod tests { let reader = index.reader()?; let searcher = reader.searcher(); let mut term_scorer = term_query - .specialized_weight(&searcher, true) + .specialized_weight(&searcher, true)? .specialized_scorer(searcher.segment_reader(0u32), 1.0)?; assert_eq!(term_scorer.doc(), 0); assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855); @@ -1548,7 +1548,7 @@ mod tests { assert_eq!(searcher.segment_readers().len(), 2); for segment_reader in searcher.segment_readers() { let mut term_scorer = term_query - .specialized_weight(&searcher, true) + .specialized_weight(&searcher, true)? .specialized_scorer(segment_reader, 1.0)?; // the difference compared to before is instrinsic to the bm25 formula. no worries there. for doc in segment_reader.doc_ids_alive() { @@ -1572,7 +1572,7 @@ mod tests { let segment_reader = searcher.segment_reader(0u32); let mut term_scorer = term_query - .specialized_weight(&searcher, true) + .specialized_weight(&searcher, true)? .specialized_scorer(segment_reader, 1.0)?; // the difference compared to before is instrinsic to the bm25 formula. no worries there. for doc in segment_reader.doc_ids_alive() { diff --git a/src/lib.rs b/src/lib.rs index 6fd4ce2ad..0bbd683fd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -383,31 +383,23 @@ mod tests { let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); let mut index_writer = index.writer_for_tests()?; - { - index_writer.add_document(doc!(text_field=>"a b c")); - index_writer.commit()?; - } - { - index_writer.add_document(doc!(text_field=>"a")); - index_writer.add_document(doc!(text_field=>"a a")); - index_writer.commit()?; - } - { - index_writer.add_document(doc!(text_field=>"c")); - index_writer.commit()?; - } - { - let reader = index.reader()?; - let searcher = reader.searcher(); - let term_a = Term::from_field_text(text_field, "a"); - assert_eq!(searcher.doc_freq(&term_a), 3); - let term_b = Term::from_field_text(text_field, "b"); - assert_eq!(searcher.doc_freq(&term_b), 1); - let term_c = Term::from_field_text(text_field, "c"); - assert_eq!(searcher.doc_freq(&term_c), 2); - let term_d = Term::from_field_text(text_field, "d"); - assert_eq!(searcher.doc_freq(&term_d), 0); - } + index_writer.add_document(doc!(text_field=>"a b c")); + index_writer.commit()?; + index_writer.add_document(doc!(text_field=>"a")); + index_writer.add_document(doc!(text_field=>"a a")); + index_writer.commit()?; + index_writer.add_document(doc!(text_field=>"c")); + index_writer.commit()?; + let reader = index.reader()?; + let searcher = reader.searcher(); + let term_a = Term::from_field_text(text_field, "a"); + assert_eq!(searcher.doc_freq(&term_a)?, 3); + let term_b = Term::from_field_text(text_field, "b"); + assert_eq!(searcher.doc_freq(&term_b)?, 1); + let term_c = Term::from_field_text(text_field, "c"); + assert_eq!(searcher.doc_freq(&term_c)?, 2); + let term_d = Term::from_field_text(text_field, "d"); + assert_eq!(searcher.doc_freq(&term_d)?, 0); Ok(()) } @@ -504,13 +496,13 @@ mod tests { reader.reload()?; let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(0); - let inverted_index = segment_reader.inverted_index(text_field); + let inverted_index = segment_reader.inverted_index(text_field)?; assert!(inverted_index - .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)? .is_none()); { let mut postings = inverted_index - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert!(advance_undeleted(&mut postings, segment_reader)); assert_eq!(postings.doc(), 5); @@ -518,7 +510,7 @@ mod tests { } { let mut postings = inverted_index - .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert!(advance_undeleted(&mut postings, segment_reader)); assert_eq!(postings.doc(), 3); @@ -540,14 +532,14 @@ mod tests { reader.reload()?; let searcher = reader.searcher(); let seg_reader = searcher.segment_reader(0); - let inverted_index = seg_reader.inverted_index(term_abcd.field()); + let inverted_index = seg_reader.inverted_index(term_abcd.field())?; assert!(inverted_index - .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)? .is_none()); { let mut postings = inverted_index - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert!(advance_undeleted(&mut postings, seg_reader)); assert_eq!(postings.doc(), 5); @@ -555,7 +547,7 @@ mod tests { } { let mut postings = inverted_index - .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert!(advance_undeleted(&mut postings, seg_reader)); assert_eq!(postings.doc(), 3); @@ -577,19 +569,19 @@ mod tests { reader.reload()?; let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(0); - let inverted_index = segment_reader.inverted_index(term_abcd.field()); + let inverted_index = segment_reader.inverted_index(term_abcd.field())?; assert!(inverted_index - .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)? .is_none()); { let mut postings = inverted_index - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert!(!advance_undeleted(&mut postings, segment_reader)); } { let mut postings = inverted_index - .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert!(advance_undeleted(&mut postings, segment_reader)); assert_eq!(postings.doc(), 3); @@ -599,7 +591,7 @@ mod tests { } { let mut postings = inverted_index - .read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert!(advance_undeleted(&mut postings, segment_reader)); assert_eq!(postings.doc(), 4); @@ -624,8 +616,8 @@ mod tests { let term = Term::from_field_u64(field, 1u64); let mut postings = searcher .segment_reader(0) - .inverted_index(term.field()) - .read_postings(&term, IndexRecordOption::Basic) + .inverted_index(term.field())? + .read_postings(&term, IndexRecordOption::Basic)? .unwrap(); assert_eq!(postings.doc(), 0); assert_eq!(postings.advance(), TERMINATED); @@ -648,8 +640,8 @@ mod tests { let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher .segment_reader(0) - .inverted_index(term.field()) - .read_postings(&term, IndexRecordOption::Basic) + .inverted_index(term.field())? + .read_postings(&term, IndexRecordOption::Basic)? .unwrap(); assert_eq!(postings.doc(), 0); assert_eq!(postings.advance(), TERMINATED); @@ -672,8 +664,8 @@ mod tests { let term = Term::from_field_f64(value_field, val); let mut postings = searcher .segment_reader(0) - .inverted_index(term.field()) - .read_postings(&term, IndexRecordOption::Basic) + .inverted_index(term.field())? + .read_postings(&term, IndexRecordOption::Basic)? .unwrap(); assert_eq!(postings.doc(), 0); assert_eq!(postings.advance(), TERMINATED); @@ -693,7 +685,7 @@ mod tests { let reader = index.reader()?; let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(0); - let inverted_index = segment_reader.inverted_index(absent_field); //< should not panic + let inverted_index = segment_reader.inverted_index(absent_field)?; assert_eq!(inverted_index.terms().num_terms(), 0); Ok(()) } @@ -743,14 +735,14 @@ mod tests { let index_reader = index.reader()?; let searcher = index_reader.searcher(); let reader = searcher.segment_reader(0); - let inverted_index = reader.inverted_index(text_field); + let inverted_index = reader.inverted_index(text_field)?; let term_abcd = Term::from_field_text(text_field, "abcd"); assert!(inverted_index - .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)? .is_none()); let term_af = Term::from_field_text(text_field, "af"); let mut postings = inverted_index - .read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); diff --git a/src/positions/mod.rs b/src/positions/mod.rs index 46abdc3b0..8532b343d 100644 --- a/src/positions/mod.rs +++ b/src/positions/mod.rs @@ -38,11 +38,11 @@ const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) a pub mod tests { use super::PositionSerializer; - use crate::directory::ReadOnlySource; use crate::positions::reader::PositionReader; + use crate::{common::HasLen, directory::FileSlice}; use std::iter; - fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) { + fn create_stream_buffer(vals: &[u32]) -> (FileSlice, FileSlice) { let mut skip_buffer = vec![]; let mut stream_buffer = vec![]; { @@ -53,10 +53,7 @@ pub mod tests { } serializer.close().unwrap(); } - ( - ReadOnlySource::from(stream_buffer), - ReadOnlySource::from(skip_buffer), - ) + (FileSlice::new(stream_buffer), FileSlice::new(skip_buffer)) } #[test] @@ -65,7 +62,7 @@ pub mod tests { let (stream, skip) = create_stream_buffer(&v[..]); assert_eq!(skip.len(), 12); assert_eq!(stream.len(), 1168); - let mut position_reader = PositionReader::new(stream, skip, 0u64); + let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap(); for &n in &[1, 10, 127, 128, 130, 312] { let mut v = vec![0u32; n]; position_reader.read(0, &mut v[..]); @@ -81,7 +78,7 @@ pub mod tests { let (stream, skip) = create_stream_buffer(&v[..]); assert_eq!(skip.len(), 12); assert_eq!(stream.len(), 1168); - let mut position_reader = PositionReader::new(stream, skip, 0u64); + let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap(); for &offset in &[1u64, 10u64, 127u64, 128u64, 130u64, 312u64] { for &len in &[1, 10, 130, 500] { let mut v = vec![0u32; len]; @@ -100,7 +97,7 @@ pub mod tests { assert_eq!(skip.len(), 12); assert_eq!(stream.len(), 1168); - let mut position_reader = PositionReader::new(stream, skip, 0u64); + let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap(); let mut buf = [0u32; 7]; let mut c = 0; @@ -122,7 +119,7 @@ pub mod tests { let (stream, skip) = create_stream_buffer(&v[..]); assert_eq!(skip.len(), 15_749); assert_eq!(stream.len(), 4_987_872); - let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0); + let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0).unwrap(); let mut buf = [0u32; 256]; position_reader.read(128, &mut buf); for i in 0..256 { @@ -142,7 +139,8 @@ pub mod tests { assert_eq!(skip.len(), 15_749); assert_eq!(stream.len(), 4_987_872); let mut buf = [0u32; 1]; - let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 200_000); + let mut position_reader = + PositionReader::new(stream.clone(), skip.clone(), 200_000).unwrap(); position_reader.read(230, &mut buf); position_reader.read(9, &mut buf); } @@ -157,7 +155,7 @@ pub mod tests { } let (stream, skip) = create_stream_buffer(&v[..]); let mut buf = Vec::new(); - let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0); + let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0).unwrap(); let mut offset = 0; for i in 1..24 { buf.resize(i, 0); @@ -175,7 +173,7 @@ pub mod tests { let (stream, skip) = create_stream_buffer(&v[..]); assert_eq!(skip.len(), 15_749); assert_eq!(stream.len(), 1_000_000); - let mut position_reader = PositionReader::new(stream, skip, 128 * 1024); + let mut position_reader = PositionReader::new(stream, skip, 128 * 1024).unwrap(); let mut buf = [0u32; 1]; position_reader.read(0, &mut buf); assert_eq!(buf[0], CONST_VAL); @@ -194,7 +192,8 @@ pub mod tests { 128 * 1024 + 7, 128 * 10 * 1024 + 10, ] { - let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset); + let mut position_reader = + PositionReader::new(stream.clone(), skip.clone(), offset).unwrap(); let mut buf = [0u32; 1]; position_reader.read(0, &mut buf); assert_eq!(buf[0], offset as u32); diff --git a/src/positions/reader.rs b/src/positions/reader.rs index 1bcbf4d1e..37fdae396 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -1,8 +1,13 @@ +use std::io; + use crate::common::{BinarySerializable, FixedSize}; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; +use crate::directory::OwnedBytes; use crate::positions::COMPRESSION_BLOCK_SIZE; use crate::positions::LONG_SKIP_INTERVAL; use crate::positions::LONG_SKIP_IN_BLOCKS; +use bitpacking::{BitPacker, BitPacker4x}; + /// Positions works as a long sequence of compressed block. /// All terms are chained one after the other. /// @@ -23,28 +28,28 @@ use crate::positions::LONG_SKIP_IN_BLOCKS; /// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`, /// so skipping a block without decompressing it is just a matter of advancing that many /// bytes. -use bitpacking::{BitPacker, BitPacker4x}; -use owned_read::OwnedRead; struct Positions { bit_packer: BitPacker4x, - skip_source: ReadOnlySource, - position_source: ReadOnlySource, - long_skip_source: ReadOnlySource, + skip_file: FileSlice, + position_file: FileSlice, + long_skip_data: OwnedBytes, } impl Positions { - pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions { - let (body, footer) = skip_source.split_from_end(u32::SIZE_IN_BYTES); - let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted"); - let (skip_source, long_skip_source) = + pub fn new(position_file: FileSlice, skip_file: FileSlice) -> io::Result { + let (body, footer) = skip_file.split_from_end(u32::SIZE_IN_BYTES); + let footer_data = footer.read_bytes()?; + let num_long_skips = u32::deserialize(&mut footer_data.as_slice())?; + let (skip_file, long_skip_file) = body.split_from_end(u64::SIZE_IN_BYTES * (num_long_skips as usize)); - Positions { + let long_skip_data = long_skip_file.read_bytes()?; + Ok(Positions { bit_packer: BitPacker4x::new(), - skip_source, - long_skip_source, - position_source, - } + skip_file, + long_skip_data, + position_file, + }) } /// Returns the offset of the block associated to the given `long_skip_id`. @@ -54,19 +59,23 @@ impl Positions { if long_skip_id == 0 { return 0; } - let long_skip_slice = self.long_skip_source.as_slice(); + let long_skip_slice = self.long_skip_data.as_slice(); let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8]; u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") } - fn reader(&self, offset: u64) -> PositionReader { + fn reader(&self, offset: u64) -> io::Result { let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize; let offset_num_bytes: u64 = self.long_skip(long_skip_id); - let mut position_read = OwnedRead::new(self.position_source.clone()); - position_read.advance(offset_num_bytes as usize); - let mut skip_read = OwnedRead::new(self.skip_source.clone()); - skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS); - PositionReader { + let position_read = self + .position_file + .slice_from(offset_num_bytes as usize) + .read_bytes()?; + let skip_read = self + .skip_file + .slice_from(long_skip_id * LONG_SKIP_IN_BLOCKS) + .read_bytes()?; + Ok(PositionReader { bit_packer: self.bit_packer, skip_read, position_read, @@ -74,14 +83,14 @@ impl Positions { block_offset: std::i64::MAX as u64, anchor_offset: (long_skip_id as u64) * LONG_SKIP_INTERVAL, abs_offset: offset, - } + }) } } #[derive(Clone)] pub struct PositionReader { - skip_read: OwnedRead, - position_read: OwnedRead, + skip_read: OwnedBytes, + position_read: OwnedBytes, bit_packer: BitPacker4x, buffer: Box<[u32; COMPRESSION_BLOCK_SIZE]>, @@ -93,11 +102,12 @@ pub struct PositionReader { impl PositionReader { pub fn new( - position_source: ReadOnlySource, - skip_source: ReadOnlySource, + position_file: FileSlice, + skip_file: FileSlice, offset: u64, - ) -> PositionReader { - Positions::new(position_source, skip_source).reader(offset) + ) -> io::Result { + let positions = Positions::new(position_file, skip_file)?; + positions.reader(offset) } fn advance_num_blocks(&mut self, num_blocks: usize) { @@ -131,7 +141,7 @@ impl PositionReader { self.advance_num_blocks(num_blocks_to_skip); self.anchor_offset = offset - (offset % COMPRESSION_BLOCK_SIZE as u64); self.block_offset = self.anchor_offset; - let num_bits = self.skip_read.get(0); + let num_bits = self.skip_read.as_slice()[0]; self.bit_packer .decompress(self.position_read.as_ref(), self.buffer.as_mut(), num_bits); } else { @@ -141,7 +151,7 @@ impl PositionReader { self.anchor_offset = self.block_offset; } - let mut num_bits = self.skip_read.get(0); + let mut num_bits = self.skip_read.as_slice()[0]; let mut position_data = self.position_read.as_ref(); for i in 1.. { @@ -155,7 +165,7 @@ impl PositionReader { output = &mut output[remaining_in_block..]; offset += remaining_in_block as u64; position_data = &position_data[(num_bits as usize * COMPRESSION_BLOCK_SIZE / 8)..]; - num_bits = self.skip_read.get(i); + num_bits = self.skip_read.as_slice()[i]; self.bit_packer .decompress(position_data, self.buffer.as_mut(), num_bits); self.block_offset += COMPRESSION_BLOCK_SIZE as u64; diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 87d1714f5..849453af5 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -1,5 +1,8 @@ +use std::io; + use crate::common::{BinarySerializable, VInt}; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; +use crate::directory::OwnedBytes; use crate::fieldnorm::FieldNormReader; use crate::postings::compression::{ AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE, @@ -34,7 +37,7 @@ pub struct BlockSegmentPostings { doc_freq: u32, - data: ReadOnlySource, + data: OwnedBytes, pub(crate) skip_reader: SkipReader, } @@ -72,37 +75,34 @@ fn decode_vint_block( fn split_into_skips_and_postings( doc_freq: u32, - data: ReadOnlySource, -) -> (Option, ReadOnlySource) { + mut bytes: OwnedBytes, +) -> (Option, OwnedBytes) { if doc_freq < COMPRESSION_BLOCK_SIZE as u32 { - return (None, data); + return (None, bytes); } - let mut data_byte_arr = data.as_slice(); - let skip_len = VInt::deserialize(&mut data_byte_arr) - .expect("Data corrupted") - .0 as usize; - let vint_len = data.len() - data_byte_arr.len(); - let (skip_data, postings_data) = data.slice_from(vint_len).split(skip_len); + let skip_len = VInt::deserialize(&mut bytes).expect("Data corrupted").0 as usize; + let (skip_data, postings_data) = bytes.split(skip_len); (Some(skip_data), postings_data) } impl BlockSegmentPostings { - pub(crate) fn from_data( + pub(crate) fn open( doc_freq: u32, - data: ReadOnlySource, + data: FileSlice, record_option: IndexRecordOption, requested_option: IndexRecordOption, - ) -> BlockSegmentPostings { + ) -> io::Result { let freq_reading_option = match (record_option, requested_option) { (IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq, (_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq, (_, _) => FreqReadingOption::ReadFreq, }; - let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data); + let (skip_data_opt, postings_data) = + split_into_skips_and_postings(doc_freq, data.read_bytes()?); let skip_reader = match skip_data_opt { Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option), - None => SkipReader::new(ReadOnlySource::empty(), doc_freq, record_option), + None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option), }; let mut block_segment_postings = BlockSegmentPostings { @@ -116,7 +116,7 @@ impl BlockSegmentPostings { skip_reader, }; block_segment_postings.load_block(); - block_segment_postings + Ok(block_segment_postings) } /// Returns the block_max_score for the current block. @@ -172,15 +172,15 @@ impl BlockSegmentPostings { // # Warning // // This does not reset the positions list. - pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: ReadOnlySource) { + pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) { let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data); - self.data = ReadOnlySource::new(postings_data); + self.data = postings_data; self.block_max_score_cache = None; self.loaded_offset = std::usize::MAX; if let Some(skip_data) = skip_data_opt { self.skip_reader.reset(skip_data, doc_freq); } else { - self.skip_reader.reset(ReadOnlySource::empty(), doc_freq); + self.skip_reader.reset(OwnedBytes::empty(), doc_freq); } self.doc_freq = doc_freq; self.load_block(); @@ -344,8 +344,8 @@ impl BlockSegmentPostings { freq_reading_option: FreqReadingOption::NoFreq, block_max_score_cache: None, doc_freq: 0, - data: ReadOnlySource::new(vec![]), - skip_reader: SkipReader::new(ReadOnlySource::new(vec![]), 0, IndexRecordOption::Basic), + data: OwnedBytes::empty(), + skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic), } } } @@ -467,10 +467,12 @@ mod tests { index_writer.commit().unwrap(); let searcher = index.reader().unwrap().searcher(); let segment_reader = searcher.segment_reader(0); - let inverted_index = segment_reader.inverted_index(int_field); + let inverted_index = segment_reader.inverted_index(int_field).unwrap(); let term = Term::from_field_u64(int_field, 0u64); let term_info = inverted_index.get_term_info(&term).unwrap(); - inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic) + inverted_index + .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic) + .unwrap() } #[test] @@ -491,37 +493,38 @@ mod tests { } #[test] - fn test_reset_block_segment_postings() { + fn test_reset_block_segment_postings() -> crate::Result<()> { let mut schema_builder = Schema::builder(); let int_field = schema_builder.add_u64_field("id", INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests().unwrap(); + let mut index_writer = index.writer_for_tests()?; // create two postings list, one containg even number, // the other containing odd numbers. for i in 0..6 { let doc = doc!(int_field=> (i % 2) as u64); index_writer.add_document(doc); } - index_writer.commit().unwrap(); - let searcher = index.reader().unwrap().searcher(); + index_writer.commit()?; + let searcher = index.reader()?.searcher(); let segment_reader = searcher.segment_reader(0); let mut block_segments; { let term = Term::from_field_u64(int_field, 0u64); - let inverted_index = segment_reader.inverted_index(int_field); + let inverted_index = segment_reader.inverted_index(int_field)?; let term_info = inverted_index.get_term_info(&term).unwrap(); block_segments = inverted_index - .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic); + .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; } assert_eq!(block_segments.docs(), &[0, 2, 4]); { let term = Term::from_field_u64(int_field, 1u64); - let inverted_index = segment_reader.inverted_index(int_field); + let inverted_index = segment_reader.inverted_index(int_field)?; let term_info = inverted_index.get_term_info(&term).unwrap(); - inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments); + inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?; } assert_eq!(block_segments.docs(), &[1, 3, 5]); + Ok(()) } } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 3f9cce259..d4c97962d 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -101,12 +101,12 @@ pub mod tests { index_writer.commit()?; let searcher = index.reader()?.searcher(); - let inverted_index = searcher.segment_reader(0u32).inverted_index(title); + let inverted_index = searcher.segment_reader(0u32).inverted_index(title)?; let term = Term::from_field_text(title, "abc"); let mut positions = Vec::new(); { let mut postings = inverted_index - .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings.doc(), 0); postings.positions(&mut positions); @@ -120,7 +120,7 @@ pub mod tests { } { let mut postings = inverted_index - .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings.doc(), 0); assert_eq!(postings.advance(), 1); @@ -129,7 +129,7 @@ pub mod tests { } { let mut postings = inverted_index - .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings.seek(1), 1); assert_eq!(postings.doc(), 1); @@ -138,7 +138,7 @@ pub mod tests { } { let mut postings = inverted_index - .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings.seek(1002), 1002); assert_eq!(postings.doc(), 1002); @@ -147,7 +147,7 @@ pub mod tests { } { let mut postings = inverted_index - .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings.seek(100), 100); assert_eq!(postings.seek(1002), 1002); @@ -159,7 +159,7 @@ pub mod tests { } #[test] - pub fn test_drop_token_that_are_too_long() { + pub fn test_drop_token_that_are_too_long() -> crate::Result<()> { let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect(); let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect(); exceeding_token_text.push_str(" hello"); @@ -184,7 +184,7 @@ pub mod tests { reader.reload().unwrap(); let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(0u32); - let inverted_index = segment_reader.inverted_index(text_field); + let inverted_index = segment_reader.inverted_index(text_field)?; assert_eq!(inverted_index.terms().num_terms(), 1); let mut bytes = vec![]; assert!(inverted_index.terms().ord_to_term(0, &mut bytes)); @@ -196,12 +196,13 @@ pub mod tests { reader.reload().unwrap(); let searcher = reader.searcher(); let segment_reader = searcher.segment_reader(1u32); - let inverted_index = segment_reader.inverted_index(text_field); + let inverted_index = segment_reader.inverted_index(text_field)?; assert_eq!(inverted_index.terms().num_terms(), 1); let mut bytes = vec![]; assert!(inverted_index.terms().ord_to_term(0, &mut bytes)); assert_eq!(&bytes[..], ok_token_text.as_bytes()); } + Ok(()) } #[test] @@ -261,15 +262,15 @@ pub mod tests { { let term_a = Term::from_field_text(text_field, "abcdef"); assert!(segment_reader - .inverted_index(term_a.field()) - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .inverted_index(term_a.field())? + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)? .is_none()); } { let term_a = Term::from_field_text(text_field, "a"); let mut postings_a = segment_reader - .inverted_index(term_a.field()) - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .inverted_index(term_a.field())? + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings_a.len(), 1000); assert_eq!(postings_a.doc(), 0); @@ -291,8 +292,8 @@ pub mod tests { { let term_e = Term::from_field_text(text_field, "e"); let mut postings_e = segment_reader - .inverted_index(term_e.field()) - .read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions) + .inverted_index(term_e.field())? + .read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings_e.len(), 1000 - 2); for i in 2u32..1000u32 { @@ -312,7 +313,7 @@ pub mod tests { } #[test] - pub fn test_position_and_fieldnorm2() { + pub fn test_position_and_fieldnorm2() -> crate::Result<()> { let mut positions: Vec = Vec::new(); let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); @@ -328,16 +329,17 @@ pub mod tests { let searcher = index.reader().unwrap().searcher(); let segment_reader = searcher.segment_reader(0); let mut postings = segment_reader - .inverted_index(text_field) - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .inverted_index(text_field)? + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)? .unwrap(); assert_eq!(postings.doc(), 1u32); postings.positions(&mut positions); assert_eq!(&positions[..], &[1u32, 4]); + Ok(()) } #[test] - fn test_skip_next() { + fn test_skip_next() -> crate::Result<()> { let term_0 = Term::from_field_u64(Field::from_field_id(0), 0); let term_1 = Term::from_field_u64(Field::from_field_id(0), 1); let term_2 = Term::from_field_u64(Field::from_field_id(0), 2); @@ -348,10 +350,9 @@ pub mod tests { let mut schema_builder = Schema::builder(); let value_field = schema_builder.add_u64_field("value", INDEXED); let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); { - let mut index_writer = index.writer_for_tests().unwrap(); + let mut index_writer = index.writer_for_tests()?; for i in 0u64..num_docs as u64 { let doc = doc!(value_field => 2u64, value_field => i % 2u64); index_writer.add_document(doc); @@ -360,15 +361,15 @@ pub mod tests { } index }; - let searcher = index.reader().unwrap().searcher(); + let searcher = index.reader()?.searcher(); let segment_reader = searcher.segment_reader(0); // check that the basic usage works for i in 0..num_docs - 1 { for j in i + 1..num_docs { let mut segment_postings = segment_reader - .inverted_index(term_2.field()) - .read_postings(&term_2, IndexRecordOption::Basic) + .inverted_index(term_2.field())? + .read_postings(&term_2, IndexRecordOption::Basic)? .unwrap(); assert_eq!(segment_postings.seek(i), i); assert_eq!(segment_postings.doc(), i); @@ -380,8 +381,8 @@ pub mod tests { { let mut segment_postings = segment_reader - .inverted_index(term_2.field()) - .read_postings(&term_2, IndexRecordOption::Basic) + .inverted_index(term_2.field())? + .read_postings(&term_2, IndexRecordOption::Basic)? .unwrap(); // check that `skip_next` advances the iterator @@ -400,8 +401,8 @@ pub mod tests { // check that filtering works { let mut segment_postings = segment_reader - .inverted_index(term_0.field()) - .read_postings(&term_0, IndexRecordOption::Basic) + .inverted_index(term_0.field())? + .read_postings(&term_0, IndexRecordOption::Basic)? .unwrap(); for i in 0..num_docs / 2 { @@ -410,8 +411,8 @@ pub mod tests { } let mut segment_postings = segment_reader - .inverted_index(term_0.field()) - .read_postings(&term_0, IndexRecordOption::Basic) + .inverted_index(term_0.field())? + .read_postings(&term_0, IndexRecordOption::Basic)? .unwrap(); for i in 0..num_docs / 2 - 1 { @@ -422,19 +423,19 @@ pub mod tests { // delete some of the documents { - let mut index_writer = index.writer_for_tests().unwrap(); + let mut index_writer = index.writer_for_tests()?; index_writer.delete_term(term_0); assert!(index_writer.commit().is_ok()); } - let searcher = index.reader().unwrap().searcher(); + let searcher = index.reader()?.searcher(); assert_eq!(searcher.segment_readers().len(), 1); let segment_reader = searcher.segment_reader(0); // make sure seeking still works for i in 0..num_docs { let mut segment_postings = segment_reader - .inverted_index(term_2.field()) - .read_postings(&term_2, IndexRecordOption::Basic) + .inverted_index(term_2.field())? + .read_postings(&term_2, IndexRecordOption::Basic)? .unwrap(); if i % 2 == 0 { @@ -450,8 +451,8 @@ pub mod tests { // now try with a longer sequence { let mut segment_postings = segment_reader - .inverted_index(term_2.field()) - .read_postings(&term_2, IndexRecordOption::Basic) + .inverted_index(term_2.field())? + .read_postings(&term_2, IndexRecordOption::Basic)? .unwrap(); let mut last = 2; // start from 5 to avoid seeking to 3 twice @@ -476,20 +477,19 @@ pub mod tests { // delete everything else { - let mut index_writer = index.writer_for_tests().unwrap(); + let mut index_writer = index.writer_for_tests()?; index_writer.delete_term(term_1); assert!(index_writer.commit().is_ok()); } - let searcher = index.reader().unwrap().searcher(); + let searcher = index.reader()?.searcher(); // finally, check that it's empty { - let searchable_segment_ids = index - .searchable_segment_ids() - .expect("could not get index segment ids"); + let searchable_segment_ids = index.searchable_segment_ids()?; assert!(searchable_segment_ids.is_empty()); assert_eq!(searcher.num_docs(), 0); } + Ok(()) } pub static TERM_A: Lazy = Lazy::new(|| { @@ -621,7 +621,7 @@ mod bench { b.iter(|| { let mut segment_postings = segment_reader .inverted_index(TERM_A.field()) - .read_postings(&*TERM_A, IndexRecordOption::Basic) + .read_postings(&*TERM_A, IndexRecordOption::Basic)? .unwrap(); while segment_postings.advance() != TERMINATED {} }); @@ -636,18 +636,22 @@ mod bench { let segment_postings_a = segment_reader .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, IndexRecordOption::Basic) + .unwrap() .unwrap(); let segment_postings_b = segment_reader .inverted_index(TERM_B.field()) .read_postings(&*TERM_B, IndexRecordOption::Basic) + .unwrap() .unwrap(); let segment_postings_c = segment_reader .inverted_index(TERM_C.field()) .read_postings(&*TERM_C, IndexRecordOption::Basic) + .unwrap() .unwrap(); let segment_postings_d = segment_reader .inverted_index(TERM_D.field()) .read_postings(&*TERM_D, IndexRecordOption::Basic) + .unwrap() .unwrap(); let mut intersection = Intersection::new(vec![ segment_postings_a, @@ -668,6 +672,7 @@ mod bench { let mut segment_postings = segment_reader .inverted_index(TERM_A.field()) .read_postings(&*TERM_A, IndexRecordOption::Basic) + .unwrap() .unwrap(); let mut existing_docs = Vec::new(); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 401ba0df1..36e46a0c5 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -161,7 +161,7 @@ impl MultiFieldPostingsWriter { } let postings_writer = &self.per_field_postings_writers[field.field_id() as usize]; - let fieldnorm_reader = fieldnorm_readers.get_field(field); + let fieldnorm_reader = fieldnorm_readers.get_field(field)?; let mut field_serializer = serializer.new_field( field, postings_writer.total_num_tokens(), diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 25c7a5ff4..89aa04bb6 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -12,7 +12,7 @@ use crate::postings::Postings; use crate::schema::IndexRecordOption; use crate::{DocId, TERMINATED}; -use crate::directory::ReadOnlySource; +use crate::directory::FileSlice; use crate::fastfield::DeleteBitSet; use crate::postings::BlockSegmentPostings; @@ -86,12 +86,13 @@ impl SegmentPostings { .close_term(docs.len() as u32) .expect("In memory Serialization should never fail."); } - let block_segment_postings = BlockSegmentPostings::from_data( + let block_segment_postings = BlockSegmentPostings::open( docs.len() as u32, - ReadOnlySource::from(buffer), + FileSlice::new(buffer), IndexRecordOption::Basic, IndexRecordOption::Basic, - ); + ) + .unwrap(); SegmentPostings::from_block_postings(block_segment_postings, None) } @@ -131,12 +132,13 @@ impl SegmentPostings { postings_serializer .close_term(doc_and_tfs.len() as u32) .unwrap(); - let block_segment_postings = BlockSegmentPostings::from_data( + let block_segment_postings = BlockSegmentPostings::open( doc_and_tfs.len() as u32, - ReadOnlySource::from(buffer), + FileSlice::new(buffer), IndexRecordOption::WithFreqs, IndexRecordOption::WithFreqs, - ); + ) + .unwrap(); SegmentPostings::from_block_postings(block_segment_postings, None) } @@ -204,7 +206,7 @@ impl DocSet for SegmentPostings { } /// Return the current document's `DocId`. - #[inline] + #[inline(always)] fn doc(&self) -> DocId { self.block_cursor.doc(self.cur) } diff --git a/src/postings/skip.rs b/src/postings/skip.rs index 8b3263886..0f90beff9 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -1,10 +1,9 @@ -use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable, VInt}; -use crate::directory::ReadOnlySource; +use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable}; +use crate::directory::OwnedBytes; use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE}; use crate::query::BM25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; -use owned_read::OwnedRead; pub struct SkipSerializer { buffer: Vec, @@ -62,7 +61,7 @@ impl SkipSerializer { pub(crate) struct SkipReader { last_doc_in_block: DocId, pub(crate) last_doc_in_previous_block: DocId, - owned_read: OwnedRead, + owned_read: OwnedBytes, skip_info: IndexRecordOption, byte_offset: usize, remaining_docs: u32, // number of docs remaining, including the @@ -93,7 +92,7 @@ impl Default for BlockInfo { } impl SkipReader { - pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader { + pub fn new(data: OwnedBytes, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader { let mut skip_reader = SkipReader { last_doc_in_block: if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { 0 @@ -101,7 +100,7 @@ impl SkipReader { TERMINATED }, last_doc_in_previous_block: 0u32, - owned_read: OwnedRead::new(data), + owned_read: data, skip_info, block_info: BlockInfo::VInt { num_docs: doc_freq }, byte_offset: 0, @@ -114,14 +113,14 @@ impl SkipReader { skip_reader } - pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) { + pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) { self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 { 0 } else { TERMINATED }; self.last_doc_in_previous_block = 0u32; - self.owned_read = OwnedRead::new(data); + self.owned_read = data; self.block_info = BlockInfo::VInt { num_docs: doc_freq }; self.byte_offset = 0; self.remaining_docs = doc_freq; @@ -154,17 +153,24 @@ impl SkipReader { self.position_offset } + #[inline(always)] pub fn byte_offset(&self) -> usize { self.byte_offset } fn read_block_info(&mut self) { - let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted"); + let doc_delta = { + let bytes = self.owned_read.as_slice(); + let mut buf = [0; 4]; + buf.copy_from_slice(&bytes[..4]); + u32::from_le_bytes(buf) + }; self.last_doc_in_block += doc_delta as DocId; - let doc_num_bits = self.owned_read.get(0); + let doc_num_bits = self.owned_read.as_slice()[4]; + match self.skip_info { IndexRecordOption::Basic => { - self.owned_read.advance(1); + self.owned_read.advance(5); self.block_info = BlockInfo::BitPacked { doc_num_bits, tf_num_bits: 0, @@ -174,11 +180,11 @@ impl SkipReader { }; } IndexRecordOption::WithFreqs => { - let tf_num_bits = self.owned_read.get(1); - let block_wand_fieldnorm_id = self.owned_read.get(2); - let data = &self.owned_read.as_ref()[3..]; - let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(data); - self.owned_read.advance(3 + num_bytes); + let bytes = self.owned_read.as_slice(); + let tf_num_bits = bytes[5]; + let block_wand_fieldnorm_id = bytes[6]; + let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[7..]); + self.owned_read.advance(7 + num_bytes); self.block_info = BlockInfo::BitPacked { doc_num_bits, tf_num_bits, @@ -188,13 +194,16 @@ impl SkipReader { }; } IndexRecordOption::WithFreqsAndPositions => { - let tf_num_bits = self.owned_read.get(1); - self.owned_read.advance(2); - let tf_sum = u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum"); - let block_wand_fieldnorm_id = self.owned_read.get(0); - self.owned_read.advance(1); - let block_wand_term_freq = - VInt::deserialize_u64(&mut self.owned_read).unwrap() as u32; + let bytes = self.owned_read.as_slice(); + let tf_num_bits = bytes[5]; + let tf_sum = { + let mut buf = [0; 4]; + buf.copy_from_slice(&bytes[6..10]); + u32::from_le_bytes(buf) + }; + let block_wand_fieldnorm_id = bytes[10]; + let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[11..]); + self.owned_read.advance(11 + num_bytes); self.block_info = BlockInfo::BitPacked { doc_num_bits, tf_num_bits, @@ -262,7 +271,7 @@ mod tests { use super::BlockInfo; use super::IndexRecordOption; use super::{SkipReader, SkipSerializer}; - use crate::directory::ReadOnlySource; + use crate::directory::OwnedBytes; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; #[test] @@ -278,11 +287,8 @@ mod tests { skip_serializer.data().to_owned() }; let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32; - let mut skip_reader = SkipReader::new( - ReadOnlySource::new(buf), - doc_freq, - IndexRecordOption::WithFreqs, - ); + let mut skip_reader = + SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::WithFreqs); assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info, @@ -323,11 +329,8 @@ mod tests { skip_serializer.data().to_owned() }; let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32; - let mut skip_reader = SkipReader::new( - ReadOnlySource::from(buf), - doc_freq, - IndexRecordOption::Basic, - ); + let mut skip_reader = + SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::Basic); assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info(), @@ -367,11 +370,8 @@ mod tests { skip_serializer.data().to_owned() }; let doc_freq = COMPRESSION_BLOCK_SIZE as u32; - let mut skip_reader = SkipReader::new( - ReadOnlySource::from(buf), - doc_freq, - IndexRecordOption::Basic, - ); + let mut skip_reader = + SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::Basic); assert_eq!(skip_reader.last_doc_in_block(), 1u32); assert_eq!( skip_reader.block_info(), diff --git a/src/postings/stacker/expull.rs b/src/postings/stacker/expull.rs index ba44505a2..0510eac64 100644 --- a/src/postings/stacker/expull.rs +++ b/src/postings/stacker/expull.rs @@ -206,8 +206,8 @@ mod tests { fn test_stack_long() { let mut heap = MemoryArena::new(); let mut stack = ExpUnrolledLinkedList::new(); - let source: Vec = (0..100).collect(); - for &el in &source { + let data: Vec = (0..100).collect(); + for &el in &data { assert!(stack .writer(&mut heap) .write_u32::(el) @@ -221,7 +221,7 @@ mod tests { result.push(LittleEndian::read_u32(&remaining[..4])); remaining = &remaining[4..]; } - assert_eq!(&result[..], &source[..]); + assert_eq!(&result[..], &data[..]); } #[test] diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index f25375384..83ee9c88e 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -42,13 +42,13 @@ where fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); - let inverted_index = reader.inverted_index(self.field); + let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); let mut term_stream = self.automaton_stream(term_dict); while term_stream.advance() { let term_info = term_stream.value(); let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic); + .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; loop { let docs = block_segment_postings.docs(); if docs.is_empty() { diff --git a/src/query/bm25.rs b/src/query/bm25.rs index 131115bbc..0bb1b4f08 100644 --- a/src/query/bm25.rs +++ b/src/query/bm25.rs @@ -52,7 +52,7 @@ impl BM25Weight { } } - pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight { + pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> crate::Result { assert!(!terms.is_empty(), "BM25 requires at least one term"); let field = terms[0].field(); for term in &terms[1..] { @@ -66,25 +66,27 @@ impl BM25Weight { let mut total_num_tokens = 0u64; let mut total_num_docs = 0u64; for segment_reader in searcher.segment_readers() { - let inverted_index = segment_reader.inverted_index(field); + let inverted_index = segment_reader.inverted_index(field)?; total_num_tokens += inverted_index.total_num_tokens(); total_num_docs += u64::from(segment_reader.max_doc()); } let average_fieldnorm = total_num_tokens as Score / total_num_docs as Score; if terms.len() == 1 { - let term_doc_freq = searcher.doc_freq(&terms[0]); - BM25Weight::for_one_term(term_doc_freq, total_num_docs, average_fieldnorm) + let term_doc_freq = searcher.doc_freq(&terms[0])?; + Ok(BM25Weight::for_one_term( + term_doc_freq, + total_num_docs, + average_fieldnorm, + )) } else { - let idf = terms - .iter() - .map(|term| { - let term_doc_freq = searcher.doc_freq(term); - idf(term_doc_freq, total_num_docs) - }) - .sum::(); - let idf_explain = Explanation::new("idf", idf); - BM25Weight::new(idf_explain, average_fieldnorm) + let mut idf_sum: Score = 0.0; + for term in terms { + let term_doc_freq = searcher.doc_freq(term)?; + idf_sum += idf(term_doc_freq, total_num_docs); + } + let idf_explain = Explanation::new("idf", idf_sum); + Ok(BM25Weight::new(idf_explain, average_fieldnorm)) } } diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 1032337d4..601b662d7 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -95,7 +95,7 @@ impl PhraseQuery { ))); } let terms = self.phrase_terms(); - let bm25_weight = BM25Weight::for_terms(searcher, &terms); + let bm25_weight = BM25Weight::for_terms(searcher, &terms)?; Ok(PhraseWeight::new( self.phrase_terms.clone(), bm25_weight, diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 063ad763b..1f60275ea 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -48,8 +48,8 @@ impl PhraseWeight { let mut term_postings_list = Vec::new(); for &(offset, ref term) in &self.phrase_terms { if let Some(postings) = reader - .inverted_index(term.field()) - .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) + .inverted_index(term.field())? + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)? { term_postings_list.push((offset, postings)); } else { @@ -66,8 +66,8 @@ impl PhraseWeight { let mut term_postings_list = Vec::new(); for &(offset, ref term) in &self.phrase_terms { if let Some(postings) = reader - .inverted_index(term.field()) - .read_postings_no_deletes(&term, IndexRecordOption::WithFreqsAndPositions) + .inverted_index(term.field())? + .read_postings_no_deletes(&term, IndexRecordOption::WithFreqsAndPositions)? { term_postings_list.push((offset, postings)); } else { diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 936ea33e9..7d78bf4f2 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -296,13 +296,13 @@ impl Weight for RangeWeight { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); - let inverted_index = reader.inverted_index(self.field); + let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); let mut term_range = self.term_range(term_dict); while term_range.advance() { let term_info = term_range.value(); let mut block_segment_postings = inverted_index - .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic); + .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?; loop { let docs = block_segment_postings.docs(); if docs.is_empty() { diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 0fb8c596f..a653be561 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -87,21 +87,31 @@ impl TermQuery { /// While `.weight(...)` returns a boxed trait object, /// this method return a specific implementation. /// This is useful for optimization purpose. - pub fn specialized_weight(&self, searcher: &Searcher, scoring_enabled: bool) -> TermWeight { + pub fn specialized_weight( + &self, + searcher: &Searcher, + scoring_enabled: bool, + ) -> crate::Result { let term = self.term.clone(); - let bm25_weight = BM25Weight::for_terms(searcher, &[term]); + let bm25_weight = BM25Weight::for_terms(searcher, &[term])?; let index_record_option = if scoring_enabled { self.index_record_option } else { IndexRecordOption::Basic }; - TermWeight::new(self.term.clone(), index_record_option, bm25_weight) + Ok(TermWeight::new( + self.term.clone(), + index_record_option, + bm25_weight, + )) } } impl Query for TermQuery { fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result> { - Ok(Box::new(self.specialized_weight(searcher, scoring_enabled))) + Ok(Box::new( + self.specialized_weight(searcher, scoring_enabled)?, + )) } fn query_terms(&self, term_set: &mut BTreeSet) { term_set.insert(self.term.clone()); diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 583b518fc..86552026e 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -253,7 +253,7 @@ mod tests { } fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) -> crate::Result<()> { - let term_weight = term_query.specialized_weight(&searcher, true); + let term_weight = term_query.specialized_weight(&searcher, true)?; for reader in searcher.segment_readers() { let mut block_max_scores = vec![]; let mut block_max_scores_b = vec![]; diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index adaf38a62..339cf2033 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -36,11 +36,9 @@ impl Weight for TermWeight { Ok(self.scorer(reader, 1.0)?.count(delete_bitset)) } else { let field = self.term.field(); - Ok(reader - .inverted_index(field) - .get_term_info(&self.term) - .map(|term_info| term_info.doc_freq) - .unwrap_or(0)) + let inv_index = reader.inverted_index(field)?; + let term_info = inv_index.get_term_info(&self.term); + Ok(term_info.map(|term_info| term_info.doc_freq).unwrap_or(0)) } } @@ -97,11 +95,11 @@ impl TermWeight { boost: Score, ) -> crate::Result { let field = self.term.field(); - let inverted_index = reader.inverted_index(field); + let inverted_index = reader.inverted_index(field)?; let fieldnorm_reader = reader.get_fieldnorms_reader(field)?; let similarity_weight = self.similarity_weight.boost_by(boost); let postings_opt: Option = - inverted_index.read_postings(&self.term, self.index_record_option); + inverted_index.read_postings(&self.term, self.index_record_option)?; if let Some(segment_postings) = postings_opt { Ok(TermScorer::new( segment_postings, diff --git a/src/reader/mod.rs b/src/reader/mod.rs index ff993de83..0a38c8cbb 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -9,8 +9,8 @@ use crate::directory::META_LOCK; use crate::Index; use crate::Searcher; use crate::SegmentReader; -use std::convert::TryInto; use std::sync::Arc; +use std::{convert::TryInto, io}; /// Defines when a new version of the index should be reloaded. /// @@ -138,11 +138,11 @@ impl InnerIndexReader { .collect::>()? }; let schema = self.index.schema(); - let searchers = std::iter::repeat_with(|| { + let searchers: Vec = std::iter::repeat_with(|| { Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone()) }) .take(self.num_searchers) - .collect(); + .collect::>()?; self.searcher_pool.publish_new_generation(searchers); Ok(()) } diff --git a/src/schema/term.rs b/src/schema/term.rs index 266c7b9fb..2d182caa5 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -152,7 +152,7 @@ impl Term where B: AsRef<[u8]>, { - /// Wraps a source of data + /// Wraps a object holding bytes pub fn wrap(data: B) -> Term { Term(data) } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 59485ef8b..f71de1d9d 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -263,19 +263,17 @@ impl SnippetGenerator { ) -> crate::Result { let mut terms = BTreeSet::new(); query.query_terms(&mut terms); - let terms_text: BTreeMap = terms - .into_iter() - .filter(|term| term.field() == field) - .flat_map(|term| { - let doc_freq = searcher.doc_freq(&term); + let mut terms_text: BTreeMap = Default::default(); + for term in terms { + if term.field() != field { + continue; + } + let doc_freq = searcher.doc_freq(&term)?; + if doc_freq > 0 { let score = 1.0 / (1.0 + doc_freq as Score); - if doc_freq > 0 { - Some((term.text().to_string(), score)) - } else { - None - } - }) - .collect(); + terms_text.insert(term.text().to_string(), score); + } + } let tokenizer = searcher.index().tokenizer_for_field(field)?; Ok(SnippetGenerator { terms_text, diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index b09e5ed0a..3bad8f8b0 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -307,7 +307,7 @@ mod test { let index = Index::create_in_ram(schema.clone()); let reader = index.reader().unwrap(); let searcher = reader.searcher(); - let searcher_space_usage = searcher.space_usage(); + let searcher_space_usage = searcher.space_usage().unwrap(); assert_eq!(0, searcher_space_usage.total()); } @@ -346,7 +346,7 @@ mod test { let reader = index.reader().unwrap(); let searcher = reader.searcher(); - let searcher_space_usage = searcher.space_usage(); + let searcher_space_usage = searcher.space_usage().unwrap(); assert!(searcher_space_usage.total() > 0); assert_eq!(1, searcher_space_usage.segments().len()); @@ -386,7 +386,7 @@ mod test { let reader = index.reader().unwrap(); let searcher = reader.searcher(); - let searcher_space_usage = searcher.space_usage(); + let searcher_space_usage = searcher.space_usage().unwrap(); assert!(searcher_space_usage.total() > 0); assert_eq!(1, searcher_space_usage.segments().len()); @@ -425,7 +425,7 @@ mod test { } let reader = index.reader().unwrap(); let searcher = reader.searcher(); - let searcher_space_usage = searcher.space_usage(); + let searcher_space_usage = searcher.space_usage().unwrap(); assert!(searcher_space_usage.total() > 0); assert_eq!(1, searcher_space_usage.segments().len()); @@ -446,49 +446,47 @@ mod test { } #[test] - fn test_deletes() { + fn test_deletes() -> crate::Result<()> { let mut schema_builder = Schema::builder(); let name = schema_builder.add_u64_field("name", INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); { - let mut index_writer = index.writer_for_tests().unwrap(); + let mut index_writer = index.writer_for_tests()?; index_writer.add_document(doc!(name => 1u64)); index_writer.add_document(doc!(name => 2u64)); index_writer.add_document(doc!(name => 3u64)); index_writer.add_document(doc!(name => 4u64)); - index_writer.commit().unwrap(); + index_writer.commit()?; } { - let mut index_writer2 = index.writer(50_000_000).unwrap(); + let mut index_writer2 = index.writer(50_000_000)?; index_writer2.delete_term(Term::from_field_u64(name, 2u64)); index_writer2.delete_term(Term::from_field_u64(name, 3u64)); - // ok, now we should have a deleted doc - index_writer2.commit().unwrap(); + index_writer2.commit()?; } - let reader = index.reader().unwrap(); + let reader = index.reader()?; let searcher = reader.searcher(); - let searcher_space_usage = searcher.space_usage(); + let searcher_space_usage = searcher.space_usage()?; assert!(searcher_space_usage.total() > 0); assert_eq!(1, searcher_space_usage.segments().len()); - let segment = &searcher_space_usage.segments()[0]; - assert!(segment.total() > 0); + let segment_space_usage = &searcher_space_usage.segments()[0]; + assert!(segment_space_usage.total() > 0); - assert_eq!(2, segment.num_docs()); + assert_eq!(2, segment_space_usage.num_docs()); - expect_single_field(segment.termdict(), &name, 1, 512); - expect_single_field(segment.postings(), &name, 1, 512); - assert_eq!(0, segment.positions().total()); - assert_eq!(0, segment.positions_skip_idx().total()); - assert_eq!(0, segment.fast_fields().total()); - expect_single_field(segment.fieldnorms(), &name, 1, 512); - // TODO: understand why the following fails - // assert_eq!(0, segment.store().total()); - assert!(segment.deletes() > 0); + expect_single_field(segment_space_usage.termdict(), &name, 1, 512); + expect_single_field(segment_space_usage.postings(), &name, 1, 512); + assert_eq!(0, segment_space_usage.positions().total()); + assert_eq!(0, segment_space_usage.positions_skip_idx().total()); + assert_eq!(0, segment_space_usage.fast_fields().total()); + expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512); + assert!(segment_space_usage.deletes() > 0); + Ok(()) } } diff --git a/src/store/mod.rs b/src/store/mod.rs index bb15301b7..ada5a76cd 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -103,19 +103,18 @@ pub mod tests { } #[test] - fn test_store() { + fn test_store() -> crate::Result<()> { let path = Path::new("store"); let mut directory = RAMDirectory::create(); - let store_file = directory.open_write(path).unwrap(); - let schema = write_lorem_ipsum_store(store_file, 1_000); + let store_wrt = directory.open_write(path)?; + let schema = write_lorem_ipsum_store(store_wrt, 1_000); let field_title = schema.get_field("title").unwrap(); - let store_source = directory.open_read(path).unwrap(); - let store = StoreReader::from_source(store_source); + let store_file = directory.open_read(path)?; + let store = StoreReader::open(store_file)?; for i in 0..1_000 { assert_eq!( *store - .get(i) - .unwrap() + .get(i)? .get_first(field_title) .unwrap() .text() @@ -123,6 +122,7 @@ pub mod tests { format!("Doc {}", i) ); } + Ok(()) } } @@ -152,8 +152,8 @@ mod bench { let mut directory = RAMDirectory::create(); let path = Path::new("store"); write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000); - let store_source = directory.open_read(path).unwrap(); - let store = StoreReader::from_source(store_source); + let store_file = directory.open_read(path).unwrap(); + let store = StoreReader::open(store_file).unwrap(); b.iter(|| { store.get(12).unwrap(); }); diff --git a/src/store/reader.rs b/src/store/reader.rs index c6d0dc000..cedcfd28a 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -1,8 +1,8 @@ use super::decompress; use super::skiplist::SkipList; -use crate::common::BinarySerializable; use crate::common::VInt; -use crate::directory::ReadOnlySource; +use crate::common::{BinarySerializable, HasLen}; +use crate::directory::{FileSlice, OwnedBytes}; use crate::schema::Document; use crate::space_usage::StoreSpaceUsage; use crate::DocId; @@ -13,8 +13,8 @@ use std::mem::size_of; /// Reads document off tantivy's [`Store`](./index.html) #[derive(Clone)] pub struct StoreReader { - data: ReadOnlySource, - offset_index_source: ReadOnlySource, + data: FileSlice, + offset_index_file: OwnedBytes, current_block_offset: RefCell, current_block: RefCell>, max_doc: DocId, @@ -22,19 +22,20 @@ pub struct StoreReader { impl StoreReader { /// Opens a store reader - pub fn from_source(data: ReadOnlySource) -> StoreReader { - let (data_source, offset_index_source, max_doc) = split_source(data); - StoreReader { - data: data_source, - offset_index_source, + // TODO rename open + pub fn open(store_file: FileSlice) -> io::Result { + let (data_file, offset_index_file, max_doc) = split_file(store_file)?; + Ok(StoreReader { + data: data_file, + offset_index_file: offset_index_file.read_bytes()?, current_block_offset: RefCell::new(usize::max_value()), current_block: RefCell::new(Vec::new()), max_doc, - } + }) } pub(crate) fn block_index(&self) -> SkipList<'_, u64> { - SkipList::from(self.offset_index_source.as_slice()) + SkipList::from(self.offset_index_file.as_slice()) } fn block_offset(&self, doc_id: DocId) -> (DocId, u64) { @@ -44,23 +45,22 @@ impl StoreReader { .unwrap_or((0u32, 0u64)) } - pub(crate) fn block_data(&self) -> &[u8] { - self.data.as_slice() + pub(crate) fn block_data(&self) -> io::Result { + self.data.read_bytes() } - fn compressed_block(&self, addr: usize) -> &[u8] { - let total_buffer = self.data.as_slice(); - let mut buffer = &total_buffer[addr..]; - let block_len = u32::deserialize(&mut buffer).expect("") as usize; - &buffer[..block_len] + fn compressed_block(&self, addr: usize) -> io::Result { + let (block_len_bytes, block_body) = self.data.slice_from(addr).split(4); + let block_len = u32::deserialize(&mut block_len_bytes.read_bytes()?)?; + block_body.slice_to(block_len as usize).read_bytes() } fn read_block(&self, block_offset: usize) -> io::Result<()> { if block_offset != *self.current_block_offset.borrow() { let mut current_block_mut = self.current_block.borrow_mut(); current_block_mut.clear(); - let compressed_block = self.compressed_block(block_offset); - decompress(compressed_block, &mut current_block_mut)?; + let compressed_block = self.compressed_block(block_offset)?; + decompress(compressed_block.as_slice(), &mut current_block_mut)?; *self.current_block_offset.borrow_mut() = block_offset; } Ok(()) @@ -89,21 +89,21 @@ impl StoreReader { /// Summarize total space usage of this store reader. pub fn space_usage(&self) -> StoreSpaceUsage { - StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len()) + StoreSpaceUsage::new(self.data.len(), self.offset_index_file.len()) } } -fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) { +fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice, DocId)> { let data_len = data.len(); let footer_offset = data_len - size_of::() - size_of::(); - let serialized_offset: ReadOnlySource = data.slice(footer_offset, data_len); + let serialized_offset: OwnedBytes = data.slice(footer_offset, data_len).read_bytes()?; let mut serialized_offset_buf = serialized_offset.as_slice(); - let offset = u64::deserialize(&mut serialized_offset_buf).unwrap(); + let offset = u64::deserialize(&mut serialized_offset_buf)?; let offset = offset as usize; - let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap(); - ( + let max_doc = u32::deserialize(&mut serialized_offset_buf)?; + Ok(( data.slice(0, offset), data.slice(offset, footer_offset), max_doc, - ) + )) } diff --git a/src/store/writer.rs b/src/store/writer.rs index 5ddda2c7f..ea1b69835 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -75,7 +75,8 @@ impl StoreWriter { let start_offset = self.writer.written_bytes() as u64; // just bulk write all of the block of the given reader. - self.writer.write_all(store_reader.block_data())?; + self.writer + .write_all(store_reader.block_data()?.as_slice())?; // concatenate the index of the `store_reader`, after translating // its start doc id and its start file offset. diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index fd1c4fa19..70136f4d2 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -36,9 +36,9 @@ pub use self::termdict::{TermDictionary, TermDictionaryBuilder}; mod tests { use super::{TermDictionary, TermDictionaryBuilder, TermStreamer}; use crate::core::Index; - use crate::directory::{Directory, RAMDirectory, ReadOnlySource}; + use crate::directory::{Directory, FileSlice, RAMDirectory}; use crate::postings::TermInfo; - use crate::schema::{Document, Schema, TEXT}; + use crate::schema::{Schema, TEXT}; use std::path::PathBuf; use std::str; @@ -59,7 +59,7 @@ mod tests { } #[test] - fn test_term_ordinals() { + fn test_term_ordinals() -> crate::Result<()> { const COUNTRIES: [&'static str; 7] = [ "San Marino", "Serbia", @@ -72,42 +72,37 @@ mod tests { let mut directory = RAMDirectory::create(); let path = PathBuf::from("TermDictionary"); { - let write = directory.open_write(&path).unwrap(); - let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap(); + let write = directory.open_write(&path)?; + let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; for term in COUNTRIES.iter() { - term_dictionary_builder - .insert(term.as_bytes(), &make_term_info(0u64)) - .unwrap(); + term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?; } - term_dictionary_builder.finish().unwrap(); + term_dictionary_builder.finish()?; } - let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(&source); + let term_file = directory.open_read(&path)?; + let term_dict: TermDictionary = TermDictionary::open(term_file)?; for (term_ord, term) in COUNTRIES.iter().enumerate() { assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64); let mut bytes = vec![]; assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes)); assert_eq!(bytes, term.as_bytes()); } + Ok(()) } #[test] - fn test_term_dictionary_simple() { + fn test_term_dictionary_simple() -> crate::Result<()> { let mut directory = RAMDirectory::create(); let path = PathBuf::from("TermDictionary"); { - let write = directory.open_write(&path).unwrap(); - let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap(); - term_dictionary_builder - .insert("abc".as_bytes(), &make_term_info(34u64)) - .unwrap(); - term_dictionary_builder - .insert("abcd".as_bytes(), &make_term_info(346u64)) - .unwrap(); - term_dictionary_builder.finish().unwrap(); + let write = directory.open_write(&path)?; + let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; + term_dictionary_builder.insert("abc".as_bytes(), &make_term_info(34u64))?; + term_dictionary_builder.insert("abcd".as_bytes(), &make_term_info(346u64))?; + term_dictionary_builder.finish()?; } - let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(&source); + let file = directory.open_read(&path)?; + let term_dict: TermDictionary = TermDictionary::open(file)?; assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); let mut stream = term_dict.stream(); @@ -130,43 +125,26 @@ mod tests { assert_eq!(stream.value().doc_freq, 346u32); } assert!(!stream.advance()); + Ok(()) } #[test] - fn test_term_iterator() { + fn test_term_iterator() -> crate::Result<()> { let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); { - let mut index_writer = index.writer_for_tests().unwrap(); - { - { - let mut doc = Document::default(); - doc.add_text(text_field, "a b d f"); - index_writer.add_document(doc); - } - index_writer.commit().unwrap(); - } - { - { - let mut doc = Document::default(); - doc.add_text(text_field, "a b c d f"); - index_writer.add_document(doc); - } - index_writer.commit().unwrap(); - } - { - { - let mut doc = Document::default(); - doc.add_text(text_field, "e f"); - index_writer.add_document(doc); - } - index_writer.commit().unwrap(); - } + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(text_field=>"a b d f")); + index_writer.commit()?; + index_writer.add_document(doc!(text_field=>"a b c d f")); + index_writer.commit()?; + index_writer.add_document(doc!(text_field => "e f")); + index_writer.commit()?; } - let searcher = index.reader().unwrap().searcher(); + let searcher = index.reader()?.searcher(); - let field_searcher = searcher.field(text_field); + let field_searcher = searcher.field(text_field)?; let mut term_it = field_searcher.terms(); let mut term_string = String::new(); while term_it.advance() { @@ -174,10 +152,11 @@ mod tests { term_string.push_str(str::from_utf8(term_it.key()).expect("test")); } assert_eq!(&*term_string, "abcdef"); + Ok(()) } #[test] - fn test_term_dictionary_stream() { + fn test_term_dictionary_stream() -> crate::Result<()> { let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); @@ -190,8 +169,8 @@ mod tests { } term_dictionary_builder.finish().unwrap() }; - let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(&source); + let term_file = FileSlice::new(buffer); + let term_dictionary: TermDictionary = TermDictionary::open(term_file)?; { let mut streamer = term_dictionary.stream(); let mut i = 0; @@ -203,28 +182,26 @@ mod tests { } } - let &(ref key, ref _v) = &ids[2047]; - term_dictionary.get(key.as_bytes()); + let &(ref key, ref val) = &ids[2047]; + assert_eq!( + term_dictionary.get(key.as_bytes()), + Some(make_term_info(*val as u64)) + ); + Ok(()) } #[test] - fn test_stream_high_range_prefix_suffix() { + fn test_stream_high_range_prefix_suffix() -> crate::Result<()> { let buffer: Vec = { let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); // term requires more than 16bits - term_dictionary_builder - .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)) - .unwrap(); - term_dictionary_builder - .insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2)) - .unwrap(); - term_dictionary_builder - .insert("abr", &make_term_info(2)) - .unwrap(); - term_dictionary_builder.finish().unwrap() + term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?; + term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?; + term_dictionary_builder.insert("abr", &make_term_info(2))?; + term_dictionary_builder.finish()? }; - let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(&source); + let term_dict_file = FileSlice::new(buffer); + let term_dictionary: TermDictionary = TermDictionary::open(term_dict_file)?; let mut kv_stream = term_dictionary.stream(); assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); @@ -235,10 +212,11 @@ mod tests { assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abr".as_bytes()); assert!(!kv_stream.advance()); + Ok(()) } #[test] - fn test_stream_range() { + fn test_stream_range() -> crate::Result<()> { let ids: Vec<_> = (0u32..10_000u32) .map(|i| (format!("doc{:0>6}", i), i)) .collect(); @@ -252,9 +230,9 @@ mod tests { term_dictionary_builder.finish().unwrap() }; - let source = ReadOnlySource::from(buffer); + let file = FileSlice::new(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(&source); + let term_dictionary: TermDictionary = TermDictionary::open(file)?; { for i in (0..20).chain(6000..8_000) { let &(ref target_key, _) = &ids[i]; @@ -305,10 +283,11 @@ mod tests { } } } + Ok(()) } #[test] - fn test_empty_string() { + fn test_empty_string() -> crate::Result<()> { let buffer: Vec = { let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); term_dictionary_builder @@ -319,30 +298,29 @@ mod tests { .unwrap(); term_dictionary_builder.finish().unwrap() }; - let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(&source); + let file = FileSlice::new(buffer); + let term_dictionary: TermDictionary = TermDictionary::open(file)?; let mut stream = term_dictionary.stream(); assert!(stream.advance()); assert!(stream.key().is_empty()); assert!(stream.advance()); assert_eq!(stream.key(), &[1u8]); assert!(!stream.advance()); + Ok(()) } #[test] - fn test_stream_range_boundaries() { + fn test_stream_range_boundaries() -> crate::Result<()> { let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); + let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?; for i in 0u8..10u8 { let number_arr = [i; 1]; - term_dictionary_builder - .insert(&number_arr, &make_term_info(i as u64)) - .unwrap(); + term_dictionary_builder.insert(&number_arr, &make_term_info(i as u64))?; } - term_dictionary_builder.finish().unwrap() + term_dictionary_builder.finish()? }; - let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(&source); + let file = FileSlice::new(buffer); + let term_dictionary: TermDictionary = TermDictionary::open(file)?; let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| { let mut res: Vec = vec![]; @@ -430,10 +408,11 @@ mod tests { .into_stream(); assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]); } + Ok(()) } #[test] - fn test_automaton_search() { + fn test_automaton_search() -> crate::Result<()> { use crate::query::DFAWrapper; use levenshtein_automata::LevenshteinAutomatonBuilder; @@ -450,17 +429,15 @@ mod tests { let mut directory = RAMDirectory::create(); let path = PathBuf::from("TermDictionary"); { - let write = directory.open_write(&path).unwrap(); - let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap(); + let write = directory.open_write(&path)?; + let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; for term in COUNTRIES.iter() { - term_dictionary_builder - .insert(term.as_bytes(), &make_term_info(0u64)) - .unwrap(); + term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?; } - term_dictionary_builder.finish().unwrap(); + term_dictionary_builder.finish()?; } - let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(&source); + let file = directory.open_read(&path)?; + let term_dict: TermDictionary = TermDictionary::open(file)?; // We can now build an entire dfa. let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true); @@ -472,5 +449,6 @@ mod tests { assert!(range.advance()); assert_eq!("Spain".as_bytes(), range.key()); assert!(!range.advance()); + Ok(()) } } diff --git a/src/termdict/term_info_store.rs b/src/termdict/term_info_store.rs index cba4de2ca..b61c41251 100644 --- a/src/termdict/term_info_store.rs +++ b/src/termdict/term_info_store.rs @@ -1,8 +1,6 @@ -use crate::common::bitpacker::BitPacker; use crate::common::compute_num_bits; -use crate::common::Endianness; -use crate::common::{BinarySerializable, FixedSize}; -use crate::directory::ReadOnlySource; +use crate::common::{bitpacker::BitPacker, BinarySerializable, FixedSize}; +use crate::directory::{FileSlice, OwnedBytes}; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; use byteorder::{ByteOrder, LittleEndian}; @@ -79,8 +77,8 @@ impl TermInfoBlockMeta { pub struct TermInfoStore { num_terms: usize, - block_meta_source: ReadOnlySource, - term_info_source: ReadOnlySource, + block_meta_bytes: OwnedBytes, + term_info_bytes: OwnedBytes, } fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { @@ -105,35 +103,35 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { } impl TermInfoStore { - pub fn open(data: &ReadOnlySource) -> TermInfoStore { - let buffer = data.as_slice(); - let len = Endianness::read_u64(&buffer[0..8]) as usize; - let num_terms = Endianness::read_u64(&buffer[8..16]) as usize; - let block_meta_source = data.slice(16, 16 + len); - let term_info_source = data.slice_from(16 + len); - TermInfoStore { + pub fn open(term_info_store_file: FileSlice) -> crate::Result { + let (len_slice, main_slice) = term_info_store_file.split(16); + let mut bytes = len_slice.read_bytes()?; + let len = u64::deserialize(&mut bytes)? as usize; + let num_terms = u64::deserialize(&mut bytes)? as usize; + let (block_meta_file, term_info_file) = main_slice.split(len); + let term_info_bytes = term_info_file.read_bytes()?; + Ok(TermInfoStore { num_terms, - block_meta_source, - term_info_source, - } + block_meta_bytes: block_meta_file.read_bytes()?, + term_info_bytes, + }) } pub fn get(&self, term_ord: TermOrdinal) -> TermInfo { let block_id = (term_ord as usize) / BLOCK_LEN; - let buffer = self.block_meta_source.as_slice(); + let buffer = self.block_meta_bytes.as_slice(); let mut block_data: &[u8] = &buffer[block_id * TermInfoBlockMeta::SIZE_IN_BYTES..]; let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data) .expect("Failed to deserialize terminfoblockmeta"); let inner_offset = (term_ord as usize) % BLOCK_LEN; if inner_offset == 0 { - term_info_block_data.ref_term_info - } else { - let term_info_data = self.term_info_source.as_slice(); - term_info_block_data.deserialize_term_info( - &term_info_data[term_info_block_data.offset as usize..], - inner_offset - 1, - ) + return term_info_block_data.ref_term_info; } + let term_info_data = self.term_info_bytes.as_slice(); + term_info_block_data.deserialize_term_info( + &term_info_data[term_info_block_data.offset as usize..], + inner_offset - 1, + ) } pub fn num_terms(&self) -> usize { @@ -263,7 +261,7 @@ mod tests { use crate::common::bitpacker::BitPacker; use crate::common::compute_num_bits; use crate::common::BinarySerializable; - use crate::directory::ReadOnlySource; + use crate::directory::FileSlice; use crate::postings::TermInfo; #[test] @@ -309,7 +307,7 @@ mod tests { } #[test] - fn test_pack() { + fn test_pack() -> crate::Result<()> { let mut store_writer = TermInfoStoreWriter::new(); let mut term_infos = vec![]; for i in 0..1000 { @@ -318,14 +316,15 @@ mod tests { postings_offset: (i / 10) as u64, positions_idx: (i * 7) as u64, }; - store_writer.write_term_info(&term_info).unwrap(); + store_writer.write_term_info(&term_info)?; term_infos.push(term_info); } let mut buffer = Vec::new(); - store_writer.serialize(&mut buffer).unwrap(); - let term_info_store = TermInfoStore::open(&ReadOnlySource::from(buffer)); + store_writer.serialize(&mut buffer)?; + let term_info_store = TermInfoStore::open(FileSlice::new(buffer))?; for i in 0..1000 { assert_eq!(term_info_store.get(i as u64), term_infos[i]); } + Ok(()) } } diff --git a/src/termdict/termdict.rs b/src/termdict/termdict.rs index 1330a11c3..9da4678d9 100644 --- a/src/termdict/termdict.rs +++ b/src/termdict/termdict.rs @@ -1,8 +1,8 @@ use super::term_info_store::{TermInfoStore, TermInfoStoreWriter}; use super::{TermStreamer, TermStreamerBuilder}; -use crate::common::BinarySerializable; -use crate::common::CountingWriter; -use crate::directory::ReadOnlySource; +use crate::common::{BinarySerializable, CountingWriter}; +use crate::directory::{FileSlice, OwnedBytes}; +use crate::error::DataCorruption; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; use once_cell::sync::Lazy; @@ -86,17 +86,19 @@ where } } -fn open_fst_index(source: ReadOnlySource) -> tantivy_fst::Map { - let fst = Fst::new(source).expect("FST data is corrupted"); - tantivy_fst::Map::from(fst) +fn open_fst_index(fst_file: FileSlice) -> crate::Result> { + let bytes = fst_file.read_bytes()?; + let fst = Fst::new(bytes) + .map_err(|err| DataCorruption::comment_only(format!("Fst data is corrupted: {:?}", err)))?; + Ok(tantivy_fst::Map::from(fst)) } -static EMPTY_DATA_SOURCE: Lazy = Lazy::new(|| { +static EMPTY_TERM_DICT_FILE: Lazy = Lazy::new(|| { let term_dictionary_data: Vec = TermDictionaryBuilder::create(Vec::::new()) .expect("Creating a TermDictionaryBuilder in a Vec should never fail") .finish() .expect("Writing in a Vec should never fail"); - ReadOnlySource::from(term_dictionary_data) + FileSlice::new(term_dictionary_data) }); /// The term dictionary contains all of the terms in @@ -106,31 +108,28 @@ static EMPTY_DATA_SOURCE: Lazy = Lazy::new(|| { /// respective `TermOrdinal`. The `TermInfoStore` then makes it /// possible to fetch the associated `TermInfo`. pub struct TermDictionary { - fst_index: tantivy_fst::Map, + fst_index: tantivy_fst::Map, term_info_store: TermInfoStore, } impl TermDictionary { - /// Opens a `TermDictionary` given a data source. - pub fn from_source(source: &ReadOnlySource) -> Self { - let total_len = source.len(); - let length_offset = total_len - 8; - let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - let footer_size = u64::deserialize(&mut split_len_buffer) - .expect("Deserializing 8 bytes should always work") as usize; - let split_len = length_offset - footer_size; - let fst_source = source.slice(0, split_len); - let values_source = source.slice(split_len, length_offset); - let fst_index = open_fst_index(fst_source); - TermDictionary { + /// Opens a `TermDictionary`. + pub fn open(file: FileSlice) -> crate::Result { + let (main_slice, footer_len_slice) = file.split_from_end(8); + let mut footer_len_bytes = footer_len_slice.read_bytes()?; + let footer_size = u64::deserialize(&mut footer_len_bytes)?; + let (fst_file_slice, values_file_slice) = main_slice.split_from_end(footer_size as usize); + let fst_index = open_fst_index(fst_file_slice)?; + let term_info_store = TermInfoStore::open(values_file_slice)?; + Ok(TermDictionary { fst_index, - term_info_store: TermInfoStore::open(&values_source), - } + term_info_store, + }) } /// Creates an empty term dictionary which contains no terms. pub fn empty() -> Self { - TermDictionary::from_source(&*EMPTY_DATA_SOURCE) + TermDictionary::open(EMPTY_TERM_DICT_FILE.clone()).unwrap() } /// Returns the number of terms in the dictionary. diff --git a/tests/failpoints/mod.rs b/tests/failpoints/mod.rs index 658fadbc1..f3cfab714 100644 --- a/tests/failpoints/mod.rs +++ b/tests/failpoints/mod.rs @@ -40,17 +40,17 @@ fn test_failpoints_managed_directory_gc_if_delete_fails() { } #[test] -fn test_write_commit_fails() { +fn test_write_commit_fails() -> tantivy::Result<()> { let _fail_scenario_guard = fail::FailScenario::setup(); let mut schema_builder = Schema::builder(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); - let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; for _ in 0..100 { index_writer.add_document(doc!(text_field => "a")); } - index_writer.commit().unwrap(); + index_writer.commit()?; fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap(); for _ in 0..100 { index_writer.add_document(doc!(text_field => "b")); @@ -59,8 +59,9 @@ fn test_write_commit_fails() { let num_docs_containing = |s: &str| { let term_a = Term::from_field_text(text_field, s); - index.reader().unwrap().searcher().doc_freq(&term_a) + index.reader()?.searcher().doc_freq(&term_a) }; - assert_eq!(num_docs_containing("a"), 100); - assert_eq!(num_docs_containing("b"), 0); + assert_eq!(num_docs_containing("a")?, 100); + assert_eq!(num_docs_containing("b")?, 0); + Ok(()) }