Large API Change in the Directory API. (#901)

Tantivy used to assume that all files could be somehow memory mapped. After this change, Directory return a `FileSlice` that can be reduced and eventually read into an `OwnedBytes` object. Long and blocking io operation are still required by they do not span over the entire file.
This commit is contained in:
Paul Masurel
2020-10-08 16:36:51 +09:00
committed by GitHub
parent 579e3d1ed8
commit c23a03ad81
58 changed files with 1497 additions and 1117 deletions

View File

@@ -3,6 +3,7 @@ Tantivy 0.14.0
- Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan).
- Migrated tantivy error from the now deprecated `failure` crate to `thiserror` #760. (@hirevo)
- API Change. Accessing the typed value off a `Schema::Value` now returns an Option instead of panicking if the type does not match.
- Large API Change in the Directory API. Tantivy used to assume that all files could be somehow memory mapped. After this change, Directory return a `FileSlice` that can be reduced and eventually read into an `OwnedBytes` object. Long and blocking io operation are still required by they do not span over the entire file.
Tantivy 0.13.2
===================

View File

@@ -33,7 +33,6 @@ notify = {version="4", optional=true}
uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.7"
futures = {version = "0.3", features=["thread-pool"] }
owning_ref = "0.4"
tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" }
stable_deref_trait = "1"
rust-stemmers = "1"
@@ -41,7 +40,6 @@ downcast-rs = "1"
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.4"
fnv = "1"
owned-read = "0.4"
thiserror = "1.0"
htmlescape = "0.3"
fail = "0.4"

View File

@@ -45,7 +45,7 @@ fn main() -> tantivy::Result<()> {
// Inverted index stands for the combination of
// - the term dictionary
// - the inverted lists associated to each terms and their positions
let inverted_index = segment_reader.inverted_index(title);
let inverted_index = segment_reader.inverted_index(title)?;
// A `Term` is a text token associated with a field.
// Let's go through all docs containing the term `title:the` and access their position
@@ -58,7 +58,7 @@ fn main() -> tantivy::Result<()> {
// If you don't need all this information, you may get better performance by decompressing less
// information.
if let Some(mut segment_postings) =
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)?
{
// this buffer will be used to request for positions
let mut positions: Vec<u32> = Vec::with_capacity(100);
@@ -106,7 +106,7 @@ fn main() -> tantivy::Result<()> {
// Inverted index stands for the combination of
// - the term dictionary
// - the inverted lists associated to each terms and their positions
let inverted_index = segment_reader.inverted_index(title);
let inverted_index = segment_reader.inverted_index(title)?;
// This segment posting object is like a cursor over the documents matching the term.
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
@@ -115,7 +115,7 @@ fn main() -> tantivy::Result<()> {
// If you don't need all this information, you may get better performance by decompressing less
// information.
if let Some(mut block_segment_postings) =
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
{
loop {
let docs = block_segment_postings.docs();

View File

@@ -7,7 +7,6 @@ use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentReader;
use crate::TantivyError;
use std::cmp::Ordering;
use std::collections::btree_map;
use std::collections::BTreeMap;
@@ -266,10 +265,7 @@ impl Collector for FacetCollector {
_: SegmentLocalId,
reader: &SegmentReader,
) -> crate::Result<FacetSegmentCollector> {
let field_name = reader.schema().get_field_name(self.field);
let facet_reader = reader.facet_reader(self.field).ok_or_else(|| {
TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name))
})?;
let facet_reader = reader.facet_reader(self.field)?;
let mut collapse_mapping = Vec::new();
let mut counts = Vec::new();

View File

@@ -1,6 +1,7 @@
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use std::io;
use std::ops::Deref;
use crate::directory::OwnedBytes;
pub(crate) struct BitPacker {
mini_buffer: u64,
@@ -60,20 +61,14 @@ impl BitPacker {
}
#[derive(Clone)]
pub struct BitUnpacker<Data>
where
Data: Deref<Target = [u8]>,
{
pub struct BitUnpacker {
num_bits: u64,
mask: u64,
data: Data,
data: OwnedBytes,
}
impl<Data> BitUnpacker<Data>
where
Data: Deref<Target = [u8]>,
{
pub fn new(data: Data, num_bits: u8) -> BitUnpacker<Data> {
impl BitUnpacker {
pub fn new(data: OwnedBytes, num_bits: u8) -> BitUnpacker {
let mask: u64 = if num_bits == 64 {
!0u64
} else {
@@ -90,7 +85,7 @@ where
if self.num_bits == 0 {
return 0u64;
}
let data: &[u8] = &*self.data;
let data: &[u8] = self.data.as_slice();
let num_bits = self.num_bits;
let mask = self.mask;
let addr_in_bits = idx * num_bits;
@@ -109,8 +104,9 @@ where
#[cfg(test)]
mod test {
use super::{BitPacker, BitUnpacker};
use crate::directory::OwnedBytes;
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new();
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
@@ -122,7 +118,7 @@ mod test {
}
bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
let bitunpacker = BitUnpacker::new(data, num_bits);
let bitunpacker = BitUnpacker::new(OwnedBytes::new(data), num_bits);
(bitunpacker, vals)
}

View File

@@ -1,14 +1,15 @@
use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::common::VInt;
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::directory::{TerminatingWrite, WritePtr};
use crate::schema::Field;
use crate::space_usage::FieldUsage;
use crate::space_usage::PerFieldSpaceUsage;
use std::collections::HashMap;
use std::io::Write;
use std::io::{self, Read};
use std::io::{self, Read, Write};
use super::HasLen;
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
pub struct FileAddr {
@@ -103,25 +104,26 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
/// for each field.
#[derive(Clone)]
pub struct CompositeFile {
data: ReadOnlySource,
data: FileSlice,
offsets_index: HashMap<FileAddr, (usize, usize)>,
}
impl CompositeFile {
/// Opens a composite file stored in a given
/// `ReadOnlySource`.
pub fn open(data: &ReadOnlySource) -> io::Result<CompositeFile> {
/// `FileSlice`.
pub fn open(data: &FileSlice) -> io::Result<CompositeFile> {
let end = data.len();
let footer_len_data = data.slice_from(end - 4);
let footer_len_data = data.slice_from(end - 4).read_bytes()?;
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
let footer_start = end - 4 - footer_len;
let footer_data = data.slice(footer_start, footer_start + footer_len);
let footer_data = data
.slice(footer_start, footer_start + footer_len)
.read_bytes()?;
let mut footer_buffer = footer_data.as_slice();
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
let mut file_addrs = vec![];
let mut offsets = vec![];
let mut field_index = HashMap::new();
let mut offset = 0;
@@ -150,19 +152,19 @@ impl CompositeFile {
pub fn empty() -> CompositeFile {
CompositeFile {
offsets_index: HashMap::new(),
data: ReadOnlySource::empty(),
data: FileSlice::empty(),
}
}
/// Returns the `ReadOnlySource` associated
/// Returns the `FileSlice` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
pub fn open_read(&self, field: Field) -> Option<FileSlice> {
self.open_read_with_idx(field, 0)
}
/// Returns the `ReadOnlySource` associated
/// Returns the `FileSlice` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<FileSlice> {
self.offsets_index
.get(&FileAddr { field, idx })
.map(|&(from, to)| self.data.slice(from, to))
@@ -192,46 +194,44 @@ mod test {
use std::path::Path;
#[test]
fn test_composite_file() {
fn test_composite_file() -> crate::Result<()> {
let path = Path::new("test_path");
let mut directory = RAMDirectory::create();
{
let w = directory.open_write(path).unwrap();
let mut composite_write = CompositeWrite::wrap(w);
{
let mut write_0 = composite_write.for_field(Field::from_field_id(0u32));
VInt(32431123u64).serialize(&mut write_0).unwrap();
write_0.flush().unwrap();
}
{
let mut write_4 = composite_write.for_field(Field::from_field_id(4u32));
VInt(2).serialize(&mut write_4).unwrap();
write_4.flush().unwrap();
}
composite_write.close().unwrap();
let mut write_0 = composite_write.for_field(Field::from_field_id(0u32));
VInt(32431123u64).serialize(&mut write_0)?;
write_0.flush()?;
let mut write_4 = composite_write.for_field(Field::from_field_id(4u32));
VInt(2).serialize(&mut write_4)?;
write_4.flush()?;
composite_write.close()?;
}
{
let r = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&r).unwrap();
let r = directory.open_read(path)?;
let composite_file = CompositeFile::open(&r)?;
{
let file0 = composite_file
.open_read(Field::from_field_id(0u32))
.unwrap();
.unwrap()
.read_bytes()?;
let mut file0_buf = file0.as_slice();
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
let payload_0 = VInt::deserialize(&mut file0_buf)?.0;
assert_eq!(file0_buf.len(), 0);
assert_eq!(payload_0, 32431123u64);
}
{
let file4 = composite_file
.open_read(Field::from_field_id(4u32))
.unwrap();
.unwrap()
.read_bytes()?;
let mut file4_buf = file4.as_slice();
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
let payload_4 = VInt::deserialize(&mut file4_buf)?.0;
assert_eq!(file4_buf.len(), 0);
assert_eq!(payload_4, 2u64);
}
}
Ok(())
}
}

View File

@@ -1,5 +1,7 @@
use std::io;
use crate::common::BinarySerializable;
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::positions::PositionReader;
use crate::postings::TermInfo;
use crate::postings::{BlockSegmentPostings, SegmentPostings};
@@ -14,7 +16,7 @@ use crate::termdict::TermDictionary;
///
/// It is safe to delete the segment associated to
/// an `InvertedIndexReader`. As long as it is open,
/// the `ReadOnlySource` it is relying on should
/// the `FileSlice` it is relying on should
/// stay available.
///
///
@@ -22,9 +24,9 @@ use crate::termdict::TermDictionary;
/// the `SegmentReader`'s [`.inverted_index(...)`] method
pub struct InvertedIndexReader {
termdict: TermDictionary,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
positions_idx_source: ReadOnlySource,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
positions_idx_file_slice: FileSlice,
record_option: IndexRecordOption,
total_num_tokens: u64,
}
@@ -33,22 +35,21 @@ impl InvertedIndexReader {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symmetry
pub(crate) fn new(
termdict: TermDictionary,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
positions_idx_source: ReadOnlySource,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
positions_idx_file_slice: FileSlice,
record_option: IndexRecordOption,
) -> InvertedIndexReader {
let total_num_tokens_data = postings_source.slice(0, 8);
let mut total_num_tokens_cursor = total_num_tokens_data.as_slice();
let total_num_tokens = u64::deserialize(&mut total_num_tokens_cursor).unwrap_or(0u64);
InvertedIndexReader {
) -> io::Result<InvertedIndexReader> {
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
let total_num_tokens = u64::deserialize(&mut total_num_tokens_slice.read_bytes()?)?;
Ok(InvertedIndexReader {
termdict,
postings_source: postings_source.slice_from(8),
positions_source,
positions_idx_source,
postings_file_slice: postings_body,
positions_file_slice,
positions_idx_file_slice,
record_option,
total_num_tokens,
}
})
}
/// Creates an empty `InvertedIndexReader` object, which
@@ -56,9 +57,9 @@ impl InvertedIndexReader {
pub fn empty(record_option: IndexRecordOption) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionary::empty(),
postings_source: ReadOnlySource::empty(),
positions_source: ReadOnlySource::empty(),
positions_idx_source: ReadOnlySource::empty(),
postings_file_slice: FileSlice::empty(),
positions_file_slice: FileSlice::empty(),
positions_idx_file_slice: FileSlice::empty(),
record_option,
total_num_tokens: 0u64,
}
@@ -88,11 +89,12 @@ impl InvertedIndexReader {
&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) {
let offset = term_info.postings_offset as usize;
let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source);
block_postings.reset(term_info.doc_freq, postings_slice);
) -> io::Result<()> {
let postings_slice = self
.postings_file_slice
.slice_from(term_info.postings_offset as usize);
block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?);
Ok(())
}
/// Returns a block postings given a `Term`.
@@ -103,9 +105,11 @@ impl InvertedIndexReader {
&self,
term: &Term,
option: IndexRecordOption,
) -> Option<BlockSegmentPostings> {
self.get_term_info(term)
) -> io::Result<Option<BlockSegmentPostings>> {
Ok(self
.get_term_info(term)
.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
.transpose()?)
}
/// Returns a block postings given a `term_info`.
@@ -116,10 +120,10 @@ impl InvertedIndexReader {
&self,
term_info: &TermInfo,
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
) -> io::Result<BlockSegmentPostings> {
let offset = term_info.postings_offset as usize;
let postings_data = self.postings_source.slice_from(offset);
BlockSegmentPostings::from_data(
let postings_data = self.postings_file_slice.slice_from(offset);
BlockSegmentPostings::open(
term_info.doc_freq,
postings_data,
self.record_option,
@@ -135,20 +139,23 @@ impl InvertedIndexReader {
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
) -> io::Result<SegmentPostings> {
let block_postings = self.read_block_postings_from_terminfo(term_info, option)?;
let position_stream = {
if option.has_positions() {
let position_reader = self.positions_source.clone();
let skip_reader = self.positions_idx_source.clone();
let position_reader = self.positions_file_slice.clone();
let skip_reader = self.positions_idx_file_slice.clone();
let position_reader =
PositionReader::new(position_reader, skip_reader, term_info.positions_idx);
PositionReader::new(position_reader, skip_reader, term_info.positions_idx)?;
Some(position_reader)
} else {
None
}
};
SegmentPostings::from_block_postings(block_postings, position_stream)
Ok(SegmentPostings::from_block_postings(
block_postings,
position_stream,
))
}
/// Returns the total number of tokens recorded for all documents
@@ -167,24 +174,31 @@ impl InvertedIndexReader {
/// For instance, requesting `IndexRecordOption::Freq` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
pub fn read_postings(
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<SegmentPostings>> {
self.get_term_info(term)
.map(move |term_info| self.read_postings_from_terminfo(&term_info, option))
.transpose()
}
pub(crate) fn read_postings_no_deletes(
&self,
term: &Term,
option: IndexRecordOption,
) -> Option<SegmentPostings> {
) -> io::Result<Option<SegmentPostings>> {
self.get_term_info(term)
.map(|term_info| self.read_postings_from_terminfo(&term_info, option))
.transpose()
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
self.get_term_info(term)
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
Ok(self
.get_term_info(term)
.map(|term_info| term_info.doc_freq)
.unwrap_or(0u32)
.unwrap_or(0u32))
}
}

View File

@@ -11,8 +11,8 @@ use crate::store::StoreReader;
use crate::termdict::TermMerger;
use crate::DocAddress;
use crate::Index;
use std::fmt;
use std::sync::Arc;
use std::{fmt, io};
/// Holds a list of `SegmentReader`s ready for search.
///
@@ -32,17 +32,17 @@ impl Searcher {
schema: Schema,
index: Index,
segment_readers: Vec<SegmentReader>,
) -> Searcher {
let store_readers = segment_readers
) -> io::Result<Searcher> {
let store_readers: Vec<StoreReader> = segment_readers
.iter()
.map(SegmentReader::get_store_reader)
.collect();
Searcher {
.collect::<io::Result<Vec<_>>>()?;
Ok(Searcher {
schema,
index,
segment_readers,
store_readers,
}
})
}
/// Returns the `Index` associated to the `Searcher`
@@ -75,13 +75,14 @@ impl Searcher {
/// Return the overall number of documents containing
/// the given term.
pub fn doc_freq(&self, term: &Term) -> u64 {
self.segment_readers
.iter()
.map(|segment_reader| {
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
})
.sum::<u64>()
pub fn doc_freq(&self, term: &Term) -> crate::Result<u64> {
let mut total_doc_freq = 0;
for segment_reader in &self.segment_readers {
let inverted_index = segment_reader.inverted_index(term.field())?;
let doc_freq = inverted_index.doc_freq(term)?;
total_doc_freq += u64::from(doc_freq);
}
Ok(total_doc_freq)
}
/// Return the list of segment readers
@@ -148,22 +149,22 @@ impl Searcher {
}
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> FieldSearcher {
let inv_index_readers = self
pub fn field(&self, field: Field) -> crate::Result<FieldSearcher> {
let inv_index_readers: Vec<Arc<InvertedIndexReader>> = self
.segment_readers
.iter()
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
.collect::<crate::Result<Vec<_>>>()?;
Ok(FieldSearcher::new(inv_index_readers))
}
/// Summarize total space usage of this searcher.
pub fn space_usage(&self) -> SearcherSpaceUsage {
pub fn space_usage(&self) -> io::Result<SearcherSpaceUsage> {
let mut space_usage = SearcherSpaceUsage::new();
for segment_reader in self.segment_readers.iter() {
space_usage.add_segment(segment_reader.space_usage());
for segment_reader in &self.segment_readers {
space_usage.add_segment(segment_reader.space_usage()?);
}
space_usage
Ok(space_usage)
}
}

View File

@@ -4,7 +4,7 @@ use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::directory::error::{OpenReadError, OpenWriteError};
use crate::directory::Directory;
use crate::directory::{ReadOnlySource, WritePtr};
use crate::directory::{FileSlice, WritePtr};
use crate::indexer::segment_serializer::SegmentSerializer;
use crate::schema::Schema;
use crate::Opstamp;
@@ -78,10 +78,9 @@ impl Segment {
}
/// Open one of the component file for a *regular* read.
pub fn open_read(&self, component: SegmentComponent) -> Result<ReadOnlySource, OpenReadError> {
pub fn open_read(&self, component: SegmentComponent) -> Result<FileSlice, OpenReadError> {
let path = self.relative_path(component);
let source = self.index.directory().open_read(&path)?;
Ok(source)
self.index.directory().open_read(&path)
}
/// Open one of the component file for *regular* write.

View File

@@ -1,10 +1,9 @@
use crate::common::CompositeFile;
use crate::common::HasLen;
use crate::core::InvertedIndexReader;
use crate::core::Segment;
use crate::core::SegmentComponent;
use crate::core::SegmentId;
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders;
@@ -16,11 +15,12 @@ use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermDictionary;
use crate::DocId;
use crate::{common::CompositeFile, error::DataCorruption};
use fail::fail_point;
use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
use std::sync::RwLock;
use std::{collections::HashMap, io};
/// Entry point to access all of the datastructures of the `Segment`
///
@@ -50,7 +50,7 @@ pub struct SegmentReader {
fast_fields_readers: Arc<FastFieldReaders>,
fieldnorm_readers: FieldNormReaders,
store_source: ReadOnlySource,
store_file: FileSlice,
delete_bitset_opt: Option<DeleteBitSet>,
schema: Schema,
}
@@ -106,19 +106,26 @@ impl SegmentReader {
}
/// Accessor to the `FacetReader` associated to a given `Field`.
pub fn facet_reader(&self, field: Field) -> Option<FacetReader> {
pub fn facet_reader(&self, field: Field) -> crate::Result<FacetReader> {
let field_entry = self.schema.get_field_entry(field);
if field_entry.field_type() != &FieldType::HierarchicalFacet {
return None;
return Err(crate::TantivyError::InvalidArgument(format!(
"Field {:?} is not a facet field.",
field_entry.name()
)));
}
let term_ords_reader = self.fast_fields().u64s(field)?;
let term_ords_reader = self.fast_fields().u64s(field).ok_or_else(|| {
DataCorruption::comment_only(format!(
"Cannot find data for hierarchical facet {:?}",
field_entry.name()
))
})?;
let termdict = self
.termdict_composite
.open_read(field)
.map(|source| TermDictionary::from_source(&source))
.unwrap_or_else(TermDictionary::empty);
let facet_reader = FacetReader::new(term_ords_reader, termdict);
Some(facet_reader)
.map(TermDictionary::open)
.unwrap_or_else(|| Ok(TermDictionary::empty()))?;
Ok(FacetReader::new(term_ords_reader, termdict))
}
/// Accessor to the segment's `Field norms`'s reader.
@@ -129,7 +136,7 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> crate::Result<FieldNormReader> {
self.fieldnorm_readers.get_field(field).ok_or_else(|| {
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
"Field norm not found for field {:?}. Was it marked as indexed during indexing?",
@@ -140,33 +147,33 @@ impl SegmentReader {
}
/// Accessor to the segment's `StoreReader`.
pub fn get_store_reader(&self) -> StoreReader {
StoreReader::from_source(self.store_source.clone())
pub fn get_store_reader(&self) -> io::Result<StoreReader> {
StoreReader::open(self.store_file.clone())
}
/// Open a new segment for reading.
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
let termdict_composite = CompositeFile::open(&termdict_source)?;
let termdict_file = segment.open_read(SegmentComponent::TERMS)?;
let termdict_composite = CompositeFile::open(&termdict_file)?;
let store_source = segment.open_read(SegmentComponent::STORE)?;
let store_file = segment.open_read(SegmentComponent::STORE)?;
fail_point!("SegmentReader::open#middle");
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(&postings_source)?;
let postings_file = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(&postings_file)?;
let positions_composite = {
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
CompositeFile::open(&source)?
if let Ok(positions_file) = segment.open_read(SegmentComponent::POSITIONS) {
CompositeFile::open(&positions_file)?
} else {
CompositeFile::empty()
}
};
let positions_idx_composite = {
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONSSKIP) {
CompositeFile::open(&source)?
if let Ok(positions_skip_file) = segment.open_read(SegmentComponent::POSITIONSSKIP) {
CompositeFile::open(&positions_skip_file)?
} else {
CompositeFile::empty()
}
@@ -184,13 +191,14 @@ impl SegmentReader {
let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
Some(DeleteBitSet::open(delete_data))
let delete_bitset = DeleteBitSet::open(delete_data)?;
Some(delete_bitset)
} else {
None
};
Ok(SegmentReader {
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
inv_idx_reader_cache: Default::default(),
max_doc: segment.meta().max_doc(),
num_docs: segment.meta().num_docs(),
termdict_composite,
@@ -198,7 +206,7 @@ impl SegmentReader {
fast_fields_readers: fast_field_readers,
fieldnorm_readers,
segment_id: segment.id(),
store_source,
store_file,
delete_bitset_opt,
positions_composite,
positions_idx_composite,
@@ -218,14 +226,14 @@ impl SegmentReader {
/// is returned.
/// Similarly if the field is marked as indexed but no term has been indexed for the given
/// index. an empty `InvertedIndexReader` is returned (but no warning is logged).
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
pub fn inverted_index(&self, field: Field) -> crate::Result<Arc<InvertedIndexReader>> {
if let Some(inv_idx_reader) = self
.inv_idx_reader_cache
.read()
.expect("Lock poisoned. This should never happen")
.get(&field)
{
return Arc::clone(inv_idx_reader);
return Ok(Arc::clone(inv_idx_reader));
}
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
@@ -235,41 +243,42 @@ impl SegmentReader {
warn!("Field {:?} does not seem indexed.", field_entry.name());
}
let postings_source_opt = self.postings_composite.open_read(field);
let postings_file_opt = self.postings_composite.open_read(field);
if postings_source_opt.is_none() || record_option_opt.is_none() {
if postings_file_opt.is_none() || record_option_opt.is_none() {
// no documents in the segment contained this field.
// As a result, no data is associated to the inverted index.
//
// Returns an empty inverted index.
let record_option = record_option_opt.unwrap_or(IndexRecordOption::Basic);
return Arc::new(InvertedIndexReader::empty(record_option));
return Ok(Arc::new(InvertedIndexReader::empty(record_option)));
}
let record_option = record_option_opt.unwrap();
let postings_source = postings_source_opt.unwrap();
let postings_file = postings_file_opt.unwrap();
let termdict_source = self.termdict_composite.open_read(field).expect(
"Failed to open field term dictionary in composite file. Is the field indexed?",
);
let termdict_file: FileSlice = self.termdict_composite.open_read(field)
.ok_or_else(||
DataCorruption::comment_only(format!("Failed to open field {:?}'s term dictionary in the composite file. Has the schema been modified?", field_entry.name()))
)?;
let positions_source = self
let positions_file = self
.positions_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
let positions_idx_source = self
let positions_idx_file = self
.positions_idx_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
TermDictionary::from_source(&termdict_source),
postings_source,
positions_source,
positions_idx_source,
TermDictionary::open(termdict_file)?,
postings_file,
positions_file,
positions_idx_file,
record_option,
));
)?);
// by releasing the lock in between, we may end up opening the inverting index
// twice, but this is fine.
@@ -278,7 +287,7 @@ impl SegmentReader {
.expect("Field reader cache lock poisoned. This should never happen.")
.insert(field, Arc::clone(&inv_idx_reader));
inv_idx_reader
Ok(inv_idx_reader)
}
/// Returns the segment id
@@ -306,8 +315,8 @@ impl SegmentReader {
}
/// Summarize total space usage of this segment.
pub fn space_usage(&self) -> SegmentSpaceUsage {
SegmentSpaceUsage::new(
pub fn space_usage(&self) -> io::Result<SegmentSpaceUsage> {
Ok(SegmentSpaceUsage::new(
self.num_docs(),
self.termdict_composite.space_usage(),
self.postings_composite.space_usage(),
@@ -315,12 +324,12 @@ impl SegmentReader {
self.positions_idx_composite.space_usage(),
self.fast_fields_readers.space_usage(),
self.fieldnorm_readers.space_usage(),
self.get_store_reader().space_usage(),
self.get_store_reader()?.space_usage(),
self.delete_bitset_opt
.as_ref()
.map(DeleteBitSet::space_usage)
.unwrap_or(0),
)
))
}
}
@@ -337,7 +346,7 @@ mod test {
use crate::DocId;
#[test]
fn test_alive_docs_iterator() {
fn test_alive_docs_iterator() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("name", TEXT | STORED);
let schema = schema_builder.build();
@@ -345,26 +354,26 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
let mut index_writer2 = index.writer(50_000_000).unwrap();
let mut index_writer2 = index.writer(50_000_000)?;
index_writer2.delete_term(Term::from_field_text(name, "horse"));
index_writer2.delete_term(Term::from_field_text(name, "cap"));
// ok, now we should have a deleted doc
index_writer2.commit().unwrap();
index_writer2.commit()?;
}
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
assert_eq!(vec![0u32, 2u32], docs);
Ok(())
}
}

View File

@@ -3,7 +3,7 @@ use crate::directory::error::LockError;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
use crate::directory::WatchCallback;
use crate::directory::WatchHandle;
use crate::directory::{ReadOnlySource, WritePtr};
use crate::directory::{FileSlice, WritePtr};
use std::fmt;
use std::io;
use std::io::Write;
@@ -11,7 +11,6 @@ use std::marker::Send;
use std::marker::Sync;
use std::path::Path;
use std::path::PathBuf;
use std::result;
use std::thread;
use std::time::Duration;
@@ -117,19 +116,19 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// change.
///
/// Specifically, subsequent writes or flushes should
/// have no effect on the returned `ReadOnlySource` object.
/// have no effect on the returned `FileSlice` object.
///
/// You should only use this to read files create with [Directory::open_write].
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError>;
fn open_read(&self, path: &Path) -> Result<FileSlice, OpenReadError>;
/// Removes a file
///
/// Removing a file will not affect an eventual
/// existing ReadOnlySource pointing to it.
/// existing FileSlice pointing to it.
///
/// Removing a nonexistent file, yields a
/// `DeleteError::DoesNotExist`.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError>;
fn delete(&self, path: &Path) -> Result<(), DeleteError>;
/// Returns true iff the file exists
fn exists(&self, path: &Path) -> bool;
@@ -139,7 +138,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
///
/// Right after this call, the file should be created
/// and any subsequent call to `open_read` for the
/// same path should return a `ReadOnlySource`.
/// same path should return a `FileSlice`.
///
/// Write operations may be aggressively buffered.
/// The client of this trait is responsible for calling flush

View File

@@ -57,6 +57,11 @@ pub enum OpenWriteError {
},
}
impl OpenWriteError {
pub(crate) fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
Self::IOError { io_error, filepath }
}
}
/// Type of index incompatibility between the library and the index found on disk
/// Used to catch and provide a hint to solve this incompatibility issue
pub enum Incompatibility {
@@ -137,6 +142,11 @@ pub enum OpenReadError {
IncompatibleIndex(Incompatibility),
}
impl OpenReadError {
pub(crate) fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
Self::IOError { io_error, filepath }
}
}
/// Error that may occur when trying to delete a file
#[derive(Debug, Error)]
pub enum DeleteError {

264
src/directory/file_slice.rs Normal file
View File

@@ -0,0 +1,264 @@
use crate::common::HasLen;
use crate::directory::OwnedBytes;
use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::sync::Arc;
use std::{io, ops::Deref};
pub type BoxedData = Box<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
/// Objects that represents files sections in tantivy.
///
/// These read objects are only in charge to deliver
/// the data in the form of a constant read-only `&[u8]`.
/// Whatever happens to the directory file, the data
/// hold by this object should never be altered or destroyed.
pub trait FileSliceTrait: 'static + Send + Sync + HasLen {
fn read_bytes(&self) -> io::Result<OwnedBytes>;
fn slice(&self, from: usize, to: usize) -> FileSlice;
}
impl FileSliceTrait for &'static [u8] {
fn read_bytes(&self) -> io::Result<OwnedBytes> {
Ok(OwnedBytes::new(*self))
}
fn slice(&self, from: usize, to: usize) -> FileSlice {
FileSlice::from(&self[from..to])
}
}
impl HasLen for &'static [u8] {
fn len(&self) -> usize {
self.as_ref().len()
}
}
/// Logical slice of read only file in tantivy.
//
/// In other words, it is more or less equivalent to the triplet `(file, start_byteoffset, stop_offset)`.
///
/// FileSlice is a simple wrapper over an `Arc<Box<dyn FileSliceTrait>>`. It can
/// be cloned cheaply.
///
/// The underlying behavior is therefore specific to the `Directory` that created it.
/// Despite its name, a `FileSlice` may or may not directly map to an actual file
/// on the filesystem.
#[derive(Clone)]
pub struct FileSlice(Arc<Box<dyn FileSliceTrait>>);
impl FileSlice {
/// Creates a FileSlice, wrapping over a FileSliceTrait.
pub fn new<D>(data: D) -> Self
where
D: Deref<Target = [u8]> + Send + Sync + 'static,
{
FileSlice::from(SlicedDeref::new(data))
}
/// Creates an empty FileSlice
pub fn empty() -> FileSlice {
let data: &'static [u8] = &[];
FileSlice::from(data)
}
/// Returns a `OwnedBytes` with all of the data in the `FileSlice`.
///
/// The behavior is strongly dependant on the implementation of the underlying
/// `Directory` and the `FileSliceTrait` it creates.
/// In particular, it is up to the `Directory` implementation
/// to handle caching if needed.
pub fn read_bytes(&self) -> io::Result<OwnedBytes> {
self.0.read_bytes()
}
/// Splits the file slice at the given offset and return two file slices.
/// `file_slice[..split_offset]` and `file_slice[split_offset..]`.
///
/// This operation is cheap and must not copy any underlying data.
pub fn split(self, left_len: usize) -> (FileSlice, FileSlice) {
let left = self.slice_to(left_len);
let right = self.slice_from(left_len);
(left, right)
}
/// Splits the file slice at the given offset and return two file slices.
/// `file_slice[..split_offset]` and `file_slice[split_offset..]`.
pub fn split_from_end(self, right_len: usize) -> (FileSlice, FileSlice) {
let left_len = self.len() - right_len;
self.split(left_len)
}
/// Creates a FileSlice that is just a view over a slice of the data.
pub fn slice(&self, start: usize, stop: usize) -> FileSlice {
assert!(
start <= stop,
"Requested negative slice [{}..{}]",
start,
stop
);
assert!(stop <= self.len());
self.0.slice(start, stop)
}
/// Like `.slice(...)` but enforcing only the `from`
/// boundary.
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> FileSlice {
self.slice(from_offset, self.len())
}
/// Like `.slice(...)` but enforcing only the `to`
/// boundary.
///
/// Equivalent to `.slice(0, to_offset)`
pub fn slice_to(&self, to_offset: usize) -> FileSlice {
self.slice(0, to_offset)
}
}
impl HasLen for FileSlice {
fn len(&self) -> usize {
self.0.len()
}
}
impl<S: FileSliceTrait> From<S> for FileSlice {
fn from(file: S) -> Self {
FileSlice(Arc::new(Box::new(file)))
}
}
impl From<Arc<BoxedData>> for FileSlice {
fn from(data: Arc<BoxedData>) -> Self {
let slice_deref: SlicedDeref = SlicedDeref::from(data);
FileSlice::from(slice_deref)
}
}
/// `SliceDeref` wraps an `Arc<BoxData>` to implement `FileSliceTrait` .
/// It keeps track of (start, stop) boundaries.
#[derive(Clone)]
pub struct SlicedDeref {
data: Arc<BoxedData>,
start: usize,
stop: usize,
}
impl SlicedDeref {
/// Wraps a new `Deref<Target = [u8]>`
pub fn new<D>(data: D) -> Self
where
D: Deref<Target = [u8]> + 'static + Send + Sync,
{
let len = data.len();
SlicedDeref {
data: Arc::new(Box::new(data)),
start: 0,
stop: len,
}
}
}
impl From<Arc<BoxedData>> for SlicedDeref {
fn from(data: Arc<BoxedData>) -> Self {
let len = data.len();
SlicedDeref {
data,
start: 0,
stop: len,
}
}
}
unsafe impl StableDeref for SlicedDeref {}
unsafe impl CloneStableDeref for SlicedDeref {}
impl FileSliceTrait for SlicedDeref {
fn read_bytes(&self) -> io::Result<OwnedBytes> {
Ok(OwnedBytes::new(self.clone()))
}
fn slice(&self, from: usize, to: usize) -> FileSlice {
assert!(to <= self.len());
FileSlice::from(SlicedDeref {
data: self.data.clone(),
start: self.start + from,
stop: self.start + to,
})
}
}
impl HasLen for SlicedDeref {
fn len(&self) -> usize {
self.stop - self.start
}
}
impl Deref for SlicedDeref {
type Target = [u8];
fn deref(&self) -> &Self::Target {
&self.data.deref()[self.start..self.stop]
}
}
#[cfg(test)]
mod tests {
use super::{FileSlice, FileSliceTrait, SlicedDeref};
use crate::common::HasLen;
use std::io;
#[test]
fn test_file_slice() -> io::Result<()> {
let file_slice = FileSlice::new(b"abcdef".as_ref());
assert_eq!(file_slice.len(), 6);
assert_eq!(file_slice.slice_from(2).read_bytes()?.as_slice(), b"cdef");
assert_eq!(file_slice.slice_to(2).read_bytes()?.as_slice(), b"ab");
assert_eq!(
file_slice
.slice_from(1)
.slice_to(2)
.read_bytes()?
.as_slice(),
b"bc"
);
{
let (left, right) = file_slice.clone().split(0);
assert_eq!(left.read_bytes()?.as_slice(), b"");
assert_eq!(right.read_bytes()?.as_slice(), b"abcdef");
}
{
let (left, right) = file_slice.clone().split(2);
assert_eq!(left.read_bytes()?.as_slice(), b"ab");
assert_eq!(right.read_bytes()?.as_slice(), b"cdef");
}
{
let (left, right) = file_slice.clone().split_from_end(0);
assert_eq!(left.read_bytes()?.as_slice(), b"abcdef");
assert_eq!(right.read_bytes()?.as_slice(), b"");
}
{
let (left, right) = file_slice.clone().split_from_end(2);
assert_eq!(left.read_bytes()?.as_slice(), b"abcd");
assert_eq!(right.read_bytes()?.as_slice(), b"ef");
}
Ok(())
}
#[test]
fn test_file_slice_trait_slice_len() {
let blop: &'static [u8] = b"abc";
let owned_bytes: Box<dyn FileSliceTrait> = Box::new(blop);
assert_eq!(owned_bytes.len(), 3);
}
#[test]
fn test_slice_deref() -> io::Result<()> {
let slice_deref = SlicedDeref::new(&b"abcdef"[..]);
assert_eq!(slice_deref.len(), 6);
assert_eq!(slice_deref.read_bytes()?.as_ref(), b"abcdef");
assert_eq!(slice_deref.slice(1, 4).read_bytes()?.as_ref(), b"bcd");
Ok(())
}
}

View File

@@ -1,9 +1,8 @@
use crate::common::{BinarySerializable, CountingWriter, FixedSize, VInt};
use crate::common::{BinarySerializable, CountingWriter, FixedSize, HasLen, VInt};
use crate::directory::error::Incompatibility;
use crate::directory::read_only_source::ReadOnlySource;
use crate::directory::FileSlice;
use crate::directory::{AntiCallToken, TerminatingWrite};
use crate::Version;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use crc32fast::Hasher;
use std::io;
use std::io::Write;
@@ -64,26 +63,26 @@ impl Footer {
let mut counting_write = CountingWriter::wrap(&mut write);
self.serialize(&mut counting_write)?;
let written_len = counting_write.written_bytes();
write.write_u32::<LittleEndian>(written_len as u32)?;
(written_len as u32).serialize(write)?;
Ok(())
}
pub fn extract_footer(source: ReadOnlySource) -> Result<(Footer, ReadOnlySource), io::Error> {
if source.len() < 4 {
pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> {
if file.len() < 4 {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
format!(
"File corrupted. The file is smaller than 4 bytes (len={}).",
source.len()
file.len()
),
));
}
let (body_footer, footer_len_bytes) = source.split_from_end(u32::SIZE_IN_BYTES);
let footer_len = LittleEndian::read_u32(footer_len_bytes.as_slice()) as usize;
let body_len = body_footer.len() - footer_len;
let (body, footer_data) = body_footer.split(body_len);
let mut cursor = footer_data.as_slice();
let footer = Footer::deserialize(&mut cursor)?;
let (body_footer, footer_len_file) = file.split_from_end(u32::SIZE_IN_BYTES);
let mut footer_len_bytes = footer_len_file.read_bytes()?;
let footer_len = u32::deserialize(&mut footer_len_bytes)? as usize;
let (body, footer) = body_footer.split_from_end(footer_len);
let mut footer_bytes = footer.read_bytes()?;
let footer = Footer::deserialize(&mut footer_bytes)?;
Ok((footer, body))
}

View File

@@ -5,7 +5,7 @@ use crate::directory::DirectoryLock;
use crate::directory::GarbageCollectionResult;
use crate::directory::Lock;
use crate::directory::META_LOCK;
use crate::directory::{ReadOnlySource, WritePtr};
use crate::directory::{FileSlice, WritePtr};
use crate::directory::{WatchCallback, WatchHandle};
use crate::error::DataCorruption;
use crate::Directory;
@@ -86,12 +86,7 @@ impl ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::default(),
}),
Err(OpenReadError::IOError { io_error, filepath }) => {
Err(crate::TantivyError::OpenReadError(OpenReadError::IOError {
io_error,
filepath,
}))
}
io_err @ Err(OpenReadError::IOError { .. }) => Err(io_err.err().unwrap().into()),
Err(OpenReadError::IncompatibleIndex(incompatibility)) => {
// For the moment, this should never happen `meta.json`
// do not have any footer and cannot detect incompatibility.
@@ -241,8 +236,14 @@ impl ManagedDirectory {
io_error,
filepath: path.to_path_buf(),
})?;
let bytes = data
.read_bytes()
.map_err(|io_error| OpenReadError::IOError {
filepath: path.to_path_buf(),
io_error,
})?;
let mut hasher = Hasher::new();
hasher.update(data.as_slice());
hasher.update(bytes.as_slice());
let crc = hasher.finalize();
Ok(footer
.versioned_footer
@@ -273,24 +274,17 @@ impl ManagedDirectory {
}
impl Directory for ManagedDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
let read_only_source = self.directory.open_read(path)?;
let (footer, reader) = Footer::extract_footer(read_only_source).map_err(|io_error| {
OpenReadError::IOError {
io_error,
filepath: path.to_path_buf(),
}
})?;
fn open_read(&self, path: &Path) -> result::Result<FileSlice, OpenReadError> {
let file_slice = self.directory.open_read(path)?;
let (footer, reader) = Footer::extract_footer(file_slice)
.map_err(|io_error| OpenReadError::wrap_io_error(io_error, path.to_path_buf()))?;
footer.is_compatible()?;
Ok(reader)
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
self.register_file_as_managed(path)
.map_err(|io_error| OpenWriteError::IOError {
io_error,
filepath: path.to_path_buf(),
})?;
.map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?;
Ok(io::BufWriter::new(Box::new(FooterProxy::new(
self.directory
.open_write(path)?
@@ -414,39 +408,37 @@ mod tests_mmap_specific {
}
#[test]
fn test_checksum() {
fn test_checksum() -> crate::Result<()> {
let test_path1: &'static Path = Path::new("some_path_for_test");
let test_path2: &'static Path = Path::new("other_test_path");
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut write = managed_directory.open_write(test_path1).unwrap();
write.write_all(&[0u8, 1u8]).unwrap();
write.terminate().unwrap();
let mmap_directory = MmapDirectory::open(&tempdir_path)?;
let mut managed_directory = ManagedDirectory::wrap(mmap_directory)?;
let mut write = managed_directory.open_write(test_path1)?;
write.write_all(&[0u8, 1u8])?;
write.terminate()?;
let mut write = managed_directory.open_write(test_path2).unwrap();
write.write_all(&[3u8, 4u8, 5u8]).unwrap();
write.terminate().unwrap();
let mut write = managed_directory.open_write(test_path2)?;
write.write_all(&[3u8, 4u8, 5u8])?;
write.terminate()?;
let read_source = managed_directory.open_read(test_path2).unwrap();
assert_eq!(read_source.as_slice(), &[3u8, 4u8, 5u8]);
let read_file = managed_directory.open_read(test_path2)?.read_bytes()?;
assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]);
assert!(managed_directory.list_damaged().unwrap().is_empty());
let mut corrupted_path = tempdir_path.clone();
corrupted_path.push(test_path2);
let mut file = OpenOptions::new()
.write(true)
.open(&corrupted_path)
.unwrap();
file.write_all(&[255u8]).unwrap();
file.flush().unwrap();
let mut file = OpenOptions::new().write(true).open(&corrupted_path)?;
file.write_all(&[255u8])?;
file.flush()?;
drop(file);
let damaged = managed_directory.list_damaged().unwrap();
let damaged = managed_directory.list_damaged()?;
assert_eq!(damaged.len(), 1);
assert!(damaged.contains(test_path2));
Ok(())
}
}

View File

@@ -1,12 +1,12 @@
use crate::core::META_FILEPATH;
use crate::directory::error::LockError;
use crate::directory::error::{DeleteError, OpenDirectoryError, OpenReadError, OpenWriteError};
use crate::directory::read_only_source::BoxedData;
use crate::directory::AntiCallToken;
use crate::directory::BoxedData;
use crate::directory::Directory;
use crate::directory::DirectoryLock;
use crate::directory::FileSlice;
use crate::directory::Lock;
use crate::directory::ReadOnlySource;
use crate::directory::WatchCallback;
use crate::directory::WatchCallbackList;
use crate::directory::WatchHandle;
@@ -42,21 +42,17 @@ pub(crate) fn make_io_err(msg: String) -> io::Error {
/// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped)
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
let file = File::open(full_path).map_err(|e| {
if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned())
let file = File::open(full_path).map_err(|io_err| {
if io_err.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_path_buf())
} else {
OpenReadError::IOError {
io_error: e,
filepath: full_path.to_owned(),
}
OpenReadError::wrap_io_error(io_err, full_path.to_path_buf())
}
})?;
let meta_data = file.metadata().map_err(|e| OpenReadError::IOError {
io_error: e,
filepath: full_path.to_owned(),
})?;
let meta_data = file
.metadata()
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_owned()))?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return None
@@ -66,10 +62,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
unsafe {
memmap::Mmap::map(&file)
.map(Some)
.map_err(|e| OpenReadError::IOError {
io_error: e,
filepath: full_path.to_owned(),
})
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
}
}
@@ -408,7 +401,7 @@ impl TerminatingWrite for SafeFileWriter {
}
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
fn open_read(&self, path: &Path) -> result::Result<FileSlice, OpenReadError> {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
@@ -418,15 +411,13 @@ impl Directory for MmapDirectory {
on mmap cache while reading {:?}",
path
);
OpenReadError::IOError {
io_error: make_io_err(msg),
filepath: path.to_owned(),
}
let io_err = make_io_err(msg);
OpenReadError::wrap_io_error(io_err, path.to_path_buf())
})?;
Ok(mmap_cache
.get_mmap(&full_path)?
.map(ReadOnlySource::from)
.unwrap_or_else(ReadOnlySource::empty))
.map(FileSlice::from)
.unwrap_or_else(FileSlice::empty))
}
/// Any entry associated to the path in the mmap will be
@@ -465,29 +456,22 @@ impl Directory for MmapDirectory {
.create_new(true)
.open(full_path);
let mut file = open_res.map_err(|err| {
if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned())
let mut file = open_res.map_err(|io_err| {
if io_err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_path_buf())
} else {
OpenWriteError::IOError {
io_error: err,
filepath: path.to_owned(),
}
OpenWriteError::wrap_io_error(io_err, path.to_path_buf())
}
})?;
// making sure the file is created.
file.flush().map_err(|io_error| OpenWriteError::IOError {
io_error,
filepath: path.to_owned(),
})?;
file.flush()
.map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?;
// Apparetntly, on some filesystem syncing the parent
// directory is required.
self.sync_directory().map_err(|e| OpenWriteError::IOError {
io_error: e,
filepath: path.to_owned(),
})?;
self.sync_directory()
.map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?;
let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer)))
@@ -498,21 +482,16 @@ impl Directory for MmapDirectory {
let mut buffer = Vec::new();
match File::open(&full_path) {
Ok(mut file) => {
file.read_to_end(&mut buffer)
.map_err(|io_error| OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
})?;
file.read_to_end(&mut buffer).map_err(|io_error| {
OpenReadError::wrap_io_error(io_error, path.to_path_buf())
})?;
Ok(buffer)
}
Err(io_error) => {
if io_error.kind() == io::ErrorKind::NotFound {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
} else {
Err(OpenReadError::IOError {
io_error,
filepath: path.to_owned(),
})
Err(OpenReadError::wrap_io_error(io_error, path.to_path_buf()))
}
}
}
@@ -560,10 +539,10 @@ mod tests {
// The following tests are specific to the MmapDirectory
use super::*;
use crate::indexer::LogMergePolicy;
use crate::schema::{Schema, SchemaBuilder, TEXT};
use crate::Index;
use crate::ReloadPolicy;
use crate::{common::HasLen, indexer::LogMergePolicy};
use std::fs;
use std::sync::atomic::{AtomicUsize, Ordering};

View File

@@ -9,10 +9,11 @@ mod mmap_directory;
mod directory;
mod directory_lock;
mod file_slice;
mod footer;
mod managed_directory;
mod owned_bytes;
mod ram_directory;
mod read_only_source;
mod watch_event_router;
/// Errors specific to the directory module.
@@ -21,8 +22,10 @@ pub mod error;
pub use self::directory::DirectoryLock;
pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
pub(crate) use self::file_slice::BoxedData;
pub use self::file_slice::FileSlice;
pub use self::owned_bytes::OwnedBytes;
pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource;
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
use std::io::{self, BufWriter, Write};
use std::path::PathBuf;

View File

@@ -0,0 +1,239 @@
use stable_deref_trait::StableDeref;
use std::mem;
use std::ops::Deref;
use std::sync::Arc;
use std::{fmt, io};
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
/// this data as a static slice.
///
/// The backing object is required to be `StableDeref`.
#[derive(Clone)]
pub struct OwnedBytes {
data: &'static [u8],
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
}
impl OwnedBytes {
/// Creates an empty `OwnedBytes`.
pub fn empty() -> OwnedBytes {
OwnedBytes::new(&[][..])
}
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
data_holder: T,
) -> OwnedBytes {
let box_stable_deref = Arc::new(data_holder);
let data = unsafe { mem::transmute::<_, &'static [u8]>(box_stable_deref.deref().deref()) };
OwnedBytes {
box_stable_deref,
data,
}
}
/// Returns the underlying slice of data.
/// `Deref` and `AsRef` are also available.
#[inline(always)]
pub fn as_slice(&self) -> &[u8] {
self.data
}
/// Returns the len of the slice.
#[inline(always)]
pub fn len(&self) -> usize {
self.data.len()
}
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
///
/// Left will hold `split_len` bytes.
///
/// This operation is cheap and does not require to copy any memory.
/// On the other hand, both `left` and `right` retain a handle over
/// the entire slice of memory. In other words, the memory will only
/// be released when both left and right are dropped.
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes {
data: &self.data[..split_len],
box_stable_deref: self.box_stable_deref,
};
let right = OwnedBytes {
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
(left, right)
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline(always)]
pub fn is_empty(&self) -> bool {
self.as_slice().is_empty()
}
/// Drops the left most `advance_len` bytes.
///
/// See also [.clip(clip_len: usize))](#method.clip).
#[inline(always)]
pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..]
}
}
impl fmt::Debug for OwnedBytes {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We truncate the bytes in order to make sure the debug string
// is not too long.
let bytes_truncated: &[u8] = if self.len() > 8 {
&self.as_slice()[..10]
} else {
self.as_slice()
};
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
}
}
impl Deref for OwnedBytes {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl io::Read for OwnedBytes {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
if data.len() >= buf.len() {
let buf_len = buf.len();
buf.copy_from_slice(&data[..buf_len]);
buf.len()
} else {
let data_len = data.len();
buf[..data_len].copy_from_slice(data);
data_len
}
};
self.advance(read_len);
Ok(read_len)
}
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
buf.extend(data);
data.len()
};
self.advance(read_len);
Ok(read_len)
}
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
let read_len = self.read(buf)?;
if read_len != buf.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
));
}
Ok(())
}
}
impl AsRef<[u8]> for OwnedBytes {
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
#[cfg(test)]
mod tests {
use std::io::{self, Read};
use super::OwnedBytes;
#[test]
fn test_owned_bytes_debug() {
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
assert_eq!(
format!("{:?}", short_bytes),
"OwnedBytes([97, 98, 99, 100], len=4)"
);
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
assert_eq!(
format!("{:?}", long_bytes),
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
);
}
#[test]
fn test_owned_bytes_read() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
{
let mut buf = [0u8; 5];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
}
{
let mut buf = [0u8; 2];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"fg");
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
}
Ok(())
}
#[test]
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 5];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
assert_eq!(&buf, b"abcde");
Ok(())
}
#[test]
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 7];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf[..5], b"abcde");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_to_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = Vec::new();
bytes.read_to_end(&mut buf)?;
assert_eq!(buf.as_slice(), b"abcde".as_ref());
Ok(())
}
#[test]
fn test_owned_bytes_split() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
let (left, right) = bytes.split(3);
assert_eq!(left.as_slice(), b"abc");
assert_eq!(right.as_slice(), b"defghi");
}
#[test]
fn test_owned_bytes_split_boundary() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
{
let (left, right) = bytes.clone().split(0);
assert_eq!(left.as_slice(), b"");
assert_eq!(right.as_slice(), b"abcdefghi");
}
{
let (left, right) = bytes.split(9);
assert_eq!(left.as_slice(), b"abcdefghi");
assert_eq!(right.as_slice(), b"");
}
}
}

View File

@@ -1,9 +1,9 @@
use crate::core::META_FILEPATH;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
use crate::directory::AntiCallToken;
use crate::directory::WatchCallbackList;
use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle};
use crate::directory::{TerminatingWrite, WritePtr};
use crate::{common::HasLen, core::META_FILEPATH};
use fail::fail_point;
use std::collections::HashMap;
use std::fmt;
@@ -80,17 +80,17 @@ impl TerminatingWrite for VecWriter {
#[derive(Default)]
struct InnerDirectory {
fs: HashMap<PathBuf, ReadOnlySource>,
fs: HashMap<PathBuf, FileSlice>,
watch_router: WatchCallbackList,
}
impl InnerDirectory {
fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
let data = ReadOnlySource::new(Vec::from(data));
let data = FileSlice::new(Vec::from(data));
self.fs.insert(path, data).is_some()
}
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
fn open_read(&self, path: &Path) -> Result<FileSlice, OpenReadError> {
self.fs
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
@@ -153,9 +153,9 @@ impl RAMDirectory {
/// If an error is encounterred, files may be persisted partially.
pub fn persist(&self, dest: &mut dyn Directory) -> crate::Result<()> {
let wlock = self.fs.write().unwrap();
for (path, source) in wlock.fs.iter() {
for (path, file) in wlock.fs.iter() {
let mut dest_wrt = dest.open_write(path)?;
dest_wrt.write_all(source.as_slice())?;
dest_wrt.write_all(file.read_bytes()?.as_slice())?;
dest_wrt.terminate()?;
}
Ok(())
@@ -163,7 +163,7 @@ impl RAMDirectory {
}
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
fn open_read(&self, path: &Path) -> result::Result<FileSlice, OpenReadError> {
self.fs.read().unwrap().open_read(path)
}
@@ -195,7 +195,14 @@ impl Directory for RAMDirectory {
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
Ok(self.open_read(path)?.as_slice().to_owned())
let bytes =
self.open_read(path)?
.read_bytes()
.map_err(|io_error| OpenReadError::IOError {
io_error,
filepath: path.to_path_buf(),
})?;
Ok(bytes.as_slice().to_owned())
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {

View File

@@ -1,137 +0,0 @@
use crate::common::HasLen;
use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::ops::Deref;
use std::sync::Arc;
pub type BoxedData = Box<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
/// Read object that represents files in tantivy.
///
/// These read objects are only in charge to deliver
/// the data in the form of a constant read-only `&[u8]`.
/// Whatever happens to the directory file, the data
/// hold by this object should never be altered or destroyed.
pub struct ReadOnlySource {
data: Arc<BoxedData>,
start: usize,
stop: usize,
}
unsafe impl StableDeref for ReadOnlySource {}
unsafe impl CloneStableDeref for ReadOnlySource {}
impl Deref for ReadOnlySource {
type Target = [u8];
fn deref(&self) -> &[u8] {
self.as_slice()
}
}
impl From<Arc<BoxedData>> for ReadOnlySource {
fn from(data: Arc<BoxedData>) -> Self {
let len = data.len();
ReadOnlySource {
data,
start: 0,
stop: len,
}
}
}
impl ReadOnlySource {
pub(crate) fn new<D>(data: D) -> ReadOnlySource
where
D: Deref<Target = [u8]> + Send + Sync + 'static,
{
let len = data.len();
ReadOnlySource {
data: Arc::new(Box::new(data)),
start: 0,
stop: len,
}
}
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::new(&[][..])
}
/// Returns the data underlying the ReadOnlySource object.
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.stop]
}
/// Splits into 2 `ReadOnlySource`, at the offset given
/// as an argument.
pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) {
let left = self.slice(0, addr);
let right = self.slice_from(addr);
(left, right)
}
/// Splits into 2 `ReadOnlySource`, at the offset `end - right_len`.
pub fn split_from_end(self, right_len: usize) -> (ReadOnlySource, ReadOnlySource) {
let left_len = self.len() - right_len;
self.split(left_len)
}
/// Creates a ReadOnlySource that is just a
/// view over a slice of the data.
///
/// Keep in mind that any living slice extends
/// the lifetime of the original ReadOnlySource,
///
/// For instance, if `ReadOnlySource` wraps 500MB
/// worth of data in anonymous memory, and only a
/// 1KB slice is remaining, the whole `500MBs`
/// are retained in memory.
pub fn slice(&self, start: usize, stop: usize) -> ReadOnlySource {
assert!(
start <= stop,
"Requested negative slice [{}..{}]",
start,
stop
);
assert!(stop <= self.len());
ReadOnlySource {
data: self.data.clone(),
start: self.start + start,
stop: self.start + stop,
}
}
/// Like `.slice(...)` but enforcing only the `from`
/// boundary.
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
self.slice(from_offset, self.len())
}
/// Like `.slice(...)` but enforcing only the `to`
/// boundary.
///
/// Equivalent to `.slice(0, to_offset)`
pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource {
self.slice(0, to_offset)
}
}
impl HasLen for ReadOnlySource {
fn len(&self) -> usize {
self.stop - self.start
}
}
impl Clone for ReadOnlySource {
fn clone(&self) -> Self {
self.slice_from(0)
}
}
impl From<Vec<u8>> for ReadOnlySource {
fn from(data: Vec<u8>) -> ReadOnlySource {
ReadOnlySource::new(data)
}
}

View File

@@ -20,9 +20,9 @@ mod mmap_directory_tests {
}
#[test]
fn test_simple() {
fn test_simple() -> crate::Result<()> {
let mut directory = make_directory();
super::test_simple(&mut directory);
super::test_simple(&mut directory)
}
#[test]
@@ -32,15 +32,17 @@ mod mmap_directory_tests {
}
#[test]
fn test_rewrite_forbidden() {
fn test_rewrite_forbidden() -> crate::Result<()> {
let mut directory = make_directory();
super::test_rewrite_forbidden(&mut directory);
super::test_rewrite_forbidden(&mut directory)?;
Ok(())
}
#[test]
fn test_directory_delete() {
fn test_directory_delete() -> crate::Result<()> {
let mut directory = make_directory();
super::test_directory_delete(&mut directory);
super::test_directory_delete(&mut directory)?;
Ok(())
}
#[test]
@@ -72,9 +74,9 @@ mod ram_directory_tests {
}
#[test]
fn test_simple() {
fn test_simple() -> crate::Result<()> {
let mut directory = make_directory();
super::test_simple(&mut directory);
super::test_simple(&mut directory)
}
#[test]
@@ -84,15 +86,17 @@ mod ram_directory_tests {
}
#[test]
fn test_rewrite_forbidden() {
fn test_rewrite_forbidden() -> crate::Result<()> {
let mut directory = make_directory();
super::test_rewrite_forbidden(&mut directory);
super::test_rewrite_forbidden(&mut directory)?;
Ok(())
}
#[test]
fn test_directory_delete() {
fn test_directory_delete() -> crate::Result<()> {
let mut directory = make_directory();
super::test_directory_delete(&mut directory);
super::test_directory_delete(&mut directory)?;
Ok(())
}
#[test]
@@ -123,35 +127,28 @@ fn ram_directory_panics_if_flush_forgotten() {
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &mut dyn Directory) {
fn test_simple(directory: &mut dyn Directory) -> crate::Result<()> {
let test_path: &'static Path = Path::new("some_path_for_test");
{
let mut write_file = directory.open_write(test_path).unwrap();
assert!(directory.exists(test_path));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
{
let read_file = directory.open_read(test_path).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
let mut write_file = directory.open_write(test_path)?;
assert!(directory.exists(test_path));
write_file.write_all(&[4])?;
write_file.write_all(&[3])?;
write_file.write_all(&[7, 3, 5])?;
write_file.flush()?;
let read_file = directory.open_read(test_path)?.read_bytes()?;
assert_eq!(read_file.as_slice(), &[4u8, 3u8, 7u8, 3u8, 5u8]);
assert!(directory.delete(test_path).is_ok());
assert!(!directory.exists(test_path));
Ok(())
}
fn test_rewrite_forbidden(directory: &mut dyn Directory) {
fn test_rewrite_forbidden(directory: &mut dyn Directory) -> crate::Result<()> {
let test_path: &'static Path = Path::new("some_path_for_test");
{
directory.open_write(test_path).unwrap();
assert!(directory.exists(test_path));
}
{
assert!(directory.open_write(test_path).is_err());
}
directory.open_write(test_path)?;
assert!(directory.exists(test_path));
assert!(directory.open_write(test_path).is_err());
assert!(directory.delete(test_path).is_ok());
Ok(())
}
fn test_write_create_the_file(directory: &mut dyn Directory) {
@@ -165,21 +162,20 @@ fn test_write_create_the_file(directory: &mut dyn Directory) {
}
}
fn test_directory_delete(directory: &mut dyn Directory) {
fn test_directory_delete(directory: &mut dyn Directory) -> crate::Result<()> {
let test_path: &'static Path = Path::new("some_path_for_test");
assert!(directory.open_read(test_path).is_err());
let mut write_file = directory.open_write(&test_path).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
let mut write_file = directory.open_write(&test_path)?;
write_file.write_all(&[1, 2, 3, 4])?;
write_file.flush()?;
{
let read_handle = directory.open_read(&test_path).unwrap();
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
let read_handle = directory.open_read(&test_path)?.read_bytes()?;
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(&test_path).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
@@ -189,6 +185,7 @@ fn test_directory_delete(directory: &mut dyn Directory) {
assert!(directory.open_read(&test_path).is_err());
assert!(directory.delete(&test_path).is_err());
Ok(())
}
fn test_watch(directory: &mut dyn Directory) {

View File

@@ -85,7 +85,7 @@ mod tests {
let field = searcher.schema().get_field("string_bytes").unwrap();
let term = Term::from_field_bytes(field, b"lucene".as_ref());
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
let term_weight = term_query.specialized_weight(&searcher, true);
let term_weight = term_query.specialized_weight(&searcher, true)?;
let term_scorer = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0f32)?;
assert_eq!(term_scorer.doc(), 0u32);
Ok(())
@@ -98,7 +98,7 @@ mod tests {
let field = searcher.schema().get_field("string_bytes").unwrap();
let term = Term::from_field_bytes(field, b"lucene".as_ref());
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
let term_weight = term_query.specialized_weight(&searcher, false);
let term_weight = term_query.specialized_weight(&searcher, false)?;
let term_scorer_err = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0f32);
assert!(matches!(
term_scorer_err,

View File

@@ -1,6 +1,5 @@
use owning_ref::OwningRef;
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::fastfield::FastFieldReader;
use crate::DocId;
@@ -17,16 +16,16 @@ use crate::DocId;
#[derive(Clone)]
pub struct BytesFastFieldReader {
idx_reader: FastFieldReader<u64>,
values: OwningRef<ReadOnlySource, [u8]>,
values: OwnedBytes,
}
impl BytesFastFieldReader {
pub(crate) fn open(
idx_reader: FastFieldReader<u64>,
values_source: ReadOnlySource,
) -> BytesFastFieldReader {
let values = OwningRef::new(values_source).map(|source| &source[..]);
BytesFastFieldReader { idx_reader, values }
values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?;
Ok(BytesFastFieldReader { idx_reader, values })
}
fn range(&self, doc: DocId) -> (usize, usize) {
@@ -38,7 +37,7 @@ impl BytesFastFieldReader {
/// Returns the bytes associated to the given `doc`
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
let (start, stop) = self.range(doc);
&self.values[start..stop]
&self.values.as_slice()[start..stop]
}
/// Returns the overall number of bytes in this bytes fast field.

View File

@@ -1,5 +1,6 @@
use crate::common::{BitSet, HasLen};
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::WritePtr;
use crate::space_usage::ByteCount;
use crate::DocId;
@@ -39,7 +40,7 @@ pub fn write_delete_bitset(
/// Set of deleted `DocId`s.
#[derive(Clone)]
pub struct DeleteBitSet {
data: ReadOnlySource,
data: OwnedBytes,
len: usize,
}
@@ -58,21 +59,22 @@ impl DeleteBitSet {
let mut wrt = directory.open_write(path).unwrap();
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
wrt.terminate().unwrap();
let source = directory.open_read(path).unwrap();
Self::open(source)
let file = directory.open_read(path).unwrap();
Self::open(file).unwrap()
}
/// Opens a delete bitset given its data source.
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
let num_deleted: usize = data
/// Opens a delete bitset given its file.
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
let bytes = file.read_bytes()?;
let num_deleted: usize = bytes
.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();
DeleteBitSet {
data,
Ok(DeleteBitSet {
data: bytes,
len: num_deleted,
}
})
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
@@ -84,7 +86,7 @@ impl DeleteBitSet {
#[inline(always)]
pub fn is_deleted(&self, doc: DocId) -> bool {
let byte_offset = doc / 8u32;
let b: u8 = (*self.data)[byte_offset as usize];
let b: u8 = self.data.as_slice()[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
}

View File

@@ -209,6 +209,7 @@ mod tests {
use crate::schema::FAST;
use crate::schema::{Document, IntOptions};
use crate::{Index, SegmentId, SegmentReader};
use common::HasLen;
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
@@ -239,7 +240,7 @@ mod tests {
}
#[test]
fn test_intfastfield_small() {
fn test_intfastfield_small() -> crate::Result<()> {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
@@ -254,27 +255,24 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 36 as usize);
}
{
let composite_file = CompositeFile::open(&source).unwrap();
let field_source = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(field_source);
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 36 as usize);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(file)?;
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
Ok(())
}
#[test]
fn test_intfastfield_large() {
fn test_intfastfield_large() -> crate::Result<()> {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
@@ -285,19 +283,15 @@ mod tests {
fast_field_writers.add_document(&doc!(*FIELD=>1_002u64));
fast_field_writers.add_document(&doc!(*FIELD=>1_501u64));
fast_field_writers.add_document(&doc!(*FIELD=>215u64));
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
fast_field_writers.serialize(&mut serializer, &HashMap::new())?;
serializer.close()?;
}
let source = directory.open_read(&path).unwrap();
let file = directory.open_read(&path)?;
assert_eq!(file.len(), 61 as usize);
{
assert_eq!(source.len(), 61 as usize);
}
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
@@ -308,10 +302,11 @@ mod tests {
assert_eq!(fast_field_reader.get(7), 1_501u64);
assert_eq!(fast_field_reader.get(8), 215u64);
}
Ok(())
}
#[test]
fn test_intfastfield_null_amplitude() {
fn test_intfastfield_null_amplitude() -> crate::Result<()> {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
@@ -327,22 +322,21 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 34 as usize);
{
assert_eq!(source.len(), 34 as usize);
}
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
}
Ok(())
}
#[test]
fn test_intfastfield_large_numbers() {
fn test_intfastfield_large_numbers() -> crate::Result<()> {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
@@ -360,14 +354,12 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 80042 as usize);
{
assert_eq!(source.len(), 80042 as usize);
}
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(
@@ -376,10 +368,11 @@ mod tests {
);
}
}
Ok(())
}
#[test]
fn test_signed_intfastfield() {
fn test_signed_intfastfield() -> crate::Result<()> {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema_builder = Schema::builder();
@@ -400,14 +393,12 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 17709 as usize);
{
assert_eq!(source.len(), 17709 as usize);
}
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = FastFieldReader::<i64>::open(data);
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
@@ -420,10 +411,11 @@ mod tests {
assert_eq!(buffer[i], -100i64 + 53i64 + i as i64);
}
}
Ok(())
}
#[test]
fn test_signed_intfastfield_default_val() {
fn test_signed_intfastfield_default_val() -> crate::Result<()> {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema_builder = Schema::builder();
@@ -442,13 +434,14 @@ mod tests {
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = FastFieldReader::<i64>::open(data);
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
Ok(())
}
// Warning: this generates the same permutation at each call
@@ -459,28 +452,26 @@ mod tests {
}
#[test]
fn test_intfastfield_permutation() {
fn test_intfastfield_permutation() -> crate::Result<()> {
let path = Path::new("test");
let permutation = generate_permutation();
let n = permutation.len();
let mut directory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
fast_field_writers.serialize(&mut serializer, &HashMap::new())?;
serializer.close()?;
}
let source = directory.open_read(&path).unwrap();
let file = directory.open_read(&path)?;
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let mut a = 0u64;
for _ in 0..n {
@@ -488,6 +479,7 @@ mod tests {
a = fast_field_reader.get(a as u32);
}
}
Ok(())
}
#[test]
@@ -633,9 +625,9 @@ mod bench {
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
@@ -667,9 +659,9 @@ mod bench {
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);

View File

@@ -3,13 +3,12 @@ use crate::common::bitpacker::BitUnpacker;
use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::common::CompositeFile;
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::directory::{Directory, RAMDirectory, WritePtr};
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
use owning_ref::OwningRef;
use std::collections::HashMap;
use std::marker::PhantomData;
use std::path::Path;
@@ -20,34 +19,27 @@ use std::path::Path;
/// fast field is required.
#[derive(Clone)]
pub struct FastFieldReader<Item: FastValue> {
bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
bit_unpacker: BitUnpacker,
min_value_u64: u64,
max_value_u64: u64,
_phantom: PhantomData<Item>,
}
impl<Item: FastValue> FastFieldReader<Item> {
/// Opens a fast field given a source.
pub fn open(data: ReadOnlySource) -> Self {
let min_value: u64;
let amplitude: u64;
{
let mut cursor = data.as_slice();
min_value =
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
amplitude =
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
}
/// Opens a fast field given a file.
pub fn open(file: FileSlice) -> crate::Result<Self> {
let mut bytes = file.read_bytes()?;
let min_value = u64::deserialize(&mut bytes)?;
let amplitude = u64::deserialize(&mut bytes)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let owning_ref = OwningRef::new(data).map(|data| &data[16..]);
let bit_unpacker = BitUnpacker::new(owning_ref, num_bits);
FastFieldReader {
let bit_unpacker = BitUnpacker::new(bytes, num_bits);
Ok(FastFieldReader {
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
_phantom: PhantomData,
}
})
}
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
@@ -157,12 +149,11 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
serializer.close().unwrap();
}
let source = directory.open_read(path).expect("Failed to open the file");
let composite_file =
CompositeFile::open(&source).expect("Failed to read the composite file");
let field_source = composite_file
let file = directory.open_read(path).expect("Failed to open the file");
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
let field_file = composite_file
.open_read(field)
.expect("File component not found");
FastFieldReader::open(field_source)
FastFieldReader::open(field_file).unwrap()
}
}

View File

@@ -72,44 +72,48 @@ impl FastFieldReaders {
if !bytes_option.is_fast() {
continue;
}
let idx_reader = fast_fields_composite
let fast_field_idx_file = fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
let idx_reader = FastFieldReader::open(fast_field_idx_file)?;
let data = fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
let bytes_fast_field_reader = BytesFastFieldReader::open(idx_reader, data)?;
fast_field_readers
.fast_bytes
.insert(field, BytesFastFieldReader::open(idx_reader, data));
.insert(field, bytes_fast_field_reader);
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
match cardinality {
Cardinality::SingleValue => {
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
match fast_type {
FastType::U64 => {
let fast_field_reader = FastFieldReader::open(fast_field_data);
let fast_field_reader = FastFieldReader::open(fast_field_data)?;
fast_field_readers
.fast_field_u64
.insert(field, fast_field_reader);
}
FastType::I64 => {
fast_field_readers.fast_field_i64.insert(
field,
FastFieldReader::open(fast_field_data.clone()),
);
let fast_field_reader =
FastFieldReader::open(fast_field_data.clone())?;
fast_field_readers
.fast_field_i64
.insert(field, fast_field_reader);
}
FastType::F64 => {
fast_field_readers.fast_field_f64.insert(
field,
FastFieldReader::open(fast_field_data.clone()),
);
let fast_field_reader =
FastFieldReader::open(fast_field_data.clone())?;
fast_field_readers
.fast_field_f64
.insert(field, fast_field_reader);
}
FastType::Date => {
fast_field_readers.fast_field_date.insert(
field,
FastFieldReader::open(fast_field_data.clone()),
);
let fast_field_reader =
FastFieldReader::open(fast_field_data.clone())?;
fast_field_readers
.fast_field_date
.insert(field, fast_field_reader);
}
}
} else {
@@ -120,10 +124,10 @@ impl FastFieldReaders {
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
let idx_reader = FastFieldReader::open(fast_field_idx);
let idx_reader = FastFieldReader::open(fast_field_idx)?;
match fast_type {
FastType::I64 => {
let vals_reader = FastFieldReader::open(fast_field_data);
let vals_reader = FastFieldReader::open(fast_field_data)?;
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
@@ -131,7 +135,7 @@ impl FastFieldReaders {
.insert(field, multivalued_int_fast_field);
}
FastType::U64 => {
let vals_reader = FastFieldReader::open(fast_field_data);
let vals_reader = FastFieldReader::open(fast_field_data)?;
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
@@ -139,7 +143,7 @@ impl FastFieldReaders {
.insert(field, multivalued_int_fast_field);
}
FastType::F64 => {
let vals_reader = FastFieldReader::open(fast_field_data);
let vals_reader = FastFieldReader::open(fast_field_data)?;
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
@@ -147,7 +151,7 @@ impl FastFieldReaders {
.insert(field, multivalued_int_fast_field);
}
FastType::Date => {
let vals_reader = FastFieldReader::open(fast_field_data);
let vals_reader = FastFieldReader::open(fast_field_data)?;
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers

View File

@@ -1,6 +1,7 @@
use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::schema::Field;
use crate::space_usage::PerFieldSpaceUsage;
use crate::DocId;
@@ -19,16 +20,21 @@ pub struct FieldNormReaders {
impl FieldNormReaders {
/// Creates a field norm reader.
pub fn open(source: ReadOnlySource) -> crate::Result<FieldNormReaders> {
let data = CompositeFile::open(&source)?;
pub fn open(file: FileSlice) -> crate::Result<FieldNormReaders> {
let data = CompositeFile::open(&file)?;
Ok(FieldNormReaders {
data: Arc::new(data),
})
}
/// Returns the FieldNormReader for a specific field.
pub fn get_field(&self, field: Field) -> Option<FieldNormReader> {
self.data.open_read(field).map(FieldNormReader::open)
pub fn get_field(&self, field: Field) -> crate::Result<Option<FieldNormReader>> {
if let Some(file) = self.data.open_read(field) {
let fieldnorm_reader = FieldNormReader::open(file)?;
Ok(Some(fieldnorm_reader))
} else {
Ok(None)
}
}
/// Return a break down of the space usage per field.
@@ -56,13 +62,14 @@ impl FieldNormReaders {
/// in a very short array.
#[derive(Clone)]
pub struct FieldNormReader {
data: ReadOnlySource,
data: OwnedBytes,
}
impl FieldNormReader {
/// Opens a field norm reader given its data source.
pub fn open(data: ReadOnlySource) -> Self {
FieldNormReader { data }
/// Opens a field norm reader given its file.
pub fn open(fieldnorm_file: FileSlice) -> crate::Result<Self> {
let data = fieldnorm_file.read_bytes()?;
Ok(FieldNormReader { data })
}
/// Returns the number of documents in this segment.
@@ -87,8 +94,7 @@ impl FieldNormReader {
/// Returns the `fieldnorm_id` associated to a document.
#[inline(always)]
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
let fielnorms_data = self.data.as_slice();
fielnorms_data[doc_id as usize]
self.data.as_slice()[doc_id as usize]
}
/// Converts a `fieldnorm_id` into a fieldnorm.
@@ -111,7 +117,7 @@ impl FieldNormReader {
.cloned()
.map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>();
let field_norms_data = ReadOnlySource::from(field_norms_id);
let field_norms_data = OwnedBytes::new(field_norms_id);
FieldNormReader {
data: field_norms_data,
}

View File

@@ -108,9 +108,9 @@ fn compute_deleted_bitset(
// Limit doc helps identify the first document
// that may be affected by the delete operation.
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
let inverted_index = segment_reader.inverted_index(delete_op.term.field())?;
if let Some(mut docset) =
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)?
{
let mut deleted_doc = docset.doc();
while deleted_doc != TERMINATED {
@@ -979,7 +979,7 @@ mod tests {
let num_docs_containing = |s: &str| {
let searcher = reader.searcher();
let term = Term::from_field_text(text_field, s);
searcher.doc_freq(&term)
searcher.doc_freq(&term).unwrap()
};
{
@@ -1015,7 +1015,7 @@ mod tests {
.unwrap();
let num_docs_containing = |s: &str| {
let term_a = Term::from_field_text(text_field, s);
reader.searcher().doc_freq(&term_a)
reader.searcher().doc_freq(&term_a).unwrap()
};
{
// writing the segment
@@ -1110,6 +1110,7 @@ mod tests {
.unwrap()
.searcher()
.doc_freq(&term_a)
.unwrap()
};
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 100);
@@ -1129,7 +1130,7 @@ mod tests {
reader.reload().unwrap();
let searcher = reader.searcher();
let term = Term::from_field_text(text_field, s);
searcher.doc_freq(&term)
searcher.doc_freq(&term).unwrap()
};
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
@@ -1180,7 +1181,15 @@ mod tests {
// working with an empty index == no documents
let term_b = Term::from_field_text(text_field, "b");
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_b), 0);
assert_eq!(
index
.reader()
.unwrap()
.searcher()
.doc_freq(&term_b)
.unwrap(),
0
);
}
#[test]
@@ -1200,7 +1209,15 @@ mod tests {
let term_a = Term::from_field_text(text_field, "a");
// expect the document with that term to be in the index
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
assert_eq!(
index
.reader()
.unwrap()
.searcher()
.doc_freq(&term_a)
.unwrap(),
1
);
}
#[test]
@@ -1226,7 +1243,15 @@ mod tests {
// Find original docs in the index
let term_a = Term::from_field_text(text_field, "a");
// expect the document with that term to be in the index
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
assert_eq!(
index
.reader()
.unwrap()
.searcher()
.doc_freq(&term_a)
.unwrap(),
1
);
}
#[test]

View File

@@ -38,7 +38,7 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::R
count[fieldnorm_id as usize] += 1;
}
} else {
total_tokens += reader.inverted_index(field).total_num_tokens();
total_tokens += reader.inverted_index(field)?.total_num_tokens();
}
}
Ok(total_tokens
@@ -510,7 +510,7 @@ impl IndexMerger {
.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect();
.collect::<crate::Result<Vec<_>>>()?;
for field_reader in &field_readers {
let terms = field_reader.terms();
@@ -583,8 +583,8 @@ impl IndexMerger {
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
let segment_postings =
inverted_index.read_postings_from_terminfo(term_info, segment_postings_option);
let segment_postings = inverted_index
.read_postings_from_terminfo(term_info, segment_postings_option)?;
let delete_bitset_opt = segment_reader.delete_bitset();
let doc_freq = if let Some(delete_bitset) = delete_bitset_opt {
segment_postings.doc_freq_given_deletes(delete_bitset)
@@ -653,7 +653,7 @@ impl IndexMerger {
) -> crate::Result<HashMap<Field, TermOrdinalMapping>> {
let mut term_ordinal_mappings = HashMap::new();
for (field, field_entry) in self.schema.fields() {
let fieldnorm_reader = fieldnorm_readers.get_field(field);
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
if field_entry.is_indexed() {
if let Some(term_ordinal_mapping) = self.write_postings_for_field(
field,
@@ -670,7 +670,7 @@ impl IndexMerger {
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> crate::Result<()> {
for reader in &self.readers {
let store_reader = reader.get_store_reader();
let store_reader = reader.get_store_reader()?;
if reader.num_deleted_docs() > 0 {
for doc_id in reader.doc_ids_alive() {
let doc = store_reader.get(doc_id)?;
@@ -1533,7 +1533,7 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
let mut term_scorer = term_query
.specialized_weight(&searcher, true)
.specialized_weight(&searcher, true)?
.specialized_scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(term_scorer.doc(), 0);
assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855);
@@ -1548,7 +1548,7 @@ mod tests {
assert_eq!(searcher.segment_readers().len(), 2);
for segment_reader in searcher.segment_readers() {
let mut term_scorer = term_query
.specialized_weight(&searcher, true)
.specialized_weight(&searcher, true)?
.specialized_scorer(segment_reader, 1.0)?;
// the difference compared to before is instrinsic to the bm25 formula. no worries there.
for doc in segment_reader.doc_ids_alive() {
@@ -1572,7 +1572,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let mut term_scorer = term_query
.specialized_weight(&searcher, true)
.specialized_weight(&searcher, true)?
.specialized_scorer(segment_reader, 1.0)?;
// the difference compared to before is instrinsic to the bm25 formula. no worries there.
for doc in segment_reader.doc_ids_alive() {

View File

@@ -383,31 +383,23 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
{
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?;
}
{
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit()?;
}
{
index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit()?;
}
{
let reader = index.reader()?;
let searcher = reader.searcher();
let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3);
let term_b = Term::from_field_text(text_field, "b");
assert_eq!(searcher.doc_freq(&term_b), 1);
let term_c = Term::from_field_text(text_field, "c");
assert_eq!(searcher.doc_freq(&term_c), 2);
let term_d = Term::from_field_text(text_field, "d");
assert_eq!(searcher.doc_freq(&term_d), 0);
}
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a"));
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"c"));
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a)?, 3);
let term_b = Term::from_field_text(text_field, "b");
assert_eq!(searcher.doc_freq(&term_b)?, 1);
let term_c = Term::from_field_text(text_field, "c");
assert_eq!(searcher.doc_freq(&term_c)?, 2);
let term_d = Term::from_field_text(text_field, "d");
assert_eq!(searcher.doc_freq(&term_d)?, 0);
Ok(())
}
@@ -504,13 +496,13 @@ mod tests {
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(text_field);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)?
.is_none());
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 5);
@@ -518,7 +510,7 @@ mod tests {
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 3);
@@ -540,14 +532,14 @@ mod tests {
reader.reload()?;
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inverted_index = seg_reader.inverted_index(term_abcd.field());
let inverted_index = seg_reader.inverted_index(term_abcd.field())?;
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)?
.is_none());
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert!(advance_undeleted(&mut postings, seg_reader));
assert_eq!(postings.doc(), 5);
@@ -555,7 +547,7 @@ mod tests {
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert!(advance_undeleted(&mut postings, seg_reader));
assert_eq!(postings.doc(), 3);
@@ -577,19 +569,19 @@ mod tests {
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(term_abcd.field());
let inverted_index = segment_reader.inverted_index(term_abcd.field())?;
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)?
.is_none());
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert!(!advance_undeleted(&mut postings, segment_reader));
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 3);
@@ -599,7 +591,7 @@ mod tests {
}
{
let mut postings = inverted_index
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 4);
@@ -624,8 +616,8 @@ mod tests {
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.inverted_index(term.field())?
.read_postings(&term, IndexRecordOption::Basic)?
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
@@ -648,8 +640,8 @@ mod tests {
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.inverted_index(term.field())?
.read_postings(&term, IndexRecordOption::Basic)?
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
@@ -672,8 +664,8 @@ mod tests {
let term = Term::from_field_f64(value_field, val);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.inverted_index(term.field())?
.read_postings(&term, IndexRecordOption::Basic)?
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), TERMINATED);
@@ -693,7 +685,7 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(absent_field); //< should not panic
let inverted_index = segment_reader.inverted_index(absent_field)?;
assert_eq!(inverted_index.terms().num_terms(), 0);
Ok(())
}
@@ -743,14 +735,14 @@ mod tests {
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
let inverted_index = reader.inverted_index(text_field)?;
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)?
.is_none());
let term_af = Term::from_field_text(text_field, "af");
let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 3);

View File

@@ -38,11 +38,11 @@ const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) a
pub mod tests {
use super::PositionSerializer;
use crate::directory::ReadOnlySource;
use crate::positions::reader::PositionReader;
use crate::{common::HasLen, directory::FileSlice};
use std::iter;
fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) {
fn create_stream_buffer(vals: &[u32]) -> (FileSlice, FileSlice) {
let mut skip_buffer = vec![];
let mut stream_buffer = vec![];
{
@@ -53,10 +53,7 @@ pub mod tests {
}
serializer.close().unwrap();
}
(
ReadOnlySource::from(stream_buffer),
ReadOnlySource::from(skip_buffer),
)
(FileSlice::new(stream_buffer), FileSlice::new(skip_buffer))
}
#[test]
@@ -65,7 +62,7 @@ pub mod tests {
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 12);
assert_eq!(stream.len(), 1168);
let mut position_reader = PositionReader::new(stream, skip, 0u64);
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
for &n in &[1, 10, 127, 128, 130, 312] {
let mut v = vec![0u32; n];
position_reader.read(0, &mut v[..]);
@@ -81,7 +78,7 @@ pub mod tests {
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 12);
assert_eq!(stream.len(), 1168);
let mut position_reader = PositionReader::new(stream, skip, 0u64);
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
for &offset in &[1u64, 10u64, 127u64, 128u64, 130u64, 312u64] {
for &len in &[1, 10, 130, 500] {
let mut v = vec![0u32; len];
@@ -100,7 +97,7 @@ pub mod tests {
assert_eq!(skip.len(), 12);
assert_eq!(stream.len(), 1168);
let mut position_reader = PositionReader::new(stream, skip, 0u64);
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
let mut buf = [0u32; 7];
let mut c = 0;
@@ -122,7 +119,7 @@ pub mod tests {
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 4_987_872);
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0);
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0).unwrap();
let mut buf = [0u32; 256];
position_reader.read(128, &mut buf);
for i in 0..256 {
@@ -142,7 +139,8 @@ pub mod tests {
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 4_987_872);
let mut buf = [0u32; 1];
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 200_000);
let mut position_reader =
PositionReader::new(stream.clone(), skip.clone(), 200_000).unwrap();
position_reader.read(230, &mut buf);
position_reader.read(9, &mut buf);
}
@@ -157,7 +155,7 @@ pub mod tests {
}
let (stream, skip) = create_stream_buffer(&v[..]);
let mut buf = Vec::new();
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0);
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0).unwrap();
let mut offset = 0;
for i in 1..24 {
buf.resize(i, 0);
@@ -175,7 +173,7 @@ pub mod tests {
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 1_000_000);
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024);
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024).unwrap();
let mut buf = [0u32; 1];
position_reader.read(0, &mut buf);
assert_eq!(buf[0], CONST_VAL);
@@ -194,7 +192,8 @@ pub mod tests {
128 * 1024 + 7,
128 * 10 * 1024 + 10,
] {
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset);
let mut position_reader =
PositionReader::new(stream.clone(), skip.clone(), offset).unwrap();
let mut buf = [0u32; 1];
position_reader.read(0, &mut buf);
assert_eq!(buf[0], offset as u32);

View File

@@ -1,8 +1,13 @@
use std::io;
use crate::common::{BinarySerializable, FixedSize};
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::positions::LONG_SKIP_INTERVAL;
use crate::positions::LONG_SKIP_IN_BLOCKS;
use bitpacking::{BitPacker, BitPacker4x};
/// Positions works as a long sequence of compressed block.
/// All terms are chained one after the other.
///
@@ -23,28 +28,28 @@ use crate::positions::LONG_SKIP_IN_BLOCKS;
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
/// so skipping a block without decompressing it is just a matter of advancing that many
/// bytes.
use bitpacking::{BitPacker, BitPacker4x};
use owned_read::OwnedRead;
struct Positions {
bit_packer: BitPacker4x,
skip_source: ReadOnlySource,
position_source: ReadOnlySource,
long_skip_source: ReadOnlySource,
skip_file: FileSlice,
position_file: FileSlice,
long_skip_data: OwnedBytes,
}
impl Positions {
pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions {
let (body, footer) = skip_source.split_from_end(u32::SIZE_IN_BYTES);
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
let (skip_source, long_skip_source) =
pub fn new(position_file: FileSlice, skip_file: FileSlice) -> io::Result<Positions> {
let (body, footer) = skip_file.split_from_end(u32::SIZE_IN_BYTES);
let footer_data = footer.read_bytes()?;
let num_long_skips = u32::deserialize(&mut footer_data.as_slice())?;
let (skip_file, long_skip_file) =
body.split_from_end(u64::SIZE_IN_BYTES * (num_long_skips as usize));
Positions {
let long_skip_data = long_skip_file.read_bytes()?;
Ok(Positions {
bit_packer: BitPacker4x::new(),
skip_source,
long_skip_source,
position_source,
}
skip_file,
long_skip_data,
position_file,
})
}
/// Returns the offset of the block associated to the given `long_skip_id`.
@@ -54,19 +59,23 @@ impl Positions {
if long_skip_id == 0 {
return 0;
}
let long_skip_slice = self.long_skip_source.as_slice();
let long_skip_slice = self.long_skip_data.as_slice();
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
}
fn reader(&self, offset: u64) -> PositionReader {
fn reader(&self, offset: u64) -> io::Result<PositionReader> {
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
let mut position_read = OwnedRead::new(self.position_source.clone());
position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(self.skip_source.clone());
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
PositionReader {
let position_read = self
.position_file
.slice_from(offset_num_bytes as usize)
.read_bytes()?;
let skip_read = self
.skip_file
.slice_from(long_skip_id * LONG_SKIP_IN_BLOCKS)
.read_bytes()?;
Ok(PositionReader {
bit_packer: self.bit_packer,
skip_read,
position_read,
@@ -74,14 +83,14 @@ impl Positions {
block_offset: std::i64::MAX as u64,
anchor_offset: (long_skip_id as u64) * LONG_SKIP_INTERVAL,
abs_offset: offset,
}
})
}
}
#[derive(Clone)]
pub struct PositionReader {
skip_read: OwnedRead,
position_read: OwnedRead,
skip_read: OwnedBytes,
position_read: OwnedBytes,
bit_packer: BitPacker4x,
buffer: Box<[u32; COMPRESSION_BLOCK_SIZE]>,
@@ -93,11 +102,12 @@ pub struct PositionReader {
impl PositionReader {
pub fn new(
position_source: ReadOnlySource,
skip_source: ReadOnlySource,
position_file: FileSlice,
skip_file: FileSlice,
offset: u64,
) -> PositionReader {
Positions::new(position_source, skip_source).reader(offset)
) -> io::Result<PositionReader> {
let positions = Positions::new(position_file, skip_file)?;
positions.reader(offset)
}
fn advance_num_blocks(&mut self, num_blocks: usize) {
@@ -131,7 +141,7 @@ impl PositionReader {
self.advance_num_blocks(num_blocks_to_skip);
self.anchor_offset = offset - (offset % COMPRESSION_BLOCK_SIZE as u64);
self.block_offset = self.anchor_offset;
let num_bits = self.skip_read.get(0);
let num_bits = self.skip_read.as_slice()[0];
self.bit_packer
.decompress(self.position_read.as_ref(), self.buffer.as_mut(), num_bits);
} else {
@@ -141,7 +151,7 @@ impl PositionReader {
self.anchor_offset = self.block_offset;
}
let mut num_bits = self.skip_read.get(0);
let mut num_bits = self.skip_read.as_slice()[0];
let mut position_data = self.position_read.as_ref();
for i in 1.. {
@@ -155,7 +165,7 @@ impl PositionReader {
output = &mut output[remaining_in_block..];
offset += remaining_in_block as u64;
position_data = &position_data[(num_bits as usize * COMPRESSION_BLOCK_SIZE / 8)..];
num_bits = self.skip_read.get(i);
num_bits = self.skip_read.as_slice()[i];
self.bit_packer
.decompress(position_data, self.buffer.as_mut(), num_bits);
self.block_offset += COMPRESSION_BLOCK_SIZE as u64;

View File

@@ -1,5 +1,8 @@
use std::io;
use crate::common::{BinarySerializable, VInt};
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
@@ -34,7 +37,7 @@ pub struct BlockSegmentPostings {
doc_freq: u32,
data: ReadOnlySource,
data: OwnedBytes,
pub(crate) skip_reader: SkipReader,
}
@@ -72,37 +75,34 @@ fn decode_vint_block(
fn split_into_skips_and_postings(
doc_freq: u32,
data: ReadOnlySource,
) -> (Option<ReadOnlySource>, ReadOnlySource) {
mut bytes: OwnedBytes,
) -> (Option<OwnedBytes>, OwnedBytes) {
if doc_freq < COMPRESSION_BLOCK_SIZE as u32 {
return (None, data);
return (None, bytes);
}
let mut data_byte_arr = data.as_slice();
let skip_len = VInt::deserialize(&mut data_byte_arr)
.expect("Data corrupted")
.0 as usize;
let vint_len = data.len() - data_byte_arr.len();
let (skip_data, postings_data) = data.slice_from(vint_len).split(skip_len);
let skip_len = VInt::deserialize(&mut bytes).expect("Data corrupted").0 as usize;
let (skip_data, postings_data) = bytes.split(skip_len);
(Some(skip_data), postings_data)
}
impl BlockSegmentPostings {
pub(crate) fn from_data(
pub(crate) fn open(
doc_freq: u32,
data: ReadOnlySource,
data: FileSlice,
record_option: IndexRecordOption,
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
) -> io::Result<BlockSegmentPostings> {
let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq,
};
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let (skip_data_opt, postings_data) =
split_into_skips_and_postings(doc_freq, data.read_bytes()?);
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option),
None => SkipReader::new(ReadOnlySource::empty(), doc_freq, record_option),
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
};
let mut block_segment_postings = BlockSegmentPostings {
@@ -116,7 +116,7 @@ impl BlockSegmentPostings {
skip_reader,
};
block_segment_postings.load_block();
block_segment_postings
Ok(block_segment_postings)
}
/// Returns the block_max_score for the current block.
@@ -172,15 +172,15 @@ impl BlockSegmentPostings {
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: ReadOnlySource) {
pub(crate) fn reset(&mut self, doc_freq: u32, postings_data: OwnedBytes) {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
self.data = ReadOnlySource::new(postings_data);
self.data = postings_data;
self.block_max_score_cache = None;
self.loaded_offset = std::usize::MAX;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq);
} else {
self.skip_reader.reset(ReadOnlySource::empty(), doc_freq);
self.skip_reader.reset(OwnedBytes::empty(), doc_freq);
}
self.doc_freq = doc_freq;
self.load_block();
@@ -344,8 +344,8 @@ impl BlockSegmentPostings {
freq_reading_option: FreqReadingOption::NoFreq,
block_max_score_cache: None,
doc_freq: 0,
data: ReadOnlySource::new(vec![]),
skip_reader: SkipReader::new(ReadOnlySource::new(vec![]), 0, IndexRecordOption::Basic),
data: OwnedBytes::empty(),
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
}
}
}
@@ -467,10 +467,12 @@ mod tests {
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field);
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
.unwrap()
}
#[test]
@@ -491,37 +493,38 @@ mod tests {
}
#[test]
fn test_reset_block_segment_postings() {
fn test_reset_block_segment_postings() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let inverted_index = segment_reader.inverted_index(int_field);
let inverted_index = segment_reader.inverted_index(int_field)?;
let term_info = inverted_index.get_term_info(&term).unwrap();
block_segments = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
}
assert_eq!(block_segments.docs(), &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let inverted_index = segment_reader.inverted_index(int_field);
let inverted_index = segment_reader.inverted_index(int_field)?;
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?;
}
assert_eq!(block_segments.docs(), &[1, 3, 5]);
Ok(())
}
}

View File

@@ -101,12 +101,12 @@ pub mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
let inverted_index = searcher.segment_reader(0u32).inverted_index(title)?;
let term = Term::from_field_text(title, "abc");
let mut positions = Vec::new();
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings.doc(), 0);
postings.positions(&mut positions);
@@ -120,7 +120,7 @@ pub mod tests {
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.advance(), 1);
@@ -129,7 +129,7 @@ pub mod tests {
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings.seek(1), 1);
assert_eq!(postings.doc(), 1);
@@ -138,7 +138,7 @@ pub mod tests {
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings.seek(1002), 1002);
assert_eq!(postings.doc(), 1002);
@@ -147,7 +147,7 @@ pub mod tests {
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings.seek(100), 100);
assert_eq!(postings.seek(1002), 1002);
@@ -159,7 +159,7 @@ pub mod tests {
}
#[test]
pub fn test_drop_token_that_are_too_long() {
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
exceeding_token_text.push_str(" hello");
@@ -184,7 +184,7 @@ pub mod tests {
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
@@ -196,12 +196,13 @@ pub mod tests {
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(1u32);
let inverted_index = segment_reader.inverted_index(text_field);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
assert_eq!(&bytes[..], ok_token_text.as_bytes());
}
Ok(())
}
#[test]
@@ -261,15 +262,15 @@ pub mod tests {
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.inverted_index(term_a.field())?
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)?
.is_none());
}
{
let term_a = Term::from_field_text(text_field, "a");
let mut postings_a = segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.inverted_index(term_a.field())?
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings_a.len(), 1000);
assert_eq!(postings_a.doc(), 0);
@@ -291,8 +292,8 @@ pub mod tests {
{
let term_e = Term::from_field_text(text_field, "e");
let mut postings_e = segment_reader
.inverted_index(term_e.field())
.read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions)
.inverted_index(term_e.field())?
.read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings_e.len(), 1000 - 2);
for i in 2u32..1000u32 {
@@ -312,7 +313,7 @@ pub mod tests {
}
#[test]
pub fn test_position_and_fieldnorm2() {
pub fn test_position_and_fieldnorm2() -> crate::Result<()> {
let mut positions: Vec<u32> = Vec::new();
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
@@ -328,16 +329,17 @@ pub mod tests {
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let mut postings = segment_reader
.inverted_index(text_field)
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.inverted_index(text_field)?
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)?
.unwrap();
assert_eq!(postings.doc(), 1u32);
postings.positions(&mut positions);
assert_eq!(&positions[..], &[1u32, 4]);
Ok(())
}
#[test]
fn test_skip_next() {
fn test_skip_next() -> crate::Result<()> {
let term_0 = Term::from_field_u64(Field::from_field_id(0), 0);
let term_1 = Term::from_field_u64(Field::from_field_id(0), 1);
let term_2 = Term::from_field_u64(Field::from_field_id(0), 2);
@@ -348,10 +350,9 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_u64_field("value", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc);
@@ -360,15 +361,15 @@ pub mod tests {
}
index
};
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
// check that the basic usage works
for i in 0..num_docs - 1 {
for j in i + 1..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.inverted_index(term_2.field())?
.read_postings(&term_2, IndexRecordOption::Basic)?
.unwrap();
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
@@ -380,8 +381,8 @@ pub mod tests {
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.inverted_index(term_2.field())?
.read_postings(&term_2, IndexRecordOption::Basic)?
.unwrap();
// check that `skip_next` advances the iterator
@@ -400,8 +401,8 @@ pub mod tests {
// check that filtering works
{
let mut segment_postings = segment_reader
.inverted_index(term_0.field())
.read_postings(&term_0, IndexRecordOption::Basic)
.inverted_index(term_0.field())?
.read_postings(&term_0, IndexRecordOption::Basic)?
.unwrap();
for i in 0..num_docs / 2 {
@@ -410,8 +411,8 @@ pub mod tests {
}
let mut segment_postings = segment_reader
.inverted_index(term_0.field())
.read_postings(&term_0, IndexRecordOption::Basic)
.inverted_index(term_0.field())?
.read_postings(&term_0, IndexRecordOption::Basic)?
.unwrap();
for i in 0..num_docs / 2 - 1 {
@@ -422,19 +423,19 @@ pub mod tests {
// delete some of the documents
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
// make sure seeking still works
for i in 0..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.inverted_index(term_2.field())?
.read_postings(&term_2, IndexRecordOption::Basic)?
.unwrap();
if i % 2 == 0 {
@@ -450,8 +451,8 @@ pub mod tests {
// now try with a longer sequence
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.inverted_index(term_2.field())?
.read_postings(&term_2, IndexRecordOption::Basic)?
.unwrap();
let mut last = 2; // start from 5 to avoid seeking to 3 twice
@@ -476,20 +477,19 @@ pub mod tests {
// delete everything else
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
// finally, check that it's empty
{
let searchable_segment_ids = index
.searchable_segment_ids()
.expect("could not get index segment ids");
let searchable_segment_ids = index.searchable_segment_ids()?;
assert!(searchable_segment_ids.is_empty());
assert_eq!(searcher.num_docs(), 0);
}
Ok(())
}
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
@@ -621,7 +621,7 @@ mod bench {
b.iter(|| {
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.read_postings(&*TERM_A, IndexRecordOption::Basic)?
.unwrap();
while segment_postings.advance() != TERMINATED {}
});
@@ -636,18 +636,22 @@ mod bench {
let segment_postings_a = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap()
.unwrap();
let segment_postings_b = segment_reader
.inverted_index(TERM_B.field())
.read_postings(&*TERM_B, IndexRecordOption::Basic)
.unwrap()
.unwrap();
let segment_postings_c = segment_reader
.inverted_index(TERM_C.field())
.read_postings(&*TERM_C, IndexRecordOption::Basic)
.unwrap()
.unwrap();
let segment_postings_d = segment_reader
.inverted_index(TERM_D.field())
.read_postings(&*TERM_D, IndexRecordOption::Basic)
.unwrap()
.unwrap();
let mut intersection = Intersection::new(vec![
segment_postings_a,
@@ -668,6 +672,7 @@ mod bench {
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap()
.unwrap();
let mut existing_docs = Vec::new();

View File

@@ -161,7 +161,7 @@ impl MultiFieldPostingsWriter {
}
let postings_writer = &self.per_field_postings_writers[field.field_id() as usize];
let fieldnorm_reader = fieldnorm_readers.get_field(field);
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
let mut field_serializer = serializer.new_field(
field,
postings_writer.total_num_tokens(),

View File

@@ -12,7 +12,7 @@ use crate::postings::Postings;
use crate::schema::IndexRecordOption;
use crate::{DocId, TERMINATED};
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::fastfield::DeleteBitSet;
use crate::postings::BlockSegmentPostings;
@@ -86,12 +86,13 @@ impl SegmentPostings {
.close_term(docs.len() as u32)
.expect("In memory Serialization should never fail.");
}
let block_segment_postings = BlockSegmentPostings::from_data(
let block_segment_postings = BlockSegmentPostings::open(
docs.len() as u32,
ReadOnlySource::from(buffer),
FileSlice::new(buffer),
IndexRecordOption::Basic,
IndexRecordOption::Basic,
);
)
.unwrap();
SegmentPostings::from_block_postings(block_segment_postings, None)
}
@@ -131,12 +132,13 @@ impl SegmentPostings {
postings_serializer
.close_term(doc_and_tfs.len() as u32)
.unwrap();
let block_segment_postings = BlockSegmentPostings::from_data(
let block_segment_postings = BlockSegmentPostings::open(
doc_and_tfs.len() as u32,
ReadOnlySource::from(buffer),
FileSlice::new(buffer),
IndexRecordOption::WithFreqs,
IndexRecordOption::WithFreqs,
);
)
.unwrap();
SegmentPostings::from_block_postings(block_segment_postings, None)
}
@@ -204,7 +206,7 @@ impl DocSet for SegmentPostings {
}
/// Return the current document's `DocId`.
#[inline]
#[inline(always)]
fn doc(&self) -> DocId {
self.block_cursor.doc(self.cur)
}

View File

@@ -1,10 +1,9 @@
use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable, VInt};
use crate::directory::ReadOnlySource;
use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable};
use crate::directory::OwnedBytes;
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
use crate::query::BM25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
use owned_read::OwnedRead;
pub struct SkipSerializer {
buffer: Vec<u8>,
@@ -62,7 +61,7 @@ impl SkipSerializer {
pub(crate) struct SkipReader {
last_doc_in_block: DocId,
pub(crate) last_doc_in_previous_block: DocId,
owned_read: OwnedRead,
owned_read: OwnedBytes,
skip_info: IndexRecordOption,
byte_offset: usize,
remaining_docs: u32, // number of docs remaining, including the
@@ -93,7 +92,7 @@ impl Default for BlockInfo {
}
impl SkipReader {
pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader {
pub fn new(data: OwnedBytes, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader {
let mut skip_reader = SkipReader {
last_doc_in_block: if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
0
@@ -101,7 +100,7 @@ impl SkipReader {
TERMINATED
},
last_doc_in_previous_block: 0u32,
owned_read: OwnedRead::new(data),
owned_read: data,
skip_info,
block_info: BlockInfo::VInt { num_docs: doc_freq },
byte_offset: 0,
@@ -114,14 +113,14 @@ impl SkipReader {
skip_reader
}
pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) {
pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) {
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
0
} else {
TERMINATED
};
self.last_doc_in_previous_block = 0u32;
self.owned_read = OwnedRead::new(data);
self.owned_read = data;
self.block_info = BlockInfo::VInt { num_docs: doc_freq };
self.byte_offset = 0;
self.remaining_docs = doc_freq;
@@ -154,17 +153,24 @@ impl SkipReader {
self.position_offset
}
#[inline(always)]
pub fn byte_offset(&self) -> usize {
self.byte_offset
}
fn read_block_info(&mut self) {
let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
let doc_delta = {
let bytes = self.owned_read.as_slice();
let mut buf = [0; 4];
buf.copy_from_slice(&bytes[..4]);
u32::from_le_bytes(buf)
};
self.last_doc_in_block += doc_delta as DocId;
let doc_num_bits = self.owned_read.get(0);
let doc_num_bits = self.owned_read.as_slice()[4];
match self.skip_info {
IndexRecordOption::Basic => {
self.owned_read.advance(1);
self.owned_read.advance(5);
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits: 0,
@@ -174,11 +180,11 @@ impl SkipReader {
};
}
IndexRecordOption::WithFreqs => {
let tf_num_bits = self.owned_read.get(1);
let block_wand_fieldnorm_id = self.owned_read.get(2);
let data = &self.owned_read.as_ref()[3..];
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(data);
self.owned_read.advance(3 + num_bytes);
let bytes = self.owned_read.as_slice();
let tf_num_bits = bytes[5];
let block_wand_fieldnorm_id = bytes[6];
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[7..]);
self.owned_read.advance(7 + num_bytes);
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
@@ -188,13 +194,16 @@ impl SkipReader {
};
}
IndexRecordOption::WithFreqsAndPositions => {
let tf_num_bits = self.owned_read.get(1);
self.owned_read.advance(2);
let tf_sum = u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum");
let block_wand_fieldnorm_id = self.owned_read.get(0);
self.owned_read.advance(1);
let block_wand_term_freq =
VInt::deserialize_u64(&mut self.owned_read).unwrap() as u32;
let bytes = self.owned_read.as_slice();
let tf_num_bits = bytes[5];
let tf_sum = {
let mut buf = [0; 4];
buf.copy_from_slice(&bytes[6..10]);
u32::from_le_bytes(buf)
};
let block_wand_fieldnorm_id = bytes[10];
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[11..]);
self.owned_read.advance(11 + num_bytes);
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
@@ -262,7 +271,7 @@ mod tests {
use super::BlockInfo;
use super::IndexRecordOption;
use super::{SkipReader, SkipSerializer};
use crate::directory::ReadOnlySource;
use crate::directory::OwnedBytes;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
#[test]
@@ -278,11 +287,8 @@ mod tests {
skip_serializer.data().to_owned()
};
let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32;
let mut skip_reader = SkipReader::new(
ReadOnlySource::new(buf),
doc_freq,
IndexRecordOption::WithFreqs,
);
let mut skip_reader =
SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::WithFreqs);
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info,
@@ -323,11 +329,8 @@ mod tests {
skip_serializer.data().to_owned()
};
let doc_freq = 3u32 + (COMPRESSION_BLOCK_SIZE * 2) as u32;
let mut skip_reader = SkipReader::new(
ReadOnlySource::from(buf),
doc_freq,
IndexRecordOption::Basic,
);
let mut skip_reader =
SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::Basic);
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),
@@ -367,11 +370,8 @@ mod tests {
skip_serializer.data().to_owned()
};
let doc_freq = COMPRESSION_BLOCK_SIZE as u32;
let mut skip_reader = SkipReader::new(
ReadOnlySource::from(buf),
doc_freq,
IndexRecordOption::Basic,
);
let mut skip_reader =
SkipReader::new(OwnedBytes::new(buf), doc_freq, IndexRecordOption::Basic);
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),

View File

@@ -206,8 +206,8 @@ mod tests {
fn test_stack_long() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
let source: Vec<u32> = (0..100).collect();
for &el in &source {
let data: Vec<u32> = (0..100).collect();
for &el in &data {
assert!(stack
.writer(&mut heap)
.write_u32::<LittleEndian>(el)
@@ -221,7 +221,7 @@ mod tests {
result.push(LittleEndian::read_u32(&remaining[..4]));
remaining = &remaining[4..];
}
assert_eq!(&result[..], &source[..]);
assert_eq!(&result[..], &data[..]);
}
#[test]

View File

@@ -42,13 +42,13 @@ where
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field);
let inverted_index = reader.inverted_index(self.field)?;
let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict);
while term_stream.advance() {
let term_info = term_stream.value();
let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {

View File

@@ -52,7 +52,7 @@ impl BM25Weight {
}
}
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight {
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> crate::Result<BM25Weight> {
assert!(!terms.is_empty(), "BM25 requires at least one term");
let field = terms[0].field();
for term in &terms[1..] {
@@ -66,25 +66,27 @@ impl BM25Weight {
let mut total_num_tokens = 0u64;
let mut total_num_docs = 0u64;
for segment_reader in searcher.segment_readers() {
let inverted_index = segment_reader.inverted_index(field);
let inverted_index = segment_reader.inverted_index(field)?;
total_num_tokens += inverted_index.total_num_tokens();
total_num_docs += u64::from(segment_reader.max_doc());
}
let average_fieldnorm = total_num_tokens as Score / total_num_docs as Score;
if terms.len() == 1 {
let term_doc_freq = searcher.doc_freq(&terms[0]);
BM25Weight::for_one_term(term_doc_freq, total_num_docs, average_fieldnorm)
let term_doc_freq = searcher.doc_freq(&terms[0])?;
Ok(BM25Weight::for_one_term(
term_doc_freq,
total_num_docs,
average_fieldnorm,
))
} else {
let idf = terms
.iter()
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
})
.sum::<Score>();
let idf_explain = Explanation::new("idf", idf);
BM25Weight::new(idf_explain, average_fieldnorm)
let mut idf_sum: Score = 0.0;
for term in terms {
let term_doc_freq = searcher.doc_freq(term)?;
idf_sum += idf(term_doc_freq, total_num_docs);
}
let idf_explain = Explanation::new("idf", idf_sum);
Ok(BM25Weight::new(idf_explain, average_fieldnorm))
}
}

View File

@@ -95,7 +95,7 @@ impl PhraseQuery {
)));
}
let terms = self.phrase_terms();
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
let bm25_weight = BM25Weight::for_terms(searcher, &terms)?;
Ok(PhraseWeight::new(
self.phrase_terms.clone(),
bm25_weight,

View File

@@ -48,8 +48,8 @@ impl PhraseWeight {
let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.inverted_index(term.field())?
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
@@ -66,8 +66,8 @@ impl PhraseWeight {
let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())
.read_postings_no_deletes(&term, IndexRecordOption::WithFreqsAndPositions)
.inverted_index(term.field())?
.read_postings_no_deletes(&term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {

View File

@@ -296,13 +296,13 @@ impl Weight for RangeWeight {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field);
let inverted_index = reader.inverted_index(self.field)?;
let term_dict = inverted_index.terms();
let mut term_range = self.term_range(term_dict);
while term_range.advance() {
let term_info = term_range.value();
let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {

View File

@@ -87,21 +87,31 @@ impl TermQuery {
/// While `.weight(...)` returns a boxed trait object,
/// this method return a specific implementation.
/// This is useful for optimization purpose.
pub fn specialized_weight(&self, searcher: &Searcher, scoring_enabled: bool) -> TermWeight {
pub fn specialized_weight(
&self,
searcher: &Searcher,
scoring_enabled: bool,
) -> crate::Result<TermWeight> {
let term = self.term.clone();
let bm25_weight = BM25Weight::for_terms(searcher, &[term]);
let bm25_weight = BM25Weight::for_terms(searcher, &[term])?;
let index_record_option = if scoring_enabled {
self.index_record_option
} else {
IndexRecordOption::Basic
};
TermWeight::new(self.term.clone(), index_record_option, bm25_weight)
Ok(TermWeight::new(
self.term.clone(),
index_record_option,
bm25_weight,
))
}
}
impl Query for TermQuery {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> crate::Result<Box<dyn Weight>> {
Ok(Box::new(self.specialized_weight(searcher, scoring_enabled)))
Ok(Box::new(
self.specialized_weight(searcher, scoring_enabled)?,
))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
term_set.insert(self.term.clone());

View File

@@ -253,7 +253,7 @@ mod tests {
}
fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) -> crate::Result<()> {
let term_weight = term_query.specialized_weight(&searcher, true);
let term_weight = term_query.specialized_weight(&searcher, true)?;
for reader in searcher.segment_readers() {
let mut block_max_scores = vec![];
let mut block_max_scores_b = vec![];

View File

@@ -36,11 +36,9 @@ impl Weight for TermWeight {
Ok(self.scorer(reader, 1.0)?.count(delete_bitset))
} else {
let field = self.term.field();
Ok(reader
.inverted_index(field)
.get_term_info(&self.term)
.map(|term_info| term_info.doc_freq)
.unwrap_or(0))
let inv_index = reader.inverted_index(field)?;
let term_info = inv_index.get_term_info(&self.term);
Ok(term_info.map(|term_info| term_info.doc_freq).unwrap_or(0))
}
}
@@ -97,11 +95,11 @@ impl TermWeight {
boost: Score,
) -> crate::Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let inverted_index = reader.inverted_index(field)?;
let fieldnorm_reader = reader.get_fieldnorms_reader(field)?;
let similarity_weight = self.similarity_weight.boost_by(boost);
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option);
inverted_index.read_postings(&self.term, self.index_record_option)?;
if let Some(segment_postings) = postings_opt {
Ok(TermScorer::new(
segment_postings,

View File

@@ -9,8 +9,8 @@ use crate::directory::META_LOCK;
use crate::Index;
use crate::Searcher;
use crate::SegmentReader;
use std::convert::TryInto;
use std::sync::Arc;
use std::{convert::TryInto, io};
/// Defines when a new version of the index should be reloaded.
///
@@ -138,11 +138,11 @@ impl InnerIndexReader {
.collect::<crate::Result<_>>()?
};
let schema = self.index.schema();
let searchers = std::iter::repeat_with(|| {
let searchers: Vec<Searcher> = std::iter::repeat_with(|| {
Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
})
.take(self.num_searchers)
.collect();
.collect::<io::Result<_>>()?;
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}

View File

@@ -152,7 +152,7 @@ impl<B> Term<B>
where
B: AsRef<[u8]>,
{
/// Wraps a source of data
/// Wraps a object holding bytes
pub fn wrap(data: B) -> Term<B> {
Term(data)
}

View File

@@ -263,19 +263,17 @@ impl SnippetGenerator {
) -> crate::Result<SnippetGenerator> {
let mut terms = BTreeSet::new();
query.query_terms(&mut terms);
let terms_text: BTreeMap<String, Score> = terms
.into_iter()
.filter(|term| term.field() == field)
.flat_map(|term| {
let doc_freq = searcher.doc_freq(&term);
let mut terms_text: BTreeMap<String, Score> = Default::default();
for term in terms {
if term.field() != field {
continue;
}
let doc_freq = searcher.doc_freq(&term)?;
if doc_freq > 0 {
let score = 1.0 / (1.0 + doc_freq as Score);
if doc_freq > 0 {
Some((term.text().to_string(), score))
} else {
None
}
})
.collect();
terms_text.insert(term.text().to_string(), score);
}
}
let tokenizer = searcher.index().tokenizer_for_field(field)?;
Ok(SnippetGenerator {
terms_text,

View File

@@ -307,7 +307,7 @@ mod test {
let index = Index::create_in_ram(schema.clone());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage();
let searcher_space_usage = searcher.space_usage().unwrap();
assert_eq!(0, searcher_space_usage.total());
}
@@ -346,7 +346,7 @@ mod test {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage();
let searcher_space_usage = searcher.space_usage().unwrap();
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
@@ -386,7 +386,7 @@ mod test {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage();
let searcher_space_usage = searcher.space_usage().unwrap();
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
@@ -425,7 +425,7 @@ mod test {
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage();
let searcher_space_usage = searcher.space_usage().unwrap();
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
@@ -446,49 +446,47 @@ mod test {
}
#[test]
fn test_deletes() {
fn test_deletes() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_u64_field("name", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 3u64));
index_writer.add_document(doc!(name => 4u64));
index_writer.commit().unwrap();
index_writer.commit()?;
}
{
let mut index_writer2 = index.writer(50_000_000).unwrap();
let mut index_writer2 = index.writer(50_000_000)?;
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
// ok, now we should have a deleted doc
index_writer2.commit().unwrap();
index_writer2.commit()?;
}
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage();
let searcher_space_usage = searcher.space_usage()?;
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment = &searcher_space_usage.segments()[0];
assert!(segment.total() > 0);
let segment_space_usage = &searcher_space_usage.segments()[0];
assert!(segment_space_usage.total() > 0);
assert_eq!(2, segment.num_docs());
assert_eq!(2, segment_space_usage.num_docs());
expect_single_field(segment.termdict(), &name, 1, 512);
expect_single_field(segment.postings(), &name, 1, 512);
assert_eq!(0, segment.positions().total());
assert_eq!(0, segment.positions_skip_idx().total());
assert_eq!(0, segment.fast_fields().total());
expect_single_field(segment.fieldnorms(), &name, 1, 512);
// TODO: understand why the following fails
// assert_eq!(0, segment.store().total());
assert!(segment.deletes() > 0);
expect_single_field(segment_space_usage.termdict(), &name, 1, 512);
expect_single_field(segment_space_usage.postings(), &name, 1, 512);
assert_eq!(0, segment_space_usage.positions().total());
assert_eq!(0, segment_space_usage.positions_skip_idx().total());
assert_eq!(0, segment_space_usage.fast_fields().total());
expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512);
assert!(segment_space_usage.deletes() > 0);
Ok(())
}
}

View File

@@ -103,19 +103,18 @@ pub mod tests {
}
#[test]
fn test_store() {
fn test_store() -> crate::Result<()> {
let path = Path::new("store");
let mut directory = RAMDirectory::create();
let store_file = directory.open_write(path).unwrap();
let schema = write_lorem_ipsum_store(store_file, 1_000);
let store_wrt = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(store_wrt, 1_000);
let field_title = schema.get_field("title").unwrap();
let store_source = directory.open_read(path).unwrap();
let store = StoreReader::from_source(store_source);
let store_file = directory.open_read(path)?;
let store = StoreReader::open(store_file)?;
for i in 0..1_000 {
assert_eq!(
*store
.get(i)
.unwrap()
.get(i)?
.get_first(field_title)
.unwrap()
.text()
@@ -123,6 +122,7 @@ pub mod tests {
format!("Doc {}", i)
);
}
Ok(())
}
}
@@ -152,8 +152,8 @@ mod bench {
let mut directory = RAMDirectory::create();
let path = Path::new("store");
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
let store_source = directory.open_read(path).unwrap();
let store = StoreReader::from_source(store_source);
let store_file = directory.open_read(path).unwrap();
let store = StoreReader::open(store_file).unwrap();
b.iter(|| {
store.get(12).unwrap();
});

View File

@@ -1,8 +1,8 @@
use super::decompress;
use super::skiplist::SkipList;
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::directory::ReadOnlySource;
use crate::common::{BinarySerializable, HasLen};
use crate::directory::{FileSlice, OwnedBytes};
use crate::schema::Document;
use crate::space_usage::StoreSpaceUsage;
use crate::DocId;
@@ -13,8 +13,8 @@ use std::mem::size_of;
/// Reads document off tantivy's [`Store`](./index.html)
#[derive(Clone)]
pub struct StoreReader {
data: ReadOnlySource,
offset_index_source: ReadOnlySource,
data: FileSlice,
offset_index_file: OwnedBytes,
current_block_offset: RefCell<usize>,
current_block: RefCell<Vec<u8>>,
max_doc: DocId,
@@ -22,19 +22,20 @@ pub struct StoreReader {
impl StoreReader {
/// Opens a store reader
pub fn from_source(data: ReadOnlySource) -> StoreReader {
let (data_source, offset_index_source, max_doc) = split_source(data);
StoreReader {
data: data_source,
offset_index_source,
// TODO rename open
pub fn open(store_file: FileSlice) -> io::Result<StoreReader> {
let (data_file, offset_index_file, max_doc) = split_file(store_file)?;
Ok(StoreReader {
data: data_file,
offset_index_file: offset_index_file.read_bytes()?,
current_block_offset: RefCell::new(usize::max_value()),
current_block: RefCell::new(Vec::new()),
max_doc,
}
})
}
pub(crate) fn block_index(&self) -> SkipList<'_, u64> {
SkipList::from(self.offset_index_source.as_slice())
SkipList::from(self.offset_index_file.as_slice())
}
fn block_offset(&self, doc_id: DocId) -> (DocId, u64) {
@@ -44,23 +45,22 @@ impl StoreReader {
.unwrap_or((0u32, 0u64))
}
pub(crate) fn block_data(&self) -> &[u8] {
self.data.as_slice()
pub(crate) fn block_data(&self) -> io::Result<OwnedBytes> {
self.data.read_bytes()
}
fn compressed_block(&self, addr: usize) -> &[u8] {
let total_buffer = self.data.as_slice();
let mut buffer = &total_buffer[addr..];
let block_len = u32::deserialize(&mut buffer).expect("") as usize;
&buffer[..block_len]
fn compressed_block(&self, addr: usize) -> io::Result<OwnedBytes> {
let (block_len_bytes, block_body) = self.data.slice_from(addr).split(4);
let block_len = u32::deserialize(&mut block_len_bytes.read_bytes()?)?;
block_body.slice_to(block_len as usize).read_bytes()
}
fn read_block(&self, block_offset: usize) -> io::Result<()> {
if block_offset != *self.current_block_offset.borrow() {
let mut current_block_mut = self.current_block.borrow_mut();
current_block_mut.clear();
let compressed_block = self.compressed_block(block_offset);
decompress(compressed_block, &mut current_block_mut)?;
let compressed_block = self.compressed_block(block_offset)?;
decompress(compressed_block.as_slice(), &mut current_block_mut)?;
*self.current_block_offset.borrow_mut() = block_offset;
}
Ok(())
@@ -89,21 +89,21 @@ impl StoreReader {
/// Summarize total space usage of this store reader.
pub fn space_usage(&self) -> StoreSpaceUsage {
StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
StoreSpaceUsage::new(self.data.len(), self.offset_index_file.len())
}
}
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice, DocId)> {
let data_len = data.len();
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();
let serialized_offset: ReadOnlySource = data.slice(footer_offset, data_len);
let serialized_offset: OwnedBytes = data.slice(footer_offset, data_len).read_bytes()?;
let mut serialized_offset_buf = serialized_offset.as_slice();
let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
let offset = u64::deserialize(&mut serialized_offset_buf)?;
let offset = offset as usize;
let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
(
let max_doc = u32::deserialize(&mut serialized_offset_buf)?;
Ok((
data.slice(0, offset),
data.slice(offset, footer_offset),
max_doc,
)
))
}

View File

@@ -75,7 +75,8 @@ impl StoreWriter {
let start_offset = self.writer.written_bytes() as u64;
// just bulk write all of the block of the given reader.
self.writer.write_all(store_reader.block_data())?;
self.writer
.write_all(store_reader.block_data()?.as_slice())?;
// concatenate the index of the `store_reader`, after translating
// its start doc id and its start file offset.

View File

@@ -36,9 +36,9 @@ pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
mod tests {
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
use crate::core::Index;
use crate::directory::{Directory, RAMDirectory, ReadOnlySource};
use crate::directory::{Directory, FileSlice, RAMDirectory};
use crate::postings::TermInfo;
use crate::schema::{Document, Schema, TEXT};
use crate::schema::{Schema, TEXT};
use std::path::PathBuf;
use std::str;
@@ -59,7 +59,7 @@ mod tests {
}
#[test]
fn test_term_ordinals() {
fn test_term_ordinals() -> crate::Result<()> {
const COUNTRIES: [&'static str; 7] = [
"San Marino",
"Serbia",
@@ -72,42 +72,37 @@ mod tests {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
let write = directory.open_write(&path)?;
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))
.unwrap();
term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?;
}
term_dictionary_builder.finish().unwrap();
term_dictionary_builder.finish()?;
}
let source = directory.open_read(&path).unwrap();
let term_dict: TermDictionary = TermDictionary::from_source(&source);
let term_file = directory.open_read(&path)?;
let term_dict: TermDictionary = TermDictionary::open(term_file)?;
for (term_ord, term) in COUNTRIES.iter().enumerate() {
assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64);
let mut bytes = vec![];
assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes));
assert_eq!(bytes, term.as_bytes());
}
Ok(())
}
#[test]
fn test_term_dictionary_simple() {
fn test_term_dictionary_simple() -> crate::Result<()> {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
term_dictionary_builder
.insert("abc".as_bytes(), &make_term_info(34u64))
.unwrap();
term_dictionary_builder
.insert("abcd".as_bytes(), &make_term_info(346u64))
.unwrap();
term_dictionary_builder.finish().unwrap();
let write = directory.open_write(&path)?;
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
term_dictionary_builder.insert("abc".as_bytes(), &make_term_info(34u64))?;
term_dictionary_builder.insert("abcd".as_bytes(), &make_term_info(346u64))?;
term_dictionary_builder.finish()?;
}
let source = directory.open_read(&path).unwrap();
let term_dict: TermDictionary = TermDictionary::from_source(&source);
let file = directory.open_read(&path)?;
let term_dict: TermDictionary = TermDictionary::open(file)?;
assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
let mut stream = term_dict.stream();
@@ -130,43 +125,26 @@ mod tests {
assert_eq!(stream.value().doc_freq, 346u32);
}
assert!(!stream.advance());
Ok(())
}
#[test]
fn test_term_iterator() {
fn test_term_iterator() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests().unwrap();
{
{
let mut doc = Document::default();
doc.add_text(text_field, "a b d f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c d f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
{
let mut doc = Document::default();
doc.add_text(text_field, "e f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b d f"));
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a b c d f"));
index_writer.commit()?;
index_writer.add_document(doc!(text_field => "e f"));
index_writer.commit()?;
}
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
let field_searcher = searcher.field(text_field);
let field_searcher = searcher.field(text_field)?;
let mut term_it = field_searcher.terms();
let mut term_string = String::new();
while term_it.advance() {
@@ -174,10 +152,11 @@ mod tests {
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
}
assert_eq!(&*term_string, "abcdef");
Ok(())
}
#[test]
fn test_term_dictionary_stream() {
fn test_term_dictionary_stream() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.collect();
@@ -190,8 +169,8 @@ mod tests {
}
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
let term_file = FileSlice::new(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(term_file)?;
{
let mut streamer = term_dictionary.stream();
let mut i = 0;
@@ -203,28 +182,26 @@ mod tests {
}
}
let &(ref key, ref _v) = &ids[2047];
term_dictionary.get(key.as_bytes());
let &(ref key, ref val) = &ids[2047];
assert_eq!(
term_dictionary.get(key.as_bytes()),
Some(make_term_info(*val as u64))
);
Ok(())
}
#[test]
fn test_stream_high_range_prefix_suffix() {
fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
// term requires more than 16bits
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
.unwrap();
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
.unwrap();
term_dictionary_builder
.insert("abr", &make_term_info(2))
.unwrap();
term_dictionary_builder.finish().unwrap()
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?;
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?;
term_dictionary_builder.insert("abr", &make_term_info(2))?;
term_dictionary_builder.finish()?
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
let term_dict_file = FileSlice::new(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(term_dict_file)?;
let mut kv_stream = term_dictionary.stream();
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
@@ -235,10 +212,11 @@ mod tests {
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abr".as_bytes());
assert!(!kv_stream.advance());
Ok(())
}
#[test]
fn test_stream_range() {
fn test_stream_range() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.collect();
@@ -252,9 +230,9 @@ mod tests {
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let file = FileSlice::new(buffer);
let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
{
for i in (0..20).chain(6000..8_000) {
let &(ref target_key, _) = &ids[i];
@@ -305,10 +283,11 @@ mod tests {
}
}
}
Ok(())
}
#[test]
fn test_empty_string() {
fn test_empty_string() -> crate::Result<()> {
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
term_dictionary_builder
@@ -319,30 +298,29 @@ mod tests {
.unwrap();
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
let file = FileSlice::new(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
let mut stream = term_dictionary.stream();
assert!(stream.advance());
assert!(stream.key().is_empty());
assert!(stream.advance());
assert_eq!(stream.key(), &[1u8]);
assert!(!stream.advance());
Ok(())
}
#[test]
fn test_stream_range_boundaries() {
fn test_stream_range_boundaries() -> crate::Result<()> {
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
for i in 0u8..10u8 {
let number_arr = [i; 1];
term_dictionary_builder
.insert(&number_arr, &make_term_info(i as u64))
.unwrap();
term_dictionary_builder.insert(&number_arr, &make_term_info(i as u64))?;
}
term_dictionary_builder.finish().unwrap()
term_dictionary_builder.finish()?
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
let file = FileSlice::new(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| {
let mut res: Vec<u32> = vec![];
@@ -430,10 +408,11 @@ mod tests {
.into_stream();
assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
}
Ok(())
}
#[test]
fn test_automaton_search() {
fn test_automaton_search() -> crate::Result<()> {
use crate::query::DFAWrapper;
use levenshtein_automata::LevenshteinAutomatonBuilder;
@@ -450,17 +429,15 @@ mod tests {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
let write = directory.open_write(&path)?;
let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?;
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))
.unwrap();
term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?;
}
term_dictionary_builder.finish().unwrap();
term_dictionary_builder.finish()?;
}
let source = directory.open_read(&path).unwrap();
let term_dict: TermDictionary = TermDictionary::from_source(&source);
let file = directory.open_read(&path)?;
let term_dict: TermDictionary = TermDictionary::open(file)?;
// We can now build an entire dfa.
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
@@ -472,5 +449,6 @@ mod tests {
assert!(range.advance());
assert_eq!("Spain".as_bytes(), range.key());
assert!(!range.advance());
Ok(())
}
}

View File

@@ -1,8 +1,6 @@
use crate::common::bitpacker::BitPacker;
use crate::common::compute_num_bits;
use crate::common::Endianness;
use crate::common::{BinarySerializable, FixedSize};
use crate::directory::ReadOnlySource;
use crate::common::{bitpacker::BitPacker, BinarySerializable, FixedSize};
use crate::directory::{FileSlice, OwnedBytes};
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
use byteorder::{ByteOrder, LittleEndian};
@@ -79,8 +77,8 @@ impl TermInfoBlockMeta {
pub struct TermInfoStore {
num_terms: usize,
block_meta_source: ReadOnlySource,
term_info_source: ReadOnlySource,
block_meta_bytes: OwnedBytes,
term_info_bytes: OwnedBytes,
}
fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
@@ -105,35 +103,35 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
}
impl TermInfoStore {
pub fn open(data: &ReadOnlySource) -> TermInfoStore {
let buffer = data.as_slice();
let len = Endianness::read_u64(&buffer[0..8]) as usize;
let num_terms = Endianness::read_u64(&buffer[8..16]) as usize;
let block_meta_source = data.slice(16, 16 + len);
let term_info_source = data.slice_from(16 + len);
TermInfoStore {
pub fn open(term_info_store_file: FileSlice) -> crate::Result<TermInfoStore> {
let (len_slice, main_slice) = term_info_store_file.split(16);
let mut bytes = len_slice.read_bytes()?;
let len = u64::deserialize(&mut bytes)? as usize;
let num_terms = u64::deserialize(&mut bytes)? as usize;
let (block_meta_file, term_info_file) = main_slice.split(len);
let term_info_bytes = term_info_file.read_bytes()?;
Ok(TermInfoStore {
num_terms,
block_meta_source,
term_info_source,
}
block_meta_bytes: block_meta_file.read_bytes()?,
term_info_bytes,
})
}
pub fn get(&self, term_ord: TermOrdinal) -> TermInfo {
let block_id = (term_ord as usize) / BLOCK_LEN;
let buffer = self.block_meta_source.as_slice();
let buffer = self.block_meta_bytes.as_slice();
let mut block_data: &[u8] = &buffer[block_id * TermInfoBlockMeta::SIZE_IN_BYTES..];
let term_info_block_data = TermInfoBlockMeta::deserialize(&mut block_data)
.expect("Failed to deserialize terminfoblockmeta");
let inner_offset = (term_ord as usize) % BLOCK_LEN;
if inner_offset == 0 {
term_info_block_data.ref_term_info
} else {
let term_info_data = self.term_info_source.as_slice();
term_info_block_data.deserialize_term_info(
&term_info_data[term_info_block_data.offset as usize..],
inner_offset - 1,
)
return term_info_block_data.ref_term_info;
}
let term_info_data = self.term_info_bytes.as_slice();
term_info_block_data.deserialize_term_info(
&term_info_data[term_info_block_data.offset as usize..],
inner_offset - 1,
)
}
pub fn num_terms(&self) -> usize {
@@ -263,7 +261,7 @@ mod tests {
use crate::common::bitpacker::BitPacker;
use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::directory::ReadOnlySource;
use crate::directory::FileSlice;
use crate::postings::TermInfo;
#[test]
@@ -309,7 +307,7 @@ mod tests {
}
#[test]
fn test_pack() {
fn test_pack() -> crate::Result<()> {
let mut store_writer = TermInfoStoreWriter::new();
let mut term_infos = vec![];
for i in 0..1000 {
@@ -318,14 +316,15 @@ mod tests {
postings_offset: (i / 10) as u64,
positions_idx: (i * 7) as u64,
};
store_writer.write_term_info(&term_info).unwrap();
store_writer.write_term_info(&term_info)?;
term_infos.push(term_info);
}
let mut buffer = Vec::new();
store_writer.serialize(&mut buffer).unwrap();
let term_info_store = TermInfoStore::open(&ReadOnlySource::from(buffer));
store_writer.serialize(&mut buffer)?;
let term_info_store = TermInfoStore::open(FileSlice::new(buffer))?;
for i in 0..1000 {
assert_eq!(term_info_store.get(i as u64), term_infos[i]);
}
Ok(())
}
}

View File

@@ -1,8 +1,8 @@
use super::term_info_store::{TermInfoStore, TermInfoStoreWriter};
use super::{TermStreamer, TermStreamerBuilder};
use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::directory::ReadOnlySource;
use crate::common::{BinarySerializable, CountingWriter};
use crate::directory::{FileSlice, OwnedBytes};
use crate::error::DataCorruption;
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
use once_cell::sync::Lazy;
@@ -86,17 +86,19 @@ where
}
}
fn open_fst_index(source: ReadOnlySource) -> tantivy_fst::Map<ReadOnlySource> {
let fst = Fst::new(source).expect("FST data is corrupted");
tantivy_fst::Map::from(fst)
fn open_fst_index(fst_file: FileSlice) -> crate::Result<tantivy_fst::Map<OwnedBytes>> {
let bytes = fst_file.read_bytes()?;
let fst = Fst::new(bytes)
.map_err(|err| DataCorruption::comment_only(format!("Fst data is corrupted: {:?}", err)))?;
Ok(tantivy_fst::Map::from(fst))
}
static EMPTY_DATA_SOURCE: Lazy<ReadOnlySource> = Lazy::new(|| {
static EMPTY_TERM_DICT_FILE: Lazy<FileSlice> = Lazy::new(|| {
let term_dictionary_data: Vec<u8> = TermDictionaryBuilder::create(Vec::<u8>::new())
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
ReadOnlySource::from(term_dictionary_data)
FileSlice::new(term_dictionary_data)
});
/// The term dictionary contains all of the terms in
@@ -106,31 +108,28 @@ static EMPTY_DATA_SOURCE: Lazy<ReadOnlySource> = Lazy::new(|| {
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
/// possible to fetch the associated `TermInfo`.
pub struct TermDictionary {
fst_index: tantivy_fst::Map<ReadOnlySource>,
fst_index: tantivy_fst::Map<OwnedBytes>,
term_info_store: TermInfoStore,
}
impl TermDictionary {
/// Opens a `TermDictionary` given a data source.
pub fn from_source(source: &ReadOnlySource) -> Self {
let total_len = source.len();
let length_offset = total_len - 8;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = u64::deserialize(&mut split_len_buffer)
.expect("Deserializing 8 bytes should always work") as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_source);
TermDictionary {
/// Opens a `TermDictionary`.
pub fn open(file: FileSlice) -> crate::Result<Self> {
let (main_slice, footer_len_slice) = file.split_from_end(8);
let mut footer_len_bytes = footer_len_slice.read_bytes()?;
let footer_size = u64::deserialize(&mut footer_len_bytes)?;
let (fst_file_slice, values_file_slice) = main_slice.split_from_end(footer_size as usize);
let fst_index = open_fst_index(fst_file_slice)?;
let term_info_store = TermInfoStore::open(values_file_slice)?;
Ok(TermDictionary {
fst_index,
term_info_store: TermInfoStore::open(&values_source),
}
term_info_store,
})
}
/// Creates an empty term dictionary which contains no terms.
pub fn empty() -> Self {
TermDictionary::from_source(&*EMPTY_DATA_SOURCE)
TermDictionary::open(EMPTY_TERM_DICT_FILE.clone()).unwrap()
}
/// Returns the number of terms in the dictionary.

View File

@@ -40,17 +40,17 @@ fn test_failpoints_managed_directory_gc_if_delete_fails() {
}
#[test]
fn test_write_commit_fails() {
fn test_write_commit_fails() -> tantivy::Result<()> {
let _fail_scenario_guard = fail::FailScenario::setup();
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
index_writer.commit().unwrap();
index_writer.commit()?;
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "b"));
@@ -59,8 +59,9 @@ fn test_write_commit_fails() {
let num_docs_containing = |s: &str| {
let term_a = Term::from_field_text(text_field, s);
index.reader().unwrap().searcher().doc_freq(&term_a)
index.reader()?.searcher().doc_freq(&term_a)
};
assert_eq!(num_docs_containing("a"), 100);
assert_eq!(num_docs_containing("b"), 0);
assert_eq!(num_docs_containing("a")?, 100);
assert_eq!(num_docs_containing("b")?, 0);
Ok(())
}