mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-24 20:20:44 +00:00
Merge branch 'main' into indexmeta
This commit is contained in:
@@ -180,7 +180,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
/// the collector.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub(crate) fn at_capacity(&self) -> bool {
|
||||
self.heap.len() >= self.limit
|
||||
}
|
||||
@@ -189,7 +189,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
///
|
||||
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
||||
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn collect(&mut self, doc: DocId, feature: T) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
|
||||
@@ -59,19 +59,19 @@ impl TinySet {
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
@@ -79,20 +79,20 @@ impl TinySet {
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
|
||||
@@ -190,7 +190,7 @@ mod test {
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::directory::{Directory, RAMDirectory};
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::schema::Field;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
@@ -198,7 +198,7 @@ mod test {
|
||||
#[test]
|
||||
fn test_composite_file() -> crate::Result<()> {
|
||||
let path = Path::new("test_path");
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
{
|
||||
let w = directory.open_write(path).unwrap();
|
||||
let mut composite_write = CompositeWrite::wrap(w);
|
||||
|
||||
@@ -99,13 +99,13 @@ const HIGHEST_BIT: u64 = 1 << 63;
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
@@ -127,7 +127,7 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn f64_to_u64(val: f64) -> u64 {
|
||||
let bits = val.to_bits();
|
||||
if val.is_sign_positive() {
|
||||
@@ -138,7 +138,7 @@ pub fn f64_to_u64(val: f64) -> u64 {
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use crate::directory::MmapDirectory;
|
||||
use crate::directory::INDEX_WRITER_LOCK;
|
||||
use crate::directory::{Directory, RAMDirectory};
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::index_writer::HEAP_SIZE_MIN;
|
||||
@@ -222,7 +222,7 @@ impl Index {
|
||||
self.set_multithread_executor(default_num_threads)
|
||||
}
|
||||
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
/// Creates a new index using the `RamDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
/// This should only be used for unit tests.
|
||||
@@ -256,7 +256,7 @@ impl Index {
|
||||
/// is destroyed.
|
||||
///
|
||||
/// The temp directory is only used for testing the `MmapDirectory`.
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
/// For other unit tests, prefer the `RamDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(schema: Schema) -> crate::Result<Index> {
|
||||
IndexBuilder::new().schema(schema).create_from_tempdir()
|
||||
@@ -390,7 +390,7 @@ impl Index {
|
||||
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IOError`.
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
@@ -524,7 +524,7 @@ impl fmt::Debug for Index {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::directory::{RAMDirectory, WatchCallback};
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::schema::Field;
|
||||
use crate::schema::{Schema, INDEXED, TEXT};
|
||||
use crate::IndexReader;
|
||||
@@ -548,7 +548,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(!Index::exists(&directory).unwrap());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
@@ -561,7 +561,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_create() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(!Index::exists(&directory).unwrap());
|
||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
@@ -569,7 +569,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_open() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
@@ -582,7 +582,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn create_should_wipeoff_existing() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
@@ -600,7 +600,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn open_or_create_exists_but_schema_does_not_match() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
@@ -738,7 +738,7 @@ mod tests {
|
||||
#[cfg(not(target_os = "windows"))]
|
||||
#[test]
|
||||
fn garbage_collect_works_as_intended() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||
|
||||
@@ -108,14 +108,13 @@ impl SegmentMeta {
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSITIONSSKIP => ".posidx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
SegmentComponent::Postings => ".idx".to_string(),
|
||||
SegmentComponent::Positions => ".pos".to_string(),
|
||||
SegmentComponent::Terms => ".term".to_string(),
|
||||
SegmentComponent::Store => ".store".to_string(),
|
||||
SegmentComponent::FastFields => ".fast".to_string(),
|
||||
SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
|
||||
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ pub struct InvertedIndexReader {
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
positions_file_slice: FileSlice,
|
||||
positions_idx_file_slice: FileSlice,
|
||||
record_option: IndexRecordOption,
|
||||
total_num_tokens: u64,
|
||||
}
|
||||
@@ -37,7 +36,6 @@ impl InvertedIndexReader {
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
positions_file_slice: FileSlice,
|
||||
positions_idx_file_slice: FileSlice,
|
||||
record_option: IndexRecordOption,
|
||||
) -> io::Result<InvertedIndexReader> {
|
||||
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
|
||||
@@ -46,7 +44,6 @@ impl InvertedIndexReader {
|
||||
termdict,
|
||||
postings_file_slice: postings_body,
|
||||
positions_file_slice,
|
||||
positions_idx_file_slice,
|
||||
record_option,
|
||||
total_num_tokens,
|
||||
})
|
||||
@@ -59,7 +56,6 @@ impl InvertedIndexReader {
|
||||
termdict: TermDictionary::empty(),
|
||||
postings_file_slice: FileSlice::empty(),
|
||||
positions_file_slice: FileSlice::empty(),
|
||||
positions_idx_file_slice: FileSlice::empty(),
|
||||
record_option,
|
||||
total_num_tokens: 0u64,
|
||||
}
|
||||
@@ -141,12 +137,12 @@ impl InvertedIndexReader {
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<SegmentPostings> {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option)?;
|
||||
let position_stream = {
|
||||
let position_reader = {
|
||||
if option.has_positions() {
|
||||
let position_reader = self.positions_file_slice.clone();
|
||||
let skip_reader = self.positions_idx_file_slice.clone();
|
||||
let position_reader =
|
||||
PositionReader::new(position_reader, skip_reader, term_info.positions_idx)?;
|
||||
let positions_data = self
|
||||
.positions_file_slice
|
||||
.read_bytes_slice(term_info.positions_range.clone())?;
|
||||
let position_reader = PositionReader::open(positions_data)?;
|
||||
Some(position_reader)
|
||||
} else {
|
||||
None
|
||||
@@ -154,7 +150,7 @@ impl InvertedIndexReader {
|
||||
};
|
||||
Ok(SegmentPostings::from_block_postings(
|
||||
block_postings,
|
||||
position_stream,
|
||||
position_reader,
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -7,39 +7,36 @@ use std::slice;
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum SegmentComponent {
|
||||
/// Postings (or inverted list). Sorted lists of document ids, associated to terms
|
||||
POSTINGS,
|
||||
Postings,
|
||||
/// Positions of terms in each document.
|
||||
POSITIONS,
|
||||
/// Index to seek within the position file
|
||||
POSITIONSSKIP,
|
||||
Positions,
|
||||
/// Column-oriented random-access storage of fields.
|
||||
FASTFIELDS,
|
||||
FastFields,
|
||||
/// Stores the sum of the length (in terms) of each field for each document.
|
||||
/// Field norms are stored as a special u64 fast field.
|
||||
FIELDNORMS,
|
||||
FieldNorms,
|
||||
/// Dictionary associating `Term`s to `TermInfo`s which is
|
||||
/// simply an address into the `postings` file and the `positions` file.
|
||||
TERMS,
|
||||
Terms,
|
||||
/// Row-oriented, compressed storage of the documents.
|
||||
/// Accessing a document from the store is relatively slow, as it
|
||||
/// requires to decompress the entire block it belongs to.
|
||||
STORE,
|
||||
Store,
|
||||
/// Bitset describing which document of the segment is deleted.
|
||||
DELETE,
|
||||
Delete,
|
||||
}
|
||||
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::POSITIONSSKIP,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE,
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::Postings,
|
||||
SegmentComponent::Positions,
|
||||
SegmentComponent::FastFields,
|
||||
SegmentComponent::FieldNorms,
|
||||
SegmentComponent::Terms,
|
||||
SegmentComponent::Store,
|
||||
SegmentComponent::Delete,
|
||||
];
|
||||
SEGMENT_COMPONENTS.iter()
|
||||
}
|
||||
|
||||
@@ -46,7 +46,6 @@ pub struct SegmentReader {
|
||||
termdict_composite: CompositeFile,
|
||||
postings_composite: CompositeFile,
|
||||
positions_composite: CompositeFile,
|
||||
positions_idx_composite: CompositeFile,
|
||||
fast_fields_readers: Arc<FastFieldReaders>,
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
|
||||
@@ -151,44 +150,36 @@ impl SegmentReader {
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
|
||||
let termdict_file = segment.open_read(SegmentComponent::TERMS)?;
|
||||
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_file)?;
|
||||
|
||||
let store_file = segment.open_read(SegmentComponent::STORE)?;
|
||||
let store_file = segment.open_read(SegmentComponent::Store)?;
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_file = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_file = segment.open_read(SegmentComponent::Postings)?;
|
||||
let postings_composite = CompositeFile::open(&postings_file)?;
|
||||
|
||||
let positions_composite = {
|
||||
if let Ok(positions_file) = segment.open_read(SegmentComponent::POSITIONS) {
|
||||
if let Ok(positions_file) = segment.open_read(SegmentComponent::Positions) {
|
||||
CompositeFile::open(&positions_file)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let positions_idx_composite = {
|
||||
if let Ok(positions_skip_file) = segment.open_read(SegmentComponent::POSITIONSSKIP) {
|
||||
CompositeFile::open(&positions_skip_file)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
let delete_data = segment.open_read(SegmentComponent::Delete)?;
|
||||
let delete_bitset = DeleteBitSet::open(delete_data)?;
|
||||
Some(delete_bitset)
|
||||
} else {
|
||||
@@ -207,7 +198,6 @@ impl SegmentReader {
|
||||
store_file,
|
||||
delete_bitset_opt,
|
||||
positions_composite,
|
||||
positions_idx_composite,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
@@ -263,18 +253,15 @@ impl SegmentReader {
|
||||
let positions_file = self
|
||||
.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let positions_idx_file = self
|
||||
.positions_idx_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
.ok_or_else(|| {
|
||||
let error_msg = format!("Failed to open field {:?}'s positions in the composite file. Has the schema been modified?", field_entry.name());
|
||||
DataCorruption::comment_only(error_msg)
|
||||
})?;
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionary::open(termdict_file)?,
|
||||
postings_file,
|
||||
positions_file,
|
||||
positions_idx_file,
|
||||
record_option,
|
||||
)?);
|
||||
|
||||
@@ -319,7 +306,6 @@ impl SegmentReader {
|
||||
self.termdict_composite.space_usage(),
|
||||
self.postings_composite.space_usage(),
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorm_readers.space_usage(),
|
||||
self.get_store_reader()?.space_usage(),
|
||||
|
||||
@@ -70,7 +70,7 @@ impl Drop for DirectoryLockGuard {
|
||||
|
||||
enum TryAcquireLockError {
|
||||
FileExists,
|
||||
IOError(io::Error),
|
||||
IoError(io::Error),
|
||||
}
|
||||
|
||||
fn try_acquire_lock(
|
||||
@@ -79,9 +79,9 @@ fn try_acquire_lock(
|
||||
) -> Result<DirectoryLock, TryAcquireLockError> {
|
||||
let mut write = directory.open_write(filepath).map_err(|e| match e {
|
||||
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
|
||||
OpenWriteError::IOError { io_error, .. } => TryAcquireLockError::IOError(io_error),
|
||||
OpenWriteError::IoError { io_error, .. } => TryAcquireLockError::IoError(io_error),
|
||||
})?;
|
||||
write.flush().map_err(TryAcquireLockError::IOError)?;
|
||||
write.flush().map_err(TryAcquireLockError::IoError)?;
|
||||
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
|
||||
directory: directory.box_clone(),
|
||||
path: filepath.to_owned(),
|
||||
@@ -106,7 +106,7 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
||||
///
|
||||
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
|
||||
/// should be your default choice.
|
||||
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
|
||||
/// - The [`RamDirectory`](struct.RamDirectory.html), which
|
||||
/// should be used mostly for tests.
|
||||
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Opens a file and returns a boxed `FileHandle`.
|
||||
@@ -154,7 +154,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Flush operation should also be persistent.
|
||||
///
|
||||
/// The user shall not rely on `Drop` triggering `flush`.
|
||||
/// Note that `RAMDirectory` will panic! if `flush`
|
||||
/// Note that `RamDirectory` will panic! if `flush`
|
||||
/// was not called.
|
||||
///
|
||||
/// The file may not previously exist.
|
||||
@@ -192,8 +192,8 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
return Err(LockError::LockBusy);
|
||||
}
|
||||
}
|
||||
Err(TryAcquireLockError::IOError(io_error)) => {
|
||||
return Err(LockError::IOError(io_error));
|
||||
Err(TryAcquireLockError::IoError(io_error)) => {
|
||||
return Err(LockError::IoError(io_error));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,9 +12,9 @@ pub enum LockError {
|
||||
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
|
||||
#[error("Could not acquire lock as it is already held, possibly by a different process.")]
|
||||
LockBusy,
|
||||
/// Trying to acquire a lock failed with an `IOError`
|
||||
/// Trying to acquire a lock failed with an `IoError`
|
||||
#[error("Failed to acquire the lock due to an io:Error.")]
|
||||
IOError(io::Error),
|
||||
IoError(io::Error),
|
||||
}
|
||||
|
||||
/// Error that may occur when opening a directory
|
||||
@@ -30,7 +30,7 @@ pub enum OpenDirectoryError {
|
||||
#[error("Failed to create a temporary directory: '{0}'.")]
|
||||
FailedToCreateTempDir(io::Error),
|
||||
/// IoError
|
||||
#[error("IOError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
|
||||
#[error("IoError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
|
||||
IoError {
|
||||
/// underlying io Error.
|
||||
io_error: io::Error,
|
||||
@@ -48,8 +48,8 @@ pub enum OpenWriteError {
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// Any kind of IO error that happens when
|
||||
/// writing in the underlying IO device.
|
||||
#[error("IOError '{io_error:?}' while opening file for write: '{filepath}'.")]
|
||||
IOError {
|
||||
#[error("IoError '{io_error:?}' while opening file for write: '{filepath}'.")]
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to open for write.
|
||||
@@ -60,7 +60,7 @@ pub enum OpenWriteError {
|
||||
impl OpenWriteError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
|
||||
Self::IOError { io_error, filepath }
|
||||
Self::IoError { io_error, filepath }
|
||||
}
|
||||
}
|
||||
/// Type of index incompatibility between the library and the index found on disk
|
||||
@@ -130,9 +130,9 @@ pub enum OpenReadError {
|
||||
FileDoesNotExist(PathBuf),
|
||||
/// Any kind of io::Error.
|
||||
#[error(
|
||||
"IOError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
|
||||
"IoError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
|
||||
)]
|
||||
IOError {
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to open for read.
|
||||
@@ -146,7 +146,7 @@ pub enum OpenReadError {
|
||||
impl OpenReadError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
|
||||
Self::IOError { io_error, filepath }
|
||||
Self::IoError { io_error, filepath }
|
||||
}
|
||||
}
|
||||
/// Error that may occur when trying to delete a file
|
||||
@@ -158,7 +158,7 @@ pub enum DeleteError {
|
||||
/// Any kind of IO error that happens when
|
||||
/// interacting with the underlying IO device.
|
||||
#[error("The following IO error happened while deleting file '{filepath}': '{io_error:?}'.")]
|
||||
IOError {
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to delete.
|
||||
|
||||
@@ -86,7 +86,7 @@ impl ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
io_err @ Err(OpenReadError::IOError { .. }) => Err(io_err.err().unwrap().into()),
|
||||
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
|
||||
Err(OpenReadError::IncompatibleIndex(incompatibility)) => {
|
||||
// For the moment, this should never happen `meta.json`
|
||||
// do not have any footer and cannot detect incompatibility.
|
||||
@@ -168,7 +168,7 @@ impl ManagedDirectory {
|
||||
DeleteError::FileDoesNotExist(_) => {
|
||||
deleted_files.push(file_to_delete.clone());
|
||||
}
|
||||
DeleteError::IOError { .. } => {
|
||||
DeleteError::IoError { .. } => {
|
||||
failed_to_delete_files.push(file_to_delete.clone());
|
||||
if !cfg!(target_os = "windows") {
|
||||
// On windows, delete is expected to fail if the file
|
||||
@@ -232,13 +232,13 @@ impl ManagedDirectory {
|
||||
pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> {
|
||||
let reader = self.directory.open_read(path)?;
|
||||
let (footer, data) =
|
||||
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IOError {
|
||||
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IoError {
|
||||
io_error,
|
||||
filepath: path.to_path_buf(),
|
||||
})?;
|
||||
let bytes = data
|
||||
.read_bytes()
|
||||
.map_err(|io_error| OpenReadError::IOError {
|
||||
.map_err(|io_error| OpenReadError::IoError {
|
||||
filepath: path.to_path_buf(),
|
||||
io_error,
|
||||
})?;
|
||||
|
||||
@@ -185,7 +185,7 @@ impl MmapDirectory {
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
///
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
/// For your unit tests, prefer the RamDirectory.
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new().map_err(OpenDirectoryError::FailedToCreateTempDir)?;
|
||||
Ok(MmapDirectory::new(
|
||||
@@ -374,7 +374,7 @@ impl Directory for MmapDirectory {
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
let full_path = self.resolve_path(path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError {
|
||||
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
}),
|
||||
@@ -382,7 +382,7 @@ impl Directory for MmapDirectory {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||
} else {
|
||||
Err(DeleteError::IOError {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
})
|
||||
@@ -460,9 +460,9 @@ impl Directory for MmapDirectory {
|
||||
.write(true)
|
||||
.create(true) //< if the file does not exist yet, create it.
|
||||
.open(&full_path)
|
||||
.map_err(LockError::IOError)?;
|
||||
.map_err(LockError::IoError)?;
|
||||
if lock.is_blocking {
|
||||
file.lock_exclusive().map_err(LockError::IOError)?;
|
||||
file.lock_exclusive().map_err(LockError::IoError)?;
|
||||
} else {
|
||||
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
|
||||
}
|
||||
@@ -616,8 +616,9 @@ mod tests {
|
||||
reader.reload().unwrap();
|
||||
let num_segments = reader.searcher().segment_readers().len();
|
||||
assert!(num_segments <= 4);
|
||||
let num_components_except_deletes = crate::core::SegmentComponent::iterator().len() - 1;
|
||||
assert_eq!(
|
||||
num_segments * 7,
|
||||
num_segments * num_components_except_deletes,
|
||||
mmap_directory.get_cache_info().mmapped.len()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes};
|
||||
pub use self::file_slice::{FileHandle, FileSlice};
|
||||
pub use self::owned_bytes::OwnedBytes;
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::ram_directory::RamDirectory;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
use std::io::{self, BufWriter, Write};
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -51,13 +51,13 @@ impl OwnedBytes {
|
||||
|
||||
/// Returns the underlying slice of data.
|
||||
/// `Deref` and `AsRef` are also available.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Returns the len of the slice.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
@@ -84,7 +84,7 @@ impl OwnedBytes {
|
||||
}
|
||||
|
||||
/// Returns true iff this `OwnedBytes` is empty.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.as_slice().is_empty()
|
||||
}
|
||||
@@ -92,7 +92,7 @@ impl OwnedBytes {
|
||||
/// Drops the left most `advance_len` bytes.
|
||||
///
|
||||
/// See also [.clip(clip_len: usize))](#method.clip).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn advance(&mut self, advance_len: usize) {
|
||||
self.data = &self.data[advance_len..]
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::FileHandle;
|
||||
|
||||
/// Writer associated with the `RAMDirectory`
|
||||
/// Writer associated with the `RamDirectory`
|
||||
///
|
||||
/// The Writer just writes a buffer.
|
||||
///
|
||||
@@ -26,13 +26,13 @@ use super::FileHandle;
|
||||
///
|
||||
struct VecWriter {
|
||||
path: PathBuf,
|
||||
shared_directory: RAMDirectory,
|
||||
shared_directory: RamDirectory,
|
||||
data: Cursor<Vec<u8>>,
|
||||
is_flushed: bool,
|
||||
}
|
||||
|
||||
impl VecWriter {
|
||||
fn new(path_buf: PathBuf, shared_directory: RAMDirectory) -> VecWriter {
|
||||
fn new(path_buf: PathBuf, shared_directory: RamDirectory) -> VecWriter {
|
||||
VecWriter {
|
||||
path: path_buf,
|
||||
data: Cursor::new(Vec::new()),
|
||||
@@ -119,9 +119,9 @@ impl InnerDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for RAMDirectory {
|
||||
impl fmt::Debug for RamDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "RAMDirectory")
|
||||
write!(f, "RamDirectory")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,23 +131,23 @@ impl fmt::Debug for RAMDirectory {
|
||||
/// Writes are only made visible upon flushing.
|
||||
///
|
||||
#[derive(Clone, Default)]
|
||||
pub struct RAMDirectory {
|
||||
pub struct RamDirectory {
|
||||
fs: Arc<RwLock<InnerDirectory>>,
|
||||
}
|
||||
|
||||
impl RAMDirectory {
|
||||
impl RamDirectory {
|
||||
/// Constructor
|
||||
pub fn create() -> RAMDirectory {
|
||||
pub fn create() -> RamDirectory {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Returns the sum of the size of the different files
|
||||
/// in the RAMDirectory.
|
||||
/// in the RamDirectory.
|
||||
pub fn total_mem_usage(&self) -> usize {
|
||||
self.fs.read().unwrap().total_mem_usage()
|
||||
}
|
||||
|
||||
/// Write a copy of all of the files saved in the RAMDirectory in the target `Directory`.
|
||||
/// Write a copy of all of the files saved in the RamDirectory in the target `Directory`.
|
||||
///
|
||||
/// Files are all written using the `Directory::write` meaning, even if they were
|
||||
/// written using the `atomic_write` api.
|
||||
@@ -164,7 +164,7 @@ impl RAMDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for RAMDirectory {
|
||||
impl Directory for RamDirectory {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Box<dyn FileHandle>, OpenReadError> {
|
||||
let file_slice = self.open_read(path)?;
|
||||
Ok(Box::new(file_slice))
|
||||
@@ -175,8 +175,8 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fail_point!("RAMDirectory::delete", |_| {
|
||||
Err(DeleteError::IOError {
|
||||
fail_point!("RamDirectory::delete", |_| {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: io::Error::from(io::ErrorKind::Other),
|
||||
filepath: path.to_path_buf(),
|
||||
})
|
||||
@@ -188,7 +188,7 @@ impl Directory for RAMDirectory {
|
||||
Ok(self
|
||||
.fs
|
||||
.read()
|
||||
.map_err(|e| OpenReadError::IOError {
|
||||
.map_err(|e| OpenReadError::IoError {
|
||||
io_error: io::Error::new(io::ErrorKind::Other, e.to_string()),
|
||||
filepath: path.to_path_buf(),
|
||||
})?
|
||||
@@ -212,7 +212,7 @@ impl Directory for RAMDirectory {
|
||||
let bytes =
|
||||
self.open_read(path)?
|
||||
.read_bytes()
|
||||
.map_err(|io_error| OpenReadError::IOError {
|
||||
.map_err(|io_error| OpenReadError::IoError {
|
||||
io_error,
|
||||
filepath: path.to_path_buf(),
|
||||
})?;
|
||||
@@ -220,7 +220,7 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
)));
|
||||
@@ -241,7 +241,7 @@ impl Directory for RAMDirectory {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::RAMDirectory;
|
||||
use super::RamDirectory;
|
||||
use crate::Directory;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
@@ -252,12 +252,12 @@ mod tests {
|
||||
let msg_seq: &'static [u8] = b"sequential is the way";
|
||||
let path_atomic: &'static Path = Path::new("atomic");
|
||||
let path_seq: &'static Path = Path::new("seq");
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(directory.atomic_write(path_atomic, msg_atomic).is_ok());
|
||||
let mut wrt = directory.open_write(path_seq).unwrap();
|
||||
assert!(wrt.write_all(msg_seq).is_ok());
|
||||
assert!(wrt.flush().is_ok());
|
||||
let directory_copy = RAMDirectory::create();
|
||||
let directory_copy = RamDirectory::create();
|
||||
assert!(directory.persist(&directory_copy).is_ok());
|
||||
assert_eq!(directory_copy.atomic_read(path_atomic).unwrap(), msg_atomic);
|
||||
assert_eq!(directory_copy.atomic_read(path_seq).unwrap(), msg_seq);
|
||||
|
||||
@@ -65,12 +65,12 @@ mod mmap_directory_tests {
|
||||
}
|
||||
|
||||
mod ram_directory_tests {
|
||||
use crate::directory::RAMDirectory;
|
||||
use crate::directory::RamDirectory;
|
||||
|
||||
type DirectoryImpl = RAMDirectory;
|
||||
type DirectoryImpl = RamDirectory;
|
||||
|
||||
fn make_directory() -> DirectoryImpl {
|
||||
RAMDirectory::default()
|
||||
RamDirectory::default()
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -122,7 +122,7 @@ mod ram_directory_tests {
|
||||
#[should_panic]
|
||||
fn ram_directory_panics_if_flush_forgotten() {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
let ram_directory = RAMDirectory::create();
|
||||
let ram_directory = RamDirectory::create();
|
||||
let mut write_file = ram_directory.open_write(test_path).unwrap();
|
||||
assert!(write_file.write_all(&[4]).is_ok());
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ pub enum TantivyError {
|
||||
LockFailure(LockError, Option<String>),
|
||||
/// IO Error.
|
||||
#[error("An IO error occurred: '{0}'")]
|
||||
IOError(#[from] io::Error),
|
||||
IoError(#[from] io::Error),
|
||||
/// Data corruption.
|
||||
#[error("Data corrupted: '{0:?}'")]
|
||||
DataCorruption(DataCorruption),
|
||||
@@ -139,7 +139,7 @@ impl From<schema::DocParsingError> for TantivyError {
|
||||
|
||||
impl From<serde_json::Error> for TantivyError {
|
||||
fn from(error: serde_json::Error) -> TantivyError {
|
||||
TantivyError::IOError(error.into())
|
||||
TantivyError::IoError(error.into())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,14 +47,14 @@ pub struct DeleteBitSet {
|
||||
impl DeleteBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
|
||||
use crate::directory::{Directory, RAMDirectory, TerminatingWrite};
|
||||
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
|
||||
use std::path::Path;
|
||||
assert!(docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value(max_doc);
|
||||
for &doc in docs {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("dummydeletebitset");
|
||||
let mut wrt = directory.open_write(path).unwrap();
|
||||
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
|
||||
@@ -83,7 +83,7 @@ impl DeleteBitSet {
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = self.data.as_slice()[byte_offset as usize];
|
||||
|
||||
@@ -201,7 +201,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::Field;
|
||||
@@ -242,7 +242,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intfastfield_small() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
@@ -269,7 +269,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intfastfield_large() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let mut serializer = FastFieldSerializer::from_write(write)?;
|
||||
@@ -308,7 +308,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intfastfield_null_amplitude() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
@@ -338,7 +338,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intfastfield_large_numbers() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
@@ -374,7 +374,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_signed_intfastfield() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
@@ -417,7 +417,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_signed_intfastfield_default_val() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
@@ -456,7 +456,7 @@ mod tests {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let mut serializer = FastFieldSerializer::from_write(write)?;
|
||||
@@ -576,7 +576,7 @@ mod bench {
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
@@ -612,7 +612,7 @@ mod bench {
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
@@ -646,7 +646,7 @@ mod bench {
|
||||
fn bench_intfastfield_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
|
||||
@@ -77,12 +77,15 @@ mod tests {
|
||||
// add another second
|
||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
|
||||
// add three seconds
|
||||
index_writer
|
||||
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert_eq!(reader.num_docs(), 4);
|
||||
assert_eq!(reader.num_docs(), 5);
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
@@ -150,7 +153,7 @@ mod tests {
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let range_q = format!(
|
||||
"[{} TO {}]",
|
||||
"[{} TO {}}}",
|
||||
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
);
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::common::compute_num_bits;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
@@ -118,18 +118,18 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("__dummy__");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut serializer = FastFieldSerializer::from_write(write)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers
|
||||
.get_field_writer(field)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val.to_u64());
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn id_to_fieldnorm(id: u8) -> u32 {
|
||||
FIELD_NORMS_TABLE[id as usize]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
FIELD_NORMS_TABLE
|
||||
.binary_search(&fieldnorm)
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
//! precompute computationally expensive functions of the fieldnorm
|
||||
//! in a very short array.
|
||||
//!
|
||||
//! This trick is used by the BM25 similarity.
|
||||
//! This trick is used by the Bm25 similarity.
|
||||
mod code;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
|
||||
@@ -133,7 +133,7 @@ impl FieldNormReader {
|
||||
}
|
||||
|
||||
/// Returns the `fieldnorm_id` associated to a document.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
|
||||
match &self.0 {
|
||||
ReaderImplEnum::FromData(data) => {
|
||||
@@ -145,14 +145,14 @@ impl FieldNormReader {
|
||||
}
|
||||
|
||||
/// Converts a `fieldnorm_id` into a fieldnorm.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn id_to_fieldnorm(id: u8) -> u32 {
|
||||
id_to_fieldnorm(id)
|
||||
}
|
||||
|
||||
/// Converts a `fieldnorm` into a `fieldnorm_id`.
|
||||
/// (This function is not injective).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
fieldnorm_to_id(fieldnorm)
|
||||
}
|
||||
|
||||
@@ -180,7 +180,7 @@ pub(crate) fn advance_deletes(
|
||||
if num_deleted_docs > num_deleted_docs_before {
|
||||
// There are new deletes. We need to write a new delete file.
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
|
||||
@@ -57,8 +57,8 @@ impl MergePolicy for LogMergePolicy {
|
||||
let mut size_sorted_tuples = segments
|
||||
.iter()
|
||||
.map(SegmentMeta::num_docs)
|
||||
.filter(|s| s <= &(self.max_merge_size as u32))
|
||||
.enumerate()
|
||||
.filter(|(_, s)| s <= &(self.max_merge_size as u32))
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
|
||||
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
|
||||
@@ -216,12 +216,19 @@ mod tests {
|
||||
create_random_segment_meta(1_000_000),
|
||||
create_random_segment_meta(100_001),
|
||||
create_random_segment_meta(100_000),
|
||||
create_random_segment_meta(1_000_001),
|
||||
create_random_segment_meta(100_000),
|
||||
create_random_segment_meta(100_000),
|
||||
create_random_segment_meta(1_500_000),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
// Do not include large segments
|
||||
assert_eq!(result_list.len(), 1);
|
||||
assert_eq!(result_list[0].0.len(), 3)
|
||||
assert_eq!(result_list[0].0.len(), 3);
|
||||
|
||||
// Making sure merge policy points to the correct index of the original input
|
||||
assert_eq!(result_list[0].0[0], test_input[2].id());
|
||||
assert_eq!(result_list[0].0[1], test_input[4].id());
|
||||
assert_eq!(result_list[0].0[2], test_input[5].id());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -628,7 +628,7 @@ impl IndexMerger {
|
||||
segment_postings.positions(&mut positions_buffer);
|
||||
|
||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions)?;
|
||||
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
|
||||
}
|
||||
|
||||
doc = segment_postings.advance();
|
||||
@@ -687,7 +687,7 @@ impl SerializableSegment for IndexMerger {
|
||||
}
|
||||
let fieldnorm_data = serializer
|
||||
.segment()
|
||||
.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
let term_ord_mappings =
|
||||
self.write_postings(serializer.get_postings_serializer(), fieldnorm_readers)?;
|
||||
|
||||
@@ -18,12 +18,12 @@ pub struct SegmentSerializer {
|
||||
impl SegmentSerializer {
|
||||
/// Creates a new `SegmentSerializer`.
|
||||
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
|
||||
let store_write = segment.open_write(SegmentComponent::STORE)?;
|
||||
let store_write = segment.open_write(SegmentComponent::Store)?;
|
||||
|
||||
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
|
||||
let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?;
|
||||
|
||||
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
|
||||
|
||||
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
|
||||
|
||||
@@ -628,7 +628,7 @@ impl SegmentUpdater {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::merge_segments;
|
||||
use crate::directory::RAMDirectory;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
use crate::schema::*;
|
||||
use crate::Index;
|
||||
@@ -777,7 +777,7 @@ mod tests {
|
||||
}
|
||||
|
||||
assert_eq!(indices.len(), 3);
|
||||
let output_directory = RAMDirectory::default();
|
||||
let output_directory = RamDirectory::default();
|
||||
let index = merge_segments(&indices, output_directory)?;
|
||||
assert_eq!(index.schema(), schema);
|
||||
|
||||
@@ -792,7 +792,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_empty_indices_array() {
|
||||
let merge_result = merge_segments(&[], RAMDirectory::default());
|
||||
let merge_result = merge_segments(&[], RamDirectory::default());
|
||||
assert!(merge_result.is_err());
|
||||
}
|
||||
|
||||
@@ -819,7 +819,7 @@ mod tests {
|
||||
};
|
||||
|
||||
// mismatched schema index list
|
||||
let result = merge_segments(&[first_index, second_index], RAMDirectory::default());
|
||||
let result = merge_segments(&[first_index, second_index], RamDirectory::default());
|
||||
assert!(result.is_err());
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -311,7 +311,7 @@ fn write(
|
||||
}
|
||||
let fieldnorm_data = serializer
|
||||
.segment()
|
||||
.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
let term_ord_map =
|
||||
multifield_postings.serialize(serializer.get_postings_serializer(), fieldnorm_readers)?;
|
||||
|
||||
@@ -141,7 +141,7 @@ pub mod collector;
|
||||
pub mod directory;
|
||||
pub mod fastfield;
|
||||
pub mod fieldnorm;
|
||||
pub(crate) mod positions;
|
||||
pub mod positions;
|
||||
pub mod postings;
|
||||
pub mod query;
|
||||
pub mod schema;
|
||||
|
||||
@@ -1,28 +1,29 @@
|
||||
/// Positions are stored in three parts and over two files.
|
||||
//
|
||||
/// The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
|
||||
/// for all terms of a given field, one term after the other.
|
||||
///
|
||||
/// If the last block is incomplete, it is simply padded with zeros.
|
||||
/// It cannot be read alone, as it actually does not contain the number of bits used for
|
||||
/// each blocks.
|
||||
/// .
|
||||
/// Each block is serialized one after the other.
|
||||
/// If the last block is incomplete, it is simply padded with zeros.
|
||||
///
|
||||
///
|
||||
/// The `SegmentComponent::POSITIONSSKIP` file contains the number of bits used in each block in `u8`
|
||||
/// stream.
|
||||
///
|
||||
/// This makes it possible to rapidly skip over `n positions`.
|
||||
///
|
||||
/// For every block #n where n = k * `LONG_SKIP_INTERVAL` blocks (k>=1), we also store
|
||||
/// in this file the sum of number of bits used for all of the previous block (blocks `[0, n[`).
|
||||
/// That is useful to start reading the positions for a given term: The TermInfo contains
|
||||
/// an address in the positions stream, expressed in "number of positions".
|
||||
/// The long skip structure makes it possible to skip rapidly to the a checkpoint close to this
|
||||
/// value, and then skip normally.
|
||||
///
|
||||
//! Tantivy can (if instructed to do so in the schema) store the term positions in a given field.
|
||||
//! This positions are expressed as token ordinal. For instance,
|
||||
//! In "The beauty and the beast", the term "the" appears in position 0 and position 4.
|
||||
//! This information is useful to run phrase queries.
|
||||
//!
|
||||
//! The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
|
||||
//! for all terms of a given field, one term after the other.
|
||||
//!
|
||||
//! Each terms is encoded independently.
|
||||
//! Like for positing lists, tantivy rely on simd bitpacking to encode the positions delta in blocks of 128 deltas.
|
||||
//! Because we rarely have a multiple of 128, a final block may encode the remaining values variable byte encoding.
|
||||
//!
|
||||
//! In order to make reading possible, the term delta positions first encodes the number of bitpacked blocks,
|
||||
//! then the bitwidth for each blocks, then the actual bitpacked block and finally the final variable int encoded block.
|
||||
//!
|
||||
//! Contrary to postings list, the reader does not have access on the number of positions that is encoded, and instead
|
||||
//! stops decoding the last block when its byte slice has been entirely read.
|
||||
//!
|
||||
//! More formally:
|
||||
//! * *Positions* := *NumBitPackedBlocks* *BitPackedPositionBlock*^(P/128) *BitPackedPositionsDeltaBitWidth* *VIntPosDeltas*?
|
||||
//! * *NumBitPackedBlocks**: := *P* / 128 encoded as a variable byte integer.
|
||||
//! * *BitPackedPositionBlock* := bit width encoded block of 128 positions delta
|
||||
//! * *BitPackedPositionsDeltaBitWidth* := (*BitWidth*: u8)^*NumBitPackedBlocks*
|
||||
//! * *VIntPosDeltas* := *VIntPosDelta*^(*P* % 128).
|
||||
//!
|
||||
//! The skip widths encoded separately makes it easy and fast to rapidly skip over n positions.
|
||||
mod reader;
|
||||
mod serializer;
|
||||
|
||||
@@ -31,54 +32,103 @@ pub use self::serializer::PositionSerializer;
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
|
||||
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
||||
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use super::PositionSerializer;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::positions::reader::PositionReader;
|
||||
use crate::{common::HasLen, directory::FileSlice};
|
||||
use proptest::prelude::*;
|
||||
use proptest::sample::select;
|
||||
use std::iter;
|
||||
|
||||
fn create_stream_buffer(vals: &[u32]) -> (FileSlice, FileSlice) {
|
||||
let mut skip_buffer = vec![];
|
||||
let mut stream_buffer = vec![];
|
||||
{
|
||||
let mut serializer = PositionSerializer::new(&mut stream_buffer, &mut skip_buffer);
|
||||
for (i, &val) in vals.iter().enumerate() {
|
||||
assert_eq!(serializer.positions_idx(), i as u64);
|
||||
serializer.write_all(&[val]).unwrap();
|
||||
fn create_positions_data(vals: &[u32]) -> crate::Result<OwnedBytes> {
|
||||
let mut positions_buffer = vec![];
|
||||
let mut serializer = PositionSerializer::new(&mut positions_buffer);
|
||||
serializer.write_positions_delta(&vals);
|
||||
serializer.close_term()?;
|
||||
serializer.close()?;
|
||||
Ok(OwnedBytes::new(positions_buffer))
|
||||
}
|
||||
|
||||
fn gen_delta_positions() -> BoxedStrategy<Vec<u32>> {
|
||||
select(&[0, 1, 70, 127, 128, 129, 200, 255, 256, 257, 270][..])
|
||||
.prop_flat_map(|num_delta_positions| {
|
||||
proptest::collection::vec(
|
||||
select(&[1u32, 2u32, 4u32, 8u32, 16u32][..]),
|
||||
num_delta_positions,
|
||||
)
|
||||
})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_position_delta(delta_positions in gen_delta_positions()) {
|
||||
let delta_positions_data = create_positions_data(&delta_positions).unwrap();
|
||||
let mut position_reader = PositionReader::open(delta_positions_data).unwrap();
|
||||
let mut minibuf = [0u32; 1];
|
||||
for (offset, &delta_position) in delta_positions.iter().enumerate() {
|
||||
position_reader.read(offset as u64, &mut minibuf[..]);
|
||||
assert_eq!(delta_position, minibuf[0]);
|
||||
}
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
(FileSlice::from(stream_buffer), FileSlice::from(skip_buffer))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_read() {
|
||||
let v: Vec<u32> = (0..1000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 12);
|
||||
assert_eq!(stream.len(), 1168);
|
||||
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
|
||||
fn test_position_read() -> crate::Result<()> {
|
||||
let position_deltas: Vec<u32> = (0..1000).collect();
|
||||
let positions_data = create_positions_data(&position_deltas[..])?;
|
||||
assert_eq!(positions_data.len(), 1224);
|
||||
let mut position_reader = PositionReader::open(positions_data)?;
|
||||
for &n in &[1, 10, 127, 128, 130, 312] {
|
||||
let mut v = vec![0u32; n];
|
||||
position_reader.read(0, &mut v[..]);
|
||||
for i in 0..n {
|
||||
assert_eq!(v[i], i as u32);
|
||||
assert_eq!(position_deltas[i], i as u32);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_read_with_offset() {
|
||||
let v: Vec<u32> = (0..1000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 12);
|
||||
assert_eq!(stream.len(), 1168);
|
||||
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
|
||||
fn test_empty_position() -> crate::Result<()> {
|
||||
let mut positions_buffer = vec![];
|
||||
let mut serializer = PositionSerializer::new(&mut positions_buffer);
|
||||
serializer.close_term()?;
|
||||
serializer.close()?;
|
||||
let position_delta = OwnedBytes::new(positions_buffer);
|
||||
assert!(PositionReader::open(position_delta).is_ok());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_write_positions() -> crate::Result<()> {
|
||||
let mut positions_buffer = vec![];
|
||||
let mut serializer = PositionSerializer::new(&mut positions_buffer);
|
||||
serializer.write_positions_delta(&[1u32, 12u32]);
|
||||
serializer.write_positions_delta(&[4u32, 17u32]);
|
||||
serializer.write_positions_delta(&[443u32]);
|
||||
serializer.close_term()?;
|
||||
serializer.close()?;
|
||||
let position_delta = OwnedBytes::new(positions_buffer);
|
||||
let mut output_delta_pos_buffer = vec![0u32; 5];
|
||||
let mut position_reader = PositionReader::open(position_delta)?;
|
||||
position_reader.read(0, &mut output_delta_pos_buffer[..]);
|
||||
assert_eq!(
|
||||
&output_delta_pos_buffer[..],
|
||||
&[1u32, 12u32, 4u32, 17u32, 443u32]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_read_with_offset() -> crate::Result<()> {
|
||||
let position_deltas: Vec<u32> = (0..1000).collect();
|
||||
let positions_data = create_positions_data(&position_deltas[..])?;
|
||||
assert_eq!(positions_data.len(), 1224);
|
||||
let mut position_reader = PositionReader::open(positions_data)?;
|
||||
for &offset in &[1u64, 10u64, 127u64, 128u64, 130u64, 312u64] {
|
||||
for &len in &[1, 10, 130, 500] {
|
||||
let mut v = vec![0u32; len];
|
||||
@@ -88,16 +138,16 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_read_after_skip() {
|
||||
let v: Vec<u32> = (0..1_000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 12);
|
||||
assert_eq!(stream.len(), 1168);
|
||||
fn test_position_read_after_skip() -> crate::Result<()> {
|
||||
let position_deltas: Vec<u32> = (0..1_000).collect();
|
||||
let positions_data = create_positions_data(&position_deltas[..])?;
|
||||
assert_eq!(positions_data.len(), 1224);
|
||||
|
||||
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
|
||||
let mut position_reader = PositionReader::open(positions_data)?;
|
||||
let mut buf = [0u32; 7];
|
||||
let mut c = 0;
|
||||
|
||||
@@ -111,15 +161,15 @@ pub mod tests {
|
||||
c += 1;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_reread_anchor_different_than_block() {
|
||||
let v: Vec<u32> = (0..2_000_000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 15_749);
|
||||
assert_eq!(stream.len(), 4_987_872);
|
||||
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0).unwrap();
|
||||
fn test_position_reread_anchor_different_than_block() -> crate::Result<()> {
|
||||
let positions_delta: Vec<u32> = (0..2_000_000).collect();
|
||||
let positions_data = create_positions_data(&positions_delta[..])?;
|
||||
assert_eq!(positions_data.len(), 5003499);
|
||||
let mut position_reader = PositionReader::open(positions_data.clone())?;
|
||||
let mut buf = [0u32; 256];
|
||||
position_reader.read(128, &mut buf);
|
||||
for i in 0..256 {
|
||||
@@ -129,62 +179,41 @@ pub mod tests {
|
||||
for i in 0..256 {
|
||||
assert_eq!(buf[i], (128 + i) as u32);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "offset arguments should be increasing.")]
|
||||
fn test_position_panic_if_called_previous_anchor() {
|
||||
let v: Vec<u32> = (0..2_000_000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 15_749);
|
||||
assert_eq!(stream.len(), 4_987_872);
|
||||
fn test_position_requesting_passed_block() -> crate::Result<()> {
|
||||
let positions_delta: Vec<u32> = (0..512).collect();
|
||||
let positions_data = create_positions_data(&positions_delta[..])?;
|
||||
assert_eq!(positions_data.len(), 533);
|
||||
let mut buf = [0u32; 1];
|
||||
let mut position_reader =
|
||||
PositionReader::new(stream.clone(), skip.clone(), 200_000).unwrap();
|
||||
let mut position_reader = PositionReader::open(positions_data)?;
|
||||
position_reader.read(230, &mut buf);
|
||||
assert_eq!(buf[0], 230);
|
||||
position_reader.read(9, &mut buf);
|
||||
assert_eq!(buf[0], 9);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_positions_bug() {
|
||||
let mut v: Vec<u32> = vec![];
|
||||
for i in 1..200 {
|
||||
for j in 0..i {
|
||||
v.push(j);
|
||||
}
|
||||
}
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
let mut buf = Vec::new();
|
||||
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0).unwrap();
|
||||
let mut offset = 0;
|
||||
for i in 1..24 {
|
||||
buf.resize(i, 0);
|
||||
position_reader.read(offset, &mut buf[..]);
|
||||
offset += i as u64;
|
||||
let r: Vec<u32> = (0..i).map(|el| el as u32).collect();
|
||||
assert_eq!(buf, &r[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_long_skip_const() {
|
||||
fn test_position() -> crate::Result<()> {
|
||||
const CONST_VAL: u32 = 9u32;
|
||||
let v: Vec<u32> = iter::repeat(CONST_VAL).take(2_000_000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 15_749);
|
||||
assert_eq!(stream.len(), 1_000_000);
|
||||
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024).unwrap();
|
||||
let positions_delta: Vec<u32> = iter::repeat(CONST_VAL).take(2_000_000).collect();
|
||||
let positions_data = create_positions_data(&positions_delta[..])?;
|
||||
assert_eq!(positions_data.len(), 1_015_627);
|
||||
let mut position_reader = PositionReader::open(positions_data)?;
|
||||
let mut buf = [0u32; 1];
|
||||
position_reader.read(0, &mut buf);
|
||||
assert_eq!(buf[0], CONST_VAL);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_long_skip_2() {
|
||||
let v: Vec<u32> = (0..2_000_000).collect();
|
||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||
assert_eq!(skip.len(), 15_749);
|
||||
assert_eq!(stream.len(), 4_987_872);
|
||||
fn test_position_advance() -> crate::Result<()> {
|
||||
let positions_delta: Vec<u32> = (0..2_000_000).collect();
|
||||
let positions_data = create_positions_data(&positions_delta[..])?;
|
||||
assert_eq!(positions_data.len(), 5_003_499);
|
||||
for &offset in &[
|
||||
10,
|
||||
128 * 1024,
|
||||
@@ -192,11 +221,11 @@ pub mod tests {
|
||||
128 * 1024 + 7,
|
||||
128 * 10 * 1024 + 10,
|
||||
] {
|
||||
let mut position_reader =
|
||||
PositionReader::new(stream.clone(), skip.clone(), offset).unwrap();
|
||||
let mut position_reader = PositionReader::open(positions_data.clone())?;
|
||||
let mut buf = [0u32; 1];
|
||||
position_reader.read(0, &mut buf);
|
||||
position_reader.read(offset, &mut buf);
|
||||
assert_eq!(buf[0], offset as u32);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,174 +1,148 @@
|
||||
use std::io;
|
||||
|
||||
use crate::common::{BinarySerializable, FixedSize};
|
||||
use crate::directory::FileSlice;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::positions::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::positions::LONG_SKIP_INTERVAL;
|
||||
use crate::positions::LONG_SKIP_IN_BLOCKS;
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder};
|
||||
|
||||
/// Positions works as a long sequence of compressed block.
|
||||
/// All terms are chained one after the other.
|
||||
///
|
||||
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
|
||||
/// This means we need to skip to the `nth` positions efficiently.
|
||||
///
|
||||
/// This is done thanks to two levels of skiping that we refer to in the code
|
||||
/// as `long_skip` and `short_skip`.
|
||||
///
|
||||
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
|
||||
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
|
||||
///
|
||||
/// We find the number of long skips, as `n / long_skip`.
|
||||
///
|
||||
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
|
||||
/// (values can go from 0bit to 32 bits) required to decompressed every block.
|
||||
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bits
|
||||
/// (values can go from 0bit to 32 bits) required to decompress every block.
|
||||
///
|
||||
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
|
||||
/// so skipping a block without decompressing it is just a matter of advancing that many
|
||||
/// bytes.
|
||||
|
||||
struct Positions {
|
||||
bit_packer: BitPacker4x,
|
||||
skip_file: FileSlice,
|
||||
position_file: FileSlice,
|
||||
long_skip_data: OwnedBytes,
|
||||
}
|
||||
|
||||
impl Positions {
|
||||
pub fn new(position_file: FileSlice, skip_file: FileSlice) -> io::Result<Positions> {
|
||||
let (body, footer) = skip_file.split_from_end(u32::SIZE_IN_BYTES);
|
||||
let footer_data = footer.read_bytes()?;
|
||||
let num_long_skips = u32::deserialize(&mut footer_data.as_slice())?;
|
||||
let (skip_file, long_skip_file) =
|
||||
body.split_from_end(u64::SIZE_IN_BYTES * (num_long_skips as usize));
|
||||
let long_skip_data = long_skip_file.read_bytes()?;
|
||||
Ok(Positions {
|
||||
bit_packer: BitPacker4x::new(),
|
||||
skip_file,
|
||||
long_skip_data,
|
||||
position_file,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the offset of the block associated to the given `long_skip_id`.
|
||||
///
|
||||
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
|
||||
fn long_skip(&self, long_skip_id: usize) -> u64 {
|
||||
if long_skip_id == 0 {
|
||||
return 0;
|
||||
}
|
||||
let long_skip_slice = self.long_skip_data.as_slice();
|
||||
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
|
||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
|
||||
}
|
||||
|
||||
fn reader(&self, offset: u64) -> io::Result<PositionReader> {
|
||||
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
||||
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
|
||||
let position_read = self
|
||||
.position_file
|
||||
.slice_from(offset_num_bytes as usize)
|
||||
.read_bytes()?;
|
||||
let skip_read = self
|
||||
.skip_file
|
||||
.slice_from(long_skip_id * LONG_SKIP_IN_BLOCKS)
|
||||
.read_bytes()?;
|
||||
Ok(PositionReader {
|
||||
bit_packer: self.bit_packer,
|
||||
skip_read,
|
||||
position_read,
|
||||
buffer: Box::new([0u32; 128]),
|
||||
block_offset: std::i64::MAX as u64,
|
||||
anchor_offset: (long_skip_id as u64) * LONG_SKIP_INTERVAL,
|
||||
abs_offset: offset,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PositionReader {
|
||||
skip_read: OwnedBytes,
|
||||
position_read: OwnedBytes,
|
||||
bit_packer: BitPacker4x,
|
||||
buffer: Box<[u32; COMPRESSION_BLOCK_SIZE]>,
|
||||
bit_widths: OwnedBytes,
|
||||
positions: OwnedBytes,
|
||||
|
||||
block_decoder: BlockDecoder,
|
||||
|
||||
// offset, expressed in positions, for the first position of the block currently loaded
|
||||
// block_offset is a multiple of COMPRESSION_BLOCK_SIZE.
|
||||
block_offset: u64,
|
||||
// offset, expressed in positions, for the position of the first block encoded
|
||||
// in the `self.positions` bytes, and if bitpacked, compressed using the bitwidth in
|
||||
// `self.bit_widths`.
|
||||
//
|
||||
// As we advance, anchor increases simultaneously with bit_widths and positions get consumed.
|
||||
anchor_offset: u64,
|
||||
|
||||
abs_offset: u64,
|
||||
// These are just copies used for .reset().
|
||||
original_bit_widths: OwnedBytes,
|
||||
original_positions: OwnedBytes,
|
||||
}
|
||||
|
||||
impl PositionReader {
|
||||
pub fn new(
|
||||
position_file: FileSlice,
|
||||
skip_file: FileSlice,
|
||||
offset: u64,
|
||||
) -> io::Result<PositionReader> {
|
||||
let positions = Positions::new(position_file, skip_file)?;
|
||||
positions.reader(offset)
|
||||
/// Open and reads the term positions encoded into the positions_data owned bytes.
|
||||
pub fn open(mut positions_data: OwnedBytes) -> io::Result<PositionReader> {
|
||||
let num_positions_bitpacked_blocks = VInt::deserialize(&mut positions_data)?.0 as usize;
|
||||
let (bit_widths, positions) = positions_data.split(num_positions_bitpacked_blocks);
|
||||
Ok(PositionReader {
|
||||
bit_widths: bit_widths.clone(),
|
||||
positions: positions.clone(),
|
||||
block_decoder: BlockDecoder::default(),
|
||||
block_offset: std::i64::MAX as u64,
|
||||
anchor_offset: 0u64,
|
||||
original_bit_widths: bit_widths,
|
||||
original_positions: positions,
|
||||
})
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.positions = self.original_positions.clone();
|
||||
self.bit_widths = self.original_bit_widths.clone();
|
||||
self.block_offset = std::i64::MAX as u64;
|
||||
self.anchor_offset = 0u64;
|
||||
}
|
||||
|
||||
/// Advance from num_blocks bitpacked blocks.
|
||||
///
|
||||
/// Panics if there are not that many remaining blocks.
|
||||
fn advance_num_blocks(&mut self, num_blocks: usize) {
|
||||
let num_bits: usize = self.skip_read.as_ref()[..num_blocks]
|
||||
let num_bits: usize = self.bit_widths.as_ref()[..num_blocks]
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|num_bits| num_bits as usize)
|
||||
.sum();
|
||||
let num_bytes_to_skip = num_bits * COMPRESSION_BLOCK_SIZE / 8;
|
||||
self.skip_read.advance(num_blocks as usize);
|
||||
self.position_read.advance(num_bytes_to_skip);
|
||||
self.bit_widths.advance(num_blocks as usize);
|
||||
self.positions.advance(num_bytes_to_skip);
|
||||
self.anchor_offset += (num_blocks * COMPRESSION_BLOCK_SIZE) as u64;
|
||||
}
|
||||
|
||||
/// block_rel_id is counted relatively to the anchor.
|
||||
/// block_rel_id = 0 means the anchor block.
|
||||
/// block_rel_id = i means the ith block after the anchor block.
|
||||
fn load_block(&mut self, block_rel_id: usize) {
|
||||
let bit_widths = self.bit_widths.as_slice();
|
||||
let byte_offset: usize = bit_widths[0..block_rel_id]
|
||||
.iter()
|
||||
.map(|&b| b as usize)
|
||||
.sum::<usize>()
|
||||
* COMPRESSION_BLOCK_SIZE
|
||||
/ 8;
|
||||
let compressed_data = &self.positions.as_slice()[byte_offset..];
|
||||
if bit_widths.len() > block_rel_id {
|
||||
// that block is bitpacked.
|
||||
let bit_width = bit_widths[block_rel_id];
|
||||
self.block_decoder
|
||||
.uncompress_block_unsorted(compressed_data, bit_width);
|
||||
} else {
|
||||
// that block is vint encoded.
|
||||
self.block_decoder
|
||||
.uncompress_vint_unsorted_until_end(compressed_data);
|
||||
}
|
||||
self.block_offset = self.anchor_offset + (block_rel_id * COMPRESSION_BLOCK_SIZE) as u64;
|
||||
}
|
||||
|
||||
/// Fills a buffer with the positions `[offset..offset+output.len())` integers.
|
||||
///
|
||||
/// `offset` is required to have a value >= to the offsets given in previous calls
|
||||
/// for the given `PositionReaderAbsolute` instance.
|
||||
/// This function is optimized to be called with increasing values of `offset`.
|
||||
pub fn read(&mut self, mut offset: u64, mut output: &mut [u32]) {
|
||||
offset += self.abs_offset;
|
||||
assert!(
|
||||
offset >= self.anchor_offset,
|
||||
"offset arguments should be increasing."
|
||||
);
|
||||
if offset < self.anchor_offset {
|
||||
self.reset();
|
||||
}
|
||||
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
|
||||
if !(0..128).contains(&delta_to_block_offset) {
|
||||
// The first position is not within the first block.
|
||||
// We need to decompress the first block.
|
||||
// (Note that it could be before or after)
|
||||
// We need to possibly skip a few blocks, and decompress the first relevant block.
|
||||
let delta_to_anchor_offset = offset - self.anchor_offset;
|
||||
let num_blocks_to_skip =
|
||||
(delta_to_anchor_offset / (COMPRESSION_BLOCK_SIZE as u64)) as usize;
|
||||
self.advance_num_blocks(num_blocks_to_skip);
|
||||
self.anchor_offset = offset - (offset % COMPRESSION_BLOCK_SIZE as u64);
|
||||
self.block_offset = self.anchor_offset;
|
||||
let num_bits = self.skip_read.as_slice()[0];
|
||||
self.bit_packer
|
||||
.decompress(self.position_read.as_ref(), self.buffer.as_mut(), num_bits);
|
||||
self.load_block(0);
|
||||
} else {
|
||||
// The request offset is within the loaded block.
|
||||
// We still need to advance anchor_offset to our current block.
|
||||
let num_blocks_to_skip =
|
||||
((self.block_offset - self.anchor_offset) / COMPRESSION_BLOCK_SIZE as u64) as usize;
|
||||
self.advance_num_blocks(num_blocks_to_skip);
|
||||
self.anchor_offset = self.block_offset;
|
||||
}
|
||||
|
||||
let mut num_bits = self.skip_read.as_slice()[0];
|
||||
let mut position_data = self.position_read.as_ref();
|
||||
|
||||
// At this point, the block containing offset is loaded, and anchor has
|
||||
// been updated to point to it as well.
|
||||
for i in 1.. {
|
||||
// we copy the part from block i - 1 that is relevant.
|
||||
let offset_in_block = (offset as usize) % COMPRESSION_BLOCK_SIZE;
|
||||
let remaining_in_block = COMPRESSION_BLOCK_SIZE - offset_in_block;
|
||||
if remaining_in_block >= output.len() {
|
||||
output.copy_from_slice(&self.buffer[offset_in_block..][..output.len()]);
|
||||
output.copy_from_slice(
|
||||
&self.block_decoder.output_array()[offset_in_block..][..output.len()],
|
||||
);
|
||||
break;
|
||||
}
|
||||
output[..remaining_in_block].copy_from_slice(&self.buffer[offset_in_block..]);
|
||||
output[..remaining_in_block]
|
||||
.copy_from_slice(&self.block_decoder.output_array()[offset_in_block..]);
|
||||
output = &mut output[remaining_in_block..];
|
||||
// we load block #i if necessary.
|
||||
offset += remaining_in_block as u64;
|
||||
position_data = &position_data[(num_bits as usize * COMPRESSION_BLOCK_SIZE / 8)..];
|
||||
num_bits = self.skip_read.as_slice()[i];
|
||||
self.bit_packer
|
||||
.decompress(position_data, self.buffer.as_mut(), num_bits);
|
||||
self.block_offset += COMPRESSION_BLOCK_SIZE as u64;
|
||||
self.load_block(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,80 +1,91 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||
use bitpacking::BitPacker;
|
||||
use bitpacking::BitPacker4x;
|
||||
use crate::common::{BinarySerializable, CountingWriter, VInt};
|
||||
use crate::positions::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::compression::BlockEncoder;
|
||||
use crate::postings::compression::VIntEncoder;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// The PositionSerializer is in charge of serializing all of the positions
|
||||
/// of all of the terms of a given field.
|
||||
///
|
||||
/// It is valid to call write_position_delta more than once per term.
|
||||
pub struct PositionSerializer<W: io::Write> {
|
||||
bit_packer: BitPacker4x,
|
||||
write_stream: CountingWriter<W>,
|
||||
write_skip_index: W,
|
||||
block_encoder: BlockEncoder,
|
||||
positions_wrt: CountingWriter<W>,
|
||||
positions_buffer: Vec<u8>,
|
||||
block: Vec<u32>,
|
||||
buffer: Vec<u8>,
|
||||
num_ints: u64,
|
||||
long_skips: Vec<u64>,
|
||||
bit_widths: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<W: io::Write> PositionSerializer<W> {
|
||||
pub fn new(write_stream: W, write_skip_index: W) -> PositionSerializer<W> {
|
||||
/// Creates a new PositionSerializer writing into the given positions_wrt.
|
||||
pub fn new(positions_wrt: W) -> PositionSerializer<W> {
|
||||
PositionSerializer {
|
||||
bit_packer: BitPacker4x::new(),
|
||||
write_stream: CountingWriter::wrap(write_stream),
|
||||
write_skip_index,
|
||||
block_encoder: BlockEncoder::new(),
|
||||
positions_wrt: CountingWriter::wrap(positions_wrt),
|
||||
positions_buffer: Vec::with_capacity(128_000),
|
||||
block: Vec::with_capacity(128),
|
||||
buffer: vec![0u8; 128 * 4],
|
||||
num_ints: 0u64,
|
||||
long_skips: Vec::new(),
|
||||
bit_widths: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn positions_idx(&self) -> u64 {
|
||||
self.num_ints
|
||||
/// Returns the number of bytes written in the positions write object
|
||||
/// at this point.
|
||||
/// When called before writing the positions of a term, this value is used as
|
||||
/// start offset.
|
||||
/// When called after writing the positions of a term, this value is used as a
|
||||
/// end offset.
|
||||
pub fn written_bytes(&self) -> u64 {
|
||||
self.positions_wrt.written_bytes()
|
||||
}
|
||||
|
||||
fn remaining_block_len(&self) -> usize {
|
||||
COMPRESSION_BLOCK_SIZE - self.block.len()
|
||||
}
|
||||
|
||||
pub fn write_all(&mut self, mut vals: &[u32]) -> io::Result<()> {
|
||||
while !vals.is_empty() {
|
||||
/// Writes all of the given positions delta.
|
||||
pub fn write_positions_delta(&mut self, mut positions_delta: &[u32]) {
|
||||
while !positions_delta.is_empty() {
|
||||
let remaining_block_len = self.remaining_block_len();
|
||||
let num_to_write = remaining_block_len.min(vals.len());
|
||||
self.block.extend(&vals[..num_to_write]);
|
||||
self.num_ints += num_to_write as u64;
|
||||
vals = &vals[num_to_write..];
|
||||
let num_to_write = remaining_block_len.min(positions_delta.len());
|
||||
self.block.extend(&positions_delta[..num_to_write]);
|
||||
positions_delta = &positions_delta[num_to_write..];
|
||||
if self.remaining_block_len() == 0 {
|
||||
self.flush_block()?;
|
||||
self.flush_block();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_block(&mut self) -> io::Result<()> {
|
||||
let num_bits = self.bit_packer.num_bits(&self.block[..]);
|
||||
self.write_skip_index.write_all(&[num_bits])?;
|
||||
let written_len = self
|
||||
.bit_packer
|
||||
.compress(&self.block[..], &mut self.buffer, num_bits);
|
||||
self.write_stream.write_all(&self.buffer[..written_len])?;
|
||||
fn flush_block(&mut self) {
|
||||
// encode the positions in the block
|
||||
if self.block.is_empty() {
|
||||
return;
|
||||
}
|
||||
if self.block.len() == COMPRESSION_BLOCK_SIZE {
|
||||
let (bit_width, block_encoded): (u8, &[u8]) =
|
||||
self.block_encoder.compress_block_unsorted(&self.block[..]);
|
||||
self.bit_widths.push(bit_width);
|
||||
self.positions_buffer.extend(block_encoded);
|
||||
} else {
|
||||
debug_assert!(self.block.len() < COMPRESSION_BLOCK_SIZE);
|
||||
let block_vint_encoded = self.block_encoder.compress_vint_unsorted(&self.block[..]);
|
||||
self.positions_buffer.extend_from_slice(block_vint_encoded);
|
||||
}
|
||||
self.block.clear();
|
||||
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
|
||||
self.long_skips.push(self.write_stream.written_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
/// Close the positions for the given term.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
self.flush_block();
|
||||
VInt(self.bit_widths.len() as u64).serialize(&mut self.positions_wrt)?;
|
||||
self.positions_wrt.write_all(&self.bit_widths[..])?;
|
||||
self.positions_wrt.write_all(&self.positions_buffer)?;
|
||||
self.bit_widths.clear();
|
||||
self.positions_buffer.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Close the positions for this term and flushes the data.
|
||||
pub fn close(mut self) -> io::Result<()> {
|
||||
if !self.block.is_empty() {
|
||||
self.block.resize(COMPRESSION_BLOCK_SIZE, 0u32);
|
||||
self.flush_block()?;
|
||||
}
|
||||
for &long_skip in &self.long_skips {
|
||||
long_skip.serialize(&mut self.write_skip_index)?;
|
||||
}
|
||||
(self.long_skips.len() as u32).serialize(&mut self.write_skip_index)?;
|
||||
self.write_skip_index.flush()?;
|
||||
self.write_stream.flush()?;
|
||||
Ok(())
|
||||
self.positions_wrt.flush()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,7 +100,7 @@ fn galloping(block_docs: &[u32], target: u32) -> usize {
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
pub enum BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
SSE2,
|
||||
Sse2,
|
||||
Scalar,
|
||||
}
|
||||
|
||||
@@ -139,7 +139,7 @@ impl BlockSearcher {
|
||||
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if self == BlockSearcher::SSE2 {
|
||||
if self == BlockSearcher::Sse2 {
|
||||
return sse2::linear_search_sse2_128(block_docs, target);
|
||||
}
|
||||
}
|
||||
@@ -152,7 +152,7 @@ impl Default for BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("sse2") {
|
||||
return BlockSearcher::SSE2;
|
||||
return BlockSearcher::Sse2;
|
||||
}
|
||||
}
|
||||
BlockSearcher::Scalar
|
||||
@@ -236,6 +236,6 @@ mod tests {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[test]
|
||||
fn test_search_in_block_sse2() {
|
||||
test_search_in_block_util(BlockSearcher::SSE2);
|
||||
test_search_in_block_util(BlockSearcher::Sse2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::postings::compression::{
|
||||
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
|
||||
};
|
||||
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
|
||||
use crate::query::BM25Weight;
|
||||
use crate::query::Bm25Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
|
||||
@@ -127,7 +127,7 @@ impl BlockSegmentPostings {
|
||||
pub fn block_max_score(
|
||||
&mut self,
|
||||
fieldnorm_reader: &FieldNormReader,
|
||||
bm25_weight: &BM25Weight,
|
||||
bm25_weight: &Bm25Weight,
|
||||
) -> Score {
|
||||
if let Some(score) = self.block_max_score_cache {
|
||||
return score;
|
||||
@@ -212,14 +212,14 @@ impl BlockSegmentPostings {
|
||||
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
|
||||
///
|
||||
/// This method is useful to run SSE2 linear search.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
|
||||
debug_assert!(self.block_is_loaded());
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
@@ -167,6 +167,8 @@ pub trait VIntDecoder {
|
||||
num_els: usize,
|
||||
padding: u32,
|
||||
) -> usize;
|
||||
|
||||
fn uncompress_vint_unsorted_until_end(&mut self, compressed_data: &[u8]);
|
||||
}
|
||||
|
||||
impl VIntEncoder for BlockEncoder {
|
||||
@@ -202,6 +204,11 @@ impl VIntDecoder for BlockDecoder {
|
||||
self.output.0.iter_mut().for_each(|el| *el = padding);
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted_until_end(&mut self, compressed_data: &[u8]) {
|
||||
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output.0);
|
||||
self.output_len = num_els;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
@@ -20,7 +20,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32)
|
||||
&output[..byte_written]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
@@ -41,7 +41,7 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
|
||||
&output[..byte_written]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
@@ -61,15 +61,15 @@ pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32
|
||||
read_byte
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]) -> usize {
|
||||
let mut read_byte = 0;
|
||||
let mut num_read_bytes = 0;
|
||||
for output_mut in output_arr.iter_mut() {
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
let cur_byte = compressed_data[num_read_bytes];
|
||||
num_read_bytes += 1;
|
||||
result += u32::from(cur_byte % 128u8) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
@@ -78,5 +78,31 @@ pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]
|
||||
}
|
||||
*output_mut = result;
|
||||
}
|
||||
read_byte
|
||||
num_read_bytes
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn uncompress_unsorted_until_end(
|
||||
compressed_data: &[u8],
|
||||
output_arr: &mut [u32],
|
||||
) -> usize {
|
||||
let mut num_read_bytes = 0;
|
||||
for (num_ints_written, output_mut) in output_arr.iter_mut().enumerate() {
|
||||
if compressed_data.len() == num_read_bytes {
|
||||
return num_ints_written;
|
||||
}
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[num_read_bytes];
|
||||
num_read_bytes += 1;
|
||||
result += u32::from(cur_byte % 128u8) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
*output_mut = result;
|
||||
}
|
||||
output_arr.len()
|
||||
}
|
||||
|
||||
@@ -68,13 +68,13 @@ pub mod tests {
|
||||
field_serializer.new_term("abc".as_bytes(), 12u32)?;
|
||||
for doc_id in 0u32..120u32 {
|
||||
let delta_positions = vec![1, 2, 3, 2];
|
||||
field_serializer.write_doc(doc_id, 4, &delta_positions)?;
|
||||
field_serializer.write_doc(doc_id, 4, &delta_positions);
|
||||
}
|
||||
field_serializer.close_term()?;
|
||||
mem::drop(field_serializer);
|
||||
posting_serializer.close()?;
|
||||
let read = segment.open_read(SegmentComponent::POSITIONS)?;
|
||||
assert!(read.len() <= 140);
|
||||
let read = segment.open_read(SegmentComponent::Positions)?;
|
||||
assert_eq!(read.len(), 207);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::postings::recorder::{
|
||||
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
|
||||
BufferLender, NothingRecorder, Recorder, TermFrequencyRecorder, TfAndPositionRecorder,
|
||||
};
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
@@ -30,7 +30,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
@@ -313,7 +313,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
let recorder: Rec = termdict_heap.read(addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(&term_bytes[4..], term_doc_freq)?;
|
||||
recorder.serialize(&mut buffer_lender, serializer, heap)?;
|
||||
recorder.serialize(&mut buffer_lender, serializer, heap);
|
||||
serializer.close_term()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -2,7 +2,6 @@ use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use crate::common::{read_u32_vint, write_u32_vint};
|
||||
use crate::postings::FieldSerializer;
|
||||
use crate::DocId;
|
||||
use std::io;
|
||||
|
||||
const POSITION_END: u32 = 0;
|
||||
|
||||
@@ -74,7 +73,7 @@ pub(crate) trait Recorder: Copy + 'static {
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()>;
|
||||
);
|
||||
/// Returns the number of document containing this term.
|
||||
///
|
||||
/// Returns `None` if not available.
|
||||
@@ -114,14 +113,13 @@ impl Recorder for NothingRecorder {
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
) {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
self.stack.read_to_end(heap, buffer);
|
||||
// TODO avoid reading twice.
|
||||
for doc in VInt32Reader::new(&buffer[..]) {
|
||||
serializer.write_doc(doc as u32, 0u32, &[][..])?;
|
||||
serializer.write_doc(doc as u32, 0u32, &[][..]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn term_doc_freq(&self) -> Option<u32> {
|
||||
@@ -173,16 +171,14 @@ impl Recorder for TermFrequencyRecorder {
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
) {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
self.stack.read_to_end(heap, buffer);
|
||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||
while let Some(doc) = u32_it.next() {
|
||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
||||
serializer.write_doc(doc as u32, term_freq, &[][..])?;
|
||||
serializer.write_doc(doc as u32, term_freq, &[][..]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn term_doc_freq(&self) -> Option<u32> {
|
||||
@@ -192,14 +188,14 @@ impl Recorder for TermFrequencyRecorder {
|
||||
|
||||
/// Recorder encoding term frequencies as well as positions.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TFAndPositionRecorder {
|
||||
pub struct TfAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
term_doc_freq: u32,
|
||||
}
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
impl Recorder for TfAndPositionRecorder {
|
||||
fn new() -> Self {
|
||||
TFAndPositionRecorder {
|
||||
TfAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: u32::max_value(),
|
||||
term_doc_freq: 0u32,
|
||||
@@ -229,7 +225,7 @@ impl Recorder for TFAndPositionRecorder {
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
) {
|
||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||
self.stack.read_to_end(heap, buffer_u8);
|
||||
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
||||
@@ -248,9 +244,8 @@ impl Recorder for TFAndPositionRecorder {
|
||||
}
|
||||
}
|
||||
}
|
||||
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
|
||||
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn term_doc_freq(&self) -> Option<u32> {
|
||||
|
||||
@@ -204,7 +204,7 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
|
||||
/// Return the current document's `DocId`.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
self.block_cursor.doc(self.cur)
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use crate::fieldnorm::FieldNormReader;
|
||||
use crate::positions::PositionSerializer;
|
||||
use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::skip::SkipSerializer;
|
||||
use crate::query::BM25Weight;
|
||||
use crate::query::Bm25Weight;
|
||||
use crate::schema::{Field, FieldEntry, FieldType};
|
||||
use crate::schema::{IndexRecordOption, Schema};
|
||||
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
@@ -50,19 +50,17 @@ pub struct InvertedIndexSerializer {
|
||||
terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
positions_write: CompositeWrite<WritePtr>,
|
||||
positionsidx_write: CompositeWrite<WritePtr>,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
|
||||
use crate::SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
||||
use crate::SegmentComponent::{Positions, Postings, Terms};
|
||||
let inv_index_serializer = InvertedIndexSerializer {
|
||||
terms_write: CompositeWrite::wrap(segment.open_write(TERMS)?),
|
||||
postings_write: CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
||||
positions_write: CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
||||
positionsidx_write: CompositeWrite::wrap(segment.open_write(POSITIONSSKIP)?),
|
||||
terms_write: CompositeWrite::wrap(segment.open_write(Terms)?),
|
||||
postings_write: CompositeWrite::wrap(segment.open_write(Postings)?),
|
||||
positions_write: CompositeWrite::wrap(segment.open_write(Positions)?),
|
||||
schema: segment.schema(),
|
||||
};
|
||||
Ok(inv_index_serializer)
|
||||
@@ -82,7 +80,6 @@ impl InvertedIndexSerializer {
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
let positions_write = self.positions_write.for_field(field);
|
||||
let positionsidx_write = self.positionsidx_write.for_field(field);
|
||||
let field_type: FieldType = (*field_entry.field_type()).clone();
|
||||
FieldSerializer::create(
|
||||
&field_type,
|
||||
@@ -90,7 +87,6 @@ impl InvertedIndexSerializer {
|
||||
term_dictionary_write,
|
||||
postings_write,
|
||||
positions_write,
|
||||
positionsidx_write,
|
||||
fieldnorm_reader,
|
||||
)
|
||||
}
|
||||
@@ -100,7 +96,6 @@ impl InvertedIndexSerializer {
|
||||
self.terms_write.close()?;
|
||||
self.postings_write.close()?;
|
||||
self.positions_write.close()?;
|
||||
self.positionsidx_write.close()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -123,7 +118,6 @@ impl<'a> FieldSerializer<'a> {
|
||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
positionsidx_write: &'a mut CountingWriter<WritePtr>,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
total_num_tokens.serialize(postings_write)?;
|
||||
@@ -145,7 +139,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
let postings_serializer =
|
||||
PostingsSerializer::new(postings_write, average_fieldnorm, mode, fieldnorm_reader);
|
||||
let positions_serializer_opt = if mode.has_positions() {
|
||||
Some(PositionSerializer::new(positions_write, positionsidx_write))
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -161,17 +155,17 @@ impl<'a> FieldSerializer<'a> {
|
||||
}
|
||||
|
||||
fn current_term_info(&self) -> TermInfo {
|
||||
let positions_idx =
|
||||
let positions_start =
|
||||
if let Some(positions_serializer) = self.positions_serializer_opt.as_ref() {
|
||||
positions_serializer.positions_idx()
|
||||
positions_serializer.written_bytes()
|
||||
} else {
|
||||
0u64
|
||||
};
|
||||
let addr = self.postings_serializer.addr() as usize;
|
||||
} as usize;
|
||||
let addr = self.postings_serializer.written_bytes() as usize;
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
postings_range: addr..addr,
|
||||
positions_idx,
|
||||
positions_range: positions_start..positions_start,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -204,18 +198,12 @@ impl<'a> FieldSerializer<'a> {
|
||||
///
|
||||
/// Term frequencies and positions may be ignored by the serializer depending
|
||||
/// on the configuration of the field in the `Schema`.
|
||||
pub fn write_doc(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
term_freq: u32,
|
||||
position_deltas: &[u32],
|
||||
) -> io::Result<()> {
|
||||
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) {
|
||||
self.current_term_info.doc_freq += 1;
|
||||
self.postings_serializer.write_doc(doc_id, term_freq);
|
||||
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
positions_serializer.write_all(position_deltas)?;
|
||||
positions_serializer.write_positions_delta(position_deltas);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Finish the serialization for this term postings.
|
||||
@@ -226,7 +214,14 @@ impl<'a> FieldSerializer<'a> {
|
||||
if self.term_open {
|
||||
self.postings_serializer
|
||||
.close_term(self.current_term_info.doc_freq)?;
|
||||
self.current_term_info.postings_range.end = self.postings_serializer.addr() as usize;
|
||||
self.current_term_info.postings_range.end =
|
||||
self.postings_serializer.written_bytes() as usize;
|
||||
|
||||
if let Some(positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
positions_serializer.close_term()?;
|
||||
self.current_term_info.positions_range.end =
|
||||
positions_serializer.written_bytes() as usize;
|
||||
}
|
||||
self.term_dictionary_builder
|
||||
.insert_value(&self.current_term_info)?;
|
||||
self.term_open = false;
|
||||
@@ -307,7 +302,7 @@ pub struct PostingsSerializer<W: Write> {
|
||||
mode: IndexRecordOption,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
|
||||
bm25_weight: Option<BM25Weight>,
|
||||
bm25_weight: Option<Bm25Weight>,
|
||||
|
||||
num_docs: u32, // Number of docs in the segment
|
||||
avg_fieldnorm: Score, // Average number of term in the field for that segment.
|
||||
@@ -347,7 +342,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
|
||||
pub fn new_term(&mut self, term_doc_freq: u32) {
|
||||
if self.mode.has_freq() && self.num_docs > 0 {
|
||||
let bm25_weight = BM25Weight::for_one_term(
|
||||
let bm25_weight = Bm25Weight::for_one_term(
|
||||
term_doc_freq as u64,
|
||||
self.num_docs as u64,
|
||||
self.avg_fieldnorm,
|
||||
@@ -457,7 +452,13 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn addr(&self) -> u64 {
|
||||
/// Returns the number of bytes written in the postings write object
|
||||
/// at this point.
|
||||
/// When called before writing the postings of a term, this value is used as
|
||||
/// start offset.
|
||||
/// When called after writing the postings of a term, this value is used as a
|
||||
/// end offset.
|
||||
fn written_bytes(&self) -> u64 {
|
||||
self.output_write.written_bytes() as u64
|
||||
}
|
||||
|
||||
|
||||
@@ -2,16 +2,16 @@ use std::convert::TryInto;
|
||||
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::query::BM25Weight;
|
||||
use crate::query::Bm25Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn encode_block_wand_max_tf(max_tf: u32) -> u8 {
|
||||
max_tf.min(u8::MAX as u32) as u8
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn decode_block_wand_max_tf(max_tf_code: u8) -> u32 {
|
||||
if max_tf_code == u8::MAX {
|
||||
u32::MAX
|
||||
@@ -20,12 +20,12 @@ fn decode_block_wand_max_tf(max_tf_code: u8) -> u32 {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn read_u32(data: &[u8]) -> u32 {
|
||||
u32::from_le_bytes(data[..4].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn write_u32(val: u32, buf: &mut Vec<u8>) {
|
||||
buf.extend_from_slice(&val.to_le_bytes());
|
||||
}
|
||||
@@ -144,7 +144,7 @@ impl SkipReader {
|
||||
//
|
||||
// The block max score is available for all full bitpacked block,
|
||||
// but no available for the last VInt encoded incomplete block.
|
||||
pub fn block_max_score(&self, bm25_weight: &BM25Weight) -> Option<Score> {
|
||||
pub fn block_max_score(&self, bm25_weight: &Bm25Weight) -> Option<Score> {
|
||||
match self.block_info {
|
||||
BlockInfo::BitPacked {
|
||||
block_wand_fieldnorm_id,
|
||||
@@ -163,7 +163,7 @@ impl SkipReader {
|
||||
self.position_offset
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn byte_offset(&self) -> usize {
|
||||
self.byte_offset
|
||||
}
|
||||
|
||||
@@ -147,7 +147,7 @@ impl ExpUnrolledLinkedList {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
|
||||
ExpUnrolledLinkedListWriter { eull: self, heap }
|
||||
}
|
||||
|
||||
@@ -132,7 +132,7 @@ impl MemoryArena {
|
||||
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
|
||||
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
|
||||
}
|
||||
@@ -162,7 +162,7 @@ impl Page {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn is_available(&self, len: usize) -> bool {
|
||||
len + self.len <= PAGE_SIZE
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ impl TermHashMap {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let data = self.heap.slice_from(addr);
|
||||
let key_bytes_len = NativeEndian::read_u16(data) as usize;
|
||||
@@ -126,7 +126,7 @@ impl TermHashMap {
|
||||
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
|
||||
let (stored_key, value_addr) = self.get_key_value(addr);
|
||||
if stored_key == target_key {
|
||||
|
||||
@@ -11,8 +11,8 @@ pub struct TermInfo {
|
||||
pub doc_freq: u32,
|
||||
/// Byte range of the posting list within the postings (`.idx`) file.
|
||||
pub postings_range: Range<usize>,
|
||||
/// Start offset of the first block within the position (`.pos`) file.
|
||||
pub positions_idx: u64,
|
||||
/// Byte range of the positions of this terms in the positions (`.pos`) file.
|
||||
pub positions_range: Range<usize>,
|
||||
}
|
||||
|
||||
impl TermInfo {
|
||||
@@ -21,6 +21,12 @@ impl TermInfo {
|
||||
assert!(num_bytes <= std::u32::MAX as usize);
|
||||
num_bytes as u32
|
||||
}
|
||||
|
||||
pub(crate) fn positions_num_bytes(&self) -> u32 {
|
||||
let num_bytes = self.positions_range.len();
|
||||
assert!(num_bytes <= std::u32::MAX as usize);
|
||||
num_bytes as u32
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for TermInfo {
|
||||
@@ -28,7 +34,7 @@ impl FixedSize for TermInfo {
|
||||
/// This is large, but in practise, `TermInfo` are encoded in blocks and
|
||||
/// only the first `TermInfo` of a block is serialized uncompressed.
|
||||
/// The subsequent `TermInfo` are delta encoded and bitpacked.
|
||||
const SIZE_IN_BYTES: usize = 2 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
|
||||
const SIZE_IN_BYTES: usize = 3 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
impl BinarySerializable for TermInfo {
|
||||
@@ -36,20 +42,23 @@ impl BinarySerializable for TermInfo {
|
||||
self.doc_freq.serialize(writer)?;
|
||||
(self.postings_range.start as u64).serialize(writer)?;
|
||||
self.posting_num_bytes().serialize(writer)?;
|
||||
self.positions_idx.serialize(writer)?;
|
||||
(self.positions_range.start as u64).serialize(writer)?;
|
||||
self.positions_num_bytes().serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let doc_freq = u32::deserialize(reader)?;
|
||||
let postings_start_offset = u64::deserialize(reader)? as usize;
|
||||
let postings_num_bytes = u32::deserialize(reader)?;
|
||||
let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes) as usize;
|
||||
let positions_idx = u64::deserialize(reader)?;
|
||||
let postings_num_bytes = u32::deserialize(reader)? as usize;
|
||||
let postings_end_offset = postings_start_offset + postings_num_bytes;
|
||||
let positions_start_offset = u64::deserialize(reader)? as usize;
|
||||
let positions_num_bytes = u32::deserialize(reader)? as usize;
|
||||
let positions_end_offset = positions_start_offset + positions_num_bytes;
|
||||
Ok(TermInfo {
|
||||
doc_freq,
|
||||
postings_range: postings_start_offset..postings_end_offset,
|
||||
positions_idx,
|
||||
positions_range: positions_start_offset..positions_end_offset,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -60,6 +69,8 @@ mod tests {
|
||||
use super::TermInfo;
|
||||
use crate::common::test::fixed_size_test;
|
||||
|
||||
// TODO add serialize/deserialize test for terminfo
|
||||
|
||||
#[test]
|
||||
fn test_fixed_size() {
|
||||
fixed_size_test::<TermInfo>();
|
||||
|
||||
@@ -29,22 +29,22 @@ fn compute_tf_cache(average_fieldnorm: Score) -> [Score; 256] {
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
|
||||
pub struct BM25Params {
|
||||
pub struct Bm25Params {
|
||||
pub idf: Score,
|
||||
pub avg_fieldnorm: Score,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BM25Weight {
|
||||
pub struct Bm25Weight {
|
||||
idf_explain: Explanation,
|
||||
weight: Score,
|
||||
cache: [Score; 256],
|
||||
average_fieldnorm: Score,
|
||||
}
|
||||
|
||||
impl BM25Weight {
|
||||
pub fn boost_by(&self, boost: Score) -> BM25Weight {
|
||||
BM25Weight {
|
||||
impl Bm25Weight {
|
||||
pub fn boost_by(&self, boost: Score) -> Bm25Weight {
|
||||
Bm25Weight {
|
||||
idf_explain: self.idf_explain.clone(),
|
||||
weight: self.weight * boost,
|
||||
cache: self.cache,
|
||||
@@ -52,8 +52,8 @@ impl BM25Weight {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> crate::Result<BM25Weight> {
|
||||
assert!(!terms.is_empty(), "BM25 requires at least one term");
|
||||
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> crate::Result<Bm25Weight> {
|
||||
assert!(!terms.is_empty(), "Bm25 requires at least one term");
|
||||
let field = terms[0].field();
|
||||
for term in &terms[1..] {
|
||||
assert_eq!(
|
||||
@@ -74,7 +74,7 @@ impl BM25Weight {
|
||||
|
||||
if terms.len() == 1 {
|
||||
let term_doc_freq = searcher.doc_freq(&terms[0])?;
|
||||
Ok(BM25Weight::for_one_term(
|
||||
Ok(Bm25Weight::for_one_term(
|
||||
term_doc_freq,
|
||||
total_num_docs,
|
||||
average_fieldnorm,
|
||||
@@ -86,7 +86,7 @@ impl BM25Weight {
|
||||
idf_sum += idf(term_doc_freq, total_num_docs);
|
||||
}
|
||||
let idf_explain = Explanation::new("idf", idf_sum);
|
||||
Ok(BM25Weight::new(idf_explain, average_fieldnorm))
|
||||
Ok(Bm25Weight::new(idf_explain, average_fieldnorm))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ impl BM25Weight {
|
||||
term_doc_freq: u64,
|
||||
total_num_docs: u64,
|
||||
avg_fieldnorm: Score,
|
||||
) -> BM25Weight {
|
||||
) -> Bm25Weight {
|
||||
let idf = idf(term_doc_freq, total_num_docs);
|
||||
let mut idf_explain =
|
||||
Explanation::new("idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))", idf);
|
||||
@@ -103,12 +103,12 @@ impl BM25Weight {
|
||||
term_doc_freq as Score,
|
||||
);
|
||||
idf_explain.add_const("N, total number of docs", total_num_docs as Score);
|
||||
BM25Weight::new(idf_explain, avg_fieldnorm)
|
||||
Bm25Weight::new(idf_explain, avg_fieldnorm)
|
||||
}
|
||||
|
||||
pub(crate) fn new(idf_explain: Explanation, average_fieldnorm: Score) -> BM25Weight {
|
||||
pub(crate) fn new(idf_explain: Explanation, average_fieldnorm: Score) -> Bm25Weight {
|
||||
let weight = idf_explain.value() * (1.0 + K1);
|
||||
BM25Weight {
|
||||
Bm25Weight {
|
||||
idf_explain,
|
||||
weight,
|
||||
cache: compute_tf_cache(average_fieldnorm),
|
||||
@@ -116,7 +116,7 @@ impl BM25Weight {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn score(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
|
||||
self.weight * self.tf_factor(fieldnorm_id, term_freq)
|
||||
}
|
||||
@@ -125,7 +125,7 @@ impl BM25Weight {
|
||||
self.score(255u8, 2_013_265_944)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub(crate) fn tf_factor(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
|
||||
let term_freq = term_freq as Score;
|
||||
let norm = self.cache[fieldnorm_id as usize];
|
||||
|
||||
@@ -238,7 +238,7 @@ mod tests {
|
||||
use crate::query::score_combiner::SumCombiner;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::Union;
|
||||
use crate::query::{BM25Weight, Scorer};
|
||||
use crate::query::{Bm25Weight, Scorer};
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
use proptest::prelude::*;
|
||||
use std::cmp::Ordering;
|
||||
@@ -393,7 +393,7 @@ mod tests {
|
||||
let term_scorers: Vec<TermScorer> = postings_lists_expanded
|
||||
.iter()
|
||||
.map(|postings| {
|
||||
let bm25_weight = BM25Weight::for_one_term(
|
||||
let bm25_weight = Bm25Weight::for_one_term(
|
||||
postings.len() as u64,
|
||||
max_doc as u64,
|
||||
average_fieldnorm,
|
||||
|
||||
@@ -3,7 +3,7 @@ use crate::query::Scorer;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
fn is_within<TDocSetExclude: DocSet>(docset: &mut TDocSetExclude, doc: DocId) -> bool {
|
||||
docset.doc() <= doc && docset.seek(doc) == doc
|
||||
}
|
||||
|
||||
@@ -8,9 +8,9 @@ use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
pub(crate) struct DFAWrapper(pub DFA);
|
||||
pub(crate) struct DfaWrapper(pub DFA);
|
||||
|
||||
impl Automaton for DFAWrapper {
|
||||
impl Automaton for DfaWrapper {
|
||||
type State = u32;
|
||||
|
||||
fn start(&self) -> Self::State {
|
||||
@@ -127,7 +127,7 @@ impl FuzzyTermQuery {
|
||||
}
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DFAWrapper>> {
|
||||
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> {
|
||||
// LEV_BUILDER is a HashMap, whose `get` method returns an Option
|
||||
match LEV_BUILDER.get(&(self.distance, false)) {
|
||||
// Unwrap the option and build the Ok(AutomatonWeight)
|
||||
@@ -139,7 +139,7 @@ impl FuzzyTermQuery {
|
||||
};
|
||||
Ok(AutomatonWeight::new(
|
||||
self.term.field(),
|
||||
DFAWrapper(automaton),
|
||||
DfaWrapper(automaton),
|
||||
))
|
||||
}
|
||||
None => Err(InvalidArgument(format!(
|
||||
|
||||
@@ -26,7 +26,7 @@ mod weight;
|
||||
mod vec_docset;
|
||||
|
||||
pub(crate) mod score_combiner;
|
||||
pub(crate) use self::bm25::BM25Weight;
|
||||
pub(crate) use self::bm25::Bm25Weight;
|
||||
pub use self::intersection::Intersection;
|
||||
pub use self::union::Union;
|
||||
|
||||
@@ -42,7 +42,7 @@ pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::explanation::Explanation;
|
||||
#[cfg(test)]
|
||||
pub(crate) use self::fuzzy_query::DFAWrapper;
|
||||
pub(crate) use self::fuzzy_query::DfaWrapper;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::PhraseWeight;
|
||||
use crate::core::searcher::Searcher;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::Query;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
@@ -95,7 +95,7 @@ impl PhraseQuery {
|
||||
)));
|
||||
}
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = BM25Weight::for_terms(searcher, &terms)?;
|
||||
let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
|
||||
Ok(PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
bm25_weight,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::{Intersection, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
use std::cmp::Ordering;
|
||||
@@ -49,7 +49,7 @@ pub struct PhraseScorer<TPostings: Postings> {
|
||||
right: Vec<u32>,
|
||||
phrase_count: u32,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
score_needed: bool,
|
||||
}
|
||||
|
||||
@@ -133,7 +133,7 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize {
|
||||
impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
pub fn new(
|
||||
term_postings: Vec<(usize, TPostings)>,
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
score_needed: bool,
|
||||
) -> PhraseScorer<TPostings> {
|
||||
|
||||
@@ -2,7 +2,7 @@ use super::PhraseScorer;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::Weight;
|
||||
@@ -14,7 +14,7 @@ use crate::{DocId, DocSet};
|
||||
|
||||
pub struct PhraseWeight {
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
score_needed: bool,
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ impl PhraseWeight {
|
||||
/// Creates a new phrase weight.
|
||||
pub fn new(
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
score_needed: bool,
|
||||
) -> PhraseWeight {
|
||||
PhraseWeight {
|
||||
|
||||
@@ -19,18 +19,18 @@ pub enum LogicalLiteral {
|
||||
All,
|
||||
}
|
||||
|
||||
pub enum LogicalAST {
|
||||
Clause(Vec<(Occur, LogicalAST)>),
|
||||
pub enum LogicalAst {
|
||||
Clause(Vec<(Occur, LogicalAst)>),
|
||||
Leaf(Box<LogicalLiteral>),
|
||||
Boost(Box<LogicalAST>, Score),
|
||||
Boost(Box<LogicalAst>, Score),
|
||||
}
|
||||
|
||||
impl LogicalAST {
|
||||
pub fn boost(self, boost: Score) -> LogicalAST {
|
||||
impl LogicalAst {
|
||||
pub fn boost(self, boost: Score) -> LogicalAst {
|
||||
if (boost - 1.0).abs() < Score::EPSILON {
|
||||
self
|
||||
} else {
|
||||
LogicalAST::Boost(Box::new(self), boost)
|
||||
LogicalAst::Boost(Box::new(self), boost)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -43,10 +43,10 @@ fn occur_letter(occur: Occur) -> &'static str {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for LogicalAST {
|
||||
impl fmt::Debug for LogicalAst {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
LogicalAST::Clause(ref clause) => {
|
||||
LogicalAst::Clause(ref clause) => {
|
||||
if clause.is_empty() {
|
||||
write!(formatter, "<emptyclause>")?;
|
||||
} else {
|
||||
@@ -59,15 +59,15 @@ impl fmt::Debug for LogicalAST {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LogicalAST::Boost(ref ast, boost) => write!(formatter, "{:?}^{}", ast, boost),
|
||||
LogicalAST::Leaf(ref literal) => write!(formatter, "{:?}", literal),
|
||||
LogicalAst::Boost(ref ast, boost) => write!(formatter, "{:?}^{}", ast, boost),
|
||||
LogicalAst::Leaf(ref literal) => write!(formatter, "{:?}", literal),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LogicalLiteral> for LogicalAST {
|
||||
fn from(literal: LogicalLiteral) -> LogicalAST {
|
||||
LogicalAST::Leaf(Box::new(literal))
|
||||
impl From<LogicalLiteral> for LogicalAst {
|
||||
fn from(literal: LogicalLiteral) -> LogicalAst {
|
||||
LogicalAst::Leaf(Box::new(literal))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ use std::collections::HashMap;
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
use tantivy_query_grammar::{UserInputAST, UserInputBound, UserInputLeaf};
|
||||
use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf};
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug, PartialEq, Eq, Error)]
|
||||
@@ -28,7 +28,7 @@ pub enum QueryParserError {
|
||||
SyntaxError,
|
||||
/// `FieldDoesNotExist(field_name: String)`
|
||||
/// The query references a field that is not in the schema
|
||||
#[error("File does not exists: '{0:?}'")]
|
||||
#[error("Field does not exists: '{0:?}'")]
|
||||
FieldDoesNotExist(String),
|
||||
/// The query contains a term for a `u64` or `i64`-field, but the value
|
||||
/// is neither.
|
||||
@@ -91,9 +91,9 @@ impl From<chrono::ParseError> for QueryParserError {
|
||||
/// Recursively remove empty clause from the AST
|
||||
///
|
||||
/// Returns `None` iff the `logical_ast` ended up being empty.
|
||||
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
||||
fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(children) => {
|
||||
LogicalAst::Clause(children) => {
|
||||
let trimmed_children = children
|
||||
.into_iter()
|
||||
.flat_map(|(occur, child)| {
|
||||
@@ -103,7 +103,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
||||
if trimmed_children.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(LogicalAST::Clause(trimmed_children))
|
||||
Some(LogicalAst::Clause(trimmed_children))
|
||||
}
|
||||
}
|
||||
_ => Some(logical_ast),
|
||||
@@ -178,11 +178,11 @@ pub struct QueryParser {
|
||||
boost: HashMap<Field, Score>,
|
||||
}
|
||||
|
||||
fn all_negative(ast: &LogicalAST) -> bool {
|
||||
fn all_negative(ast: &LogicalAst) -> bool {
|
||||
match ast {
|
||||
LogicalAST::Leaf(_) => false,
|
||||
LogicalAST::Boost(ref child_ast, _) => all_negative(&*child_ast),
|
||||
LogicalAST::Clause(children) => children
|
||||
LogicalAst::Leaf(_) => false,
|
||||
LogicalAst::Boost(ref child_ast, _) => all_negative(&*child_ast),
|
||||
LogicalAst::Clause(children) => children
|
||||
.iter()
|
||||
.all(|(ref occur, child)| (*occur == Occur::MustNot) || all_negative(child)),
|
||||
}
|
||||
@@ -251,7 +251,7 @@ impl QueryParser {
|
||||
}
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAst, QueryParserError> {
|
||||
let user_input_ast =
|
||||
tantivy_query_grammar::parse_query(query).map_err(|_| QueryParserError::SyntaxError)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
@@ -265,10 +265,10 @@ impl QueryParser {
|
||||
|
||||
fn compute_logical_ast(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
user_input_ast: UserInputAst,
|
||||
) -> Result<LogicalAst, QueryParserError> {
|
||||
let ast = self.compute_logical_ast_with_occur(user_input_ast)?;
|
||||
if let LogicalAST::Clause(children) = &ast {
|
||||
if let LogicalAst::Clause(children) = &ast {
|
||||
if children.is_empty() {
|
||||
return Ok(ast);
|
||||
}
|
||||
@@ -429,24 +429,24 @@ impl QueryParser {
|
||||
|
||||
fn compute_logical_ast_with_occur(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
user_input_ast: UserInputAst,
|
||||
) -> Result<LogicalAst, QueryParserError> {
|
||||
match user_input_ast {
|
||||
UserInputAST::Clause(sub_queries) => {
|
||||
UserInputAst::Clause(sub_queries) => {
|
||||
let default_occur = self.default_occur();
|
||||
let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new();
|
||||
let mut logical_sub_queries: Vec<(Occur, LogicalAst)> = Vec::new();
|
||||
for (occur_opt, sub_ast) in sub_queries {
|
||||
let sub_ast = self.compute_logical_ast_with_occur(sub_ast)?;
|
||||
let occur = occur_opt.unwrap_or(default_occur);
|
||||
logical_sub_queries.push((occur, sub_ast));
|
||||
}
|
||||
Ok(LogicalAST::Clause(logical_sub_queries))
|
||||
Ok(LogicalAst::Clause(logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Boost(ast, boost) => {
|
||||
UserInputAst::Boost(ast, boost) => {
|
||||
let ast = self.compute_logical_ast_with_occur(*ast)?;
|
||||
Ok(ast.boost(boost as Score))
|
||||
}
|
||||
UserInputAST::Leaf(leaf) => self.compute_logical_ast_from_leaf(*leaf),
|
||||
UserInputAst::Leaf(leaf) => self.compute_logical_ast_from_leaf(*leaf),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -457,7 +457,7 @@ impl QueryParser {
|
||||
fn compute_logical_ast_from_leaf(
|
||||
&self,
|
||||
leaf: UserInputLeaf,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
) -> Result<LogicalAst, QueryParserError> {
|
||||
match leaf {
|
||||
UserInputLeaf::Literal(literal) => {
|
||||
let term_phrases: Vec<(Field, String)> = match literal.field_name {
|
||||
@@ -476,22 +476,22 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut asts: Vec<LogicalAST> = Vec::new();
|
||||
let mut asts: Vec<LogicalAst> = Vec::new();
|
||||
for (field, phrase) in term_phrases {
|
||||
if let Some(ast) = self.compute_logical_ast_for_leaf(field, &phrase)? {
|
||||
// Apply some field specific boost defined at the query parser level.
|
||||
let boost = self.field_boost(field);
|
||||
asts.push(LogicalAST::Leaf(Box::new(ast)).boost(boost));
|
||||
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
|
||||
}
|
||||
}
|
||||
let result_ast: LogicalAST = if asts.len() == 1 {
|
||||
let result_ast: LogicalAst = if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
|
||||
LogicalAst::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
|
||||
};
|
||||
Ok(result_ast)
|
||||
}
|
||||
UserInputLeaf::All => Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))),
|
||||
UserInputLeaf::All => Ok(LogicalAst::Leaf(Box::new(LogicalLiteral::All))),
|
||||
UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
@@ -504,7 +504,7 @@ impl QueryParser {
|
||||
let boost = self.field_boost(field);
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let value_type = field_entry.field_type().value_type();
|
||||
let logical_ast = LogicalAST::Leaf(Box::new(LogicalLiteral::Range {
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
lower: self.resolve_bound(field, &lower)?,
|
||||
@@ -516,7 +516,7 @@ impl QueryParser {
|
||||
let result_ast = if clauses.len() == 1 {
|
||||
clauses.pop().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(
|
||||
LogicalAst::Clause(
|
||||
clauses
|
||||
.into_iter()
|
||||
.map(|clause| (Occur::Should, clause))
|
||||
@@ -547,9 +547,9 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<dyn Query> {
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
|
||||
fn convert_to_query(logical_ast: LogicalAst) -> Box<dyn Query> {
|
||||
match trim_ast(logical_ast) {
|
||||
Some(LogicalAST::Clause(trimmed_clause)) => {
|
||||
Some(LogicalAst::Clause(trimmed_clause)) => {
|
||||
let occur_subqueries = trimmed_clause
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
@@ -560,10 +560,10 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
|
||||
);
|
||||
Box::new(BooleanQuery::new(occur_subqueries))
|
||||
}
|
||||
Some(LogicalAST::Leaf(trimmed_logical_literal)) => {
|
||||
Some(LogicalAst::Leaf(trimmed_logical_literal)) => {
|
||||
convert_literal_to_query(*trimmed_logical_literal)
|
||||
}
|
||||
Some(LogicalAST::Boost(ast, boost)) => {
|
||||
Some(LogicalAst::Boost(ast, boost)) => {
|
||||
let query = convert_to_query(*ast);
|
||||
let boosted_query = BoostQuery::new(query, boost);
|
||||
Box::new(boosted_query)
|
||||
@@ -632,7 +632,7 @@ mod test {
|
||||
fn parse_query_to_logical_ast(
|
||||
query: &str,
|
||||
default_conjunction: bool,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
) -> Result<LogicalAst, QueryParserError> {
|
||||
let mut query_parser = make_query_parser();
|
||||
if default_conjunction {
|
||||
query_parser.set_conjunction_by_default();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::term_weight::TermWeight;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::Weight;
|
||||
use crate::query::{Explanation, Query};
|
||||
use crate::schema::IndexRecordOption;
|
||||
@@ -102,10 +102,10 @@ impl TermQuery {
|
||||
}
|
||||
let bm25_weight;
|
||||
if scoring_enabled {
|
||||
bm25_weight = BM25Weight::for_terms(searcher, &[term])?;
|
||||
bm25_weight = Bm25Weight::for_terms(searcher, &[term])?;
|
||||
} else {
|
||||
bm25_weight =
|
||||
BM25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32);
|
||||
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32);
|
||||
}
|
||||
let index_record_option = if scoring_enabled {
|
||||
self.index_record_option
|
||||
|
||||
@@ -6,20 +6,20 @@ use crate::Score;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::postings::{FreqReadingOption, Postings};
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TermScorer {
|
||||
postings: SegmentPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
}
|
||||
|
||||
impl TermScorer {
|
||||
pub fn new(
|
||||
postings: SegmentPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
) -> TermScorer {
|
||||
TermScorer {
|
||||
postings,
|
||||
@@ -36,7 +36,7 @@ impl TermScorer {
|
||||
pub fn create_for_test(
|
||||
doc_and_tfs: &[(DocId, u32)],
|
||||
fieldnorms: &[u32],
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
) -> TermScorer {
|
||||
assert!(!doc_and_tfs.is_empty());
|
||||
assert!(
|
||||
@@ -131,7 +131,7 @@ mod tests {
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{BM25Weight, Scorer, TermQuery};
|
||||
use crate::query::{Bm25Weight, Scorer, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
||||
use crate::Score;
|
||||
use crate::{assert_nearly_equals, Index, Searcher, SegmentId, Term};
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_term_scorer_max_score() -> crate::Result<()> {
|
||||
let bm25_weight = BM25Weight::for_one_term(3, 6, 10.0);
|
||||
let bm25_weight = Bm25Weight::for_one_term(3, 6, 10.0);
|
||||
let mut term_scorer = TermScorer::create_for_test(
|
||||
&[(2, 3), (3, 12), (7, 8)],
|
||||
&[0, 0, 10, 12, 0, 0, 0, 100],
|
||||
@@ -167,7 +167,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_term_scorer_shallow_advance() -> crate::Result<()> {
|
||||
let bm25_weight = BM25Weight::for_one_term(300, 1024, 10.0);
|
||||
let bm25_weight = Bm25Weight::for_one_term(300, 1024, 10.0);
|
||||
let mut doc_and_tfs = vec![];
|
||||
for i in 0u32..300u32 {
|
||||
let doc = i * 10;
|
||||
@@ -205,7 +205,7 @@ mod tests {
|
||||
// Average fieldnorm is over the entire index,
|
||||
// not necessarily the docs that are in the posting list.
|
||||
// For this reason we multiply by 1.1 to make a realistic value.
|
||||
let bm25_weight = BM25Weight::for_one_term(term_doc_freq as u64,
|
||||
let bm25_weight = Bm25Weight::for_one_term(term_doc_freq as u64,
|
||||
term_doc_freq as u64 * 10u64,
|
||||
average_fieldnorm);
|
||||
|
||||
@@ -240,7 +240,7 @@ mod tests {
|
||||
doc_tfs.push((258, 1u32));
|
||||
|
||||
let fieldnorms: Vec<u32> = std::iter::repeat(20u32).take(300).collect();
|
||||
let bm25_weight = BM25Weight::for_one_term(10, 129, 20.0);
|
||||
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
|
||||
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
|
||||
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
|
||||
docs.shallow_seek(135);
|
||||
|
||||
@@ -3,7 +3,7 @@ use crate::core::SegmentReader;
|
||||
use crate::docset::DocSet;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::weight::for_each_scorer;
|
||||
use crate::query::Weight;
|
||||
@@ -15,7 +15,7 @@ use crate::{DocId, Score};
|
||||
pub struct TermWeight {
|
||||
term: Term,
|
||||
index_record_option: IndexRecordOption,
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
scoring_enabled: bool,
|
||||
}
|
||||
|
||||
@@ -88,7 +88,7 @@ impl TermWeight {
|
||||
pub fn new(
|
||||
term: Term,
|
||||
index_record_option: IndexRecordOption,
|
||||
similarity_weight: BM25Weight,
|
||||
similarity_weight: Bm25Weight,
|
||||
scoring_enabled: bool,
|
||||
) -> TermWeight {
|
||||
TermWeight {
|
||||
|
||||
@@ -309,7 +309,7 @@ impl Schema {
|
||||
} else {
|
||||
format!("{:?}...", &doc_json[0..20])
|
||||
};
|
||||
DocParsingError::NotJSON(doc_json_sample)
|
||||
DocParsingError::NotJson(doc_json_sample)
|
||||
})?;
|
||||
|
||||
let mut doc = Document::default();
|
||||
@@ -394,7 +394,7 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
#[error("The provided string is not valid JSON")]
|
||||
NotJSON(String),
|
||||
NotJson(String),
|
||||
/// One of the value node could not be parsed.
|
||||
#[error("The field '{0:?}' could not be parsed: {1:?}")]
|
||||
ValueError(String, ValueParsingError),
|
||||
@@ -408,7 +408,7 @@ mod tests {
|
||||
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::int_options::Cardinality::SingleValue;
|
||||
use crate::schema::schema::DocParsingError::NotJSON;
|
||||
use crate::schema::schema::DocParsingError::NotJson;
|
||||
use crate::schema::*;
|
||||
use matches::{assert_matches, matches};
|
||||
use serde_json;
|
||||
@@ -737,7 +737,7 @@ mod tests {
|
||||
"count": 50,
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(json_err, Err(NotJSON(_)));
|
||||
assert_matches!(json_err, Err(NotJson(_)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -69,7 +69,6 @@ pub struct SegmentSpaceUsage {
|
||||
termdict: PerFieldSpaceUsage,
|
||||
postings: PerFieldSpaceUsage,
|
||||
positions: PerFieldSpaceUsage,
|
||||
positions_idx: PerFieldSpaceUsage,
|
||||
fast_fields: PerFieldSpaceUsage,
|
||||
fieldnorms: PerFieldSpaceUsage,
|
||||
|
||||
@@ -87,7 +86,6 @@ impl SegmentSpaceUsage {
|
||||
termdict: PerFieldSpaceUsage,
|
||||
postings: PerFieldSpaceUsage,
|
||||
positions: PerFieldSpaceUsage,
|
||||
positions_idx: PerFieldSpaceUsage,
|
||||
fast_fields: PerFieldSpaceUsage,
|
||||
fieldnorms: PerFieldSpaceUsage,
|
||||
store: StoreSpaceUsage,
|
||||
@@ -105,7 +103,6 @@ impl SegmentSpaceUsage {
|
||||
termdict,
|
||||
postings,
|
||||
positions,
|
||||
positions_idx,
|
||||
fast_fields,
|
||||
fieldnorms,
|
||||
store,
|
||||
@@ -122,14 +119,13 @@ impl SegmentSpaceUsage {
|
||||
use self::ComponentSpaceUsage::*;
|
||||
use crate::SegmentComponent::*;
|
||||
match component {
|
||||
POSTINGS => PerField(self.postings().clone()),
|
||||
POSITIONS => PerField(self.positions().clone()),
|
||||
POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
|
||||
FASTFIELDS => PerField(self.fast_fields().clone()),
|
||||
FIELDNORMS => PerField(self.fieldnorms().clone()),
|
||||
TERMS => PerField(self.termdict().clone()),
|
||||
STORE => Store(self.store().clone()),
|
||||
DELETE => Basic(self.deletes()),
|
||||
Postings => PerField(self.postings().clone()),
|
||||
Positions => PerField(self.positions().clone()),
|
||||
FastFields => PerField(self.fast_fields().clone()),
|
||||
FieldNorms => PerField(self.fieldnorms().clone()),
|
||||
Terms => PerField(self.termdict().clone()),
|
||||
SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
|
||||
Delete => Basic(self.deletes()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -153,11 +149,6 @@ impl SegmentSpaceUsage {
|
||||
&self.positions
|
||||
}
|
||||
|
||||
/// Space usage for positions skip idx
|
||||
pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
|
||||
&self.positions_idx
|
||||
}
|
||||
|
||||
/// Space usage for fast fields
|
||||
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
|
||||
&self.fast_fields
|
||||
@@ -358,7 +349,6 @@ mod test {
|
||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment.postings(), &name, 1, 512);
|
||||
assert_eq!(0, segment.positions().total());
|
||||
assert_eq!(0, segment.positions_skip_idx().total());
|
||||
expect_single_field(segment.fast_fields(), &name, 1, 512);
|
||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||
// TODO: understand why the following fails
|
||||
@@ -398,7 +388,6 @@ mod test {
|
||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment.postings(), &name, 1, 512);
|
||||
expect_single_field(segment.positions(), &name, 1, 512);
|
||||
expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
|
||||
assert_eq!(0, segment.fast_fields().total());
|
||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||
// TODO: understand why the following fails
|
||||
@@ -437,7 +426,6 @@ mod test {
|
||||
assert_eq!(0, segment.termdict().total());
|
||||
assert_eq!(0, segment.postings().total());
|
||||
assert_eq!(0, segment.positions().total());
|
||||
assert_eq!(0, segment.positions_skip_idx().total());
|
||||
assert_eq!(0, segment.fast_fields().total());
|
||||
assert_eq!(0, segment.fieldnorms().total());
|
||||
assert!(segment.store().total() > 0);
|
||||
@@ -483,7 +471,6 @@ mod test {
|
||||
expect_single_field(segment_space_usage.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment_space_usage.postings(), &name, 1, 512);
|
||||
assert_eq!(0, segment_space_usage.positions().total());
|
||||
assert_eq!(0, segment_space_usage.positions_skip_idx().total());
|
||||
assert_eq!(0, segment_space_usage.fast_fields().total());
|
||||
expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512);
|
||||
assert!(segment_space_usage.deletes() > 0);
|
||||
|
||||
@@ -98,7 +98,7 @@ use self::compression_snap::{compress, decompress};
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::schema::Document;
|
||||
use crate::schema::FieldValue;
|
||||
use crate::schema::Schema;
|
||||
@@ -146,7 +146,7 @@ pub mod tests {
|
||||
#[test]
|
||||
fn test_store() -> crate::Result<()> {
|
||||
let path = Path::new("store");
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, 1_000);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
@@ -172,7 +172,7 @@ mod bench {
|
||||
|
||||
use super::tests::write_lorem_ipsum_store;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::RAMDirectory;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::store::StoreReader;
|
||||
use std::path::Path;
|
||||
use test::Bencher;
|
||||
@@ -180,7 +180,7 @@ mod bench {
|
||||
#[bench]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn bench_store_encode(b: &mut Bencher) {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
b.iter(|| {
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
@@ -190,7 +190,7 @@ mod bench {
|
||||
|
||||
#[bench]
|
||||
fn bench_store_decode(b: &mut Bencher) {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
let store_file = directory.open_read(path).unwrap();
|
||||
|
||||
@@ -124,7 +124,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Field;
|
||||
use crate::{directory::RAMDirectory, store::tests::write_lorem_ipsum_store, Directory};
|
||||
use crate::{directory::RamDirectory, store::tests::write_lorem_ipsum_store, Directory};
|
||||
use std::path::Path;
|
||||
|
||||
fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> {
|
||||
@@ -133,7 +133,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_store_lru_cache() -> crate::Result<()> {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
let writer = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(writer, 500);
|
||||
|
||||
@@ -15,7 +15,7 @@ struct TermInfoBlockMeta {
|
||||
ref_term_info: TermInfo,
|
||||
doc_freq_nbits: u8,
|
||||
postings_offset_nbits: u8,
|
||||
positions_idx_nbits: u8,
|
||||
positions_offset_nbits: u8,
|
||||
}
|
||||
|
||||
impl BinarySerializable for TermInfoBlockMeta {
|
||||
@@ -25,7 +25,7 @@ impl BinarySerializable for TermInfoBlockMeta {
|
||||
write.write_all(&[
|
||||
self.doc_freq_nbits,
|
||||
self.postings_offset_nbits,
|
||||
self.positions_idx_nbits,
|
||||
self.positions_offset_nbits,
|
||||
])?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -40,7 +40,7 @@ impl BinarySerializable for TermInfoBlockMeta {
|
||||
ref_term_info,
|
||||
doc_freq_nbits: buffer[0],
|
||||
postings_offset_nbits: buffer[1],
|
||||
positions_idx_nbits: buffer[2],
|
||||
positions_offset_nbits: buffer[2],
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -52,7 +52,7 @@ impl FixedSize for TermInfoBlockMeta {
|
||||
|
||||
impl TermInfoBlockMeta {
|
||||
fn num_bits(&self) -> u8 {
|
||||
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_idx_nbits
|
||||
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_offset_nbits
|
||||
}
|
||||
|
||||
// Here inner_offset is the offset within the block, WITHOUT the first term_info.
|
||||
@@ -63,23 +63,30 @@ impl TermInfoBlockMeta {
|
||||
let num_bits = self.num_bits() as usize;
|
||||
|
||||
let posting_start_addr = num_bits * inner_offset;
|
||||
// the stop offset is the start offset of the next term info.
|
||||
let posting_stop_addr = posting_start_addr + num_bits;
|
||||
let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize;
|
||||
let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize;
|
||||
// the posting_start is the posting_start of the next term info.
|
||||
let posting_end_addr = posting_start_addr + num_bits;
|
||||
let positions_start_addr = posting_start_addr + self.postings_offset_nbits as usize;
|
||||
// the position_end is the positions_start of the next term info.
|
||||
let positions_end_addr = positions_start_addr + num_bits as usize;
|
||||
|
||||
let doc_freq_addr = positions_start_addr + self.positions_offset_nbits as usize;
|
||||
|
||||
let postings_start_offset = self.ref_term_info.postings_range.start
|
||||
+ extract_bits(data, posting_start_addr, self.postings_offset_nbits) as usize;
|
||||
let postings_end_offset = self.ref_term_info.postings_range.start
|
||||
+ extract_bits(data, posting_stop_addr, self.postings_offset_nbits) as usize;
|
||||
+ extract_bits(data, posting_end_addr, self.postings_offset_nbits) as usize;
|
||||
|
||||
let positions_start_offset = self.ref_term_info.positions_range.start
|
||||
+ extract_bits(data, positions_start_addr, self.positions_offset_nbits) as usize;
|
||||
let positions_end_offset = self.ref_term_info.positions_range.start
|
||||
+ extract_bits(data, positions_end_addr, self.positions_offset_nbits) as usize;
|
||||
|
||||
let doc_freq = extract_bits(data, doc_freq_addr, self.doc_freq_nbits) as u32;
|
||||
let positions_idx = self.ref_term_info.positions_idx
|
||||
+ extract_bits(data, positions_idx_addr, self.positions_idx_nbits);
|
||||
|
||||
TermInfo {
|
||||
doc_freq,
|
||||
postings_range: postings_start_offset..postings_end_offset,
|
||||
positions_idx,
|
||||
positions_range: positions_start_offset..positions_end_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -167,14 +174,13 @@ fn bitpack_serialize<W: Write>(
|
||||
write,
|
||||
)?;
|
||||
bit_packer.write(
|
||||
u64::from(term_info.doc_freq),
|
||||
term_info_block_meta.doc_freq_nbits,
|
||||
term_info.positions_range.start as u64,
|
||||
term_info_block_meta.positions_offset_nbits,
|
||||
write,
|
||||
)?;
|
||||
|
||||
bit_packer.write(
|
||||
term_info.positions_idx,
|
||||
term_info_block_meta.positions_idx_nbits,
|
||||
u64::from(term_info.doc_freq),
|
||||
term_info_block_meta.doc_freq_nbits,
|
||||
write,
|
||||
)?;
|
||||
Ok(())
|
||||
@@ -201,29 +207,29 @@ impl TermInfoStoreWriter {
|
||||
};
|
||||
let postings_end_offset =
|
||||
last_term_info.postings_range.end - ref_term_info.postings_range.start;
|
||||
let positions_end_offset =
|
||||
last_term_info.positions_range.end - ref_term_info.positions_range.start;
|
||||
for term_info in &mut self.term_infos[1..] {
|
||||
term_info.postings_range.start -= ref_term_info.postings_range.start;
|
||||
term_info.positions_idx -= ref_term_info.positions_idx;
|
||||
term_info.positions_range.start -= ref_term_info.positions_range.start;
|
||||
}
|
||||
|
||||
let mut max_doc_freq: u32 = 0u32;
|
||||
let max_postings_offset: usize = postings_end_offset;
|
||||
let max_positions_idx: u64 = last_term_info.positions_idx;
|
||||
|
||||
for term_info in &self.term_infos[1..] {
|
||||
max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq);
|
||||
}
|
||||
|
||||
let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq));
|
||||
let max_postings_offset_nbits = compute_num_bits(max_postings_offset as u64);
|
||||
let max_positions_idx_nbits = compute_num_bits(max_positions_idx);
|
||||
let max_postings_offset_nbits = compute_num_bits(postings_end_offset as u64);
|
||||
let max_positions_offset_nbits = compute_num_bits(positions_end_offset as u64);
|
||||
|
||||
let term_info_block_meta = TermInfoBlockMeta {
|
||||
offset: self.buffer_term_infos.len() as u64,
|
||||
ref_term_info,
|
||||
doc_freq_nbits: max_doc_freq_nbits,
|
||||
postings_offset_nbits: max_postings_offset_nbits,
|
||||
positions_idx_nbits: max_positions_idx_nbits,
|
||||
positions_offset_nbits: max_positions_offset_nbits,
|
||||
};
|
||||
|
||||
term_info_block_meta.serialize(&mut self.buffer_block_metas)?;
|
||||
@@ -236,11 +242,17 @@ impl TermInfoStoreWriter {
|
||||
)?;
|
||||
}
|
||||
|
||||
// We still need to serialize the end offset for postings & positions.
|
||||
bit_packer.write(
|
||||
postings_end_offset as u64,
|
||||
term_info_block_meta.postings_offset_nbits,
|
||||
&mut self.buffer_term_infos,
|
||||
)?;
|
||||
bit_packer.write(
|
||||
positions_end_offset as u64,
|
||||
term_info_block_meta.positions_offset_nbits,
|
||||
&mut self.buffer_term_infos,
|
||||
)?;
|
||||
|
||||
// Block need end up at the end of a byte.
|
||||
bit_packer.flush(&mut self.buffer_term_infos)?;
|
||||
@@ -313,11 +325,11 @@ mod tests {
|
||||
ref_term_info: TermInfo {
|
||||
doc_freq: 512,
|
||||
postings_range: 51..57,
|
||||
positions_idx: 3584,
|
||||
positions_range: 110..134,
|
||||
},
|
||||
doc_freq_nbits: 10,
|
||||
postings_offset_nbits: 5,
|
||||
positions_idx_nbits: 11,
|
||||
positions_offset_nbits: 8,
|
||||
};
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
term_info_block_meta.serialize(&mut buffer).unwrap();
|
||||
@@ -335,7 +347,7 @@ mod tests {
|
||||
let term_info = TermInfo {
|
||||
doc_freq: i as u32,
|
||||
postings_range: offset(i)..offset(i + 1),
|
||||
positions_idx: (i * 7) as u64,
|
||||
positions_range: offset(i) * 3..offset(i + 1) * 3,
|
||||
};
|
||||
store_writer.write_term_info(&term_info)?;
|
||||
term_infos.push(term_info);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
|
||||
|
||||
use crate::directory::{Directory, FileSlice, RAMDirectory, TerminatingWrite};
|
||||
use crate::directory::{Directory, FileSlice, RamDirectory, TerminatingWrite};
|
||||
use crate::postings::TermInfo;
|
||||
|
||||
use std::path::PathBuf;
|
||||
@@ -13,7 +13,7 @@ fn make_term_info(term_ord: u64) -> TermInfo {
|
||||
TermInfo {
|
||||
doc_freq: term_ord as u32,
|
||||
postings_range: offset(term_ord)..offset(term_ord + 1),
|
||||
positions_idx: offset(term_ord) as u64 * 2u64,
|
||||
positions_range: offset(term_ord) * 2..offset(term_ord + 1) * 2,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ fn test_term_ordinals() -> crate::Result<()> {
|
||||
"Sweden",
|
||||
"Switzerland",
|
||||
];
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
@@ -57,7 +57,7 @@ fn test_term_ordinals() -> crate::Result<()> {
|
||||
|
||||
#[test]
|
||||
fn test_term_dictionary_simple() -> crate::Result<()> {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
@@ -380,7 +380,7 @@ fn test_stream_term_ord() -> crate::Result<()> {
|
||||
let termdict = stream_range_test_dict()?;
|
||||
let mut stream = termdict.stream()?;
|
||||
for b in 0u8..10u8 {
|
||||
assert!(stream.advance(), true);
|
||||
assert!(stream.advance());
|
||||
assert_eq!(stream.term_ord(), b as u64);
|
||||
assert_eq!(stream.key(), &[b]);
|
||||
}
|
||||
@@ -390,7 +390,7 @@ fn test_stream_term_ord() -> crate::Result<()> {
|
||||
|
||||
#[test]
|
||||
fn test_automaton_search() -> crate::Result<()> {
|
||||
use crate::query::DFAWrapper;
|
||||
use crate::query::DfaWrapper;
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
@@ -403,7 +403,7 @@ fn test_automaton_search() -> crate::Result<()> {
|
||||
"Switzerland",
|
||||
];
|
||||
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path)?;
|
||||
@@ -418,7 +418,7 @@ fn test_automaton_search() -> crate::Result<()> {
|
||||
|
||||
// We can now build an entire dfa.
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
|
||||
let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen"));
|
||||
let automaton = DfaWrapper(lev_automaton_builder.build_dfa("Spaen"));
|
||||
|
||||
let mut range = term_dict.search(automaton).into_stream()?;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user