Merge branch 'main' into indexmeta

This commit is contained in:
Paul Masurel
2021-04-26 14:34:58 +09:00
committed by GitHub
74 changed files with 839 additions and 797 deletions

View File

@@ -180,7 +180,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
/// Return true iff at least K documents have gone through
/// the collector.
#[inline(always)]
#[inline]
pub(crate) fn at_capacity(&self) -> bool {
self.heap.len() >= self.limit
}
@@ -189,7 +189,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
///
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
/// will compare the lowest scoring item with the given one and keep whichever is greater.
#[inline(always)]
#[inline]
pub fn collect(&mut self, doc: DocId, feature: T) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.

View File

@@ -59,19 +59,19 @@ impl TinySet {
/// Creates a new `TinySet` containing only one element
/// within `[0; 64[`
#[inline(always)]
#[inline]
pub fn singleton(el: u32) -> TinySet {
TinySet(1u64 << u64::from(el))
}
/// Insert a new element within [0..64[
#[inline(always)]
#[inline]
pub fn insert(self, el: u32) -> TinySet {
self.union(TinySet::singleton(el))
}
/// Insert a new element within [0..64[
#[inline(always)]
#[inline]
pub fn insert_mut(&mut self, el: u32) -> bool {
let old = *self;
*self = old.insert(el);
@@ -79,20 +79,20 @@ impl TinySet {
}
/// Returns the union of two tinysets
#[inline(always)]
#[inline]
pub fn union(self, other: TinySet) -> TinySet {
TinySet(self.0 | other.0)
}
/// Returns true iff the `TinySet` is empty.
#[inline(always)]
#[inline]
pub fn is_empty(self) -> bool {
self.0 == 0u64
}
/// Returns the lowest element in the `TinySet`
/// and removes it.
#[inline(always)]
#[inline]
pub fn pop_lowest(&mut self) -> Option<u32> {
if self.is_empty() {
None

View File

@@ -190,7 +190,7 @@ mod test {
use super::{CompositeFile, CompositeWrite};
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::directory::{Directory, RAMDirectory};
use crate::directory::{Directory, RamDirectory};
use crate::schema::Field;
use std::io::Write;
use std::path::Path;
@@ -198,7 +198,7 @@ mod test {
#[test]
fn test_composite_file() -> crate::Result<()> {
let path = Path::new("test_path");
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
{
let w = directory.open_write(path).unwrap();
let mut composite_write = CompositeWrite::wrap(w);

View File

@@ -99,13 +99,13 @@ const HIGHEST_BIT: u64 = 1 << 63;
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline(always)]
#[inline]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline(always)]
#[inline]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
@@ -127,7 +127,7 @@ pub fn u64_to_i64(val: u64) -> i64 {
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline(always)]
#[inline]
pub fn f64_to_u64(val: f64) -> u64 {
let bits = val.to_bits();
if val.is_sign_positive() {
@@ -138,7 +138,7 @@ pub fn f64_to_u64(val: f64) -> u64 {
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline(always)]
#[inline]
pub fn u64_to_f64(val: u64) -> f64 {
f64::from_bits(if val & HIGHEST_BIT != 0 {
val ^ HIGHEST_BIT

View File

@@ -10,7 +10,7 @@ use crate::directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use crate::directory::MmapDirectory;
use crate::directory::INDEX_WRITER_LOCK;
use crate::directory::{Directory, RAMDirectory};
use crate::directory::{Directory, RamDirectory};
use crate::error::DataCorruption;
use crate::error::TantivyError;
use crate::indexer::index_writer::HEAP_SIZE_MIN;
@@ -222,7 +222,7 @@ impl Index {
self.set_multithread_executor(default_num_threads)
}
/// Creates a new index using the `RAMDirectory`.
/// Creates a new index using the `RamDirectory`.
///
/// The index will be allocated in anonymous memory.
/// This should only be used for unit tests.
@@ -256,7 +256,7 @@ impl Index {
/// is destroyed.
///
/// The temp directory is only used for testing the `MmapDirectory`.
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
/// For other unit tests, prefer the `RamDirectory`, see: `create_in_ram`.
#[cfg(feature = "mmap")]
pub fn create_from_tempdir(schema: Schema) -> crate::Result<Index> {
IndexBuilder::new().schema(schema).create_from_tempdir()
@@ -390,7 +390,7 @@ impl Index {
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
///
/// # Errors
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IOError`.
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
///
/// # Panics
/// If the heap size per thread is too small, panics.
@@ -524,7 +524,7 @@ impl fmt::Debug for Index {
#[cfg(test)]
mod tests {
use crate::directory::{RAMDirectory, WatchCallback};
use crate::directory::{RamDirectory, WatchCallback};
use crate::schema::Field;
use crate::schema::{Schema, INDEXED, TEXT};
use crate::IndexReader;
@@ -548,7 +548,7 @@ mod tests {
#[test]
fn test_index_exists() {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
assert!(!Index::exists(&directory).unwrap());
assert!(Index::create(
directory.clone(),
@@ -561,7 +561,7 @@ mod tests {
#[test]
fn open_or_create_should_create() {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
assert!(!Index::exists(&directory).unwrap());
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory).unwrap());
@@ -569,7 +569,7 @@ mod tests {
#[test]
fn open_or_create_should_open() {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
assert!(Index::create(
directory.clone(),
throw_away_schema(),
@@ -582,7 +582,7 @@ mod tests {
#[test]
fn create_should_wipeoff_existing() {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
assert!(Index::create(
directory.clone(),
throw_away_schema(),
@@ -600,7 +600,7 @@ mod tests {
#[test]
fn open_or_create_exists_but_schema_does_not_match() {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
assert!(Index::create(
directory.clone(),
throw_away_schema(),
@@ -738,7 +738,7 @@ mod tests {
#[cfg(not(target_os = "windows"))]
#[test]
fn garbage_collect_works_as_intended() {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();

View File

@@ -108,14 +108,13 @@ impl SegmentMeta {
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string();
path.push_str(&*match component {
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSITIONSSKIP => ".posidx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
SegmentComponent::Postings => ".idx".to_string(),
SegmentComponent::Positions => ".pos".to_string(),
SegmentComponent::Terms => ".term".to_string(),
SegmentComponent::Store => ".store".to_string(),
SegmentComponent::FastFields => ".fast".to_string(),
SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
});
PathBuf::from(path)
}

View File

@@ -26,7 +26,6 @@ pub struct InvertedIndexReader {
termdict: TermDictionary,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
positions_idx_file_slice: FileSlice,
record_option: IndexRecordOption,
total_num_tokens: u64,
}
@@ -37,7 +36,6 @@ impl InvertedIndexReader {
termdict: TermDictionary,
postings_file_slice: FileSlice,
positions_file_slice: FileSlice,
positions_idx_file_slice: FileSlice,
record_option: IndexRecordOption,
) -> io::Result<InvertedIndexReader> {
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
@@ -46,7 +44,6 @@ impl InvertedIndexReader {
termdict,
postings_file_slice: postings_body,
positions_file_slice,
positions_idx_file_slice,
record_option,
total_num_tokens,
})
@@ -59,7 +56,6 @@ impl InvertedIndexReader {
termdict: TermDictionary::empty(),
postings_file_slice: FileSlice::empty(),
positions_file_slice: FileSlice::empty(),
positions_idx_file_slice: FileSlice::empty(),
record_option,
total_num_tokens: 0u64,
}
@@ -141,12 +137,12 @@ impl InvertedIndexReader {
option: IndexRecordOption,
) -> io::Result<SegmentPostings> {
let block_postings = self.read_block_postings_from_terminfo(term_info, option)?;
let position_stream = {
let position_reader = {
if option.has_positions() {
let position_reader = self.positions_file_slice.clone();
let skip_reader = self.positions_idx_file_slice.clone();
let position_reader =
PositionReader::new(position_reader, skip_reader, term_info.positions_idx)?;
let positions_data = self
.positions_file_slice
.read_bytes_slice(term_info.positions_range.clone())?;
let position_reader = PositionReader::open(positions_data)?;
Some(position_reader)
} else {
None
@@ -154,7 +150,7 @@ impl InvertedIndexReader {
};
Ok(SegmentPostings::from_block_postings(
block_postings,
position_stream,
position_reader,
))
}

View File

@@ -7,39 +7,36 @@ use std::slice;
#[derive(Copy, Clone)]
pub enum SegmentComponent {
/// Postings (or inverted list). Sorted lists of document ids, associated to terms
POSTINGS,
Postings,
/// Positions of terms in each document.
POSITIONS,
/// Index to seek within the position file
POSITIONSSKIP,
Positions,
/// Column-oriented random-access storage of fields.
FASTFIELDS,
FastFields,
/// Stores the sum of the length (in terms) of each field for each document.
/// Field norms are stored as a special u64 fast field.
FIELDNORMS,
FieldNorms,
/// Dictionary associating `Term`s to `TermInfo`s which is
/// simply an address into the `postings` file and the `positions` file.
TERMS,
Terms,
/// Row-oriented, compressed storage of the documents.
/// Accessing a document from the store is relatively slow, as it
/// requires to decompress the entire block it belongs to.
STORE,
Store,
/// Bitset describing which document of the segment is deleted.
DELETE,
Delete,
}
impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::POSITIONSSKIP,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE,
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::Postings,
SegmentComponent::Positions,
SegmentComponent::FastFields,
SegmentComponent::FieldNorms,
SegmentComponent::Terms,
SegmentComponent::Store,
SegmentComponent::Delete,
];
SEGMENT_COMPONENTS.iter()
}

View File

@@ -46,7 +46,6 @@ pub struct SegmentReader {
termdict_composite: CompositeFile,
postings_composite: CompositeFile,
positions_composite: CompositeFile,
positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>,
fieldnorm_readers: FieldNormReaders,
@@ -151,44 +150,36 @@ impl SegmentReader {
/// Open a new segment for reading.
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
let termdict_file = segment.open_read(SegmentComponent::TERMS)?;
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
let termdict_composite = CompositeFile::open(&termdict_file)?;
let store_file = segment.open_read(SegmentComponent::STORE)?;
let store_file = segment.open_read(SegmentComponent::Store)?;
fail_point!("SegmentReader::open#middle");
let postings_file = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_file = segment.open_read(SegmentComponent::Postings)?;
let postings_composite = CompositeFile::open(&postings_file)?;
let positions_composite = {
if let Ok(positions_file) = segment.open_read(SegmentComponent::POSITIONS) {
if let Ok(positions_file) = segment.open_read(SegmentComponent::Positions) {
CompositeFile::open(&positions_file)?
} else {
CompositeFile::empty()
}
};
let positions_idx_composite = {
if let Ok(positions_skip_file) = segment.open_read(SegmentComponent::POSITIONSSKIP) {
CompositeFile::open(&positions_skip_file)?
} else {
CompositeFile::empty()
}
};
let schema = segment.schema();
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_field_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
let delete_data = segment.open_read(SegmentComponent::Delete)?;
let delete_bitset = DeleteBitSet::open(delete_data)?;
Some(delete_bitset)
} else {
@@ -207,7 +198,6 @@ impl SegmentReader {
store_file,
delete_bitset_opt,
positions_composite,
positions_idx_composite,
schema,
})
}
@@ -263,18 +253,15 @@ impl SegmentReader {
let positions_file = self
.positions_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
let positions_idx_file = self
.positions_idx_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
.ok_or_else(|| {
let error_msg = format!("Failed to open field {:?}'s positions in the composite file. Has the schema been modified?", field_entry.name());
DataCorruption::comment_only(error_msg)
})?;
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
TermDictionary::open(termdict_file)?,
postings_file,
positions_file,
positions_idx_file,
record_option,
)?);
@@ -319,7 +306,6 @@ impl SegmentReader {
self.termdict_composite.space_usage(),
self.postings_composite.space_usage(),
self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(),
self.fast_fields_readers.space_usage(),
self.fieldnorm_readers.space_usage(),
self.get_store_reader()?.space_usage(),

View File

@@ -70,7 +70,7 @@ impl Drop for DirectoryLockGuard {
enum TryAcquireLockError {
FileExists,
IOError(io::Error),
IoError(io::Error),
}
fn try_acquire_lock(
@@ -79,9 +79,9 @@ fn try_acquire_lock(
) -> Result<DirectoryLock, TryAcquireLockError> {
let mut write = directory.open_write(filepath).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
OpenWriteError::IOError { io_error, .. } => TryAcquireLockError::IOError(io_error),
OpenWriteError::IoError { io_error, .. } => TryAcquireLockError::IoError(io_error),
})?;
write.flush().map_err(TryAcquireLockError::IOError)?;
write.flush().map_err(TryAcquireLockError::IoError)?;
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
directory: directory.box_clone(),
path: filepath.to_owned(),
@@ -106,7 +106,7 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
///
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
/// should be your default choice.
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
/// - The [`RamDirectory`](struct.RamDirectory.html), which
/// should be used mostly for tests.
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// Opens a file and returns a boxed `FileHandle`.
@@ -154,7 +154,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// Flush operation should also be persistent.
///
/// The user shall not rely on `Drop` triggering `flush`.
/// Note that `RAMDirectory` will panic! if `flush`
/// Note that `RamDirectory` will panic! if `flush`
/// was not called.
///
/// The file may not previously exist.
@@ -192,8 +192,8 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
return Err(LockError::LockBusy);
}
}
Err(TryAcquireLockError::IOError(io_error)) => {
return Err(LockError::IOError(io_error));
Err(TryAcquireLockError::IoError(io_error)) => {
return Err(LockError::IoError(io_error));
}
}
}

View File

@@ -12,9 +12,9 @@ pub enum LockError {
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
#[error("Could not acquire lock as it is already held, possibly by a different process.")]
LockBusy,
/// Trying to acquire a lock failed with an `IOError`
/// Trying to acquire a lock failed with an `IoError`
#[error("Failed to acquire the lock due to an io:Error.")]
IOError(io::Error),
IoError(io::Error),
}
/// Error that may occur when opening a directory
@@ -30,7 +30,7 @@ pub enum OpenDirectoryError {
#[error("Failed to create a temporary directory: '{0}'.")]
FailedToCreateTempDir(io::Error),
/// IoError
#[error("IOError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
#[error("IoError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
IoError {
/// underlying io Error.
io_error: io::Error,
@@ -48,8 +48,8 @@ pub enum OpenWriteError {
FileAlreadyExists(PathBuf),
/// Any kind of IO error that happens when
/// writing in the underlying IO device.
#[error("IOError '{io_error:?}' while opening file for write: '{filepath}'.")]
IOError {
#[error("IoError '{io_error:?}' while opening file for write: '{filepath}'.")]
IoError {
/// The underlying `io::Error`.
io_error: io::Error,
/// File path of the file that tantivy failed to open for write.
@@ -60,7 +60,7 @@ pub enum OpenWriteError {
impl OpenWriteError {
/// Wraps an io error.
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
Self::IOError { io_error, filepath }
Self::IoError { io_error, filepath }
}
}
/// Type of index incompatibility between the library and the index found on disk
@@ -130,9 +130,9 @@ pub enum OpenReadError {
FileDoesNotExist(PathBuf),
/// Any kind of io::Error.
#[error(
"IOError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
"IoError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
)]
IOError {
IoError {
/// The underlying `io::Error`.
io_error: io::Error,
/// File path of the file that tantivy failed to open for read.
@@ -146,7 +146,7 @@ pub enum OpenReadError {
impl OpenReadError {
/// Wraps an io error.
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
Self::IOError { io_error, filepath }
Self::IoError { io_error, filepath }
}
}
/// Error that may occur when trying to delete a file
@@ -158,7 +158,7 @@ pub enum DeleteError {
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
#[error("The following IO error happened while deleting file '{filepath}': '{io_error:?}'.")]
IOError {
IoError {
/// The underlying `io::Error`.
io_error: io::Error,
/// File path of the file that tantivy failed to delete.

View File

@@ -86,7 +86,7 @@ impl ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::default(),
}),
io_err @ Err(OpenReadError::IOError { .. }) => Err(io_err.err().unwrap().into()),
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
Err(OpenReadError::IncompatibleIndex(incompatibility)) => {
// For the moment, this should never happen `meta.json`
// do not have any footer and cannot detect incompatibility.
@@ -168,7 +168,7 @@ impl ManagedDirectory {
DeleteError::FileDoesNotExist(_) => {
deleted_files.push(file_to_delete.clone());
}
DeleteError::IOError { .. } => {
DeleteError::IoError { .. } => {
failed_to_delete_files.push(file_to_delete.clone());
if !cfg!(target_os = "windows") {
// On windows, delete is expected to fail if the file
@@ -232,13 +232,13 @@ impl ManagedDirectory {
pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> {
let reader = self.directory.open_read(path)?;
let (footer, data) =
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IOError {
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IoError {
io_error,
filepath: path.to_path_buf(),
})?;
let bytes = data
.read_bytes()
.map_err(|io_error| OpenReadError::IOError {
.map_err(|io_error| OpenReadError::IoError {
filepath: path.to_path_buf(),
io_error,
})?;

View File

@@ -185,7 +185,7 @@ impl MmapDirectory {
/// Creates a new MmapDirectory in a temporary directory.
///
/// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory.
/// For your unit tests, prefer the RamDirectory.
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
let tempdir = TempDir::new().map_err(OpenDirectoryError::FailedToCreateTempDir)?;
Ok(MmapDirectory::new(
@@ -374,7 +374,7 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
let full_path = self.resolve_path(path);
match fs::remove_file(&full_path) {
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError {
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError {
io_error: e,
filepath: path.to_path_buf(),
}),
@@ -382,7 +382,7 @@ impl Directory for MmapDirectory {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(DeleteError::IOError {
Err(DeleteError::IoError {
io_error: e,
filepath: path.to_path_buf(),
})
@@ -460,9 +460,9 @@ impl Directory for MmapDirectory {
.write(true)
.create(true) //< if the file does not exist yet, create it.
.open(&full_path)
.map_err(LockError::IOError)?;
.map_err(LockError::IoError)?;
if lock.is_blocking {
file.lock_exclusive().map_err(LockError::IOError)?;
file.lock_exclusive().map_err(LockError::IoError)?;
} else {
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
}
@@ -616,8 +616,9 @@ mod tests {
reader.reload().unwrap();
let num_segments = reader.searcher().segment_readers().len();
assert!(num_segments <= 4);
let num_components_except_deletes = crate::core::SegmentComponent::iterator().len() - 1;
assert_eq!(
num_segments * 7,
num_segments * num_components_except_deletes,
mmap_directory.get_cache_info().mmapped.len()
);
}

View File

@@ -26,7 +26,7 @@ pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes};
pub use self::file_slice::{FileHandle, FileSlice};
pub use self::owned_bytes::OwnedBytes;
pub use self::ram_directory::RAMDirectory;
pub use self::ram_directory::RamDirectory;
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
use std::io::{self, BufWriter, Write};
use std::path::PathBuf;

View File

@@ -51,13 +51,13 @@ impl OwnedBytes {
/// Returns the underlying slice of data.
/// `Deref` and `AsRef` are also available.
#[inline(always)]
#[inline]
pub fn as_slice(&self) -> &[u8] {
self.data
}
/// Returns the len of the slice.
#[inline(always)]
#[inline]
pub fn len(&self) -> usize {
self.data.len()
}
@@ -84,7 +84,7 @@ impl OwnedBytes {
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline(always)]
#[inline]
pub fn is_empty(&self) -> bool {
self.as_slice().is_empty()
}
@@ -92,7 +92,7 @@ impl OwnedBytes {
/// Drops the left most `advance_len` bytes.
///
/// See also [.clip(clip_len: usize))](#method.clip).
#[inline(always)]
#[inline]
pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..]
}

View File

@@ -14,7 +14,7 @@ use std::sync::{Arc, RwLock};
use super::FileHandle;
/// Writer associated with the `RAMDirectory`
/// Writer associated with the `RamDirectory`
///
/// The Writer just writes a buffer.
///
@@ -26,13 +26,13 @@ use super::FileHandle;
///
struct VecWriter {
path: PathBuf,
shared_directory: RAMDirectory,
shared_directory: RamDirectory,
data: Cursor<Vec<u8>>,
is_flushed: bool,
}
impl VecWriter {
fn new(path_buf: PathBuf, shared_directory: RAMDirectory) -> VecWriter {
fn new(path_buf: PathBuf, shared_directory: RamDirectory) -> VecWriter {
VecWriter {
path: path_buf,
data: Cursor::new(Vec::new()),
@@ -119,9 +119,9 @@ impl InnerDirectory {
}
}
impl fmt::Debug for RAMDirectory {
impl fmt::Debug for RamDirectory {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "RAMDirectory")
write!(f, "RamDirectory")
}
}
@@ -131,23 +131,23 @@ impl fmt::Debug for RAMDirectory {
/// Writes are only made visible upon flushing.
///
#[derive(Clone, Default)]
pub struct RAMDirectory {
pub struct RamDirectory {
fs: Arc<RwLock<InnerDirectory>>,
}
impl RAMDirectory {
impl RamDirectory {
/// Constructor
pub fn create() -> RAMDirectory {
pub fn create() -> RamDirectory {
Self::default()
}
/// Returns the sum of the size of the different files
/// in the RAMDirectory.
/// in the RamDirectory.
pub fn total_mem_usage(&self) -> usize {
self.fs.read().unwrap().total_mem_usage()
}
/// Write a copy of all of the files saved in the RAMDirectory in the target `Directory`.
/// Write a copy of all of the files saved in the RamDirectory in the target `Directory`.
///
/// Files are all written using the `Directory::write` meaning, even if they were
/// written using the `atomic_write` api.
@@ -164,7 +164,7 @@ impl RAMDirectory {
}
}
impl Directory for RAMDirectory {
impl Directory for RamDirectory {
fn get_file_handle(&self, path: &Path) -> Result<Box<dyn FileHandle>, OpenReadError> {
let file_slice = self.open_read(path)?;
Ok(Box::new(file_slice))
@@ -175,8 +175,8 @@ impl Directory for RAMDirectory {
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
fail_point!("RAMDirectory::delete", |_| {
Err(DeleteError::IOError {
fail_point!("RamDirectory::delete", |_| {
Err(DeleteError::IoError {
io_error: io::Error::from(io::ErrorKind::Other),
filepath: path.to_path_buf(),
})
@@ -188,7 +188,7 @@ impl Directory for RAMDirectory {
Ok(self
.fs
.read()
.map_err(|e| OpenReadError::IOError {
.map_err(|e| OpenReadError::IoError {
io_error: io::Error::new(io::ErrorKind::Other, e.to_string()),
filepath: path.to_path_buf(),
})?
@@ -212,7 +212,7 @@ impl Directory for RAMDirectory {
let bytes =
self.open_read(path)?
.read_bytes()
.map_err(|io_error| OpenReadError::IOError {
.map_err(|io_error| OpenReadError::IoError {
io_error,
filepath: path.to_path_buf(),
})?;
@@ -220,7 +220,7 @@ impl Directory for RAMDirectory {
}
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
)));
@@ -241,7 +241,7 @@ impl Directory for RAMDirectory {
#[cfg(test)]
mod tests {
use super::RAMDirectory;
use super::RamDirectory;
use crate::Directory;
use std::io::Write;
use std::path::Path;
@@ -252,12 +252,12 @@ mod tests {
let msg_seq: &'static [u8] = b"sequential is the way";
let path_atomic: &'static Path = Path::new("atomic");
let path_seq: &'static Path = Path::new("seq");
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
assert!(directory.atomic_write(path_atomic, msg_atomic).is_ok());
let mut wrt = directory.open_write(path_seq).unwrap();
assert!(wrt.write_all(msg_seq).is_ok());
assert!(wrt.flush().is_ok());
let directory_copy = RAMDirectory::create();
let directory_copy = RamDirectory::create();
assert!(directory.persist(&directory_copy).is_ok());
assert_eq!(directory_copy.atomic_read(path_atomic).unwrap(), msg_atomic);
assert_eq!(directory_copy.atomic_read(path_seq).unwrap(), msg_seq);

View File

@@ -65,12 +65,12 @@ mod mmap_directory_tests {
}
mod ram_directory_tests {
use crate::directory::RAMDirectory;
use crate::directory::RamDirectory;
type DirectoryImpl = RAMDirectory;
type DirectoryImpl = RamDirectory;
fn make_directory() -> DirectoryImpl {
RAMDirectory::default()
RamDirectory::default()
}
#[test]
@@ -122,7 +122,7 @@ mod ram_directory_tests {
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let test_path: &'static Path = Path::new("some_path_for_test");
let ram_directory = RAMDirectory::create();
let ram_directory = RamDirectory::create();
let mut write_file = ram_directory.open_write(test_path).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}

View File

@@ -70,7 +70,7 @@ pub enum TantivyError {
LockFailure(LockError, Option<String>),
/// IO Error.
#[error("An IO error occurred: '{0}'")]
IOError(#[from] io::Error),
IoError(#[from] io::Error),
/// Data corruption.
#[error("Data corrupted: '{0:?}'")]
DataCorruption(DataCorruption),
@@ -139,7 +139,7 @@ impl From<schema::DocParsingError> for TantivyError {
impl From<serde_json::Error> for TantivyError {
fn from(error: serde_json::Error) -> TantivyError {
TantivyError::IOError(error.into())
TantivyError::IoError(error.into())
}
}

View File

@@ -47,14 +47,14 @@ pub struct DeleteBitSet {
impl DeleteBitSet {
#[cfg(test)]
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
use crate::directory::{Directory, RAMDirectory, TerminatingWrite};
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
use std::path::Path;
assert!(docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value(max_doc);
for &doc in docs {
bitset.insert(doc);
}
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let path = Path::new("dummydeletebitset");
let mut wrt = directory.open_write(path).unwrap();
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
@@ -83,7 +83,7 @@ impl DeleteBitSet {
}
/// Returns true iff the document has been marked as deleted.
#[inline(always)]
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
let byte_offset = doc / 8u32;
let b: u8 = self.data.as_slice()[byte_offset as usize];

View File

@@ -201,7 +201,7 @@ mod tests {
use super::*;
use crate::common::CompositeFile;
use crate::directory::{Directory, RAMDirectory, WritePtr};
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::FastFieldReader;
use crate::merge_policy::NoMergePolicy;
use crate::schema::Field;
@@ -242,7 +242,7 @@ mod tests {
#[test]
fn test_intfastfield_small() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
@@ -269,7 +269,7 @@ mod tests {
#[test]
fn test_intfastfield_large() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
@@ -308,7 +308,7 @@ mod tests {
#[test]
fn test_intfastfield_null_amplitude() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
@@ -338,7 +338,7 @@ mod tests {
#[test]
fn test_intfastfield_large_numbers() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
@@ -374,7 +374,7 @@ mod tests {
#[test]
fn test_signed_intfastfield() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
let mut schema_builder = Schema::builder();
let i64_field = schema_builder.add_i64_field("field", FAST);
@@ -417,7 +417,7 @@ mod tests {
#[test]
fn test_signed_intfastfield_default_val() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
let mut schema_builder = Schema::builder();
let i64_field = schema_builder.add_i64_field("field", FAST);
let schema = schema_builder.build();
@@ -456,7 +456,7 @@ mod tests {
let path = Path::new("test");
let permutation = generate_permutation();
let n = permutation.len();
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
@@ -576,7 +576,7 @@ mod bench {
use super::tests::{generate_permutation, SCHEMA};
use super::*;
use crate::common::CompositeFile;
use crate::directory::{Directory, RAMDirectory, WritePtr};
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::FastFieldReader;
use std::collections::HashMap;
use std::path::Path;
@@ -612,7 +612,7 @@ mod bench {
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
@@ -646,7 +646,7 @@ mod bench {
fn bench_intfastfield_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();

View File

@@ -77,12 +77,15 @@ mod tests {
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
// add three seconds
index_writer
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 4);
assert_eq!(reader.num_docs(), 5);
{
let parser = QueryParser::for_index(&index, vec![date_field]);
@@ -150,7 +153,7 @@ mod tests {
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let range_q = format!(
"[{} TO {}]",
"[{} TO {}}}",
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
);

View File

@@ -4,7 +4,7 @@ use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::common::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::{Directory, RAMDirectory, WritePtr};
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
@@ -118,18 +118,18 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("__dummy__");
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory
.open_write(path)
.expect("With a RAMDirectory, this should never fail.");
.expect("With a RamDirectory, this should never fail.");
let mut serializer = FastFieldSerializer::from_write(write)
.expect("With a RAMDirectory, this should never fail.");
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
let fast_field_writer = fast_field_writers
.get_field_writer(field)
.expect("With a RAMDirectory, this should never fail.");
.expect("With a RamDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val.to_u64());
}

View File

@@ -1,9 +1,9 @@
#[inline(always)]
#[inline]
pub fn id_to_fieldnorm(id: u8) -> u32 {
FIELD_NORMS_TABLE[id as usize]
}
#[inline(always)]
#[inline]
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
FIELD_NORMS_TABLE
.binary_search(&fieldnorm)

View File

@@ -15,7 +15,7 @@
//! precompute computationally expensive functions of the fieldnorm
//! in a very short array.
//!
//! This trick is used by the BM25 similarity.
//! This trick is used by the Bm25 similarity.
mod code;
mod reader;
mod serializer;

View File

@@ -133,7 +133,7 @@ impl FieldNormReader {
}
/// Returns the `fieldnorm_id` associated to a document.
#[inline(always)]
#[inline]
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
match &self.0 {
ReaderImplEnum::FromData(data) => {
@@ -145,14 +145,14 @@ impl FieldNormReader {
}
/// Converts a `fieldnorm_id` into a fieldnorm.
#[inline(always)]
#[inline]
pub fn id_to_fieldnorm(id: u8) -> u32 {
id_to_fieldnorm(id)
}
/// Converts a `fieldnorm` into a `fieldnorm_id`.
/// (This function is not injective).
#[inline(always)]
#[inline]
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
fieldnorm_to_id(fieldnorm)
}

View File

@@ -180,7 +180,7 @@ pub(crate) fn advance_deletes(
if num_deleted_docs > num_deleted_docs_before {
// There are new deletes. We need to write a new delete file.
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
delete_file.terminate()?;
}

View File

@@ -57,8 +57,8 @@ impl MergePolicy for LogMergePolicy {
let mut size_sorted_tuples = segments
.iter()
.map(SegmentMeta::num_docs)
.filter(|s| s <= &(self.max_merge_size as u32))
.enumerate()
.filter(|(_, s)| s <= &(self.max_merge_size as u32))
.collect::<Vec<(usize, u32)>>();
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
@@ -216,12 +216,19 @@ mod tests {
create_random_segment_meta(1_000_000),
create_random_segment_meta(100_001),
create_random_segment_meta(100_000),
create_random_segment_meta(1_000_001),
create_random_segment_meta(100_000),
create_random_segment_meta(100_000),
create_random_segment_meta(1_500_000),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
// Do not include large segments
assert_eq!(result_list.len(), 1);
assert_eq!(result_list[0].0.len(), 3)
assert_eq!(result_list[0].0.len(), 3);
// Making sure merge policy points to the correct index of the original input
assert_eq!(result_list[0].0[0], test_input[2].id());
assert_eq!(result_list[0].0[1], test_input[4].id());
assert_eq!(result_list[0].0[2], test_input[5].id());
}
}

View File

@@ -628,7 +628,7 @@ impl IndexMerger {
segment_postings.positions(&mut positions_buffer);
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions)?;
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
}
doc = segment_postings.advance();
@@ -687,7 +687,7 @@ impl SerializableSegment for IndexMerger {
}
let fieldnorm_data = serializer
.segment()
.open_read(SegmentComponent::FIELDNORMS)?;
.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let term_ord_mappings =
self.write_postings(serializer.get_postings_serializer(), fieldnorm_readers)?;

View File

@@ -18,12 +18,12 @@ pub struct SegmentSerializer {
impl SegmentSerializer {
/// Creates a new `SegmentSerializer`.
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
let store_write = segment.open_write(SegmentComponent::STORE)?;
let store_write = segment.open_write(SegmentComponent::Store)?;
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;

View File

@@ -628,7 +628,7 @@ impl SegmentUpdater {
#[cfg(test)]
mod tests {
use super::merge_segments;
use crate::directory::RAMDirectory;
use crate::directory::RamDirectory;
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
use crate::schema::*;
use crate::Index;
@@ -777,7 +777,7 @@ mod tests {
}
assert_eq!(indices.len(), 3);
let output_directory = RAMDirectory::default();
let output_directory = RamDirectory::default();
let index = merge_segments(&indices, output_directory)?;
assert_eq!(index.schema(), schema);
@@ -792,7 +792,7 @@ mod tests {
#[test]
fn test_merge_empty_indices_array() {
let merge_result = merge_segments(&[], RAMDirectory::default());
let merge_result = merge_segments(&[], RamDirectory::default());
assert!(merge_result.is_err());
}
@@ -819,7 +819,7 @@ mod tests {
};
// mismatched schema index list
let result = merge_segments(&[first_index, second_index], RAMDirectory::default());
let result = merge_segments(&[first_index, second_index], RamDirectory::default());
assert!(result.is_err());
Ok(())

View File

@@ -311,7 +311,7 @@ fn write(
}
let fieldnorm_data = serializer
.segment()
.open_read(SegmentComponent::FIELDNORMS)?;
.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let term_ord_map =
multifield_postings.serialize(serializer.get_postings_serializer(), fieldnorm_readers)?;

View File

@@ -141,7 +141,7 @@ pub mod collector;
pub mod directory;
pub mod fastfield;
pub mod fieldnorm;
pub(crate) mod positions;
pub mod positions;
pub mod postings;
pub mod query;
pub mod schema;

View File

@@ -1,28 +1,29 @@
/// Positions are stored in three parts and over two files.
//
/// The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
/// for all terms of a given field, one term after the other.
///
/// If the last block is incomplete, it is simply padded with zeros.
/// It cannot be read alone, as it actually does not contain the number of bits used for
/// each blocks.
/// .
/// Each block is serialized one after the other.
/// If the last block is incomplete, it is simply padded with zeros.
///
///
/// The `SegmentComponent::POSITIONSSKIP` file contains the number of bits used in each block in `u8`
/// stream.
///
/// This makes it possible to rapidly skip over `n positions`.
///
/// For every block #n where n = k * `LONG_SKIP_INTERVAL` blocks (k>=1), we also store
/// in this file the sum of number of bits used for all of the previous block (blocks `[0, n[`).
/// That is useful to start reading the positions for a given term: The TermInfo contains
/// an address in the positions stream, expressed in "number of positions".
/// The long skip structure makes it possible to skip rapidly to the a checkpoint close to this
/// value, and then skip normally.
///
//! Tantivy can (if instructed to do so in the schema) store the term positions in a given field.
//! This positions are expressed as token ordinal. For instance,
//! In "The beauty and the beast", the term "the" appears in position 0 and position 4.
//! This information is useful to run phrase queries.
//!
//! The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
//! for all terms of a given field, one term after the other.
//!
//! Each terms is encoded independently.
//! Like for positing lists, tantivy rely on simd bitpacking to encode the positions delta in blocks of 128 deltas.
//! Because we rarely have a multiple of 128, a final block may encode the remaining values variable byte encoding.
//!
//! In order to make reading possible, the term delta positions first encodes the number of bitpacked blocks,
//! then the bitwidth for each blocks, then the actual bitpacked block and finally the final variable int encoded block.
//!
//! Contrary to postings list, the reader does not have access on the number of positions that is encoded, and instead
//! stops decoding the last block when its byte slice has been entirely read.
//!
//! More formally:
//! * *Positions* := *NumBitPackedBlocks* *BitPackedPositionBlock*^(P/128) *BitPackedPositionsDeltaBitWidth* *VIntPosDeltas*?
//! * *NumBitPackedBlocks**: := *P* / 128 encoded as a variable byte integer.
//! * *BitPackedPositionBlock* := bit width encoded block of 128 positions delta
//! * *BitPackedPositionsDeltaBitWidth* := (*BitWidth*: u8)^*NumBitPackedBlocks*
//! * *VIntPosDeltas* := *VIntPosDelta*^(*P* % 128).
//!
//! The skip widths encoded separately makes it easy and fast to rapidly skip over n positions.
mod reader;
mod serializer;
@@ -31,54 +32,103 @@ pub use self::serializer::PositionSerializer;
use bitpacking::{BitPacker, BitPacker4x};
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
#[cfg(test)]
pub mod tests {
use super::PositionSerializer;
use crate::directory::OwnedBytes;
use crate::positions::reader::PositionReader;
use crate::{common::HasLen, directory::FileSlice};
use proptest::prelude::*;
use proptest::sample::select;
use std::iter;
fn create_stream_buffer(vals: &[u32]) -> (FileSlice, FileSlice) {
let mut skip_buffer = vec![];
let mut stream_buffer = vec![];
{
let mut serializer = PositionSerializer::new(&mut stream_buffer, &mut skip_buffer);
for (i, &val) in vals.iter().enumerate() {
assert_eq!(serializer.positions_idx(), i as u64);
serializer.write_all(&[val]).unwrap();
fn create_positions_data(vals: &[u32]) -> crate::Result<OwnedBytes> {
let mut positions_buffer = vec![];
let mut serializer = PositionSerializer::new(&mut positions_buffer);
serializer.write_positions_delta(&vals);
serializer.close_term()?;
serializer.close()?;
Ok(OwnedBytes::new(positions_buffer))
}
fn gen_delta_positions() -> BoxedStrategy<Vec<u32>> {
select(&[0, 1, 70, 127, 128, 129, 200, 255, 256, 257, 270][..])
.prop_flat_map(|num_delta_positions| {
proptest::collection::vec(
select(&[1u32, 2u32, 4u32, 8u32, 16u32][..]),
num_delta_positions,
)
})
.boxed()
}
proptest! {
#[test]
fn test_position_delta(delta_positions in gen_delta_positions()) {
let delta_positions_data = create_positions_data(&delta_positions).unwrap();
let mut position_reader = PositionReader::open(delta_positions_data).unwrap();
let mut minibuf = [0u32; 1];
for (offset, &delta_position) in delta_positions.iter().enumerate() {
position_reader.read(offset as u64, &mut minibuf[..]);
assert_eq!(delta_position, minibuf[0]);
}
serializer.close().unwrap();
}
(FileSlice::from(stream_buffer), FileSlice::from(skip_buffer))
}
#[test]
fn test_position_read() {
let v: Vec<u32> = (0..1000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 12);
assert_eq!(stream.len(), 1168);
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
fn test_position_read() -> crate::Result<()> {
let position_deltas: Vec<u32> = (0..1000).collect();
let positions_data = create_positions_data(&position_deltas[..])?;
assert_eq!(positions_data.len(), 1224);
let mut position_reader = PositionReader::open(positions_data)?;
for &n in &[1, 10, 127, 128, 130, 312] {
let mut v = vec![0u32; n];
position_reader.read(0, &mut v[..]);
for i in 0..n {
assert_eq!(v[i], i as u32);
assert_eq!(position_deltas[i], i as u32);
}
}
Ok(())
}
#[test]
fn test_position_read_with_offset() {
let v: Vec<u32> = (0..1000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 12);
assert_eq!(stream.len(), 1168);
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
fn test_empty_position() -> crate::Result<()> {
let mut positions_buffer = vec![];
let mut serializer = PositionSerializer::new(&mut positions_buffer);
serializer.close_term()?;
serializer.close()?;
let position_delta = OwnedBytes::new(positions_buffer);
assert!(PositionReader::open(position_delta).is_ok());
Ok(())
}
#[test]
fn test_multiple_write_positions() -> crate::Result<()> {
let mut positions_buffer = vec![];
let mut serializer = PositionSerializer::new(&mut positions_buffer);
serializer.write_positions_delta(&[1u32, 12u32]);
serializer.write_positions_delta(&[4u32, 17u32]);
serializer.write_positions_delta(&[443u32]);
serializer.close_term()?;
serializer.close()?;
let position_delta = OwnedBytes::new(positions_buffer);
let mut output_delta_pos_buffer = vec![0u32; 5];
let mut position_reader = PositionReader::open(position_delta)?;
position_reader.read(0, &mut output_delta_pos_buffer[..]);
assert_eq!(
&output_delta_pos_buffer[..],
&[1u32, 12u32, 4u32, 17u32, 443u32]
);
Ok(())
}
#[test]
fn test_position_read_with_offset() -> crate::Result<()> {
let position_deltas: Vec<u32> = (0..1000).collect();
let positions_data = create_positions_data(&position_deltas[..])?;
assert_eq!(positions_data.len(), 1224);
let mut position_reader = PositionReader::open(positions_data)?;
for &offset in &[1u64, 10u64, 127u64, 128u64, 130u64, 312u64] {
for &len in &[1, 10, 130, 500] {
let mut v = vec![0u32; len];
@@ -88,16 +138,16 @@ pub mod tests {
}
}
}
Ok(())
}
#[test]
fn test_position_read_after_skip() {
let v: Vec<u32> = (0..1_000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 12);
assert_eq!(stream.len(), 1168);
fn test_position_read_after_skip() -> crate::Result<()> {
let position_deltas: Vec<u32> = (0..1_000).collect();
let positions_data = create_positions_data(&position_deltas[..])?;
assert_eq!(positions_data.len(), 1224);
let mut position_reader = PositionReader::new(stream, skip, 0u64).unwrap();
let mut position_reader = PositionReader::open(positions_data)?;
let mut buf = [0u32; 7];
let mut c = 0;
@@ -111,15 +161,15 @@ pub mod tests {
c += 1;
}
}
Ok(())
}
#[test]
fn test_position_reread_anchor_different_than_block() {
let v: Vec<u32> = (0..2_000_000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 4_987_872);
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0).unwrap();
fn test_position_reread_anchor_different_than_block() -> crate::Result<()> {
let positions_delta: Vec<u32> = (0..2_000_000).collect();
let positions_data = create_positions_data(&positions_delta[..])?;
assert_eq!(positions_data.len(), 5003499);
let mut position_reader = PositionReader::open(positions_data.clone())?;
let mut buf = [0u32; 256];
position_reader.read(128, &mut buf);
for i in 0..256 {
@@ -129,62 +179,41 @@ pub mod tests {
for i in 0..256 {
assert_eq!(buf[i], (128 + i) as u32);
}
Ok(())
}
#[test]
#[should_panic(expected = "offset arguments should be increasing.")]
fn test_position_panic_if_called_previous_anchor() {
let v: Vec<u32> = (0..2_000_000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 4_987_872);
fn test_position_requesting_passed_block() -> crate::Result<()> {
let positions_delta: Vec<u32> = (0..512).collect();
let positions_data = create_positions_data(&positions_delta[..])?;
assert_eq!(positions_data.len(), 533);
let mut buf = [0u32; 1];
let mut position_reader =
PositionReader::new(stream.clone(), skip.clone(), 200_000).unwrap();
let mut position_reader = PositionReader::open(positions_data)?;
position_reader.read(230, &mut buf);
assert_eq!(buf[0], 230);
position_reader.read(9, &mut buf);
assert_eq!(buf[0], 9);
Ok(())
}
#[test]
fn test_positions_bug() {
let mut v: Vec<u32> = vec![];
for i in 1..200 {
for j in 0..i {
v.push(j);
}
}
let (stream, skip) = create_stream_buffer(&v[..]);
let mut buf = Vec::new();
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), 0).unwrap();
let mut offset = 0;
for i in 1..24 {
buf.resize(i, 0);
position_reader.read(offset, &mut buf[..]);
offset += i as u64;
let r: Vec<u32> = (0..i).map(|el| el as u32).collect();
assert_eq!(buf, &r[..]);
}
}
#[test]
fn test_position_long_skip_const() {
fn test_position() -> crate::Result<()> {
const CONST_VAL: u32 = 9u32;
let v: Vec<u32> = iter::repeat(CONST_VAL).take(2_000_000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 1_000_000);
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024).unwrap();
let positions_delta: Vec<u32> = iter::repeat(CONST_VAL).take(2_000_000).collect();
let positions_data = create_positions_data(&positions_delta[..])?;
assert_eq!(positions_data.len(), 1_015_627);
let mut position_reader = PositionReader::open(positions_data)?;
let mut buf = [0u32; 1];
position_reader.read(0, &mut buf);
assert_eq!(buf[0], CONST_VAL);
Ok(())
}
#[test]
fn test_position_long_skip_2() {
let v: Vec<u32> = (0..2_000_000).collect();
let (stream, skip) = create_stream_buffer(&v[..]);
assert_eq!(skip.len(), 15_749);
assert_eq!(stream.len(), 4_987_872);
fn test_position_advance() -> crate::Result<()> {
let positions_delta: Vec<u32> = (0..2_000_000).collect();
let positions_data = create_positions_data(&positions_delta[..])?;
assert_eq!(positions_data.len(), 5_003_499);
for &offset in &[
10,
128 * 1024,
@@ -192,11 +221,11 @@ pub mod tests {
128 * 1024 + 7,
128 * 10 * 1024 + 10,
] {
let mut position_reader =
PositionReader::new(stream.clone(), skip.clone(), offset).unwrap();
let mut position_reader = PositionReader::open(positions_data.clone())?;
let mut buf = [0u32; 1];
position_reader.read(0, &mut buf);
position_reader.read(offset, &mut buf);
assert_eq!(buf[0], offset as u32);
}
Ok(())
}
}

View File

@@ -1,174 +1,148 @@
use std::io;
use crate::common::{BinarySerializable, FixedSize};
use crate::directory::FileSlice;
use crate::common::{BinarySerializable, VInt};
use crate::directory::OwnedBytes;
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::positions::LONG_SKIP_INTERVAL;
use crate::positions::LONG_SKIP_IN_BLOCKS;
use bitpacking::{BitPacker, BitPacker4x};
use crate::postings::compression::{BlockDecoder, VIntDecoder};
/// Positions works as a long sequence of compressed block.
/// All terms are chained one after the other.
///
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
/// This means we need to skip to the `nth` positions efficiently.
///
/// This is done thanks to two levels of skiping that we refer to in the code
/// as `long_skip` and `short_skip`.
///
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
///
/// We find the number of long skips, as `n / long_skip`.
///
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
/// (values can go from 0bit to 32 bits) required to decompressed every block.
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bits
/// (values can go from 0bit to 32 bits) required to decompress every block.
///
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
/// so skipping a block without decompressing it is just a matter of advancing that many
/// bytes.
struct Positions {
bit_packer: BitPacker4x,
skip_file: FileSlice,
position_file: FileSlice,
long_skip_data: OwnedBytes,
}
impl Positions {
pub fn new(position_file: FileSlice, skip_file: FileSlice) -> io::Result<Positions> {
let (body, footer) = skip_file.split_from_end(u32::SIZE_IN_BYTES);
let footer_data = footer.read_bytes()?;
let num_long_skips = u32::deserialize(&mut footer_data.as_slice())?;
let (skip_file, long_skip_file) =
body.split_from_end(u64::SIZE_IN_BYTES * (num_long_skips as usize));
let long_skip_data = long_skip_file.read_bytes()?;
Ok(Positions {
bit_packer: BitPacker4x::new(),
skip_file,
long_skip_data,
position_file,
})
}
/// Returns the offset of the block associated to the given `long_skip_id`.
///
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
fn long_skip(&self, long_skip_id: usize) -> u64 {
if long_skip_id == 0 {
return 0;
}
let long_skip_slice = self.long_skip_data.as_slice();
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
}
fn reader(&self, offset: u64) -> io::Result<PositionReader> {
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
let position_read = self
.position_file
.slice_from(offset_num_bytes as usize)
.read_bytes()?;
let skip_read = self
.skip_file
.slice_from(long_skip_id * LONG_SKIP_IN_BLOCKS)
.read_bytes()?;
Ok(PositionReader {
bit_packer: self.bit_packer,
skip_read,
position_read,
buffer: Box::new([0u32; 128]),
block_offset: std::i64::MAX as u64,
anchor_offset: (long_skip_id as u64) * LONG_SKIP_INTERVAL,
abs_offset: offset,
})
}
}
#[derive(Clone)]
pub struct PositionReader {
skip_read: OwnedBytes,
position_read: OwnedBytes,
bit_packer: BitPacker4x,
buffer: Box<[u32; COMPRESSION_BLOCK_SIZE]>,
bit_widths: OwnedBytes,
positions: OwnedBytes,
block_decoder: BlockDecoder,
// offset, expressed in positions, for the first position of the block currently loaded
// block_offset is a multiple of COMPRESSION_BLOCK_SIZE.
block_offset: u64,
// offset, expressed in positions, for the position of the first block encoded
// in the `self.positions` bytes, and if bitpacked, compressed using the bitwidth in
// `self.bit_widths`.
//
// As we advance, anchor increases simultaneously with bit_widths and positions get consumed.
anchor_offset: u64,
abs_offset: u64,
// These are just copies used for .reset().
original_bit_widths: OwnedBytes,
original_positions: OwnedBytes,
}
impl PositionReader {
pub fn new(
position_file: FileSlice,
skip_file: FileSlice,
offset: u64,
) -> io::Result<PositionReader> {
let positions = Positions::new(position_file, skip_file)?;
positions.reader(offset)
/// Open and reads the term positions encoded into the positions_data owned bytes.
pub fn open(mut positions_data: OwnedBytes) -> io::Result<PositionReader> {
let num_positions_bitpacked_blocks = VInt::deserialize(&mut positions_data)?.0 as usize;
let (bit_widths, positions) = positions_data.split(num_positions_bitpacked_blocks);
Ok(PositionReader {
bit_widths: bit_widths.clone(),
positions: positions.clone(),
block_decoder: BlockDecoder::default(),
block_offset: std::i64::MAX as u64,
anchor_offset: 0u64,
original_bit_widths: bit_widths,
original_positions: positions,
})
}
fn reset(&mut self) {
self.positions = self.original_positions.clone();
self.bit_widths = self.original_bit_widths.clone();
self.block_offset = std::i64::MAX as u64;
self.anchor_offset = 0u64;
}
/// Advance from num_blocks bitpacked blocks.
///
/// Panics if there are not that many remaining blocks.
fn advance_num_blocks(&mut self, num_blocks: usize) {
let num_bits: usize = self.skip_read.as_ref()[..num_blocks]
let num_bits: usize = self.bit_widths.as_ref()[..num_blocks]
.iter()
.cloned()
.map(|num_bits| num_bits as usize)
.sum();
let num_bytes_to_skip = num_bits * COMPRESSION_BLOCK_SIZE / 8;
self.skip_read.advance(num_blocks as usize);
self.position_read.advance(num_bytes_to_skip);
self.bit_widths.advance(num_blocks as usize);
self.positions.advance(num_bytes_to_skip);
self.anchor_offset += (num_blocks * COMPRESSION_BLOCK_SIZE) as u64;
}
/// block_rel_id is counted relatively to the anchor.
/// block_rel_id = 0 means the anchor block.
/// block_rel_id = i means the ith block after the anchor block.
fn load_block(&mut self, block_rel_id: usize) {
let bit_widths = self.bit_widths.as_slice();
let byte_offset: usize = bit_widths[0..block_rel_id]
.iter()
.map(|&b| b as usize)
.sum::<usize>()
* COMPRESSION_BLOCK_SIZE
/ 8;
let compressed_data = &self.positions.as_slice()[byte_offset..];
if bit_widths.len() > block_rel_id {
// that block is bitpacked.
let bit_width = bit_widths[block_rel_id];
self.block_decoder
.uncompress_block_unsorted(compressed_data, bit_width);
} else {
// that block is vint encoded.
self.block_decoder
.uncompress_vint_unsorted_until_end(compressed_data);
}
self.block_offset = self.anchor_offset + (block_rel_id * COMPRESSION_BLOCK_SIZE) as u64;
}
/// Fills a buffer with the positions `[offset..offset+output.len())` integers.
///
/// `offset` is required to have a value >= to the offsets given in previous calls
/// for the given `PositionReaderAbsolute` instance.
/// This function is optimized to be called with increasing values of `offset`.
pub fn read(&mut self, mut offset: u64, mut output: &mut [u32]) {
offset += self.abs_offset;
assert!(
offset >= self.anchor_offset,
"offset arguments should be increasing."
);
if offset < self.anchor_offset {
self.reset();
}
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
if !(0..128).contains(&delta_to_block_offset) {
// The first position is not within the first block.
// We need to decompress the first block.
// (Note that it could be before or after)
// We need to possibly skip a few blocks, and decompress the first relevant block.
let delta_to_anchor_offset = offset - self.anchor_offset;
let num_blocks_to_skip =
(delta_to_anchor_offset / (COMPRESSION_BLOCK_SIZE as u64)) as usize;
self.advance_num_blocks(num_blocks_to_skip);
self.anchor_offset = offset - (offset % COMPRESSION_BLOCK_SIZE as u64);
self.block_offset = self.anchor_offset;
let num_bits = self.skip_read.as_slice()[0];
self.bit_packer
.decompress(self.position_read.as_ref(), self.buffer.as_mut(), num_bits);
self.load_block(0);
} else {
// The request offset is within the loaded block.
// We still need to advance anchor_offset to our current block.
let num_blocks_to_skip =
((self.block_offset - self.anchor_offset) / COMPRESSION_BLOCK_SIZE as u64) as usize;
self.advance_num_blocks(num_blocks_to_skip);
self.anchor_offset = self.block_offset;
}
let mut num_bits = self.skip_read.as_slice()[0];
let mut position_data = self.position_read.as_ref();
// At this point, the block containing offset is loaded, and anchor has
// been updated to point to it as well.
for i in 1.. {
// we copy the part from block i - 1 that is relevant.
let offset_in_block = (offset as usize) % COMPRESSION_BLOCK_SIZE;
let remaining_in_block = COMPRESSION_BLOCK_SIZE - offset_in_block;
if remaining_in_block >= output.len() {
output.copy_from_slice(&self.buffer[offset_in_block..][..output.len()]);
output.copy_from_slice(
&self.block_decoder.output_array()[offset_in_block..][..output.len()],
);
break;
}
output[..remaining_in_block].copy_from_slice(&self.buffer[offset_in_block..]);
output[..remaining_in_block]
.copy_from_slice(&self.block_decoder.output_array()[offset_in_block..]);
output = &mut output[remaining_in_block..];
// we load block #i if necessary.
offset += remaining_in_block as u64;
position_data = &position_data[(num_bits as usize * COMPRESSION_BLOCK_SIZE / 8)..];
num_bits = self.skip_read.as_slice()[i];
self.bit_packer
.decompress(position_data, self.buffer.as_mut(), num_bits);
self.block_offset += COMPRESSION_BLOCK_SIZE as u64;
self.load_block(i);
}
}
}

View File

@@ -1,80 +1,91 @@
use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
use bitpacking::BitPacker;
use bitpacking::BitPacker4x;
use crate::common::{BinarySerializable, CountingWriter, VInt};
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::BlockEncoder;
use crate::postings::compression::VIntEncoder;
use std::io::{self, Write};
/// The PositionSerializer is in charge of serializing all of the positions
/// of all of the terms of a given field.
///
/// It is valid to call write_position_delta more than once per term.
pub struct PositionSerializer<W: io::Write> {
bit_packer: BitPacker4x,
write_stream: CountingWriter<W>,
write_skip_index: W,
block_encoder: BlockEncoder,
positions_wrt: CountingWriter<W>,
positions_buffer: Vec<u8>,
block: Vec<u32>,
buffer: Vec<u8>,
num_ints: u64,
long_skips: Vec<u64>,
bit_widths: Vec<u8>,
}
impl<W: io::Write> PositionSerializer<W> {
pub fn new(write_stream: W, write_skip_index: W) -> PositionSerializer<W> {
/// Creates a new PositionSerializer writing into the given positions_wrt.
pub fn new(positions_wrt: W) -> PositionSerializer<W> {
PositionSerializer {
bit_packer: BitPacker4x::new(),
write_stream: CountingWriter::wrap(write_stream),
write_skip_index,
block_encoder: BlockEncoder::new(),
positions_wrt: CountingWriter::wrap(positions_wrt),
positions_buffer: Vec::with_capacity(128_000),
block: Vec::with_capacity(128),
buffer: vec![0u8; 128 * 4],
num_ints: 0u64,
long_skips: Vec::new(),
bit_widths: Vec::new(),
}
}
pub fn positions_idx(&self) -> u64 {
self.num_ints
/// Returns the number of bytes written in the positions write object
/// at this point.
/// When called before writing the positions of a term, this value is used as
/// start offset.
/// When called after writing the positions of a term, this value is used as a
/// end offset.
pub fn written_bytes(&self) -> u64 {
self.positions_wrt.written_bytes()
}
fn remaining_block_len(&self) -> usize {
COMPRESSION_BLOCK_SIZE - self.block.len()
}
pub fn write_all(&mut self, mut vals: &[u32]) -> io::Result<()> {
while !vals.is_empty() {
/// Writes all of the given positions delta.
pub fn write_positions_delta(&mut self, mut positions_delta: &[u32]) {
while !positions_delta.is_empty() {
let remaining_block_len = self.remaining_block_len();
let num_to_write = remaining_block_len.min(vals.len());
self.block.extend(&vals[..num_to_write]);
self.num_ints += num_to_write as u64;
vals = &vals[num_to_write..];
let num_to_write = remaining_block_len.min(positions_delta.len());
self.block.extend(&positions_delta[..num_to_write]);
positions_delta = &positions_delta[num_to_write..];
if self.remaining_block_len() == 0 {
self.flush_block()?;
self.flush_block();
}
}
Ok(())
}
fn flush_block(&mut self) -> io::Result<()> {
let num_bits = self.bit_packer.num_bits(&self.block[..]);
self.write_skip_index.write_all(&[num_bits])?;
let written_len = self
.bit_packer
.compress(&self.block[..], &mut self.buffer, num_bits);
self.write_stream.write_all(&self.buffer[..written_len])?;
fn flush_block(&mut self) {
// encode the positions in the block
if self.block.is_empty() {
return;
}
if self.block.len() == COMPRESSION_BLOCK_SIZE {
let (bit_width, block_encoded): (u8, &[u8]) =
self.block_encoder.compress_block_unsorted(&self.block[..]);
self.bit_widths.push(bit_width);
self.positions_buffer.extend(block_encoded);
} else {
debug_assert!(self.block.len() < COMPRESSION_BLOCK_SIZE);
let block_vint_encoded = self.block_encoder.compress_vint_unsorted(&self.block[..]);
self.positions_buffer.extend_from_slice(block_vint_encoded);
}
self.block.clear();
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
self.long_skips.push(self.write_stream.written_bytes());
}
}
/// Close the positions for the given term.
pub fn close_term(&mut self) -> io::Result<()> {
self.flush_block();
VInt(self.bit_widths.len() as u64).serialize(&mut self.positions_wrt)?;
self.positions_wrt.write_all(&self.bit_widths[..])?;
self.positions_wrt.write_all(&self.positions_buffer)?;
self.bit_widths.clear();
self.positions_buffer.clear();
Ok(())
}
/// Close the positions for this term and flushes the data.
pub fn close(mut self) -> io::Result<()> {
if !self.block.is_empty() {
self.block.resize(COMPRESSION_BLOCK_SIZE, 0u32);
self.flush_block()?;
}
for &long_skip in &self.long_skips {
long_skip.serialize(&mut self.write_skip_index)?;
}
(self.long_skips.len() as u32).serialize(&mut self.write_skip_index)?;
self.write_skip_index.flush()?;
self.write_stream.flush()?;
Ok(())
self.positions_wrt.flush()
}
}

View File

@@ -100,7 +100,7 @@ fn galloping(block_docs: &[u32], target: u32) -> usize {
#[derive(Clone, Copy, PartialEq)]
pub enum BlockSearcher {
#[cfg(target_arch = "x86_64")]
SSE2,
Sse2,
Scalar,
}
@@ -139,7 +139,7 @@ impl BlockSearcher {
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
#[cfg(target_arch = "x86_64")]
{
if self == BlockSearcher::SSE2 {
if self == BlockSearcher::Sse2 {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
@@ -152,7 +152,7 @@ impl Default for BlockSearcher {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
return BlockSearcher::SSE2;
return BlockSearcher::Sse2;
}
}
BlockSearcher::Scalar
@@ -236,6 +236,6 @@ mod tests {
#[cfg(target_arch = "x86_64")]
#[test]
fn test_search_in_block_sse2() {
test_search_in_block_util(BlockSearcher::SSE2);
test_search_in_block_util(BlockSearcher::Sse2);
}
}

View File

@@ -8,7 +8,7 @@ use crate::postings::compression::{
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
};
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
use crate::query::BM25Weight;
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
@@ -127,7 +127,7 @@ impl BlockSegmentPostings {
pub fn block_max_score(
&mut self,
fieldnorm_reader: &FieldNormReader,
bm25_weight: &BM25Weight,
bm25_weight: &Bm25Weight,
) -> Score {
if let Some(score) = self.block_max_score_cache {
return score;
@@ -212,14 +212,14 @@ impl BlockSegmentPostings {
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
///
/// This method is useful to run SSE2 linear search.
#[inline(always)]
#[inline]
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
debug_assert!(self.block_is_loaded());
self.doc_decoder.output_aligned()
}
/// Return the document at index `idx` of the block.
#[inline(always)]
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}

View File

@@ -167,6 +167,8 @@ pub trait VIntDecoder {
num_els: usize,
padding: u32,
) -> usize;
fn uncompress_vint_unsorted_until_end(&mut self, compressed_data: &[u8]);
}
impl VIntEncoder for BlockEncoder {
@@ -202,6 +204,11 @@ impl VIntDecoder for BlockDecoder {
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
}
fn uncompress_vint_unsorted_until_end(&mut self, compressed_data: &[u8]) {
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output.0);
self.output_len = num_els;
}
}
#[cfg(test)]

View File

@@ -1,4 +1,4 @@
#[inline(always)]
#[inline]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
@@ -20,7 +20,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32)
&output[..byte_written]
}
#[inline(always)]
#[inline]
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
@@ -41,7 +41,7 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
&output[..byte_written]
}
#[inline(always)]
#[inline]
pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
let mut read_byte = 0;
let mut result = offset;
@@ -61,15 +61,15 @@ pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32
read_byte
}
#[inline(always)]
#[inline]
pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]) -> usize {
let mut read_byte = 0;
let mut num_read_bytes = 0;
for output_mut in output_arr.iter_mut() {
let mut result = 0u32;
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
let cur_byte = compressed_data[num_read_bytes];
num_read_bytes += 1;
result += u32::from(cur_byte % 128u8) << shift;
if cur_byte & 128u8 != 0u8 {
break;
@@ -78,5 +78,31 @@ pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]
}
*output_mut = result;
}
read_byte
num_read_bytes
}
#[inline]
pub(crate) fn uncompress_unsorted_until_end(
compressed_data: &[u8],
output_arr: &mut [u32],
) -> usize {
let mut num_read_bytes = 0;
for (num_ints_written, output_mut) in output_arr.iter_mut().enumerate() {
if compressed_data.len() == num_read_bytes {
return num_ints_written;
}
let mut result = 0u32;
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[num_read_bytes];
num_read_bytes += 1;
result += u32::from(cur_byte % 128u8) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
*output_mut = result;
}
output_arr.len()
}

View File

@@ -68,13 +68,13 @@ pub mod tests {
field_serializer.new_term("abc".as_bytes(), 12u32)?;
for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2];
field_serializer.write_doc(doc_id, 4, &delta_positions)?;
field_serializer.write_doc(doc_id, 4, &delta_positions);
}
field_serializer.close_term()?;
mem::drop(field_serializer);
posting_serializer.close()?;
let read = segment.open_read(SegmentComponent::POSITIONS)?;
assert!(read.len() <= 140);
let read = segment.open_read(SegmentComponent::Positions)?;
assert_eq!(read.len(), 207);
Ok(())
}

View File

@@ -2,7 +2,7 @@ use super::stacker::{Addr, MemoryArena, TermHashMap};
use crate::fieldnorm::FieldNormReaders;
use crate::postings::recorder::{
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
BufferLender, NothingRecorder, Recorder, TermFrequencyRecorder, TfAndPositionRecorder,
};
use crate::postings::UnorderedTermId;
use crate::postings::{FieldSerializer, InvertedIndexSerializer};
@@ -30,7 +30,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
}
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
@@ -313,7 +313,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
let recorder: Rec = termdict_heap.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
serializer.new_term(&term_bytes[4..], term_doc_freq)?;
recorder.serialize(&mut buffer_lender, serializer, heap)?;
recorder.serialize(&mut buffer_lender, serializer, heap);
serializer.close_term()?;
}
Ok(())

View File

@@ -2,7 +2,6 @@ use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use crate::common::{read_u32_vint, write_u32_vint};
use crate::postings::FieldSerializer;
use crate::DocId;
use std::io;
const POSITION_END: u32 = 0;
@@ -74,7 +73,7 @@ pub(crate) trait Recorder: Copy + 'static {
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer<'_>,
heap: &MemoryArena,
) -> io::Result<()>;
);
/// Returns the number of document containing this term.
///
/// Returns `None` if not available.
@@ -114,14 +113,13 @@ impl Recorder for NothingRecorder {
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer<'_>,
heap: &MemoryArena,
) -> io::Result<()> {
) {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
// TODO avoid reading twice.
for doc in VInt32Reader::new(&buffer[..]) {
serializer.write_doc(doc as u32, 0u32, &[][..])?;
serializer.write_doc(doc as u32, 0u32, &[][..]);
}
Ok(())
}
fn term_doc_freq(&self) -> Option<u32> {
@@ -173,16 +171,14 @@ impl Recorder for TermFrequencyRecorder {
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer<'_>,
heap: &MemoryArena,
) -> io::Result<()> {
) {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
let mut u32_it = VInt32Reader::new(&buffer[..]);
while let Some(doc) = u32_it.next() {
let term_freq = u32_it.next().unwrap_or(self.current_tf);
serializer.write_doc(doc as u32, term_freq, &[][..])?;
serializer.write_doc(doc as u32, term_freq, &[][..]);
}
Ok(())
}
fn term_doc_freq(&self) -> Option<u32> {
@@ -192,14 +188,14 @@ impl Recorder for TermFrequencyRecorder {
/// Recorder encoding term frequencies as well as positions.
#[derive(Clone, Copy)]
pub struct TFAndPositionRecorder {
pub struct TfAndPositionRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
term_doc_freq: u32,
}
impl Recorder for TFAndPositionRecorder {
impl Recorder for TfAndPositionRecorder {
fn new() -> Self {
TFAndPositionRecorder {
TfAndPositionRecorder {
stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(),
term_doc_freq: 0u32,
@@ -229,7 +225,7 @@ impl Recorder for TFAndPositionRecorder {
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer<'_>,
heap: &MemoryArena,
) -> io::Result<()> {
) {
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
self.stack.read_to_end(heap, buffer_u8);
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
@@ -248,9 +244,8 @@ impl Recorder for TFAndPositionRecorder {
}
}
}
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions);
}
Ok(())
}
fn term_doc_freq(&self) -> Option<u32> {

View File

@@ -204,7 +204,7 @@ impl DocSet for SegmentPostings {
}
/// Return the current document's `DocId`.
#[inline(always)]
#[inline]
fn doc(&self) -> DocId {
self.block_cursor.doc(self.cur)
}

View File

@@ -7,7 +7,7 @@ use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionSerializer;
use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::skip::SkipSerializer;
use crate::query::BM25Weight;
use crate::query::Bm25Weight;
use crate::schema::{Field, FieldEntry, FieldType};
use crate::schema::{IndexRecordOption, Schema};
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
@@ -50,19 +50,17 @@ pub struct InvertedIndexSerializer {
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
positionsidx_write: CompositeWrite<WritePtr>,
schema: Schema,
}
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
use crate::SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
use crate::SegmentComponent::{Positions, Postings, Terms};
let inv_index_serializer = InvertedIndexSerializer {
terms_write: CompositeWrite::wrap(segment.open_write(TERMS)?),
postings_write: CompositeWrite::wrap(segment.open_write(POSTINGS)?),
positions_write: CompositeWrite::wrap(segment.open_write(POSITIONS)?),
positionsidx_write: CompositeWrite::wrap(segment.open_write(POSITIONSSKIP)?),
terms_write: CompositeWrite::wrap(segment.open_write(Terms)?),
postings_write: CompositeWrite::wrap(segment.open_write(Postings)?),
positions_write: CompositeWrite::wrap(segment.open_write(Positions)?),
schema: segment.schema(),
};
Ok(inv_index_serializer)
@@ -82,7 +80,6 @@ impl InvertedIndexSerializer {
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
let positions_write = self.positions_write.for_field(field);
let positionsidx_write = self.positionsidx_write.for_field(field);
let field_type: FieldType = (*field_entry.field_type()).clone();
FieldSerializer::create(
&field_type,
@@ -90,7 +87,6 @@ impl InvertedIndexSerializer {
term_dictionary_write,
postings_write,
positions_write,
positionsidx_write,
fieldnorm_reader,
)
}
@@ -100,7 +96,6 @@ impl InvertedIndexSerializer {
self.terms_write.close()?;
self.postings_write.close()?;
self.positions_write.close()?;
self.positionsidx_write.close()?;
Ok(())
}
}
@@ -123,7 +118,6 @@ impl<'a> FieldSerializer<'a> {
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
positionsidx_write: &'a mut CountingWriter<WritePtr>,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'a>> {
total_num_tokens.serialize(postings_write)?;
@@ -145,7 +139,7 @@ impl<'a> FieldSerializer<'a> {
let postings_serializer =
PostingsSerializer::new(postings_write, average_fieldnorm, mode, fieldnorm_reader);
let positions_serializer_opt = if mode.has_positions() {
Some(PositionSerializer::new(positions_write, positionsidx_write))
Some(PositionSerializer::new(positions_write))
} else {
None
};
@@ -161,17 +155,17 @@ impl<'a> FieldSerializer<'a> {
}
fn current_term_info(&self) -> TermInfo {
let positions_idx =
let positions_start =
if let Some(positions_serializer) = self.positions_serializer_opt.as_ref() {
positions_serializer.positions_idx()
positions_serializer.written_bytes()
} else {
0u64
};
let addr = self.postings_serializer.addr() as usize;
} as usize;
let addr = self.postings_serializer.written_bytes() as usize;
TermInfo {
doc_freq: 0,
postings_range: addr..addr,
positions_idx,
positions_range: positions_start..positions_start,
}
}
@@ -204,18 +198,12 @@ impl<'a> FieldSerializer<'a> {
///
/// Term frequencies and positions may be ignored by the serializer depending
/// on the configuration of the field in the `Schema`.
pub fn write_doc(
&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32],
) -> io::Result<()> {
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) {
self.current_term_info.doc_freq += 1;
self.postings_serializer.write_doc(doc_id, term_freq);
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
positions_serializer.write_all(position_deltas)?;
positions_serializer.write_positions_delta(position_deltas);
}
Ok(())
}
/// Finish the serialization for this term postings.
@@ -226,7 +214,14 @@ impl<'a> FieldSerializer<'a> {
if self.term_open {
self.postings_serializer
.close_term(self.current_term_info.doc_freq)?;
self.current_term_info.postings_range.end = self.postings_serializer.addr() as usize;
self.current_term_info.postings_range.end =
self.postings_serializer.written_bytes() as usize;
if let Some(positions_serializer) = self.positions_serializer_opt.as_mut() {
positions_serializer.close_term()?;
self.current_term_info.positions_range.end =
positions_serializer.written_bytes() as usize;
}
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.term_open = false;
@@ -307,7 +302,7 @@ pub struct PostingsSerializer<W: Write> {
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
bm25_weight: Option<BM25Weight>,
bm25_weight: Option<Bm25Weight>,
num_docs: u32, // Number of docs in the segment
avg_fieldnorm: Score, // Average number of term in the field for that segment.
@@ -347,7 +342,7 @@ impl<W: Write> PostingsSerializer<W> {
pub fn new_term(&mut self, term_doc_freq: u32) {
if self.mode.has_freq() && self.num_docs > 0 {
let bm25_weight = BM25Weight::for_one_term(
let bm25_weight = Bm25Weight::for_one_term(
term_doc_freq as u64,
self.num_docs as u64,
self.avg_fieldnorm,
@@ -457,7 +452,13 @@ impl<W: Write> PostingsSerializer<W> {
Ok(())
}
fn addr(&self) -> u64 {
/// Returns the number of bytes written in the postings write object
/// at this point.
/// When called before writing the postings of a term, this value is used as
/// start offset.
/// When called after writing the postings of a term, this value is used as a
/// end offset.
fn written_bytes(&self) -> u64 {
self.output_write.written_bytes() as u64
}

View File

@@ -2,16 +2,16 @@ use std::convert::TryInto;
use crate::directory::OwnedBytes;
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
use crate::query::BM25Weight;
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
#[inline(always)]
#[inline]
fn encode_block_wand_max_tf(max_tf: u32) -> u8 {
max_tf.min(u8::MAX as u32) as u8
}
#[inline(always)]
#[inline]
fn decode_block_wand_max_tf(max_tf_code: u8) -> u32 {
if max_tf_code == u8::MAX {
u32::MAX
@@ -20,12 +20,12 @@ fn decode_block_wand_max_tf(max_tf_code: u8) -> u32 {
}
}
#[inline(always)]
#[inline]
fn read_u32(data: &[u8]) -> u32 {
u32::from_le_bytes(data[..4].try_into().unwrap())
}
#[inline(always)]
#[inline]
fn write_u32(val: u32, buf: &mut Vec<u8>) {
buf.extend_from_slice(&val.to_le_bytes());
}
@@ -144,7 +144,7 @@ impl SkipReader {
//
// The block max score is available for all full bitpacked block,
// but no available for the last VInt encoded incomplete block.
pub fn block_max_score(&self, bm25_weight: &BM25Weight) -> Option<Score> {
pub fn block_max_score(&self, bm25_weight: &Bm25Weight) -> Option<Score> {
match self.block_info {
BlockInfo::BitPacked {
block_wand_fieldnorm_id,
@@ -163,7 +163,7 @@ impl SkipReader {
self.position_offset
}
#[inline(always)]
#[inline]
pub fn byte_offset(&self) -> usize {
self.byte_offset
}

View File

@@ -147,7 +147,7 @@ impl ExpUnrolledLinkedList {
}
}
#[inline(always)]
#[inline]
pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
ExpUnrolledLinkedListWriter { eull: self, heap }
}

View File

@@ -132,7 +132,7 @@ impl MemoryArena {
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
}
#[inline(always)]
#[inline]
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
}
@@ -162,7 +162,7 @@ impl Page {
}
}
#[inline(always)]
#[inline]
fn is_available(&self, len: usize) -> bool {
len + self.len <= PAGE_SIZE
}

View File

@@ -118,7 +118,7 @@ impl TermHashMap {
self.table.len() < self.occupied.len() * 3
}
#[inline(always)]
#[inline]
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let data = self.heap.slice_from(addr);
let key_bytes_len = NativeEndian::read_u16(data) as usize;
@@ -126,7 +126,7 @@ impl TermHashMap {
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
}
#[inline(always)]
#[inline]
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
let (stored_key, value_addr) = self.get_key_value(addr);
if stored_key == target_key {

View File

@@ -11,8 +11,8 @@ pub struct TermInfo {
pub doc_freq: u32,
/// Byte range of the posting list within the postings (`.idx`) file.
pub postings_range: Range<usize>,
/// Start offset of the first block within the position (`.pos`) file.
pub positions_idx: u64,
/// Byte range of the positions of this terms in the positions (`.pos`) file.
pub positions_range: Range<usize>,
}
impl TermInfo {
@@ -21,6 +21,12 @@ impl TermInfo {
assert!(num_bytes <= std::u32::MAX as usize);
num_bytes as u32
}
pub(crate) fn positions_num_bytes(&self) -> u32 {
let num_bytes = self.positions_range.len();
assert!(num_bytes <= std::u32::MAX as usize);
num_bytes as u32
}
}
impl FixedSize for TermInfo {
@@ -28,7 +34,7 @@ impl FixedSize for TermInfo {
/// This is large, but in practise, `TermInfo` are encoded in blocks and
/// only the first `TermInfo` of a block is serialized uncompressed.
/// The subsequent `TermInfo` are delta encoded and bitpacked.
const SIZE_IN_BYTES: usize = 2 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
const SIZE_IN_BYTES: usize = 3 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
}
impl BinarySerializable for TermInfo {
@@ -36,20 +42,23 @@ impl BinarySerializable for TermInfo {
self.doc_freq.serialize(writer)?;
(self.postings_range.start as u64).serialize(writer)?;
self.posting_num_bytes().serialize(writer)?;
self.positions_idx.serialize(writer)?;
(self.positions_range.start as u64).serialize(writer)?;
self.positions_num_bytes().serialize(writer)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_freq = u32::deserialize(reader)?;
let postings_start_offset = u64::deserialize(reader)? as usize;
let postings_num_bytes = u32::deserialize(reader)?;
let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes) as usize;
let positions_idx = u64::deserialize(reader)?;
let postings_num_bytes = u32::deserialize(reader)? as usize;
let postings_end_offset = postings_start_offset + postings_num_bytes;
let positions_start_offset = u64::deserialize(reader)? as usize;
let positions_num_bytes = u32::deserialize(reader)? as usize;
let positions_end_offset = positions_start_offset + positions_num_bytes;
Ok(TermInfo {
doc_freq,
postings_range: postings_start_offset..postings_end_offset,
positions_idx,
positions_range: positions_start_offset..positions_end_offset,
})
}
}
@@ -60,6 +69,8 @@ mod tests {
use super::TermInfo;
use crate::common::test::fixed_size_test;
// TODO add serialize/deserialize test for terminfo
#[test]
fn test_fixed_size() {
fixed_size_test::<TermInfo>();

View File

@@ -29,22 +29,22 @@ fn compute_tf_cache(average_fieldnorm: Score) -> [Score; 256] {
}
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
pub struct BM25Params {
pub struct Bm25Params {
pub idf: Score,
pub avg_fieldnorm: Score,
}
#[derive(Clone)]
pub struct BM25Weight {
pub struct Bm25Weight {
idf_explain: Explanation,
weight: Score,
cache: [Score; 256],
average_fieldnorm: Score,
}
impl BM25Weight {
pub fn boost_by(&self, boost: Score) -> BM25Weight {
BM25Weight {
impl Bm25Weight {
pub fn boost_by(&self, boost: Score) -> Bm25Weight {
Bm25Weight {
idf_explain: self.idf_explain.clone(),
weight: self.weight * boost,
cache: self.cache,
@@ -52,8 +52,8 @@ impl BM25Weight {
}
}
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> crate::Result<BM25Weight> {
assert!(!terms.is_empty(), "BM25 requires at least one term");
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> crate::Result<Bm25Weight> {
assert!(!terms.is_empty(), "Bm25 requires at least one term");
let field = terms[0].field();
for term in &terms[1..] {
assert_eq!(
@@ -74,7 +74,7 @@ impl BM25Weight {
if terms.len() == 1 {
let term_doc_freq = searcher.doc_freq(&terms[0])?;
Ok(BM25Weight::for_one_term(
Ok(Bm25Weight::for_one_term(
term_doc_freq,
total_num_docs,
average_fieldnorm,
@@ -86,7 +86,7 @@ impl BM25Weight {
idf_sum += idf(term_doc_freq, total_num_docs);
}
let idf_explain = Explanation::new("idf", idf_sum);
Ok(BM25Weight::new(idf_explain, average_fieldnorm))
Ok(Bm25Weight::new(idf_explain, average_fieldnorm))
}
}
@@ -94,7 +94,7 @@ impl BM25Weight {
term_doc_freq: u64,
total_num_docs: u64,
avg_fieldnorm: Score,
) -> BM25Weight {
) -> Bm25Weight {
let idf = idf(term_doc_freq, total_num_docs);
let mut idf_explain =
Explanation::new("idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))", idf);
@@ -103,12 +103,12 @@ impl BM25Weight {
term_doc_freq as Score,
);
idf_explain.add_const("N, total number of docs", total_num_docs as Score);
BM25Weight::new(idf_explain, avg_fieldnorm)
Bm25Weight::new(idf_explain, avg_fieldnorm)
}
pub(crate) fn new(idf_explain: Explanation, average_fieldnorm: Score) -> BM25Weight {
pub(crate) fn new(idf_explain: Explanation, average_fieldnorm: Score) -> Bm25Weight {
let weight = idf_explain.value() * (1.0 + K1);
BM25Weight {
Bm25Weight {
idf_explain,
weight,
cache: compute_tf_cache(average_fieldnorm),
@@ -116,7 +116,7 @@ impl BM25Weight {
}
}
#[inline(always)]
#[inline]
pub fn score(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
self.weight * self.tf_factor(fieldnorm_id, term_freq)
}
@@ -125,7 +125,7 @@ impl BM25Weight {
self.score(255u8, 2_013_265_944)
}
#[inline(always)]
#[inline]
pub(crate) fn tf_factor(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
let term_freq = term_freq as Score;
let norm = self.cache[fieldnorm_id as usize];

View File

@@ -238,7 +238,7 @@ mod tests {
use crate::query::score_combiner::SumCombiner;
use crate::query::term_query::TermScorer;
use crate::query::Union;
use crate::query::{BM25Weight, Scorer};
use crate::query::{Bm25Weight, Scorer};
use crate::{DocId, DocSet, Score, TERMINATED};
use proptest::prelude::*;
use std::cmp::Ordering;
@@ -393,7 +393,7 @@ mod tests {
let term_scorers: Vec<TermScorer> = postings_lists_expanded
.iter()
.map(|postings| {
let bm25_weight = BM25Weight::for_one_term(
let bm25_weight = Bm25Weight::for_one_term(
postings.len() as u64,
max_doc as u64,
average_fieldnorm,

View File

@@ -3,7 +3,7 @@ use crate::query::Scorer;
use crate::DocId;
use crate::Score;
#[inline(always)]
#[inline]
fn is_within<TDocSetExclude: DocSet>(docset: &mut TDocSetExclude, doc: DocId) -> bool {
docset.doc() <= doc && docset.seek(doc) == doc
}

View File

@@ -8,9 +8,9 @@ use std::collections::HashMap;
use std::ops::Range;
use tantivy_fst::Automaton;
pub(crate) struct DFAWrapper(pub DFA);
pub(crate) struct DfaWrapper(pub DFA);
impl Automaton for DFAWrapper {
impl Automaton for DfaWrapper {
type State = u32;
fn start(&self) -> Self::State {
@@ -127,7 +127,7 @@ impl FuzzyTermQuery {
}
}
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DFAWrapper>> {
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> {
// LEV_BUILDER is a HashMap, whose `get` method returns an Option
match LEV_BUILDER.get(&(self.distance, false)) {
// Unwrap the option and build the Ok(AutomatonWeight)
@@ -139,7 +139,7 @@ impl FuzzyTermQuery {
};
Ok(AutomatonWeight::new(
self.term.field(),
DFAWrapper(automaton),
DfaWrapper(automaton),
))
}
None => Err(InvalidArgument(format!(

View File

@@ -26,7 +26,7 @@ mod weight;
mod vec_docset;
pub(crate) mod score_combiner;
pub(crate) use self::bm25::BM25Weight;
pub(crate) use self::bm25::Bm25Weight;
pub use self::intersection::Intersection;
pub use self::union::Union;
@@ -42,7 +42,7 @@ pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::explanation::Explanation;
#[cfg(test)]
pub(crate) use self::fuzzy_query::DFAWrapper;
pub(crate) use self::fuzzy_query::DfaWrapper;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers;
pub use self::phrase_query::PhraseQuery;

View File

@@ -1,6 +1,6 @@
use super::PhraseWeight;
use crate::core::searcher::Searcher;
use crate::query::bm25::BM25Weight;
use crate::query::bm25::Bm25Weight;
use crate::query::Query;
use crate::query::Weight;
use crate::schema::IndexRecordOption;
@@ -95,7 +95,7 @@ impl PhraseQuery {
)));
}
let terms = self.phrase_terms();
let bm25_weight = BM25Weight::for_terms(searcher, &terms)?;
let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
Ok(PhraseWeight::new(
self.phrase_terms.clone(),
bm25_weight,

View File

@@ -1,7 +1,7 @@
use crate::docset::{DocSet, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::BM25Weight;
use crate::query::bm25::Bm25Weight;
use crate::query::{Intersection, Scorer};
use crate::{DocId, Score};
use std::cmp::Ordering;
@@ -49,7 +49,7 @@ pub struct PhraseScorer<TPostings: Postings> {
right: Vec<u32>,
phrase_count: u32,
fieldnorm_reader: FieldNormReader,
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
score_needed: bool,
}
@@ -133,7 +133,7 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize {
impl<TPostings: Postings> PhraseScorer<TPostings> {
pub fn new(
term_postings: Vec<(usize, TPostings)>,
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
fieldnorm_reader: FieldNormReader,
score_needed: bool,
) -> PhraseScorer<TPostings> {

View File

@@ -2,7 +2,7 @@ use super::PhraseScorer;
use crate::core::SegmentReader;
use crate::fieldnorm::FieldNormReader;
use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight;
use crate::query::bm25::Bm25Weight;
use crate::query::explanation::does_not_match;
use crate::query::Scorer;
use crate::query::Weight;
@@ -14,7 +14,7 @@ use crate::{DocId, DocSet};
pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
score_needed: bool,
}
@@ -22,7 +22,7 @@ impl PhraseWeight {
/// Creates a new phrase weight.
pub fn new(
phrase_terms: Vec<(usize, Term)>,
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
score_needed: bool,
) -> PhraseWeight {
PhraseWeight {

View File

@@ -19,18 +19,18 @@ pub enum LogicalLiteral {
All,
}
pub enum LogicalAST {
Clause(Vec<(Occur, LogicalAST)>),
pub enum LogicalAst {
Clause(Vec<(Occur, LogicalAst)>),
Leaf(Box<LogicalLiteral>),
Boost(Box<LogicalAST>, Score),
Boost(Box<LogicalAst>, Score),
}
impl LogicalAST {
pub fn boost(self, boost: Score) -> LogicalAST {
impl LogicalAst {
pub fn boost(self, boost: Score) -> LogicalAst {
if (boost - 1.0).abs() < Score::EPSILON {
self
} else {
LogicalAST::Boost(Box::new(self), boost)
LogicalAst::Boost(Box::new(self), boost)
}
}
}
@@ -43,10 +43,10 @@ fn occur_letter(occur: Occur) -> &'static str {
}
}
impl fmt::Debug for LogicalAST {
impl fmt::Debug for LogicalAst {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
match *self {
LogicalAST::Clause(ref clause) => {
LogicalAst::Clause(ref clause) => {
if clause.is_empty() {
write!(formatter, "<emptyclause>")?;
} else {
@@ -59,15 +59,15 @@ impl fmt::Debug for LogicalAST {
}
Ok(())
}
LogicalAST::Boost(ref ast, boost) => write!(formatter, "{:?}^{}", ast, boost),
LogicalAST::Leaf(ref literal) => write!(formatter, "{:?}", literal),
LogicalAst::Boost(ref ast, boost) => write!(formatter, "{:?}^{}", ast, boost),
LogicalAst::Leaf(ref literal) => write!(formatter, "{:?}", literal),
}
}
}
impl From<LogicalLiteral> for LogicalAST {
fn from(literal: LogicalLiteral) -> LogicalAST {
LogicalAST::Leaf(Box::new(literal))
impl From<LogicalLiteral> for LogicalAst {
fn from(literal: LogicalLiteral) -> LogicalAst {
LogicalAst::Leaf(Box::new(literal))
}
}

View File

@@ -18,7 +18,7 @@ use std::collections::HashMap;
use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::FromStr;
use tantivy_query_grammar::{UserInputAST, UserInputBound, UserInputLeaf};
use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf};
/// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq, Error)]
@@ -28,7 +28,7 @@ pub enum QueryParserError {
SyntaxError,
/// `FieldDoesNotExist(field_name: String)`
/// The query references a field that is not in the schema
#[error("File does not exists: '{0:?}'")]
#[error("Field does not exists: '{0:?}'")]
FieldDoesNotExist(String),
/// The query contains a term for a `u64` or `i64`-field, but the value
/// is neither.
@@ -91,9 +91,9 @@ impl From<chrono::ParseError> for QueryParserError {
/// Recursively remove empty clause from the AST
///
/// Returns `None` iff the `logical_ast` ended up being empty.
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
match logical_ast {
LogicalAST::Clause(children) => {
LogicalAst::Clause(children) => {
let trimmed_children = children
.into_iter()
.flat_map(|(occur, child)| {
@@ -103,7 +103,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
if trimmed_children.is_empty() {
None
} else {
Some(LogicalAST::Clause(trimmed_children))
Some(LogicalAst::Clause(trimmed_children))
}
}
_ => Some(logical_ast),
@@ -178,11 +178,11 @@ pub struct QueryParser {
boost: HashMap<Field, Score>,
}
fn all_negative(ast: &LogicalAST) -> bool {
fn all_negative(ast: &LogicalAst) -> bool {
match ast {
LogicalAST::Leaf(_) => false,
LogicalAST::Boost(ref child_ast, _) => all_negative(&*child_ast),
LogicalAST::Clause(children) => children
LogicalAst::Leaf(_) => false,
LogicalAst::Boost(ref child_ast, _) => all_negative(&*child_ast),
LogicalAst::Clause(children) => children
.iter()
.all(|(ref occur, child)| (*occur == Occur::MustNot) || all_negative(child)),
}
@@ -251,7 +251,7 @@ impl QueryParser {
}
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAst, QueryParserError> {
let user_input_ast =
tantivy_query_grammar::parse_query(query).map_err(|_| QueryParserError::SyntaxError)?;
self.compute_logical_ast(user_input_ast)
@@ -265,10 +265,10 @@ impl QueryParser {
fn compute_logical_ast(
&self,
user_input_ast: UserInputAST,
) -> Result<LogicalAST, QueryParserError> {
user_input_ast: UserInputAst,
) -> Result<LogicalAst, QueryParserError> {
let ast = self.compute_logical_ast_with_occur(user_input_ast)?;
if let LogicalAST::Clause(children) = &ast {
if let LogicalAst::Clause(children) = &ast {
if children.is_empty() {
return Ok(ast);
}
@@ -429,24 +429,24 @@ impl QueryParser {
fn compute_logical_ast_with_occur(
&self,
user_input_ast: UserInputAST,
) -> Result<LogicalAST, QueryParserError> {
user_input_ast: UserInputAst,
) -> Result<LogicalAst, QueryParserError> {
match user_input_ast {
UserInputAST::Clause(sub_queries) => {
UserInputAst::Clause(sub_queries) => {
let default_occur = self.default_occur();
let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new();
let mut logical_sub_queries: Vec<(Occur, LogicalAst)> = Vec::new();
for (occur_opt, sub_ast) in sub_queries {
let sub_ast = self.compute_logical_ast_with_occur(sub_ast)?;
let occur = occur_opt.unwrap_or(default_occur);
logical_sub_queries.push((occur, sub_ast));
}
Ok(LogicalAST::Clause(logical_sub_queries))
Ok(LogicalAst::Clause(logical_sub_queries))
}
UserInputAST::Boost(ast, boost) => {
UserInputAst::Boost(ast, boost) => {
let ast = self.compute_logical_ast_with_occur(*ast)?;
Ok(ast.boost(boost as Score))
}
UserInputAST::Leaf(leaf) => self.compute_logical_ast_from_leaf(*leaf),
UserInputAst::Leaf(leaf) => self.compute_logical_ast_from_leaf(*leaf),
}
}
@@ -457,7 +457,7 @@ impl QueryParser {
fn compute_logical_ast_from_leaf(
&self,
leaf: UserInputLeaf,
) -> Result<LogicalAST, QueryParserError> {
) -> Result<LogicalAst, QueryParserError> {
match leaf {
UserInputLeaf::Literal(literal) => {
let term_phrases: Vec<(Field, String)> = match literal.field_name {
@@ -476,22 +476,22 @@ impl QueryParser {
}
}
};
let mut asts: Vec<LogicalAST> = Vec::new();
let mut asts: Vec<LogicalAst> = Vec::new();
for (field, phrase) in term_phrases {
if let Some(ast) = self.compute_logical_ast_for_leaf(field, &phrase)? {
// Apply some field specific boost defined at the query parser level.
let boost = self.field_boost(field);
asts.push(LogicalAST::Leaf(Box::new(ast)).boost(boost));
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
}
}
let result_ast: LogicalAST = if asts.len() == 1 {
let result_ast: LogicalAst = if asts.len() == 1 {
asts.into_iter().next().unwrap()
} else {
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
LogicalAst::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
};
Ok(result_ast)
}
UserInputLeaf::All => Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))),
UserInputLeaf::All => Ok(LogicalAst::Leaf(Box::new(LogicalLiteral::All))),
UserInputLeaf::Range {
field,
lower,
@@ -504,7 +504,7 @@ impl QueryParser {
let boost = self.field_boost(field);
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let logical_ast = LogicalAST::Leaf(Box::new(LogicalLiteral::Range {
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
field,
value_type,
lower: self.resolve_bound(field, &lower)?,
@@ -516,7 +516,7 @@ impl QueryParser {
let result_ast = if clauses.len() == 1 {
clauses.pop().unwrap()
} else {
LogicalAST::Clause(
LogicalAst::Clause(
clauses
.into_iter()
.map(|clause| (Occur::Should, clause))
@@ -547,9 +547,9 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<dyn Query> {
}
}
fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
fn convert_to_query(logical_ast: LogicalAst) -> Box<dyn Query> {
match trim_ast(logical_ast) {
Some(LogicalAST::Clause(trimmed_clause)) => {
Some(LogicalAst::Clause(trimmed_clause)) => {
let occur_subqueries = trimmed_clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
@@ -560,10 +560,10 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<dyn Query> {
);
Box::new(BooleanQuery::new(occur_subqueries))
}
Some(LogicalAST::Leaf(trimmed_logical_literal)) => {
Some(LogicalAst::Leaf(trimmed_logical_literal)) => {
convert_literal_to_query(*trimmed_logical_literal)
}
Some(LogicalAST::Boost(ast, boost)) => {
Some(LogicalAst::Boost(ast, boost)) => {
let query = convert_to_query(*ast);
let boosted_query = BoostQuery::new(query, boost);
Box::new(boosted_query)
@@ -632,7 +632,7 @@ mod test {
fn parse_query_to_logical_ast(
query: &str,
default_conjunction: bool,
) -> Result<LogicalAST, QueryParserError> {
) -> Result<LogicalAst, QueryParserError> {
let mut query_parser = make_query_parser();
if default_conjunction {
query_parser.set_conjunction_by_default();

View File

@@ -1,5 +1,5 @@
use super::term_weight::TermWeight;
use crate::query::bm25::BM25Weight;
use crate::query::bm25::Bm25Weight;
use crate::query::Weight;
use crate::query::{Explanation, Query};
use crate::schema::IndexRecordOption;
@@ -102,10 +102,10 @@ impl TermQuery {
}
let bm25_weight;
if scoring_enabled {
bm25_weight = BM25Weight::for_terms(searcher, &[term])?;
bm25_weight = Bm25Weight::for_terms(searcher, &[term])?;
} else {
bm25_weight =
BM25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32);
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32);
}
let index_record_option = if scoring_enabled {
self.index_record_option

View File

@@ -6,20 +6,20 @@ use crate::Score;
use crate::fieldnorm::FieldNormReader;
use crate::postings::SegmentPostings;
use crate::postings::{FreqReadingOption, Postings};
use crate::query::bm25::BM25Weight;
use crate::query::bm25::Bm25Weight;
#[derive(Clone)]
pub struct TermScorer {
postings: SegmentPostings,
fieldnorm_reader: FieldNormReader,
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
}
impl TermScorer {
pub fn new(
postings: SegmentPostings,
fieldnorm_reader: FieldNormReader,
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
) -> TermScorer {
TermScorer {
postings,
@@ -36,7 +36,7 @@ impl TermScorer {
pub fn create_for_test(
doc_and_tfs: &[(DocId, u32)],
fieldnorms: &[u32],
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
) -> TermScorer {
assert!(!doc_and_tfs.is_empty());
assert!(
@@ -131,7 +131,7 @@ mod tests {
use crate::merge_policy::NoMergePolicy;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::term_query::TermScorer;
use crate::query::{BM25Weight, Scorer, TermQuery};
use crate::query::{Bm25Weight, Scorer, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TEXT};
use crate::Score;
use crate::{assert_nearly_equals, Index, Searcher, SegmentId, Term};
@@ -141,7 +141,7 @@ mod tests {
#[test]
fn test_term_scorer_max_score() -> crate::Result<()> {
let bm25_weight = BM25Weight::for_one_term(3, 6, 10.0);
let bm25_weight = Bm25Weight::for_one_term(3, 6, 10.0);
let mut term_scorer = TermScorer::create_for_test(
&[(2, 3), (3, 12), (7, 8)],
&[0, 0, 10, 12, 0, 0, 0, 100],
@@ -167,7 +167,7 @@ mod tests {
#[test]
fn test_term_scorer_shallow_advance() -> crate::Result<()> {
let bm25_weight = BM25Weight::for_one_term(300, 1024, 10.0);
let bm25_weight = Bm25Weight::for_one_term(300, 1024, 10.0);
let mut doc_and_tfs = vec![];
for i in 0u32..300u32 {
let doc = i * 10;
@@ -205,7 +205,7 @@ mod tests {
// Average fieldnorm is over the entire index,
// not necessarily the docs that are in the posting list.
// For this reason we multiply by 1.1 to make a realistic value.
let bm25_weight = BM25Weight::for_one_term(term_doc_freq as u64,
let bm25_weight = Bm25Weight::for_one_term(term_doc_freq as u64,
term_doc_freq as u64 * 10u64,
average_fieldnorm);
@@ -240,7 +240,7 @@ mod tests {
doc_tfs.push((258, 1u32));
let fieldnorms: Vec<u32> = std::iter::repeat(20u32).take(300).collect();
let bm25_weight = BM25Weight::for_one_term(10, 129, 20.0);
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
docs.shallow_seek(135);

View File

@@ -3,7 +3,7 @@ use crate::core::SegmentReader;
use crate::docset::DocSet;
use crate::fieldnorm::FieldNormReader;
use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight;
use crate::query::bm25::Bm25Weight;
use crate::query::explanation::does_not_match;
use crate::query::weight::for_each_scorer;
use crate::query::Weight;
@@ -15,7 +15,7 @@ use crate::{DocId, Score};
pub struct TermWeight {
term: Term,
index_record_option: IndexRecordOption,
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
}
@@ -88,7 +88,7 @@ impl TermWeight {
pub fn new(
term: Term,
index_record_option: IndexRecordOption,
similarity_weight: BM25Weight,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
) -> TermWeight {
TermWeight {

View File

@@ -309,7 +309,7 @@ impl Schema {
} else {
format!("{:?}...", &doc_json[0..20])
};
DocParsingError::NotJSON(doc_json_sample)
DocParsingError::NotJson(doc_json_sample)
})?;
let mut doc = Document::default();
@@ -394,7 +394,7 @@ impl<'de> Deserialize<'de> for Schema {
pub enum DocParsingError {
/// The payload given is not valid JSON.
#[error("The provided string is not valid JSON")]
NotJSON(String),
NotJson(String),
/// One of the value node could not be parsed.
#[error("The field '{0:?}' could not be parsed: {1:?}")]
ValueError(String, ValueParsingError),
@@ -408,7 +408,7 @@ mod tests {
use crate::schema::field_type::ValueParsingError;
use crate::schema::int_options::Cardinality::SingleValue;
use crate::schema::schema::DocParsingError::NotJSON;
use crate::schema::schema::DocParsingError::NotJson;
use crate::schema::*;
use matches::{assert_matches, matches};
use serde_json;
@@ -737,7 +737,7 @@ mod tests {
"count": 50,
}"#,
);
assert_matches!(json_err, Err(NotJSON(_)));
assert_matches!(json_err, Err(NotJson(_)));
}
}

View File

@@ -69,7 +69,6 @@ pub struct SegmentSpaceUsage {
termdict: PerFieldSpaceUsage,
postings: PerFieldSpaceUsage,
positions: PerFieldSpaceUsage,
positions_idx: PerFieldSpaceUsage,
fast_fields: PerFieldSpaceUsage,
fieldnorms: PerFieldSpaceUsage,
@@ -87,7 +86,6 @@ impl SegmentSpaceUsage {
termdict: PerFieldSpaceUsage,
postings: PerFieldSpaceUsage,
positions: PerFieldSpaceUsage,
positions_idx: PerFieldSpaceUsage,
fast_fields: PerFieldSpaceUsage,
fieldnorms: PerFieldSpaceUsage,
store: StoreSpaceUsage,
@@ -105,7 +103,6 @@ impl SegmentSpaceUsage {
termdict,
postings,
positions,
positions_idx,
fast_fields,
fieldnorms,
store,
@@ -122,14 +119,13 @@ impl SegmentSpaceUsage {
use self::ComponentSpaceUsage::*;
use crate::SegmentComponent::*;
match component {
POSTINGS => PerField(self.postings().clone()),
POSITIONS => PerField(self.positions().clone()),
POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
FASTFIELDS => PerField(self.fast_fields().clone()),
FIELDNORMS => PerField(self.fieldnorms().clone()),
TERMS => PerField(self.termdict().clone()),
STORE => Store(self.store().clone()),
DELETE => Basic(self.deletes()),
Postings => PerField(self.postings().clone()),
Positions => PerField(self.positions().clone()),
FastFields => PerField(self.fast_fields().clone()),
FieldNorms => PerField(self.fieldnorms().clone()),
Terms => PerField(self.termdict().clone()),
SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
Delete => Basic(self.deletes()),
}
}
@@ -153,11 +149,6 @@ impl SegmentSpaceUsage {
&self.positions
}
/// Space usage for positions skip idx
pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
&self.positions_idx
}
/// Space usage for fast fields
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
&self.fast_fields
@@ -358,7 +349,6 @@ mod test {
expect_single_field(segment.termdict(), &name, 1, 512);
expect_single_field(segment.postings(), &name, 1, 512);
assert_eq!(0, segment.positions().total());
assert_eq!(0, segment.positions_skip_idx().total());
expect_single_field(segment.fast_fields(), &name, 1, 512);
expect_single_field(segment.fieldnorms(), &name, 1, 512);
// TODO: understand why the following fails
@@ -398,7 +388,6 @@ mod test {
expect_single_field(segment.termdict(), &name, 1, 512);
expect_single_field(segment.postings(), &name, 1, 512);
expect_single_field(segment.positions(), &name, 1, 512);
expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
assert_eq!(0, segment.fast_fields().total());
expect_single_field(segment.fieldnorms(), &name, 1, 512);
// TODO: understand why the following fails
@@ -437,7 +426,6 @@ mod test {
assert_eq!(0, segment.termdict().total());
assert_eq!(0, segment.postings().total());
assert_eq!(0, segment.positions().total());
assert_eq!(0, segment.positions_skip_idx().total());
assert_eq!(0, segment.fast_fields().total());
assert_eq!(0, segment.fieldnorms().total());
assert!(segment.store().total() > 0);
@@ -483,7 +471,6 @@ mod test {
expect_single_field(segment_space_usage.termdict(), &name, 1, 512);
expect_single_field(segment_space_usage.postings(), &name, 1, 512);
assert_eq!(0, segment_space_usage.positions().total());
assert_eq!(0, segment_space_usage.positions_skip_idx().total());
assert_eq!(0, segment_space_usage.fast_fields().total());
expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512);
assert!(segment_space_usage.deletes() > 0);

View File

@@ -98,7 +98,7 @@ use self::compression_snap::{compress, decompress};
pub mod tests {
use super::*;
use crate::directory::{Directory, RAMDirectory, WritePtr};
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::schema::Document;
use crate::schema::FieldValue;
use crate::schema::Schema;
@@ -146,7 +146,7 @@ pub mod tests {
#[test]
fn test_store() -> crate::Result<()> {
let path = Path::new("store");
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let store_wrt = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(store_wrt, 1_000);
let field_title = schema.get_field("title").unwrap();
@@ -172,7 +172,7 @@ mod bench {
use super::tests::write_lorem_ipsum_store;
use crate::directory::Directory;
use crate::directory::RAMDirectory;
use crate::directory::RamDirectory;
use crate::store::StoreReader;
use std::path::Path;
use test::Bencher;
@@ -180,7 +180,7 @@ mod bench {
#[bench]
#[cfg(feature = "mmap")]
fn bench_store_encode(b: &mut Bencher) {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let path = Path::new("store");
b.iter(|| {
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
@@ -190,7 +190,7 @@ mod bench {
#[bench]
fn bench_store_decode(b: &mut Bencher) {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let path = Path::new("store");
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
let store_file = directory.open_read(path).unwrap();

View File

@@ -124,7 +124,7 @@ mod tests {
use super::*;
use crate::schema::Document;
use crate::schema::Field;
use crate::{directory::RAMDirectory, store::tests::write_lorem_ipsum_store, Directory};
use crate::{directory::RamDirectory, store::tests::write_lorem_ipsum_store, Directory};
use std::path::Path;
fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> {
@@ -133,7 +133,7 @@ mod tests {
#[test]
fn test_store_lru_cache() -> crate::Result<()> {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let path = Path::new("store");
let writer = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(writer, 500);

View File

@@ -15,7 +15,7 @@ struct TermInfoBlockMeta {
ref_term_info: TermInfo,
doc_freq_nbits: u8,
postings_offset_nbits: u8,
positions_idx_nbits: u8,
positions_offset_nbits: u8,
}
impl BinarySerializable for TermInfoBlockMeta {
@@ -25,7 +25,7 @@ impl BinarySerializable for TermInfoBlockMeta {
write.write_all(&[
self.doc_freq_nbits,
self.postings_offset_nbits,
self.positions_idx_nbits,
self.positions_offset_nbits,
])?;
Ok(())
}
@@ -40,7 +40,7 @@ impl BinarySerializable for TermInfoBlockMeta {
ref_term_info,
doc_freq_nbits: buffer[0],
postings_offset_nbits: buffer[1],
positions_idx_nbits: buffer[2],
positions_offset_nbits: buffer[2],
})
}
}
@@ -52,7 +52,7 @@ impl FixedSize for TermInfoBlockMeta {
impl TermInfoBlockMeta {
fn num_bits(&self) -> u8 {
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_idx_nbits
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_offset_nbits
}
// Here inner_offset is the offset within the block, WITHOUT the first term_info.
@@ -63,23 +63,30 @@ impl TermInfoBlockMeta {
let num_bits = self.num_bits() as usize;
let posting_start_addr = num_bits * inner_offset;
// the stop offset is the start offset of the next term info.
let posting_stop_addr = posting_start_addr + num_bits;
let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize;
let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize;
// the posting_start is the posting_start of the next term info.
let posting_end_addr = posting_start_addr + num_bits;
let positions_start_addr = posting_start_addr + self.postings_offset_nbits as usize;
// the position_end is the positions_start of the next term info.
let positions_end_addr = positions_start_addr + num_bits as usize;
let doc_freq_addr = positions_start_addr + self.positions_offset_nbits as usize;
let postings_start_offset = self.ref_term_info.postings_range.start
+ extract_bits(data, posting_start_addr, self.postings_offset_nbits) as usize;
let postings_end_offset = self.ref_term_info.postings_range.start
+ extract_bits(data, posting_stop_addr, self.postings_offset_nbits) as usize;
+ extract_bits(data, posting_end_addr, self.postings_offset_nbits) as usize;
let positions_start_offset = self.ref_term_info.positions_range.start
+ extract_bits(data, positions_start_addr, self.positions_offset_nbits) as usize;
let positions_end_offset = self.ref_term_info.positions_range.start
+ extract_bits(data, positions_end_addr, self.positions_offset_nbits) as usize;
let doc_freq = extract_bits(data, doc_freq_addr, self.doc_freq_nbits) as u32;
let positions_idx = self.ref_term_info.positions_idx
+ extract_bits(data, positions_idx_addr, self.positions_idx_nbits);
TermInfo {
doc_freq,
postings_range: postings_start_offset..postings_end_offset,
positions_idx,
positions_range: positions_start_offset..positions_end_offset,
}
}
}
@@ -167,14 +174,13 @@ fn bitpack_serialize<W: Write>(
write,
)?;
bit_packer.write(
u64::from(term_info.doc_freq),
term_info_block_meta.doc_freq_nbits,
term_info.positions_range.start as u64,
term_info_block_meta.positions_offset_nbits,
write,
)?;
bit_packer.write(
term_info.positions_idx,
term_info_block_meta.positions_idx_nbits,
u64::from(term_info.doc_freq),
term_info_block_meta.doc_freq_nbits,
write,
)?;
Ok(())
@@ -201,29 +207,29 @@ impl TermInfoStoreWriter {
};
let postings_end_offset =
last_term_info.postings_range.end - ref_term_info.postings_range.start;
let positions_end_offset =
last_term_info.positions_range.end - ref_term_info.positions_range.start;
for term_info in &mut self.term_infos[1..] {
term_info.postings_range.start -= ref_term_info.postings_range.start;
term_info.positions_idx -= ref_term_info.positions_idx;
term_info.positions_range.start -= ref_term_info.positions_range.start;
}
let mut max_doc_freq: u32 = 0u32;
let max_postings_offset: usize = postings_end_offset;
let max_positions_idx: u64 = last_term_info.positions_idx;
for term_info in &self.term_infos[1..] {
max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq);
}
let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq));
let max_postings_offset_nbits = compute_num_bits(max_postings_offset as u64);
let max_positions_idx_nbits = compute_num_bits(max_positions_idx);
let max_postings_offset_nbits = compute_num_bits(postings_end_offset as u64);
let max_positions_offset_nbits = compute_num_bits(positions_end_offset as u64);
let term_info_block_meta = TermInfoBlockMeta {
offset: self.buffer_term_infos.len() as u64,
ref_term_info,
doc_freq_nbits: max_doc_freq_nbits,
postings_offset_nbits: max_postings_offset_nbits,
positions_idx_nbits: max_positions_idx_nbits,
positions_offset_nbits: max_positions_offset_nbits,
};
term_info_block_meta.serialize(&mut self.buffer_block_metas)?;
@@ -236,11 +242,17 @@ impl TermInfoStoreWriter {
)?;
}
// We still need to serialize the end offset for postings & positions.
bit_packer.write(
postings_end_offset as u64,
term_info_block_meta.postings_offset_nbits,
&mut self.buffer_term_infos,
)?;
bit_packer.write(
positions_end_offset as u64,
term_info_block_meta.positions_offset_nbits,
&mut self.buffer_term_infos,
)?;
// Block need end up at the end of a byte.
bit_packer.flush(&mut self.buffer_term_infos)?;
@@ -313,11 +325,11 @@ mod tests {
ref_term_info: TermInfo {
doc_freq: 512,
postings_range: 51..57,
positions_idx: 3584,
positions_range: 110..134,
},
doc_freq_nbits: 10,
postings_offset_nbits: 5,
positions_idx_nbits: 11,
positions_offset_nbits: 8,
};
let mut buffer: Vec<u8> = Vec::new();
term_info_block_meta.serialize(&mut buffer).unwrap();
@@ -335,7 +347,7 @@ mod tests {
let term_info = TermInfo {
doc_freq: i as u32,
postings_range: offset(i)..offset(i + 1),
positions_idx: (i * 7) as u64,
positions_range: offset(i) * 3..offset(i + 1) * 3,
};
store_writer.write_term_info(&term_info)?;
term_infos.push(term_info);

View File

@@ -1,6 +1,6 @@
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
use crate::directory::{Directory, FileSlice, RAMDirectory, TerminatingWrite};
use crate::directory::{Directory, FileSlice, RamDirectory, TerminatingWrite};
use crate::postings::TermInfo;
use std::path::PathBuf;
@@ -13,7 +13,7 @@ fn make_term_info(term_ord: u64) -> TermInfo {
TermInfo {
doc_freq: term_ord as u32,
postings_range: offset(term_ord)..offset(term_ord + 1),
positions_idx: offset(term_ord) as u64 * 2u64,
positions_range: offset(term_ord) * 2..offset(term_ord + 1) * 2,
}
}
@@ -34,7 +34,7 @@ fn test_term_ordinals() -> crate::Result<()> {
"Sweden",
"Switzerland",
];
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path)?;
@@ -57,7 +57,7 @@ fn test_term_ordinals() -> crate::Result<()> {
#[test]
fn test_term_dictionary_simple() -> crate::Result<()> {
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path)?;
@@ -380,7 +380,7 @@ fn test_stream_term_ord() -> crate::Result<()> {
let termdict = stream_range_test_dict()?;
let mut stream = termdict.stream()?;
for b in 0u8..10u8 {
assert!(stream.advance(), true);
assert!(stream.advance());
assert_eq!(stream.term_ord(), b as u64);
assert_eq!(stream.key(), &[b]);
}
@@ -390,7 +390,7 @@ fn test_stream_term_ord() -> crate::Result<()> {
#[test]
fn test_automaton_search() -> crate::Result<()> {
use crate::query::DFAWrapper;
use crate::query::DfaWrapper;
use levenshtein_automata::LevenshteinAutomatonBuilder;
const COUNTRIES: [&'static str; 7] = [
@@ -403,7 +403,7 @@ fn test_automaton_search() -> crate::Result<()> {
"Switzerland",
];
let directory = RAMDirectory::create();
let directory = RamDirectory::create();
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path)?;
@@ -418,7 +418,7 @@ fn test_automaton_search() -> crate::Result<()> {
// We can now build an entire dfa.
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen"));
let automaton = DfaWrapper(lev_automaton_builder.build_dfa("Spaen"));
let mut range = term_dict.search(automaton).into_stream()?;