Compare commits

...

11 Commits

Author SHA1 Message Date
Paul Masurel
b3a201e665 Revert "Fix non english stemmers (#521)"
This reverts commit 2cd31bcda2.
2019-03-27 08:54:55 +09:00
Panagiotis Ktistakis
2cd31bcda2 Fix non english stemmers (#521) 2019-03-27 08:54:16 +09:00
Paul Masurel
99870de55c 0.10.0-dev 2019-03-25 08:58:26 +09:00
Paul Masurel
cad2d91845 Disabled tests for android 2019-03-24 22:58:46 +09:00
Paul Masurel
79f3cd6cf4 Added instructions to update 2019-03-24 09:10:31 +09:00
Paul Masurel
e3abb4481b broken link 2019-03-22 09:58:28 +09:00
Paul Masurel
bfa61d2f2f Added patreon button 2019-03-22 09:51:00 +09:00
Paul Masurel
6c0e621fdb Added bench info in README 2019-03-21 09:35:04 +09:00
Paul Masurel
a8cc5208f1 Linear simd (#519)
* linear simd search within block
2019-03-20 22:10:05 +09:00
Paul Masurel
83eb0d0cb7 Disabling tests on Android 2019-03-20 10:24:17 +09:00
Paul Masurel
ee6e273365 cleanup for nodefaultfeatures 2019-03-20 10:04:42 +09:00
17 changed files with 549 additions and 340 deletions

View File

@@ -29,7 +29,7 @@ addons:
matrix:
include:
# Android
- env: TARGET=aarch64-linux-android
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
@@ -77,4 +77,4 @@ before_cache:
notifications:
email:
on_success: never
on_success: never

View File

@@ -15,6 +15,35 @@ previous index format.*
for int fields. (@fulmicoton)
- Added DateTime field (@barrotsteindev)
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
- SIMD linear search within blocks (@fulmicoton)
## How to update ?
tantivy 0.9 brought some API breaking change.
To update from tantivy 0.8, you will need to go through the following steps.
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
`IndexReader` for this.
```rust
// create the reader. You typically need to create 1 reader for the entire
// lifetime of you program.
let reader = index.reader()?;
// Acquire a searcher (previously `index.searcher()`) is now written:
let searcher = reader.searcher();
// With the default setting of the reader, you are not required to
// call `index.load_searchers()` anymore.
//
// The IndexReader will pick up that change automatically, regardless
// of whether the update was done in a different process or not.
// If this behavior is not wanted, you can create your reader with
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
// by calling `reader.reload()?`.
```
Tantivy 0.8.2

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.9.0"
version = "0.10.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -17,6 +17,7 @@
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
[![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
**Tantivy** is a **full text search engine library** written in rust.
@@ -27,6 +28,14 @@ to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design.
# Benchmark
Tantivy is typically faster than Lucene, but the results will depend on
the nature of the queries in your workload.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection.
# Features
- Full-text search
@@ -87,6 +96,14 @@ To check out and run tests, you can simply run :
Some tests will not run with just `cargo test` because of `fail-rs`.
To run the tests exhaustively, run `./run-tests.sh`.
# Contribute
# How can I support this project ?
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
There are many ways to support this project.
- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
- Report bugs
- Write a blog post
- Complete documentation
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
- Talk about tantivy around you
- Drop a word on on [![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) or even [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)

View File

@@ -13,7 +13,11 @@ pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
pub use byteorder::LittleEndian as Endianness;
use std::io;
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
/// Computes the number of bits that will be used for bitpacking.
///
@@ -52,11 +56,6 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
(n > 0) && (n & (n - 1) == 0)
}
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Has length trait
pub trait HasLen {
/// Return length
@@ -134,4 +133,11 @@ pub(crate) mod test {
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
}

View File

@@ -24,6 +24,7 @@ use schema::Schema;
use serde_json;
use std::borrow::BorrowMut;
use std::fmt;
#[cfg(feature = "mmap")]
use std::path::Path;
use std::sync::Arc;
use tokenizer::BoxedTokenizer;
@@ -355,10 +356,8 @@ mod tests {
use directory::RAMDirectory;
use schema::Field;
use schema::{Schema, INDEXED, TEXT};
use std::path::PathBuf;
use std::thread;
use std::time::Duration;
use tempdir::TempDir;
use Index;
use IndexReader;
use IndexWriter;
@@ -444,61 +443,69 @@ mod tests {
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[test]
fn test_index_on_commit_reload_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[cfg(feature = "mmap")]
mod mmap_specific {
#[test]
fn test_index_manual_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
thread::sleep(Duration::from_millis(500));
assert_eq!(reader.searcher().num_docs(), 0);
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 1);
}
use super::*;
use std::path::PathBuf;
use tempdir::TempDir;
#[test]
fn test_index_on_commit_reload_policy_different_directories() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
let reader = read_index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
#[test]
fn test_index_on_commit_reload_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[test]
fn test_index_manual_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
thread::sleep(Duration::from_millis(500));
assert_eq!(reader.searcher().num_docs(), 0);
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 1);
}
#[test]
fn test_index_on_commit_reload_policy_different_directories() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
let reader = read_index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
}
fn test_index_on_commit_reload_policy_aux(

View File

@@ -260,95 +260,98 @@ impl Clone for ManagedDirectory {
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use std::io::Write;
use std::path::Path;
use tempdir::TempDir;
mod mmap_specific {
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
}
use super::super::*;
use std::path::Path;
use tempdir::TempDir;
#[test]
#[cfg(feature = "mmap")]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
}
use directory::MmapDirectory;
use std::io::Write;
#[test]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
}
{
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
{
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
{
#[test]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
#[test]
#[cfg(feature = "mmap ")]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
}
}
}

View File

@@ -6,7 +6,6 @@ use self::notify::RawEvent;
use self::notify::RecursiveMode;
use self::notify::Watcher;
use atomicwrites;
use common::make_io_err;
use core::META_FILEPATH;
use directory::error::LockError;
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
@@ -37,6 +36,11 @@ use std::sync::Weak;
use std::thread;
use tempdir::TempDir;
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped)
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {

View File

@@ -13,7 +13,6 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
#[test]
#[ignore]
#[cfg(feature = "mmap")]
fn test_indexing() {
let mut schema_builder = Schema::builder();

View File

@@ -1,3 +1,4 @@
use common::MAX_DOC_LIMIT;
use core::Segment;
use core::SegmentReader;
use core::SerializableSegment;
@@ -23,6 +24,7 @@ use termdict::TermMerger;
use termdict::TermOrdinal;
use DocId;
use Result;
use TantivyError;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
let mut total_tokens = 0u64;
@@ -150,6 +152,14 @@ impl IndexMerger {
readers.push(reader);
}
}
if max_doc >= MAX_DOC_LIMIT {
let err_msg = format!(
"The segment resulting from this merge would have {} docs,\
which exceeds the limit {}.",
max_doc, MAX_DOC_LIMIT
);
return Err(TantivyError::InvalidArgument(err_msg));
}
Ok(IndexMerger {
schema,
readers,

View File

@@ -420,6 +420,7 @@ impl SegmentUpdater {
})
.collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter());
for merge_operation in merge_candidates {
match self.start_merge_impl(merge_operation) {
Ok(merge_future) => {

View File

@@ -174,6 +174,7 @@ extern crate downcast_rs;
#[macro_use]
extern crate fail;
#[cfg(feature = "mmap")]
#[cfg(test)]
mod functional_test;

View File

@@ -0,0 +1,229 @@
/// This modules define the logic used to search for a doc in a given
/// block. (at most 128 docs)
///
/// Searching within a block is a hotspot when running intersection.
/// so it was worth defining it in its own module.
#[cfg(target_arch = "x86_64")]
mod sse2 {
use postings::compression::COMPRESSION_BLOCK_SIZE;
use std::arch::x86_64::__m128i as DataType;
use std::arch::x86_64::_mm_add_epi32 as op_add;
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
use std::arch::x86_64::_mm_set1_epi32 as set1;
use std::arch::x86_64::_mm_setzero_si128 as set0;
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
const MASK1: i32 = 78;
const MASK2: i32 = 177;
/// Performs an exhaustive linear search over the
///
/// There is no early exit here. We simply count the
/// number of elements that are `< target`.
pub fn linear_search_sse2_128(arr: &[u32], target: u32) -> usize {
unsafe {
let ptr = arr.as_ptr() as *const DataType;
let vkey = set1(target as i32);
let mut cnt = set0();
// We work over 4 `__m128i` at a time.
// A single `__m128i` actual contains 4 `u32`.
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
cnt = op_sub(cnt, sum);
}
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
_mm_cvtsi128_si32(cnt) as usize
}
}
#[cfg(test)]
mod test {
use super::linear_search_sse2_128;
#[test]
fn test_linear_search_sse2_128_u32() {
for i in 0..23 {
dbg!(i);
let arr: Vec<u32> = (0..128).map(|el| el * 2 + 1 << 18).collect();
assert_eq!(linear_search_sse2_128(&arr, arr[64] + 1), 65);
}
}
}
}
/// This `linear search` browser exhaustively through the array.
/// but the early exit is very difficult to predict.
///
/// Coupled with `exponential search` this function is likely
/// to be called with the same `len`
fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
if pivot >= end {
break;
}
if arr[pivot] > target {
return (begin, pivot);
}
begin = pivot;
}
(begin, end)
}
fn galloping(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(&block_docs, target);
start + linear_search(&block_docs[start..end], target)
}
/// Tantivy may rely on SIMD instructions to search for a specific document within
/// a given block.
#[derive(Clone, Copy, PartialEq)]
pub enum BlockSearcher {
#[cfg(target_arch = "x86_64")]
SSE2,
Scalar,
}
impl BlockSearcher {
/// Search the first index containing an element greater or equal to
/// the target.
///
/// The results should be equivalent to
/// ```ignore
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// The `start` argument is just used to hint that the response is
/// greater than beyond `start`. The implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// The array len is > start.
/// The block is sorted
/// The target is assumed greater or equal to the `arr[start]`.
/// The target is assumed smaller or equal to the last element of the block.
///
/// Currently the scalar implementation starts by an exponential search, and
/// then operates a linear search in the result subarray.
///
/// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
/// of branch.
pub fn search_in_block(&self, block_docs: &[u32], start: usize, target: u32) -> usize {
#[cfg(target_arch = "x86_64")]
{
use postings::compression::COMPRESSION_BLOCK_SIZE;
if *self == BlockSearcher::SSE2 {
if block_docs.len() == COMPRESSION_BLOCK_SIZE {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
}
start + galloping(&block_docs[start..], target)
}
}
impl Default for BlockSearcher {
fn default() -> BlockSearcher {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
return BlockSearcher::SSE2;
}
}
BlockSearcher::Scalar
}
}
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::linear_search;
use super::BlockSearcher;
#[test]
fn test_linear_search() {
let len: usize = 50;
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
for target in 1..*arr.last().unwrap() {
let res = linear_search(&arr[..], target);
if res > 0 {
assert!(arr[res - 1] < target);
}
if res < len {
assert!(arr[res] >= target);
}
}
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
(3, 7)
);
}
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
for i in 0..cursor {
assert_eq!(block_searcher.search_in_block(block, i, target), cursor);
}
}
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block_searcher, block, target);
}
}
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block.iter().take_while(|&&val| val < target).count()
}
fn test_search_in_block_util(block_searcher: BlockSearcher) {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_in_block_all(block_searcher, &v[..]);
}
}
#[test]
fn test_search_in_block_scalar() {
test_search_in_block_util(BlockSearcher::Scalar);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_search_in_block_sse2() {
test_search_in_block_util(BlockSearcher::SSE2);
}
}

View File

@@ -43,9 +43,14 @@ impl BlockEncoder {
}
}
/// We ensure that the OutputBuffer is align on 128 bits
/// in order to run SSE2 linear search on it.
#[repr(align(128))]
struct OutputBuffer([u32; COMPRESSION_BLOCK_SIZE + 1]);
pub struct BlockDecoder {
bitpacker: BitPacker4x,
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
output: OutputBuffer,
pub output_len: usize,
}
@@ -59,7 +64,7 @@ impl BlockDecoder {
output[COMPRESSION_BLOCK_SIZE] = 0u32;
BlockDecoder {
bitpacker: BitPacker4x::new(),
output,
output: OutputBuffer(output),
output_len: 0,
}
}
@@ -72,23 +77,23 @@ impl BlockDecoder {
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(&compressed_data, &mut self.output, num_bits)
.decompress(&compressed_data, &mut self.output.0, num_bits)
}
#[inline]
pub fn output_array(&self) -> &[u32] {
&self.output[..self.output_len]
&self.output.0[..self.output_len]
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output[idx]
self.output.0[idx]
}
}
@@ -159,12 +164,12 @@ impl VIntDecoder for BlockDecoder {
num_els: usize,
) -> usize {
self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
}
}

View File

@@ -2,6 +2,7 @@
Postings module (also called inverted index)
*/
mod block_search;
pub(crate) mod compression;
/// Postings module
///
@@ -16,6 +17,8 @@ mod skip;
mod stacker;
mod term_info;
pub(crate) use self::block_search::BlockSearcher;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
@@ -104,9 +107,7 @@ pub mod tests {
let searcher = index.reader().unwrap().searcher();
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
let term = Term::from_field_text(title, "abc");
let mut positions = Vec::new();
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)

View File

@@ -7,6 +7,7 @@ use positions::PositionReader;
use postings::compression::compressed_block_size;
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use postings::serializer::PostingsSerializer;
use postings::BlockSearcher;
use postings::FreqReadingOption;
use postings::Postings;
use postings::SkipReader;
@@ -60,6 +61,7 @@ pub struct SegmentPostings {
block_cursor: BlockSegmentPostings,
cur: usize,
position_computer: Option<PositionComputer>,
block_searcher: BlockSearcher,
}
impl SegmentPostings {
@@ -70,6 +72,7 @@ impl SegmentPostings {
block_cursor: empty_block_cursor,
cur: COMPRESSION_BLOCK_SIZE,
position_computer: None,
block_searcher: BlockSearcher::default(),
}
}
@@ -117,42 +120,31 @@ impl SegmentPostings {
block_cursor: segment_block_postings,
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
position_computer: positions_stream_opt.map(PositionComputer::new),
block_searcher: BlockSearcher::default(),
}
}
}
fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
if pivot >= end {
break;
}
if arr[pivot] > target {
return (begin, pivot);
}
begin = pivot;
}
(begin, end)
}
/// Search the first index containing an element greater or equal to the target.
///
/// # Assumption
///
/// The array is assumed non empty.
/// The target is assumed greater or equal to the first element.
/// The target is assumed smaller or equal to the last element.
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(block_docs, target);
start + linear_search(&block_docs[start..end], target)
}
impl DocSet for SegmentPostings {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
if self.position_computer.is_some() {
let term_freq = self.term_freq() as usize;
self.position_computer.as_mut().unwrap().add_skip(term_freq);
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
true
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
@@ -211,9 +203,8 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
let new_cur = self
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
.block_searcher
.search_in_block(&block_docs, self.cur, target);
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
@@ -235,29 +226,6 @@ impl DocSet for SegmentPostings {
}
}
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
if self.position_computer.is_some() {
let term_freq = self.term_freq() as usize;
self.position_computer.as_mut().unwrap().add_skip(term_freq);
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
true
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
/// Return the current document's `DocId`.
#[inline]
fn doc(&self) -> DocId {
@@ -269,6 +237,10 @@ impl DocSet for SegmentPostings {
docs[self.cur]
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
// finish the current block
if self.advance() {
@@ -614,10 +586,6 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::linear_search;
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
use super::SegmentPostings;
@@ -632,21 +600,6 @@ mod tests {
use DocId;
use SkipResult;
#[test]
fn test_linear_search() {
let len: usize = 50;
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
for target in 1..*arr.last().unwrap() {
let res = linear_search(&arr[..], target);
if res > 0 {
assert!(arr[res - 1] < target);
}
if res < len {
assert!(arr[res] >= target);
}
}
}
#[test]
fn test_empty_segment_postings() {
let mut postings = SegmentPostings::empty();
@@ -662,56 +615,6 @@ mod tests {
assert_eq!(postings.doc_freq(), 0);
}
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block
.iter()
.cloned()
.enumerate()
.filter(|&(_, ref val)| *val >= target)
.next()
.unwrap()
.0
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
(3, 7)
);
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
search_within_block_trivial_but_slow(block, target)
);
}
fn util_test_search_within_block_all(block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_within_block(block, target);
}
}
#[test]
fn test_search_within_block() {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_within_block_all(&v[..]);
}
}
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());

View File

@@ -14,41 +14,35 @@ use Score;
/// specialized implementation if the two
/// shortest scorers are `TermScorer`s.
pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
if scorers.is_empty() {
return Box::new(EmptyScorer);
}
if scorers.len() == 1 {
return scorers.pop().unwrap();
}
// We know that we have at least 2 elements.
let num_docsets = scorers.len();
scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
let rarest_opt = scorers.pop();
let second_rarest_opt = scorers.pop();
let left = scorers.pop().unwrap();
let right = scorers.pop().unwrap();
scorers.reverse();
match (rarest_opt, second_rarest_opt) {
(None, None) => Box::new(EmptyScorer),
(Some(single_docset), None) => single_docset,
(Some(left), Some(right)) => {
{
let all_term_scorers = [&left, &right]
.iter()
.all(|&scorer| scorer.is::<TermScorer>());
if all_term_scorers {
let left = *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap());
let right = *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap());
return Box::new(Intersection {
left,
right,
others: scorers,
num_docsets,
});
}
}
Box::new(Intersection {
left,
right,
others: scorers,
num_docsets,
})
}
_ => {
unreachable!();
}
let all_term_scorers = [&left, &right]
.iter()
.all(|&scorer| scorer.is::<TermScorer>());
if all_term_scorers {
return Box::new(Intersection {
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
others: scorers,
num_docsets,
});
}
Box::new(Intersection {
left,
right,
others: scorers,
num_docsets,
})
}
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
@@ -124,7 +118,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
return false;
}
}
match left.skip_next(candidate) {
SkipResult::Reached => {
break;
@@ -140,35 +133,36 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
}
// test the remaining scorers;
for (ord, docset) in self.others.iter_mut().enumerate() {
if ord != other_candidate_ord {
// `candidate_ord` is already at the
// right position.
//
// Calling `skip_next` would advance this docset
// and miss it.
match docset.skip_next(candidate) {
SkipResult::Reached => {}
SkipResult::OverStep => {
// this is not in the intersection,
// let's update our candidate.
candidate = docset.doc();
match left.skip_next(candidate) {
SkipResult::Reached => {
other_candidate_ord = ord;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
if ord == other_candidate_ord {
continue;
}
// `candidate_ord` is already at the
// right position.
//
// Calling `skip_next` would advance this docset
// and miss it.
match docset.skip_next(candidate) {
SkipResult::Reached => {}
SkipResult::OverStep => {
// this is not in the intersection,
// let's update our candidate.
candidate = docset.doc();
match left.skip_next(candidate) {
SkipResult::Reached => {
other_candidate_ord = ord;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
continue 'outer;
}
SkipResult::End => {
return false;
}
continue 'outer;
}
SkipResult::End => {
return false;
}
}
}