Revert "Fix non english stemmers (#521 )"

This reverts commit 2cd31bcda2.
Fix non english stemmers (#521 )
2026-01-07 01:32:53 +00:00 · 2019-03-27 08:54:55 +09:00 · 2019-03-27 08:54:16 +09:00 · 2019-03-25 08:58:26 +09:00 · 2019-03-24 22:58:46 +09:00 · 2019-03-24 09:10:31 +09:00
17 changed files with 549 additions and 340 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -29,7 +29,7 @@ addons:
 matrix:
  include:
    # Android
-    - env: TARGET=aarch64-linux-android
+    - env: TARGET=aarch64-linux-android DISABLE_TESTS=1
    #- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
    #- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
    #- env: TARGET=i686-linux-android DISABLE_TESTS=1
@@ -77,4 +77,4 @@ before_cache:

 notifications:
  email:
-    on_success: never
+    on_success: never
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,35 @@ previous index format.*
  for int fields. (@fulmicoton)
 - Added DateTime field (@barrotsteindev)
 - Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
+- SIMD linear search within blocks (@fulmicoton)
+
+## How to update ?
+
+tantivy 0.9 brought some API breaking change.
+To update from tantivy 0.8, you will need to go through the following steps.
+
+- `schema::INT_INDEXED` and `schema::INT_STORED`  should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
+- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called 
+`IndexReader` for this. 
+    
+    ```rust
+    // create the reader. You typically need to create 1 reader for the entire
+    // lifetime of you program.
+    let reader = index.reader()?;
+    
+    // Acquire a searcher (previously `index.searcher()`) is now written:
+    let searcher = reader.searcher();
+    
+    // With the default setting of the reader, you are not required to 
+    // call `index.load_searchers()` anymore.
+    //
+    // The IndexReader will pick up that change automatically, regardless
+    // of whether the update was done in a different process or not.
+    // If this behavior is not wanted, you can create your reader with 
+    // the `ReloadPolicy::Manual`, and manually decide when to reload the index
+    // by calling `reader.reload()?`.
+  
+    ```


 Tantivy 0.8.2
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.9.0"
+version = "0.10.0-dev"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@
 [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
 [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)

+[![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)


 **Tantivy** is a **full text search engine library** written in rust.
@@ -27,6 +28,14 @@ to build such a search engine.

 Tantivy is, in fact, strongly inspired by Lucene's design.

+# Benchmark
+
+Tantivy is typically faster than Lucene, but the results will depend on 
+the nature of the queries in your workload.
+
+The following [benchmark](https://tantivy-search.github.io/bench/) break downs 
+performance for different type of queries / collection.
+
 # Features

 - Full-text search
@@ -87,6 +96,14 @@ To check out and run tests, you can simply run :
 Some tests will not run with just `cargo test` because of `fail-rs`.
 To run the tests exhaustively, run `./run-tests.sh`. 

-# Contribute
+# How can I support this project ?

-Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
+There are many ways to support this project. 
+
+- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
+- Report bugs
+- Write a blog post
+- Complete documentation
+- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
+- Talk about tantivy around you
+- Drop a word on on [![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) or even [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -13,7 +13,11 @@ pub use self::serialize::{BinarySerializable, FixedSize};
 pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
 pub use byteorder::LittleEndian as Endianness;

-use std::io;
+
+/// Segment's max doc must be `< MAX_DOC_LIMIT`.
+///
+/// We do not allow segments with more than
+pub const MAX_DOC_LIMIT: u32 = 1 << 31;

 /// Computes the number of bits that will be used for bitpacking.
 ///
@@ -52,11 +56,6 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
    (n > 0) && (n & (n - 1) == 0)
 }

-/// Create a default io error given a string.
-pub(crate) fn make_io_err(msg: String) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, msg)
-}
-
 /// Has length trait
 pub trait HasLen {
    /// Return length
@@ -134,4 +133,11 @@ pub(crate) mod test {
        assert_eq!(compute_num_bits(256), 9u8);
        assert_eq!(compute_num_bits(5_000_000_000), 33u8);
    }
+
+    #[test]
+    fn test_max_doc() {
+        // this is the first time I write a unit test for a constant.
+        assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
+        assert!((super::MAX_DOC_LIMIT as i32) < 0);
+    }
 }
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -24,6 +24,7 @@ use schema::Schema;
 use serde_json;
 use std::borrow::BorrowMut;
 use std::fmt;
+#[cfg(feature = "mmap")]
 use std::path::Path;
 use std::sync::Arc;
 use tokenizer::BoxedTokenizer;
@@ -355,10 +356,8 @@ mod tests {
    use directory::RAMDirectory;
    use schema::Field;
    use schema::{Schema, INDEXED, TEXT};
-    use std::path::PathBuf;
    use std::thread;
    use std::time::Duration;
-    use tempdir::TempDir;
    use Index;
    use IndexReader;
    use IndexWriter;
@@ -444,61 +443,69 @@ mod tests {
        test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
    }

-    #[test]
-    fn test_index_on_commit_reload_policy_mmap() {
-        let schema = throw_away_schema();
-        let field = schema.get_field("num_likes").unwrap();
-        let tempdir = TempDir::new("index").unwrap();
-        let tempdir_path = PathBuf::from(tempdir.path());
-        let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
-        let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
-        writer.commit().unwrap();
-        let reader = index
-            .reader_builder()
-            .reload_policy(ReloadPolicy::OnCommit)
-            .try_into()
-            .unwrap();
-        assert_eq!(reader.searcher().num_docs(), 0);
-        test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
-    }
+    #[cfg(feature = "mmap")]
+    mod mmap_specific {

-    #[test]
-    fn test_index_manual_policy_mmap() {
-        let schema = throw_away_schema();
-        let field = schema.get_field("num_likes").unwrap();
-        let index = Index::create_from_tempdir(schema).unwrap();
-        let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
-        writer.commit().unwrap();
-        let reader = index
-            .reader_builder()
-            .reload_policy(ReloadPolicy::Manual)
-            .try_into()
-            .unwrap();
-        assert_eq!(reader.searcher().num_docs(), 0);
-        writer.add_document(doc!(field=>1u64));
-        writer.commit().unwrap();
-        thread::sleep(Duration::from_millis(500));
-        assert_eq!(reader.searcher().num_docs(), 0);
-        reader.reload().unwrap();
-        assert_eq!(reader.searcher().num_docs(), 1);
-    }
+        use super::*;
+        use std::path::PathBuf;
+        use tempdir::TempDir;

-    #[test]
-    fn test_index_on_commit_reload_policy_different_directories() {
-        let schema = throw_away_schema();
-        let field = schema.get_field("num_likes").unwrap();
-        let tempdir = TempDir::new("index").unwrap();
-        let tempdir_path = PathBuf::from(tempdir.path());
-        let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
-        let read_index = Index::open_in_dir(&tempdir_path).unwrap();
-        let reader = read_index
-            .reader_builder()
-            .reload_policy(ReloadPolicy::OnCommit)
-            .try_into()
-            .unwrap();
-        assert_eq!(reader.searcher().num_docs(), 0);
-        let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
-        test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
+        #[test]
+        fn test_index_on_commit_reload_policy_mmap() {
+            let schema = throw_away_schema();
+            let field = schema.get_field("num_likes").unwrap();
+            let tempdir = TempDir::new("index").unwrap();
+            let tempdir_path = PathBuf::from(tempdir.path());
+            let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
+            let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+            writer.commit().unwrap();
+            let reader = index
+                .reader_builder()
+                .reload_policy(ReloadPolicy::OnCommit)
+                .try_into()
+                .unwrap();
+            assert_eq!(reader.searcher().num_docs(), 0);
+            test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
+        }
+
+        #[test]
+        fn test_index_manual_policy_mmap() {
+            let schema = throw_away_schema();
+            let field = schema.get_field("num_likes").unwrap();
+            let index = Index::create_from_tempdir(schema).unwrap();
+            let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+            writer.commit().unwrap();
+            let reader = index
+                .reader_builder()
+                .reload_policy(ReloadPolicy::Manual)
+                .try_into()
+                .unwrap();
+            assert_eq!(reader.searcher().num_docs(), 0);
+            writer.add_document(doc!(field=>1u64));
+            writer.commit().unwrap();
+            thread::sleep(Duration::from_millis(500));
+            assert_eq!(reader.searcher().num_docs(), 0);
+            reader.reload().unwrap();
+            assert_eq!(reader.searcher().num_docs(), 1);
+        }
+
+        #[test]
+        fn test_index_on_commit_reload_policy_different_directories() {
+            let schema = throw_away_schema();
+            let field = schema.get_field("num_likes").unwrap();
+            let tempdir = TempDir::new("index").unwrap();
+            let tempdir_path = PathBuf::from(tempdir.path());
+            let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
+            let read_index = Index::open_in_dir(&tempdir_path).unwrap();
+            let reader = read_index
+                .reader_builder()
+                .reload_policy(ReloadPolicy::OnCommit)
+                .try_into()
+                .unwrap();
+            assert_eq!(reader.searcher().num_docs(), 0);
+            let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
+            test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
+        }
    }

    fn test_index_on_commit_reload_policy_aux(
--- a/src/directory/managed_directory.rs
+++ b/src/directory/managed_directory.rs
@@ -260,95 +260,98 @@ impl Clone for ManagedDirectory {
 #[cfg(test)]
 mod tests {

-    use super::*;
    #[cfg(feature = "mmap")]
-    use directory::MmapDirectory;
-    use std::io::Write;
-    use std::path::Path;
-    use tempdir::TempDir;
+    mod mmap_specific {

-    lazy_static! {
-        static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
-        static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
-    }
+        use super::super::*;
+        use std::path::Path;
+        use tempdir::TempDir;

-    #[test]
-    #[cfg(feature = "mmap")]
-    fn test_managed_directory() {
-        let tempdir = TempDir::new("index").unwrap();
-        let tempdir_path = PathBuf::from(tempdir.path());
-        {
-            let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
-            let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
+        lazy_static! {
+            static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
+            static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
+        }
+
+        use directory::MmapDirectory;
+        use std::io::Write;
+
+        #[test]
+        fn test_managed_directory() {
+            let tempdir = TempDir::new("index").unwrap();
+            let tempdir_path = PathBuf::from(tempdir.path());
            {
-                let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
-                write_file.flush().unwrap();
+                let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
+                let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
+                {
+                    let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
+                    write_file.flush().unwrap();
+                }
+                {
+                    managed_directory
+                        .atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
+                        .unwrap();
+                }
+                {
+                    assert!(managed_directory.exists(*TEST_PATH1));
+                    assert!(managed_directory.exists(*TEST_PATH2));
+                }
+                {
+                    let living_files: HashSet<PathBuf> =
+                        [TEST_PATH1.to_owned()].into_iter().cloned().collect();
+                    managed_directory.garbage_collect(|| living_files);
+                }
+                {
+                    assert!(managed_directory.exists(*TEST_PATH1));
+                    assert!(!managed_directory.exists(*TEST_PATH2));
+                }
            }
            {
-                managed_directory
-                    .atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
-                    .unwrap();
-            }
-            {
-                assert!(managed_directory.exists(*TEST_PATH1));
-                assert!(managed_directory.exists(*TEST_PATH2));
-            }
-            {
-                let living_files: HashSet<PathBuf> =
-                    [TEST_PATH1.to_owned()].into_iter().cloned().collect();
-                managed_directory.garbage_collect(|| living_files);
-            }
-            {
-                assert!(managed_directory.exists(*TEST_PATH1));
-                assert!(!managed_directory.exists(*TEST_PATH2));
+                let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
+                let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
+                {
+                    assert!(managed_directory.exists(*TEST_PATH1));
+                    assert!(!managed_directory.exists(*TEST_PATH2));
+                }
+                {
+                    let living_files: HashSet<PathBuf> = HashSet::new();
+                    managed_directory.garbage_collect(|| living_files);
+                }
+                {
+                    assert!(!managed_directory.exists(*TEST_PATH1));
+                    assert!(!managed_directory.exists(*TEST_PATH2));
+                }
            }
        }
-        {
+
+        #[test]
+        fn test_managed_directory_gc_while_mmapped() {
+            let tempdir = TempDir::new("index").unwrap();
+            let tempdir_path = PathBuf::from(tempdir.path());
+            let living_files = HashSet::new();
+
            let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
            let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
-            {
-                assert!(managed_directory.exists(*TEST_PATH1));
-                assert!(!managed_directory.exists(*TEST_PATH2));
-            }
-            {
-                let living_files: HashSet<PathBuf> = HashSet::new();
-                managed_directory.garbage_collect(|| living_files);
-            }
-            {
-                assert!(!managed_directory.exists(*TEST_PATH1));
-                assert!(!managed_directory.exists(*TEST_PATH2));
-            }
-        }
-    }
-
-    #[test]
-    #[cfg(feature = "mmap ")]
-    fn test_managed_directory_gc_while_mmapped() {
-        let tempdir = TempDir::new("index").unwrap();
-        let tempdir_path = PathBuf::from(tempdir.path());
-        let living_files = HashSet::new();
-
-        let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
-        let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
-        managed_directory
-            .atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
-            .unwrap();
-        assert!(managed_directory.exists(*TEST_PATH1));
-
-        let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
-        managed_directory.garbage_collect(|| living_files.clone());
-        if cfg!(target_os = "windows") {
-            // On Windows, gc should try and fail the file as it is mmapped.
+            managed_directory
+                .atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
+                .unwrap();
            assert!(managed_directory.exists(*TEST_PATH1));
-            // unmap should happen here.
-            drop(_mmap_read);
-            // The file should still be in the list of managed file and
-            // eventually be deleted once mmap is released.
-            managed_directory.garbage_collect(|| living_files);
-            assert!(!managed_directory.exists(*TEST_PATH1));
-        } else {
-            assert!(!managed_directory.exists(*TEST_PATH1));
+
+            let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
+            managed_directory.garbage_collect(|| living_files.clone());
+            if cfg!(target_os = "windows") {
+                // On Windows, gc should try and fail the file as it is mmapped.
+                assert!(managed_directory.exists(*TEST_PATH1));
+                // unmap should happen here.
+                drop(_mmap_read);
+                // The file should still be in the list of managed file and
+                // eventually be deleted once mmap is released.
+                managed_directory.garbage_collect(|| living_files);
+                assert!(!managed_directory.exists(*TEST_PATH1));
+            } else {
+                assert!(!managed_directory.exists(*TEST_PATH1));
+            }
        }
+
    }

 }
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -6,7 +6,6 @@ use self::notify::RawEvent;
 use self::notify::RecursiveMode;
 use self::notify::Watcher;
 use atomicwrites;
-use common::make_io_err;
 use core::META_FILEPATH;
 use directory::error::LockError;
 use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
@@ -37,6 +36,11 @@ use std::sync::Weak;
 use std::thread;
 use tempdir::TempDir;

+/// Create a default io error given a string.
+pub(crate) fn make_io_err(msg: String) -> io::Error {
+    io::Error::new(io::ErrorKind::Other, msg)
+}
+
 /// Returns None iff the file exists, can be read, but is empty (and hence
 /// cannot be mmapped)
 fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
--- a/src/functional_test.rs
+++ b/src/functional_test.rs
@@ -13,7 +13,6 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {

 #[test]
 #[ignore]
-#[cfg(feature = "mmap")]
 fn test_indexing() {
    let mut schema_builder = Schema::builder();

--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -1,3 +1,4 @@
+use common::MAX_DOC_LIMIT;
 use core::Segment;
 use core::SegmentReader;
 use core::SerializableSegment;
@@ -23,6 +24,7 @@ use termdict::TermMerger;
 use termdict::TermOrdinal;
 use DocId;
 use Result;
+use TantivyError;

 fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
    let mut total_tokens = 0u64;
@@ -150,6 +152,14 @@ impl IndexMerger {
                readers.push(reader);
            }
        }
+        if max_doc >= MAX_DOC_LIMIT {
+            let err_msg = format!(
+                "The segment resulting from this merge would have {} docs,\
+                 which exceeds the limit {}.",
+                max_doc, MAX_DOC_LIMIT
+            );
+            return Err(TantivyError::InvalidArgument(err_msg));
+        }
        Ok(IndexMerger {
            schema,
            readers,
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -420,6 +420,7 @@ impl SegmentUpdater {
            })
            .collect::<Vec<_>>();
        merge_candidates.extend(committed_merge_candidates.into_iter());
+
        for merge_operation in merge_candidates {
            match self.start_merge_impl(merge_operation) {
                Ok(merge_future) => {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -174,6 +174,7 @@ extern crate downcast_rs;
 #[macro_use]
 extern crate fail;

+#[cfg(feature = "mmap")]
 #[cfg(test)]
 mod functional_test;

--- a/src/postings/block_search.rs
+++ b/src/postings/block_search.rs
@@ -0,0 +1,229 @@
+/// This modules define the logic used to search for a doc in a given
+/// block. (at most 128 docs)
+///
+/// Searching within a block is a hotspot when running intersection.
+/// so it was worth defining it in its own module.
+
+#[cfg(target_arch = "x86_64")]
+mod sse2 {
+    use postings::compression::COMPRESSION_BLOCK_SIZE;
+    use std::arch::x86_64::__m128i as DataType;
+    use std::arch::x86_64::_mm_add_epi32 as op_add;
+    use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
+    use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
+    use std::arch::x86_64::_mm_set1_epi32 as set1;
+    use std::arch::x86_64::_mm_setzero_si128 as set0;
+    use std::arch::x86_64::_mm_sub_epi32 as op_sub;
+    use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
+
+    const MASK1: i32 = 78;
+    const MASK2: i32 = 177;
+
+    /// Performs an exhaustive linear search over the
+    ///
+    /// There is no early exit here. We simply count the
+    /// number of elements that are `< target`.
+    pub fn linear_search_sse2_128(arr: &[u32], target: u32) -> usize {
+        unsafe {
+            let ptr = arr.as_ptr() as *const DataType;
+            let vkey = set1(target as i32);
+            let mut cnt = set0();
+            // We work over 4 `__m128i` at a time.
+            // A single `__m128i` actual contains 4 `u32`.
+            for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
+                let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
+                let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
+                let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
+                let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
+                let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
+                cnt = op_sub(cnt, sum);
+            }
+            cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
+            cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
+            _mm_cvtsi128_si32(cnt) as usize
+        }
+    }
+
+    #[cfg(test)]
+    mod test {
+        use super::linear_search_sse2_128;
+
+        #[test]
+        fn test_linear_search_sse2_128_u32() {
+            for i in 0..23 {
+                dbg!(i);
+                let arr: Vec<u32> = (0..128).map(|el| el * 2 + 1 << 18).collect();
+                assert_eq!(linear_search_sse2_128(&arr, arr[64] + 1), 65);
+            }
+        }
+    }
+}
+
+/// This `linear search` browser exhaustively through the array.
+/// but the early exit is very difficult to predict.
+///
+/// Coupled with `exponential search` this function is likely
+/// to be called with the same `len`
+fn linear_search(arr: &[u32], target: u32) -> usize {
+    arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
+}
+
+fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
+    let end = arr.len();
+    let mut begin = 0;
+    for &pivot in &[1, 3, 7, 15, 31, 63] {
+        if pivot >= end {
+            break;
+        }
+        if arr[pivot] > target {
+            return (begin, pivot);
+        }
+        begin = pivot;
+    }
+    (begin, end)
+}
+
+fn galloping(block_docs: &[u32], target: u32) -> usize {
+    let (start, end) = exponential_search(&block_docs, target);
+    start + linear_search(&block_docs[start..end], target)
+}
+
+/// Tantivy may rely on SIMD instructions to search for a specific document within
+/// a given block.
+#[derive(Clone, Copy, PartialEq)]
+pub enum BlockSearcher {
+    #[cfg(target_arch = "x86_64")]
+    SSE2,
+    Scalar,
+}
+
+impl BlockSearcher {
+    /// Search the first index containing an element greater or equal to
+    /// the target.
+    ///
+    /// The results should be equivalent to
+    /// ```ignore
+    /// block[..]
+    //       .iter()
+    //       .take_while(|&&val| val < target)
+    //       .count()
+    /// ```
+    ///
+    /// The `start` argument is just used to hint that the response is
+    /// greater than beyond `start`. The implementation may or may not use
+    /// it for optimization.
+    ///
+    /// # Assumption
+    ///
+    /// The array len is > start.
+    /// The block is sorted
+    /// The target is assumed greater or equal to the `arr[start]`.
+    /// The target is assumed smaller or equal to the last element of the block.
+    ///
+    /// Currently the scalar implementation starts by an exponential search, and
+    /// then operates a linear search in the result subarray.
+    ///
+    /// If SSE2 instructions are available in the `(platform, running CPU)`,
+    /// then we use a different implementation that does an exhaustive linear search over
+    /// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
+    /// of branch.
+    pub fn search_in_block(&self, block_docs: &[u32], start: usize, target: u32) -> usize {
+        #[cfg(target_arch = "x86_64")]
+        {
+            use postings::compression::COMPRESSION_BLOCK_SIZE;
+            if *self == BlockSearcher::SSE2 {
+                if block_docs.len() == COMPRESSION_BLOCK_SIZE {
+                    return sse2::linear_search_sse2_128(block_docs, target);
+                }
+            }
+        }
+        start + galloping(&block_docs[start..], target)
+    }
+}
+
+impl Default for BlockSearcher {
+    fn default() -> BlockSearcher {
+        #[cfg(target_arch = "x86_64")]
+        {
+            if is_x86_feature_detected!("sse2") {
+                return BlockSearcher::SSE2;
+            }
+        }
+        BlockSearcher::Scalar
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::exponential_search;
+    use super::linear_search;
+    use super::BlockSearcher;
+
+    #[test]
+    fn test_linear_search() {
+        let len: usize = 50;
+        let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
+        for target in 1..*arr.last().unwrap() {
+            let res = linear_search(&arr[..], target);
+            if res > 0 {
+                assert!(arr[res - 1] < target);
+            }
+            if res < len {
+                assert!(arr[res] >= target);
+            }
+        }
+    }
+
+    #[test]
+    fn test_exponentiel_search() {
+        assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
+        assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
+        assert_eq!(
+            exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
+            (3, 7)
+        );
+    }
+
+    fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
+        let cursor = search_in_block_trivial_but_slow(block, target);
+        for i in 0..cursor {
+            assert_eq!(block_searcher.search_in_block(block, i, target), cursor);
+        }
+    }
+
+    fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
+        use std::collections::HashSet;
+        let mut targets = HashSet::new();
+        for (i, val) in block.iter().cloned().enumerate() {
+            if i > 0 {
+                targets.insert(val - 1);
+            }
+            targets.insert(val);
+        }
+        for target in targets {
+            util_test_search_in_block(block_searcher, block, target);
+        }
+    }
+
+    fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
+        block.iter().take_while(|&&val| val < target).count()
+    }
+
+    fn test_search_in_block_util(block_searcher: BlockSearcher) {
+        for len in 1u32..128u32 {
+            let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
+            util_test_search_in_block_all(block_searcher, &v[..]);
+        }
+    }
+
+    #[test]
+    fn test_search_in_block_scalar() {
+        test_search_in_block_util(BlockSearcher::Scalar);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    fn test_search_in_block_sse2() {
+        test_search_in_block_util(BlockSearcher::SSE2);
+    }
+}
--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -43,9 +43,14 @@ impl BlockEncoder {
    }
 }

+/// We ensure that the OutputBuffer is align on 128 bits
+/// in order to run SSE2 linear search on it.
+#[repr(align(128))]
+struct OutputBuffer([u32; COMPRESSION_BLOCK_SIZE + 1]);
+
 pub struct BlockDecoder {
    bitpacker: BitPacker4x,
-    pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
+    output: OutputBuffer,
    pub output_len: usize,
 }

@@ -59,7 +64,7 @@ impl BlockDecoder {
        output[COMPRESSION_BLOCK_SIZE] = 0u32;
        BlockDecoder {
            bitpacker: BitPacker4x::new(),
-            output,
+            output: OutputBuffer(output),
            output_len: 0,
        }
    }
@@ -72,23 +77,23 @@ impl BlockDecoder {
    ) -> usize {
        self.output_len = COMPRESSION_BLOCK_SIZE;
        self.bitpacker
-            .decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
+            .decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
    }

    pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
        self.output_len = COMPRESSION_BLOCK_SIZE;
        self.bitpacker
-            .decompress(&compressed_data, &mut self.output, num_bits)
+            .decompress(&compressed_data, &mut self.output.0, num_bits)
    }

    #[inline]
    pub fn output_array(&self) -> &[u32] {
-        &self.output[..self.output_len]
+        &self.output.0[..self.output_len]
    }

    #[inline]
    pub fn output(&self, idx: usize) -> u32 {
-        self.output[idx]
+        self.output.0[idx]
    }
 }

@@ -159,12 +164,12 @@ impl VIntDecoder for BlockDecoder {
        num_els: usize,
    ) -> usize {
        self.output_len = num_els;
-        vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
+        vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
    }

    fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
        self.output_len = num_els;
-        vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
+        vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
    }
 }

--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -2,6 +2,7 @@
 Postings module (also called inverted index)
 */

+mod block_search;
 pub(crate) mod compression;
 /// Postings module
 ///
@@ -16,6 +17,8 @@ mod skip;
 mod stacker;
 mod term_info;

+pub(crate) use self::block_search::BlockSearcher;
+
 pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
 pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};

@@ -104,9 +107,7 @@ pub mod tests {
        let searcher = index.reader().unwrap().searcher();
        let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
        let term = Term::from_field_text(title, "abc");
-
        let mut positions = Vec::new();
-
        {
            let mut postings = inverted_index
                .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -7,6 +7,7 @@ use positions::PositionReader;
 use postings::compression::compressed_block_size;
 use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
 use postings::serializer::PostingsSerializer;
+use postings::BlockSearcher;
 use postings::FreqReadingOption;
 use postings::Postings;
 use postings::SkipReader;
@@ -60,6 +61,7 @@ pub struct SegmentPostings {
    block_cursor: BlockSegmentPostings,
    cur: usize,
    position_computer: Option<PositionComputer>,
+    block_searcher: BlockSearcher,
 }

 impl SegmentPostings {
@@ -70,6 +72,7 @@ impl SegmentPostings {
            block_cursor: empty_block_cursor,
            cur: COMPRESSION_BLOCK_SIZE,
            position_computer: None,
+            block_searcher: BlockSearcher::default(),
        }
    }

@@ -117,42 +120,31 @@ impl SegmentPostings {
            block_cursor: segment_block_postings,
            cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
            position_computer: positions_stream_opt.map(PositionComputer::new),
+            block_searcher: BlockSearcher::default(),
        }
    }
 }

-fn linear_search(arr: &[u32], target: u32) -> usize {
-    arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
-}
-
-fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
-    let end = arr.len();
-    let mut begin = 0;
-    for &pivot in &[1, 3, 7, 15, 31, 63] {
-        if pivot >= end {
-            break;
-        }
-        if arr[pivot] > target {
-            return (begin, pivot);
-        }
-        begin = pivot;
-    }
-    (begin, end)
-}
-
-/// Search the first index containing an element greater or equal to the target.
-///
-/// # Assumption
-///
-/// The array is assumed non empty.
-/// The target is assumed greater or equal to the first element.
-/// The target is assumed smaller or equal to the last element.
-fn search_within_block(block_docs: &[u32], target: u32) -> usize {
-    let (start, end) = exponential_search(block_docs, target);
-    start + linear_search(&block_docs[start..end], target)
-}
-
 impl DocSet for SegmentPostings {
+    // goes to the next element.
+    // next needs to be called a first time to point to the correct element.
+    #[inline]
+    fn advance(&mut self) -> bool {
+        if self.position_computer.is_some() {
+            let term_freq = self.term_freq() as usize;
+            self.position_computer.as_mut().unwrap().add_skip(term_freq);
+        }
+        self.cur += 1;
+        if self.cur >= self.block_cursor.block_len() {
+            self.cur = 0;
+            if !self.block_cursor.advance() {
+                self.cur = COMPRESSION_BLOCK_SIZE;
+                return false;
+            }
+        }
+        true
+    }
+
    fn skip_next(&mut self, target: DocId) -> SkipResult {
        if !self.advance() {
            return SkipResult::End;
@@ -211,9 +203,8 @@ impl DocSet for SegmentPostings {
        // we're in the right block now, start with an exponential search
        let block_docs = self.block_cursor.docs();
        let new_cur = self
-            .cur
-            .wrapping_add(search_within_block(&block_docs[self.cur..], target));
-
+            .block_searcher
+            .search_in_block(&block_docs, self.cur, target);
        if need_positions {
            sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
                .iter()
@@ -235,29 +226,6 @@ impl DocSet for SegmentPostings {
        }
    }

-    // goes to the next element.
-    // next needs to be called a first time to point to the correct element.
-    #[inline]
-    fn advance(&mut self) -> bool {
-        if self.position_computer.is_some() {
-            let term_freq = self.term_freq() as usize;
-            self.position_computer.as_mut().unwrap().add_skip(term_freq);
-        }
-        self.cur += 1;
-        if self.cur >= self.block_cursor.block_len() {
-            self.cur = 0;
-            if !self.block_cursor.advance() {
-                self.cur = COMPRESSION_BLOCK_SIZE;
-                return false;
-            }
-        }
-        true
-    }
-
-    fn size_hint(&self) -> u32 {
-        self.len() as u32
-    }
-
    /// Return the current document's `DocId`.
    #[inline]
    fn doc(&self) -> DocId {
@@ -269,6 +237,10 @@ impl DocSet for SegmentPostings {
        docs[self.cur]
    }

+    fn size_hint(&self) -> u32 {
+        self.len() as u32
+    }
+
    fn append_to_bitset(&mut self, bitset: &mut BitSet) {
        // finish the current block
        if self.advance() {
@@ -614,10 +586,6 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {

 #[cfg(test)]
 mod tests {
-
-    use super::exponential_search;
-    use super::linear_search;
-    use super::search_within_block;
    use super::BlockSegmentPostings;
    use super::BlockSegmentPostingsSkipResult;
    use super::SegmentPostings;
@@ -632,21 +600,6 @@ mod tests {
    use DocId;
    use SkipResult;

-    #[test]
-    fn test_linear_search() {
-        let len: usize = 50;
-        let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
-        for target in 1..*arr.last().unwrap() {
-            let res = linear_search(&arr[..], target);
-            if res > 0 {
-                assert!(arr[res - 1] < target);
-            }
-            if res < len {
-                assert!(arr[res] >= target);
-            }
-        }
-    }
-
    #[test]
    fn test_empty_segment_postings() {
        let mut postings = SegmentPostings::empty();
@@ -662,56 +615,6 @@ mod tests {
        assert_eq!(postings.doc_freq(), 0);
    }

-    fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
-        block
-            .iter()
-            .cloned()
-            .enumerate()
-            .filter(|&(_, ref val)| *val >= target)
-            .next()
-            .unwrap()
-            .0
-    }
-
-    #[test]
-    fn test_exponentiel_search() {
-        assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
-        assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
-        assert_eq!(
-            exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
-            (3, 7)
-        );
-    }
-
-    fn util_test_search_within_block(block: &[u32], target: u32) {
-        assert_eq!(
-            search_within_block(block, target),
-            search_within_block_trivial_but_slow(block, target)
-        );
-    }
-
-    fn util_test_search_within_block_all(block: &[u32]) {
-        use std::collections::HashSet;
-        let mut targets = HashSet::new();
-        for (i, val) in block.iter().cloned().enumerate() {
-            if i > 0 {
-                targets.insert(val - 1);
-            }
-            targets.insert(val);
-        }
-        for target in targets {
-            util_test_search_within_block(block, target);
-        }
-    }
-
-    #[test]
-    fn test_search_within_block() {
-        for len in 1u32..128u32 {
-            let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
-            util_test_search_within_block_all(&v[..]);
-        }
-    }
-
    #[test]
    fn test_block_segment_postings() {
        let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
--- a/src/query/intersection.rs
+++ b/src/query/intersection.rs
@@ -14,41 +14,35 @@ use Score;
 /// specialized implementation if the two
 /// shortest scorers are `TermScorer`s.
 pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
+    if scorers.is_empty() {
+        return Box::new(EmptyScorer);
+    }
+    if scorers.len() == 1 {
+        return scorers.pop().unwrap();
+    }
+    // We know that we have at least 2 elements.
    let num_docsets = scorers.len();
    scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
-    let rarest_opt = scorers.pop();
-    let second_rarest_opt = scorers.pop();
+    let left = scorers.pop().unwrap();
+    let right = scorers.pop().unwrap();
    scorers.reverse();
-    match (rarest_opt, second_rarest_opt) {
-        (None, None) => Box::new(EmptyScorer),
-        (Some(single_docset), None) => single_docset,
-        (Some(left), Some(right)) => {
-            {
-                let all_term_scorers = [&left, &right]
-                    .iter()
-                    .all(|&scorer| scorer.is::<TermScorer>());
-                if all_term_scorers {
-                    let left = *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap());
-                    let right = *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap());
-                    return Box::new(Intersection {
-                        left,
-                        right,
-                        others: scorers,
-                        num_docsets,
-                    });
-                }
-            }
-            Box::new(Intersection {
-                left,
-                right,
-                others: scorers,
-                num_docsets,
-            })
-        }
-        _ => {
-            unreachable!();
-        }
+    let all_term_scorers = [&left, &right]
+        .iter()
+        .all(|&scorer| scorer.is::<TermScorer>());
+    if all_term_scorers {
+        return Box::new(Intersection {
+            left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
+            right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
+            others: scorers,
+            num_docsets,
+        });
    }
+    Box::new(Intersection {
+        left,
+        right,
+        others: scorers,
+        num_docsets,
+    })
 }

 /// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
@@ -124,7 +118,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
                        return false;
                    }
                }
-
                match left.skip_next(candidate) {
                    SkipResult::Reached => {
                        break;
@@ -140,35 +133,36 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
            }
            // test the remaining scorers;
            for (ord, docset) in self.others.iter_mut().enumerate() {
-                if ord != other_candidate_ord {
-                    // `candidate_ord` is already at the
-                    // right position.
-                    //
-                    // Calling `skip_next` would advance this docset
-                    // and miss it.
-                    match docset.skip_next(candidate) {
-                        SkipResult::Reached => {}
-                        SkipResult::OverStep => {
-                            // this is not in the intersection,
-                            // let's update our candidate.
-                            candidate = docset.doc();
-                            match left.skip_next(candidate) {
-                                SkipResult::Reached => {
-                                    other_candidate_ord = ord;
-                                }
-                                SkipResult::OverStep => {
-                                    candidate = left.doc();
-                                    other_candidate_ord = usize::max_value();
-                                }
-                                SkipResult::End => {
-                                    return false;
-                                }
+                if ord == other_candidate_ord {
+                    continue;
+                }
+                // `candidate_ord` is already at the
+                // right position.
+                //
+                // Calling `skip_next` would advance this docset
+                // and miss it.
+                match docset.skip_next(candidate) {
+                    SkipResult::Reached => {}
+                    SkipResult::OverStep => {
+                        // this is not in the intersection,
+                        // let's update our candidate.
+                        candidate = docset.doc();
+                        match left.skip_next(candidate) {
+                            SkipResult::Reached => {
+                                other_candidate_ord = ord;
+                            }
+                            SkipResult::OverStep => {
+                                candidate = left.doc();
+                                other_candidate_ord = usize::max_value();
+                            }
+                            SkipResult::End => {
+                                return false;
                            }
-                            continue 'outer;
-                        }
-                        SkipResult::End => {
-                            return false;
                        }
+                        continue 'outer;
+                    }
+                    SkipResult::End => {
+                        return false;
                    }
                }
            }
Author	SHA1	Message	Date
Paul Masurel	b3a201e665	Revert "Fix non english stemmers (#521 )" This reverts commit `2cd31bcda2`.	2019-03-27 08:54:55 +09:00
Panagiotis Ktistakis	2cd31bcda2	Fix non english stemmers (#521 )	2019-03-27 08:54:16 +09:00
Paul Masurel	99870de55c	0.10.0-dev	2019-03-25 08:58:26 +09:00
Paul Masurel	cad2d91845	Disabled tests for android	2019-03-24 22:58:46 +09:00
Paul Masurel	79f3cd6cf4	Added instructions to update	2019-03-24 09:10:31 +09:00
Paul Masurel	e3abb4481b	broken link	2019-03-22 09:58:28 +09:00
Paul Masurel	bfa61d2f2f	Added patreon button	2019-03-22 09:51:00 +09:00
Paul Masurel	6c0e621fdb	Added bench info in README	2019-03-21 09:35:04 +09:00
Paul Masurel	a8cc5208f1	Linear simd (#519 ) * linear simd search within block	2019-03-20 22:10:05 +09:00
Paul Masurel	83eb0d0cb7	Disabling tests on Android	2019-03-20 10:24:17 +09:00
Paul Masurel	ee6e273365	cleanup for nodefaultfeatures	2019-03-20 10:04:42 +09:00