Preparing for hotfix release 0.9.1

Fix non english stemmers (#521 )
2026-01-08 10:02:55 +00:00 · 2019-03-28 09:58:33 +09:00 · 2019-03-28 09:50:27 +09:00
25 changed files with 385 additions and 773 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -29,7 +29,7 @@ addons:
 matrix:
  include:
    # Android
-    - env: TARGET=aarch64-linux-android DISABLE_TESTS=1
+    - env: TARGET=aarch64-linux-android
    #- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
    #- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
    #- env: TARGET=i686-linux-android DISABLE_TESTS=1
@@ -68,11 +68,6 @@ cache: cargo
 before_cache:
  # Travis can't cache files that are not readable by "others"
  - chmod -R a+r $HOME/.cargo
  - find ./target/debug -type f -maxdepth 1 -delete
  - rm -f  ./target/.rustc_info.json
  - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
  - rm -r target/debug/examples/
  - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
 #branches:
 #  only:
@@ -82,4 +77,4 @@ before_cache:
 notifications:
  email:
-    on_success: never
+    on_success: never
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,14 +1,7 @@
-Tantivy 0.10.0
+Tantivy 0.9.1
-====================
+=====================
 Minor
 ---------
 - Small simplification of the code. 
 Calling .freq() or .doc() when .advance() has never 
 on segment postings should panic from now on.
 - Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
 Hotfix: The english stemmer was actually used for all languages.
 Tantivy 0.9.0
 =====================
@@ -27,35 +20,6 @@ previous index format.*
  for int fields. (@fulmicoton)
 - Added DateTime field (@barrotsteindev)
 - Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
 - SIMD linear search within blocks (@fulmicoton)
 ## How to update ?
 tantivy 0.9 brought some API breaking change.
 To update from tantivy 0.8, you will need to go through the following steps.
 - `schema::INT_INDEXED` and `schema::INT_STORED`  should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
 - The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called 
 `IndexReader` for this. 
    ```rust
    // create the reader. You typically need to create 1 reader for the entire
    // lifetime of you program.
    let reader = index.reader()?;
    // Acquire a searcher (previously `index.searcher()`) is now written:
    let searcher = reader.searcher();
    // With the default setting of the reader, you are not required to 
    // call `index.load_searchers()` anymore.
    //
    // The IndexReader will pick up that change automatically, regardless
    // of whether the update was done in a different process or not.
    // If this behavior is not wanted, you can create your reader with 
    // the `ReloadPolicy::Manual`, and manually decide when to reload the index
    // by calling `reader.reload()?`.
    ```
 Tantivy 0.8.2
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.10.0-dev"
+version = "0.9.1"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -23,7 +23,7 @@ snap = {version="0.2"}
 atomicwrites = {version="0.2.2", optional=true}
 tempfile = "3.0"
 log = "0.4"
-combine = ">=3.6.0,<4.0.0"
+combine = "3"
 tempdir = "0.3"
 serde = "1.0"
 serde_derive = "1.0"
--- a/README.md
+++ b/README.md
@@ -17,7 +17,6 @@
 [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
 [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
 [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
 **Tantivy** is a **full text search engine library** written in rust.
@@ -28,14 +27,6 @@ to build such a search engine.
 Tantivy is, in fact, strongly inspired by Lucene's design.
 # Benchmark
 Tantivy is typically faster than Lucene, but the results will depend on 
 the nature of the queries in your workload.
 The following [benchmark](https://tantivy-search.github.io/bench/) break downs 
 performance for different type of queries / collection.
 # Features
 - Full-text search
@@ -96,14 +87,6 @@ To check out and run tests, you can simply run :
 Some tests will not run with just `cargo test` because of `fail-rs`.
 To run the tests exhaustively, run `./run-tests.sh`. 
-# How can I support this project ?
+# Contribute
-There are many ways to support this project. 
+Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
 - If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
 - Report bugs
 - Write a blog post
 - Complete documentation
 - Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
 - Talk about tantivy around you
 - Drop a word on on [![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) or even [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -13,10 +13,7 @@ pub use self::serialize::{BinarySerializable, FixedSize};
 pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
 pub use byteorder::LittleEndian as Endianness;
-/// Segment's max doc must be `< MAX_DOC_LIMIT`.
+use std::io;
 ///
 /// We do not allow segments with more than
 pub const MAX_DOC_LIMIT: u32 = 1 << 31;
 /// Computes the number of bits that will be used for bitpacking.
 ///
@@ -55,6 +52,11 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
    (n > 0) && (n & (n - 1) == 0)
 }
 /// Create a default io error given a string.
 pub(crate) fn make_io_err(msg: String) -> io::Error {
    io::Error::new(io::ErrorKind::Other, msg)
 }
 /// Has length trait
 pub trait HasLen {
    /// Return length
@@ -132,11 +134,4 @@ pub(crate) mod test {
        assert_eq!(compute_num_bits(256), 9u8);
        assert_eq!(compute_num_bits(5_000_000_000), 33u8);
    }
    #[test]
    fn test_max_doc() {
        // this is the first time I write a unit test for a constant.
        assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
        assert!((super::MAX_DOC_LIMIT as i32) < 0);
    }
 }
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -24,7 +24,6 @@ use schema::Schema;
 use serde_json;
 use std::borrow::BorrowMut;
 use std::fmt;
 #[cfg(feature = "mmap")]
 use std::path::Path;
 use std::sync::Arc;
 use tokenizer::BoxedTokenizer;
@@ -356,8 +355,10 @@ mod tests {
    use directory::RAMDirectory;
    use schema::Field;
    use schema::{Schema, INDEXED, TEXT};
    use std::path::PathBuf;
    use std::thread;
    use std::time::Duration;
    use tempdir::TempDir;
    use Index;
    use IndexReader;
    use IndexWriter;
@@ -443,69 +444,61 @@ mod tests {
        test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
    }
-    #[cfg(feature = "mmap")]
+    #[test]
-    mod mmap_specific {
+    fn test_index_on_commit_reload_policy_mmap() {
        let schema = throw_away_schema();
        let field = schema.get_field("num_likes").unwrap();
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
        let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
        let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
        writer.commit().unwrap();
        let reader = index
            .reader_builder()
            .reload_policy(ReloadPolicy::OnCommit)
            .try_into()
            .unwrap();
        assert_eq!(reader.searcher().num_docs(), 0);
        test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
    }
-        use super::*;
+    #[test]
-        use std::path::PathBuf;
+    fn test_index_manual_policy_mmap() {
-        use tempdir::TempDir;
+        let schema = throw_away_schema();
        let field = schema.get_field("num_likes").unwrap();
        let index = Index::create_from_tempdir(schema).unwrap();
        let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
        writer.commit().unwrap();
        let reader = index
            .reader_builder()
            .reload_policy(ReloadPolicy::Manual)
            .try_into()
            .unwrap();
        assert_eq!(reader.searcher().num_docs(), 0);
        writer.add_document(doc!(field=>1u64));
        writer.commit().unwrap();
        thread::sleep(Duration::from_millis(500));
        assert_eq!(reader.searcher().num_docs(), 0);
        reader.reload().unwrap();
        assert_eq!(reader.searcher().num_docs(), 1);
    }
-        #[test]
+    #[test]
-        fn test_index_on_commit_reload_policy_mmap() {
+    fn test_index_on_commit_reload_policy_different_directories() {
-            let schema = throw_away_schema();
+        let schema = throw_away_schema();
-            let field = schema.get_field("num_likes").unwrap();
+        let field = schema.get_field("num_likes").unwrap();
-            let tempdir = TempDir::new("index").unwrap();
+        let tempdir = TempDir::new("index").unwrap();
-            let tempdir_path = PathBuf::from(tempdir.path());
+        let tempdir_path = PathBuf::from(tempdir.path());
-            let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
+        let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
-            let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+        let read_index = Index::open_in_dir(&tempdir_path).unwrap();
-            writer.commit().unwrap();
+        let reader = read_index
-            let reader = index
+            .reader_builder()
-                .reader_builder()
+            .reload_policy(ReloadPolicy::OnCommit)
-                .reload_policy(ReloadPolicy::OnCommit)
+            .try_into()
-                .try_into()
+            .unwrap();
-                .unwrap();
+        assert_eq!(reader.searcher().num_docs(), 0);
-            assert_eq!(reader.searcher().num_docs(), 0);
+        let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
-            test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
+        test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
        }
        #[test]
        fn test_index_manual_policy_mmap() {
            let schema = throw_away_schema();
            let field = schema.get_field("num_likes").unwrap();
            let index = Index::create_from_tempdir(schema).unwrap();
            let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
            writer.commit().unwrap();
            let reader = index
                .reader_builder()
                .reload_policy(ReloadPolicy::Manual)
                .try_into()
                .unwrap();
            assert_eq!(reader.searcher().num_docs(), 0);
            writer.add_document(doc!(field=>1u64));
            writer.commit().unwrap();
            thread::sleep(Duration::from_millis(500));
            assert_eq!(reader.searcher().num_docs(), 0);
            reader.reload().unwrap();
            assert_eq!(reader.searcher().num_docs(), 1);
        }
        #[test]
        fn test_index_on_commit_reload_policy_different_directories() {
            let schema = throw_away_schema();
            let field = schema.get_field("num_likes").unwrap();
            let tempdir = TempDir::new("index").unwrap();
            let tempdir_path = PathBuf::from(tempdir.path());
            let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
            let read_index = Index::open_in_dir(&tempdir_path).unwrap();
            let reader = read_index
                .reader_builder()
                .reload_policy(ReloadPolicy::OnCommit)
                .try_into()
                .unwrap();
            assert_eq!(reader.searcher().num_docs(), 0);
            let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
            test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
        }
    }
    fn test_index_on_commit_reload_policy_aux(
--- a/src/directory/managed_directory.rs
+++ b/src/directory/managed_directory.rs
@@ -260,98 +260,95 @@ impl Clone for ManagedDirectory {
 #[cfg(test)]
 mod tests {
    use super::*;
    #[cfg(feature = "mmap")]
-    mod mmap_specific {
+    use directory::MmapDirectory;
    use std::io::Write;
    use std::path::Path;
    use tempdir::TempDir;
-        use super::super::*;
+    lazy_static! {
-        use std::path::Path;
+        static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
-        use tempdir::TempDir;
+        static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
-
+    }
        lazy_static! {
            static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
            static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
        }
        use directory::MmapDirectory;
        use std::io::Write;
        #[test]
        fn test_managed_directory() {
            let tempdir = TempDir::new("index").unwrap();
            let tempdir_path = PathBuf::from(tempdir.path());
            {
                let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
                let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
                {
                    let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
                    write_file.flush().unwrap();
                }
                {
                    managed_directory
                        .atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
                        .unwrap();
                }
                {
                    assert!(managed_directory.exists(*TEST_PATH1));
                    assert!(managed_directory.exists(*TEST_PATH2));
                }
                {
                    let living_files: HashSet<PathBuf> =
                        [TEST_PATH1.to_owned()].into_iter().cloned().collect();
                    managed_directory.garbage_collect(|| living_files);
                }
                {
                    assert!(managed_directory.exists(*TEST_PATH1));
                    assert!(!managed_directory.exists(*TEST_PATH2));
                }
            }
            {
                let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
                let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
                {
                    assert!(managed_directory.exists(*TEST_PATH1));
                    assert!(!managed_directory.exists(*TEST_PATH2));
                }
                {
                    let living_files: HashSet<PathBuf> = HashSet::new();
                    managed_directory.garbage_collect(|| living_files);
                }
                {
                    assert!(!managed_directory.exists(*TEST_PATH1));
                    assert!(!managed_directory.exists(*TEST_PATH2));
                }
            }
        }
        #[test]
        fn test_managed_directory_gc_while_mmapped() {
            let tempdir = TempDir::new("index").unwrap();
            let tempdir_path = PathBuf::from(tempdir.path());
            let living_files = HashSet::new();
    #[test]
    #[cfg(feature = "mmap")]
    fn test_managed_directory() {
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
        {
            let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
            let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
-            managed_directory
+            {
-                .atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
+                let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
-                .unwrap();
+                write_file.flush().unwrap();
-            assert!(managed_directory.exists(*TEST_PATH1));
+            }
-
+            {
-            let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
+                managed_directory
-            managed_directory.garbage_collect(|| living_files.clone());
+                    .atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
-            if cfg!(target_os = "windows") {
+                    .unwrap();
-                // On Windows, gc should try and fail the file as it is mmapped.
+            }
            {
                assert!(managed_directory.exists(*TEST_PATH1));
-                // unmap should happen here.
+                assert!(managed_directory.exists(*TEST_PATH2));
-                drop(_mmap_read);
+            }
-                // The file should still be in the list of managed file and
+            {
-                // eventually be deleted once mmap is released.
+                let living_files: HashSet<PathBuf> =
                    [TEST_PATH1.to_owned()].into_iter().cloned().collect();
                managed_directory.garbage_collect(|| living_files);
-                assert!(!managed_directory.exists(*TEST_PATH1));
+            }
-            } else {
+            {
-                assert!(!managed_directory.exists(*TEST_PATH1));
+                assert!(managed_directory.exists(*TEST_PATH1));
                assert!(!managed_directory.exists(*TEST_PATH2));
            }
        }
        {
            let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
            let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
            {
                assert!(managed_directory.exists(*TEST_PATH1));
                assert!(!managed_directory.exists(*TEST_PATH2));
            }
            {
                let living_files: HashSet<PathBuf> = HashSet::new();
                managed_directory.garbage_collect(|| living_files);
            }
            {
                assert!(!managed_directory.exists(*TEST_PATH1));
                assert!(!managed_directory.exists(*TEST_PATH2));
            }
        }
    }
    #[test]
    #[cfg(feature = "mmap ")]
    fn test_managed_directory_gc_while_mmapped() {
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
        let living_files = HashSet::new();
        let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
        let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
        managed_directory
            .atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
            .unwrap();
        assert!(managed_directory.exists(*TEST_PATH1));
        let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
        managed_directory.garbage_collect(|| living_files.clone());
        if cfg!(target_os = "windows") {
            // On Windows, gc should try and fail the file as it is mmapped.
            assert!(managed_directory.exists(*TEST_PATH1));
            // unmap should happen here.
            drop(_mmap_read);
            // The file should still be in the list of managed file and
            // eventually be deleted once mmap is released.
            managed_directory.garbage_collect(|| living_files);
            assert!(!managed_directory.exists(*TEST_PATH1));
        } else {
            assert!(!managed_directory.exists(*TEST_PATH1));
        }
    }
 }
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -6,6 +6,7 @@ use self::notify::RawEvent;
 use self::notify::RecursiveMode;
 use self::notify::Watcher;
 use atomicwrites;
 use common::make_io_err;
 use core::META_FILEPATH;
 use directory::error::LockError;
 use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
@@ -36,11 +37,6 @@ use std::sync::Weak;
 use std::thread;
 use tempdir::TempDir;
 /// Create a default io error given a string.
 pub(crate) fn make_io_err(msg: String) -> io::Error {
    io::Error::new(io::ErrorKind::Other, msg)
 }
 /// Returns None iff the file exists, can be read, but is empty (and hence
 /// cannot be mmapped)
 fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
--- a/src/docset.rs
+++ b/src/docset.rs
@@ -1,5 +1,4 @@
 use common::BitSet;
 use fastfield::DeleteBitSet;
 use std::borrow::Borrow;
 use std::borrow::BorrowMut;
 use std::cmp::Ordering;
@@ -96,23 +95,9 @@ pub trait DocSet {
    }
    /// Returns the number documents matching.
    /// Calling this method consumes the `DocSet`.
    fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
        let mut count = 0u32;
        while self.advance() {
            if !delete_bitset.is_deleted(self.doc()) {
                count += 1u32;
            }
        }
        count
    }
    /// Returns the count of documents, deleted or not.
    /// Calling this method consumes the `DocSet`.
    ///
-    /// Of course, the result is an upper bound of the result
+    /// Calling this method consumes the `DocSet`.
-    /// given by `count()`.
+    fn count(&mut self) -> u32 {
    fn count_including_deleted(&mut self) -> u32 {
        let mut count = 0u32;
        while self.advance() {
            count += 1u32;
@@ -142,14 +127,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
        unboxed.size_hint()
    }
-    fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
+    fn count(&mut self) -> u32 {
        let unboxed: &mut TDocSet = self.borrow_mut();
-        unboxed.count(delete_bitset)
+        unboxed.count()
    }
    fn count_including_deleted(&mut self) -> u32 {
        let unboxed: &mut TDocSet = self.borrow_mut();
        unboxed.count_including_deleted()
    }
    fn append_to_bitset(&mut self, bitset: &mut BitSet) {
--- a/src/functional_test.rs
+++ b/src/functional_test.rs
@@ -13,6 +13,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
 #[test]
 #[ignore]
 #[cfg(feature = "mmap")]
 fn test_indexing() {
    let mut schema_builder = Schema::builder();
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -1,4 +1,3 @@
 use common::MAX_DOC_LIMIT;
 use core::Segment;
 use core::SegmentReader;
 use core::SerializableSegment;
@@ -24,7 +23,6 @@ use termdict::TermMerger;
 use termdict::TermOrdinal;
 use DocId;
 use Result;
 use TantivyError;
 fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
    let mut total_tokens = 0u64;
@@ -152,14 +150,6 @@ impl IndexMerger {
                readers.push(reader);
            }
        }
        if max_doc >= MAX_DOC_LIMIT {
            let err_msg = format!(
                "The segment resulting from this merge would have {} docs,\
                 which exceeds the limit {}.",
                max_doc, MAX_DOC_LIMIT
            );
            return Err(TantivyError::InvalidArgument(err_msg));
        }
        Ok(IndexMerger {
            schema,
            readers,
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -420,7 +420,6 @@ impl SegmentUpdater {
            })
            .collect::<Vec<_>>();
        merge_candidates.extend(committed_merge_candidates.into_iter());
        for merge_operation in merge_candidates {
            match self.start_merge_impl(merge_operation) {
                Ok(merge_future) => {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -174,7 +174,6 @@ extern crate downcast_rs;
 #[macro_use]
 extern crate fail;
 #[cfg(feature = "mmap")]
 #[cfg(test)]
 mod functional_test;
--- a/src/postings/block_search.rs
+++ b/src/postings/block_search.rs
@@ -1,249 +0,0 @@
 use postings::compression::AlignedBuffer;
 /// This modules define the logic used to search for a doc in a given
 /// block. (at most 128 docs)
 ///
 /// Searching within a block is a hotspot when running intersection.
 /// so it was worth defining it in its own module.
 #[cfg(target_arch = "x86_64")]
 mod sse2 {
    use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
    use std::arch::x86_64::__m128i as DataType;
    use std::arch::x86_64::_mm_add_epi32 as op_add;
    use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
    use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
    use std::arch::x86_64::_mm_set1_epi32 as set1;
    use std::arch::x86_64::_mm_setzero_si128 as set0;
    use std::arch::x86_64::_mm_sub_epi32 as op_sub;
    use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
    const MASK1: i32 = 78;
    const MASK2: i32 = 177;
    /// Performs an exhaustive linear search over the
    ///
    /// There is no early exit here. We simply count the
    /// number of elements that are `< target`.
    pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
        unsafe {
            let ptr = arr as *const AlignedBuffer as *const DataType;
            let vkey = set1(target as i32);
            let mut cnt = set0();
            // We work over 4 `__m128i` at a time.
            // A single `__m128i` actual contains 4 `u32`.
            for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
                let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
                let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
                let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
                let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
                let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
                cnt = op_sub(cnt, sum);
            }
            cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
            cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
            _mm_cvtsi128_si32(cnt) as usize
        }
    }
    #[cfg(test)]
    mod test {
        use super::linear_search_sse2_128;
        use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
        #[test]
        fn test_linear_search_sse2_128_u32() {
            let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
            for el in 0u32..128u32 {
                block[el as usize] = el * 2 + 1 << 18;
            }
            let target = block[64] + 1;
            assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
        }
    }
 }
 /// This `linear search` browser exhaustively through the array.
 /// but the early exit is very difficult to predict.
 ///
 /// Coupled with `exponential search` this function is likely
 /// to be called with the same `len`
 fn linear_search(arr: &[u32], target: u32) -> usize {
    arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
 }
 fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
    let end = arr.len();
    let mut begin = 0;
    for &pivot in &[1, 3, 7, 15, 31, 63] {
        if pivot >= end {
            break;
        }
        if arr[pivot] > target {
            return (begin, pivot);
        }
        begin = pivot;
    }
    (begin, end)
 }
 fn galloping(block_docs: &[u32], target: u32) -> usize {
    let (start, end) = exponential_search(&block_docs, target);
    start + linear_search(&block_docs[start..end], target)
 }
 /// Tantivy may rely on SIMD instructions to search for a specific document within
 /// a given block.
 #[derive(Clone, Copy, PartialEq)]
 pub enum BlockSearcher {
    #[cfg(target_arch = "x86_64")]
    SSE2,
    Scalar,
 }
 impl BlockSearcher {
    /// Search the first index containing an element greater or equal to
    /// the target.
    ///
    /// The results should be equivalent to
    /// ```ignore
    /// block[..]
    //       .iter()
    //       .take_while(|&&val| val < target)
    //       .count()
    /// ```
    ///
    /// The `start` argument is just used to hint that the response is
    /// greater than beyond `start`. The implementation may or may not use
    /// it for optimization.
    ///
    /// # Assumption
    ///
    /// The array len is > start.
    /// The block is sorted
    /// The target is assumed greater or equal to the `arr[start]`.
    /// The target is assumed smaller or equal to the last element of the block.
    ///
    /// Currently the scalar implementation starts by an exponential search, and
    /// then operates a linear search in the result subarray.
    ///
    /// If SSE2 instructions are available in the `(platform, running CPU)`,
    /// then we use a different implementation that does an exhaustive linear search over
    /// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
    /// of branch.
    pub(crate) fn search_in_block(
        self,
        block_docs: &AlignedBuffer,
        len: usize,
        start: usize,
        target: u32,
    ) -> usize {
        #[cfg(target_arch = "x86_64")]
        {
            use postings::compression::COMPRESSION_BLOCK_SIZE;
            if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
                return sse2::linear_search_sse2_128(block_docs, target);
            }
        }
        start + galloping(&block_docs.0[start..len], target)
    }
 }
 impl Default for BlockSearcher {
    fn default() -> BlockSearcher {
        #[cfg(target_arch = "x86_64")]
        {
            if is_x86_feature_detected!("sse2") {
                return BlockSearcher::SSE2;
            }
        }
        BlockSearcher::Scalar
    }
 }
 #[cfg(test)]
 mod tests {
    use super::exponential_search;
    use super::linear_search;
    use super::BlockSearcher;
    use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
    #[test]
    fn test_linear_search() {
        let len: usize = 50;
        let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
        for target in 1..*arr.last().unwrap() {
            let res = linear_search(&arr[..], target);
            if res > 0 {
                assert!(arr[res - 1] < target);
            }
            if res < len {
                assert!(arr[res] >= target);
            }
        }
    }
    #[test]
    fn test_exponentiel_search() {
        assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
        assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
        assert_eq!(
            exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
            (3, 7)
        );
    }
    fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
        let cursor = search_in_block_trivial_but_slow(block, target);
        assert!(block.len() < COMPRESSION_BLOCK_SIZE);
        let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
        output_buffer[..block.len()].copy_from_slice(block);
        for i in 0..cursor {
            assert_eq!(
                block_searcher.search_in_block(
                    &AlignedBuffer(output_buffer),
                    block.len(),
                    i,
                    target
                ),
                cursor
            );
        }
    }
    fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
        use std::collections::HashSet;
        let mut targets = HashSet::new();
        for (i, val) in block.iter().cloned().enumerate() {
            if i > 0 {
                targets.insert(val - 1);
            }
            targets.insert(val);
        }
        for target in targets {
            util_test_search_in_block(block_searcher, block, target);
        }
    }
    fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
        block.iter().take_while(|&&val| val < target).count()
    }
    fn test_search_in_block_util(block_searcher: BlockSearcher) {
        for len in 1u32..128u32 {
            let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
            util_test_search_in_block_all(block_searcher, &v[..]);
        }
    }
    #[test]
    fn test_search_in_block_scalar() {
        test_search_in_block_util(BlockSearcher::Scalar);
    }
    #[cfg(target_arch = "x86_64")]
    #[test]
    fn test_search_in_block_sse2() {
        test_search_in_block_util(BlockSearcher::SSE2);
    }
 }
--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -43,14 +43,9 @@ impl BlockEncoder {
    }
 }
 /// We ensure that the OutputBuffer is align on 128 bits
 /// in order to run SSE2 linear search on it.
 #[repr(align(128))]
 pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
 pub struct BlockDecoder {
    bitpacker: BitPacker4x,
-    output: AlignedBuffer,
+    pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
    pub output_len: usize,
 }
@@ -60,9 +55,11 @@ impl BlockDecoder {
    }
    pub fn with_val(val: u32) -> BlockDecoder {
        let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
        output[COMPRESSION_BLOCK_SIZE] = 0u32;
        BlockDecoder {
            bitpacker: BitPacker4x::new(),
-            output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
+            output,
            output_len: 0,
        }
    }
@@ -75,28 +72,23 @@ impl BlockDecoder {
    ) -> usize {
        self.output_len = COMPRESSION_BLOCK_SIZE;
        self.bitpacker
-            .decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
+            .decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
    }
    pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
        self.output_len = COMPRESSION_BLOCK_SIZE;
        self.bitpacker
-            .decompress(&compressed_data, &mut self.output.0, num_bits)
+            .decompress(&compressed_data, &mut self.output, num_bits)
    }
    #[inline]
    pub fn output_array(&self) -> &[u32] {
-        &self.output.0[..self.output_len]
+        &self.output[..self.output_len]
    }
    #[inline]
    pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
        (&self.output, self.output_len)
    }
    #[inline]
    pub fn output(&self, idx: usize) -> u32 {
-        self.output.0[idx]
+        self.output[idx]
    }
 }
@@ -167,12 +159,12 @@ impl VIntDecoder for BlockDecoder {
        num_els: usize,
    ) -> usize {
        self.output_len = num_els;
-        vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
+        vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
    }
    fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
        self.output_len = num_els;
-        vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
+        vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
    }
 }
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -2,7 +2,6 @@
 Postings module (also called inverted index)
 */
 mod block_search;
 pub(crate) mod compression;
 /// Postings module
 ///
@@ -17,8 +16,6 @@ mod skip;
 mod stacker;
 mod term_info;
 pub(crate) use self::block_search::BlockSearcher;
 pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
 pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
@@ -55,15 +52,13 @@ pub mod tests {
    use fieldnorm::FieldNormReader;
    use indexer::operation::AddOperation;
    use indexer::SegmentWriter;
    use merge_policy::NoMergePolicy;
    use query::Scorer;
    use rand::rngs::StdRng;
    use rand::{Rng, SeedableRng};
    use schema::Field;
    use schema::IndexRecordOption;
    use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
    use schema::{Field, TextOptions};
    use schema::{IndexRecordOption, TextFieldIndexing};
    use std::iter;
    use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
    use DocId;
    use Score;
@@ -109,7 +104,9 @@ pub mod tests {
        let searcher = index.reader().unwrap().searcher();
        let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
        let term = Term::from_field_text(title, "abc");
        let mut positions = Vec::new();
        {
            let mut postings = inverted_index
                .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
@@ -162,52 +159,6 @@ pub mod tests {
        }
    }
    #[test]
    pub fn test_drop_token_that_are_too_long() {
        let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
        let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
        exceeding_token_text.push_str(" hello");
        let mut schema_builder = Schema::builder();
        let text_options = TextOptions::default().set_indexing_options(
            TextFieldIndexing::default()
                .set_index_option(IndexRecordOption::WithFreqsAndPositions)
                .set_tokenizer("simple_no_truncation"),
        );
        let text_field = schema_builder.add_text_field("text", text_options);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema.clone());
        index
            .tokenizers()
            .register("simple_no_truncation", SimpleTokenizer);
        let reader = index.reader().unwrap();
        let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
        index_writer.set_merge_policy(Box::new(NoMergePolicy));
        {
            index_writer.add_document(doc!(text_field=>exceeding_token_text));
            index_writer.commit().unwrap();
            reader.reload().unwrap();
            let searcher = reader.searcher();
            let segment_reader = searcher.segment_reader(0u32);
            let inverted_index = segment_reader.inverted_index(text_field);
            assert_eq!(inverted_index.terms().num_terms(), 1);
            let mut bytes = vec![];
            assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
            assert_eq!(&bytes, b"hello");
        }
        {
            index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
            index_writer.commit().unwrap();
            reader.reload().unwrap();
            let searcher = reader.searcher();
            let segment_reader = searcher.segment_reader(1u32);
            let inverted_index = segment_reader.inverted_index(text_field);
            assert_eq!(inverted_index.terms().num_terms(), 1);
            let mut bytes = vec![];
            assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
            assert_eq!(&bytes[..], ok_token_text.as_bytes());
        }
    }
    #[test]
    pub fn test_position_and_fieldnorm1() {
        let mut positions = Vec::new();
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -12,8 +12,8 @@ use std::io;
 use std::marker::PhantomData;
 use std::ops::DerefMut;
 use termdict::TermOrdinal;
 use tokenizer::Token;
 use tokenizer::TokenStream;
 use tokenizer::{Token, MAX_TOKEN_LEN};
 use DocId;
 use Result;
@@ -210,18 +210,8 @@ pub trait PostingsWriter {
    ) -> u32 {
        let mut term = Term::for_field(field);
        let mut sink = |token: &Token| {
-            // We skip all tokens with a len greater than u16.
+            term.set_text(token.text.as_str());
-            if token.text.len() <= MAX_TOKEN_LEN {
+            self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
                term.set_text(token.text.as_str());
                self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
            } else {
                info!(
                    "A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
                     MAX_TOKEN_LEN in the documentation for more information.",
                    token.text.len(),
                    MAX_TOKEN_LEN
                );
            }
        };
        token_stream.process(&mut sink)
    }
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -4,10 +4,9 @@ use common::{BinarySerializable, VInt};
 use docset::{DocSet, SkipResult};
 use owned_read::OwnedRead;
 use positions::PositionReader;
-use postings::compression::{compressed_block_size, AlignedBuffer};
+use postings::compression::compressed_block_size;
 use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
 use postings::serializer::PostingsSerializer;
 use postings::BlockSearcher;
 use postings::FreqReadingOption;
 use postings::Postings;
 use postings::SkipReader;
@@ -61,7 +60,6 @@ pub struct SegmentPostings {
    block_cursor: BlockSegmentPostings,
    cur: usize,
    position_computer: Option<PositionComputer>,
    block_searcher: BlockSearcher,
 }
 impl SegmentPostings {
@@ -72,7 +70,6 @@ impl SegmentPostings {
            block_cursor: empty_block_cursor,
            cur: COMPRESSION_BLOCK_SIZE,
            position_computer: None,
            block_searcher: BlockSearcher::default(),
        }
    }
@@ -120,33 +117,42 @@ impl SegmentPostings {
            block_cursor: segment_block_postings,
            cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
            position_computer: positions_stream_opt.map(PositionComputer::new),
            block_searcher: BlockSearcher::default(),
        }
    }
 }
-impl DocSet for SegmentPostings {
+fn linear_search(arr: &[u32], target: u32) -> usize {
-    // goes to the next element.
+    arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
-    // next needs to be called a first time to point to the correct element.
+}
    #[inline]
    fn advance(&mut self) -> bool {
        if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
            let term_freq = self.term_freq() as usize;
            if let Some(position_computer) = self.position_computer.as_mut() {
                position_computer.add_skip(term_freq);
            }
        }
        self.cur += 1;
        if self.cur >= self.block_cursor.block_len() {
            self.cur = 0;
            if !self.block_cursor.advance() {
                self.cur = COMPRESSION_BLOCK_SIZE;
                return false;
            }
        }
        true
    }
 fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
    let end = arr.len();
    let mut begin = 0;
    for &pivot in &[1, 3, 7, 15, 31, 63] {
        if pivot >= end {
            break;
        }
        if arr[pivot] > target {
            return (begin, pivot);
        }
        begin = pivot;
    }
    (begin, end)
 }
 /// Search the first index containing an element greater or equal to the target.
 ///
 /// # Assumption
 ///
 /// The array is assumed non empty.
 /// The target is assumed greater or equal to the first element.
 /// The target is assumed smaller or equal to the last element.
 fn search_within_block(block_docs: &[u32], target: u32) -> usize {
    let (start, end) = exponential_search(block_docs, target);
    start + linear_search(&block_docs[start..end], target)
 }
 impl DocSet for SegmentPostings {
    fn skip_next(&mut self, target: DocId) -> SkipResult {
        if !self.advance() {
            return SkipResult::End;
@@ -169,6 +175,7 @@ impl DocSet for SegmentPostings {
        // skip blocks until one that might contain the target
        // check if we need to go to the next block
        let need_positions = self.position_computer.is_some();
        let mut sum_freqs_skipped: u32 = 0;
        if !self
            .block_cursor
@@ -182,7 +189,7 @@ impl DocSet for SegmentPostings {
            // we are not in the right block.
            //
            // First compute all of the freqs skipped from the current block.
-            if self.position_computer.is_some() {
+            if need_positions {
                sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
                match self.block_cursor.skip_to(target) {
                    BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
@@ -201,21 +208,25 @@ impl DocSet for SegmentPostings {
            self.cur = 0;
        }
        let cur = self.cur;
        // we're in the right block now, start with an exponential search
-        let (output, len) = self.block_cursor.docs_aligned();
+        let block_docs = self.block_cursor.docs();
        let new_cur = self
-            .block_searcher
+            .cur
-            .search_in_block(&output, len, cur, target);
+            .wrapping_add(search_within_block(&block_docs[self.cur..], target));
-        if let Some(position_computer) = self.position_computer.as_mut() {
+
-            sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
+        if need_positions {
-            position_computer.add_skip(sum_freqs_skipped as usize);
+            sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
                .iter()
                .sum::<u32>();
            self.position_computer
                .as_mut()
                .unwrap()
                .add_skip(sum_freqs_skipped as usize);
        }
        self.cur = new_cur;
        // `doc` is now the first element >= `target`
-        let doc = output.0[new_cur];
+        let doc = block_docs[new_cur];
        debug_assert!(doc >= target);
        if doc == target {
            SkipResult::Reached
@@ -224,25 +235,40 @@ impl DocSet for SegmentPostings {
        }
    }
    // goes to the next element.
    // next needs to be called a first time to point to the correct element.
    #[inline]
    fn advance(&mut self) -> bool {
        if self.position_computer.is_some() {
            let term_freq = self.term_freq() as usize;
            self.position_computer.as_mut().unwrap().add_skip(term_freq);
        }
        self.cur += 1;
        if self.cur >= self.block_cursor.block_len() {
            self.cur = 0;
            if !self.block_cursor.advance() {
                self.cur = COMPRESSION_BLOCK_SIZE;
                return false;
            }
        }
        true
    }
    fn size_hint(&self) -> u32 {
        self.len() as u32
    }
    /// Return the current document's `DocId`.
    ///
    /// # Panics
    ///
    /// Will panics if called without having called advance before.
    #[inline]
    fn doc(&self) -> DocId {
        let docs = self.block_cursor.docs();
        debug_assert!(
            self.cur < docs.len(),
-            "Have you forgotten to call `.advance()` at least once before calling `.doc()`                                      ."
+            "Have you forgotten to call `.advance()` at least once before calling .doc()."
        );
        docs[self.cur]
    }
    fn size_hint(&self) -> u32 {
        self.len() as u32
    }
    fn append_to_bitset(&mut self, bitset: &mut BitSet) {
        // finish the current block
        if self.advance() {
@@ -266,33 +292,17 @@ impl HasLen for SegmentPostings {
 }
 impl Postings for SegmentPostings {
    /// Returns the frequency associated to the current document.
    /// If the schema is set up so that no frequency have been encoded,
    /// this method should always return 1.
    ///
    /// # Panics
    ///
    /// Will panics if called without having called advance before.
    fn term_freq(&self) -> u32 {
        debug_assert!(
            // Here we do not use the len of `freqs()`
            // because it is actually ok to request for the freq of doc
            // even if no frequency were encoded for the field.
            //
            // In that case we hit the block just as if the frequency had been
            // decoded. The block is simply prefilled by the value 1.
            self.cur < COMPRESSION_BLOCK_SIZE,
            "Have you forgotten to call `.advance()` at least once before calling \
             `.term_freq()`."
        );
        self.block_cursor.freq(self.cur)
    }
    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
-        let term_freq = self.term_freq() as usize;
+        if self.position_computer.is_some() {
-        if let Some(position_comp) = self.position_computer.as_mut() {
+            output.resize(self.term_freq() as usize, 0u32);
-            output.resize(term_freq, 0u32);
+            self.position_computer
-            position_comp.positions_with_offset(offset, &mut output[..]);
+                .as_mut()
                .unwrap()
                .positions_with_offset(offset, &mut output[..])
        } else {
            output.clear();
        }
@@ -414,10 +424,6 @@ impl BlockSegmentPostings {
        self.doc_decoder.output_array()
    }
    pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
        self.doc_decoder.output_aligned()
    }
    /// Return the document at index `idx` of the block.
    #[inline]
    pub fn doc(&self, idx: usize) -> u32 {
@@ -608,13 +614,16 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
 #[cfg(test)]
 mod tests {
    use super::exponential_search;
    use super::linear_search;
    use super::search_within_block;
    use super::BlockSegmentPostings;
    use super::BlockSegmentPostingsSkipResult;
    use super::SegmentPostings;
    use common::HasLen;
    use core::Index;
    use docset::DocSet;
    use postings::postings::Postings;
    use schema::IndexRecordOption;
    use schema::Schema;
    use schema::Term;
@@ -623,6 +632,21 @@ mod tests {
    use DocId;
    use SkipResult;
    #[test]
    fn test_linear_search() {
        let len: usize = 50;
        let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
        for target in 1..*arr.last().unwrap() {
            let res = linear_search(&arr[..], target);
            if res > 0 {
                assert!(arr[res - 1] < target);
            }
            if res < len {
                assert!(arr[res] >= target);
            }
        }
    }
    #[test]
    fn test_empty_segment_postings() {
        let mut postings = SegmentPostings::empty();
@@ -631,18 +655,6 @@ mod tests {
        assert_eq!(postings.len(), 0);
    }
    #[test]
    #[should_panic(expected = "Have you forgotten to call `.advance()`")]
    fn test_panic_if_doc_called_before_advance() {
        SegmentPostings::empty().doc();
    }
    #[test]
    #[should_panic(expected = "Have you forgotten to call `.advance()`")]
    fn test_panic_if_freq_called_before_advance() {
        SegmentPostings::empty().term_freq();
    }
    #[test]
    fn test_empty_block_segment_postings() {
        let mut postings = BlockSegmentPostings::empty();
@@ -650,6 +662,56 @@ mod tests {
        assert_eq!(postings.doc_freq(), 0);
    }
    fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
        block
            .iter()
            .cloned()
            .enumerate()
            .filter(|&(_, ref val)| *val >= target)
            .next()
            .unwrap()
            .0
    }
    #[test]
    fn test_exponentiel_search() {
        assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
        assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
        assert_eq!(
            exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
            (3, 7)
        );
    }
    fn util_test_search_within_block(block: &[u32], target: u32) {
        assert_eq!(
            search_within_block(block, target),
            search_within_block_trivial_but_slow(block, target)
        );
    }
    fn util_test_search_within_block_all(block: &[u32]) {
        use std::collections::HashSet;
        let mut targets = HashSet::new();
        for (i, val) in block.iter().cloned().enumerate() {
            if i > 0 {
                targets.insert(val - 1);
            }
            targets.insert(val);
        }
        for target in targets {
            util_test_search_within_block(block, target);
        }
    }
    #[test]
    fn test_search_within_block() {
        for len in 1u32..128u32 {
            let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
            util_test_search_within_block_all(&v[..]);
        }
    }
    #[test]
    fn test_block_segment_postings() {
        let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -14,7 +14,7 @@ use termdict::{TermDictionaryBuilder, TermOrdinal};
 use DocId;
 use Result;
-/// `InvertedIndexSerializer` is in charge of serializing
+/// `PostingsSerializer` is in charge of serializing
 /// postings on disk, in the
 /// * `.idx` (inverted index)
 /// * `.pos` (positions file)
@@ -54,7 +54,7 @@ pub struct InvertedIndexSerializer {
 }
 impl InvertedIndexSerializer {
-    /// Open a new `InvertedIndexSerializer` for the given segment
+    /// Open a new `PostingsSerializer` for the given segment
    fn create(
        terms_write: CompositeWrite<WritePtr>,
        postings_write: CompositeWrite<WritePtr>,
--- a/src/query/intersection.rs
+++ b/src/query/intersection.rs
@@ -14,35 +14,41 @@ use Score;
 /// specialized implementation if the two
 /// shortest scorers are `TermScorer`s.
 pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
    if scorers.is_empty() {
        return Box::new(EmptyScorer);
    }
    if scorers.len() == 1 {
        return scorers.pop().unwrap();
    }
    // We know that we have at least 2 elements.
    let num_docsets = scorers.len();
    scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
-    let left = scorers.pop().unwrap();
+    let rarest_opt = scorers.pop();
-    let right = scorers.pop().unwrap();
+    let second_rarest_opt = scorers.pop();
    scorers.reverse();
-    let all_term_scorers = [&left, &right]
+    match (rarest_opt, second_rarest_opt) {
-        .iter()
+        (None, None) => Box::new(EmptyScorer),
-        .all(|&scorer| scorer.is::<TermScorer>());
+        (Some(single_docset), None) => single_docset,
-    if all_term_scorers {
+        (Some(left), Some(right)) => {
-        return Box::new(Intersection {
+            {
-            left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
+                let all_term_scorers = [&left, &right]
-            right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
+                    .iter()
-            others: scorers,
+                    .all(|&scorer| scorer.is::<TermScorer>());
-            num_docsets,
+                if all_term_scorers {
-        });
+                    let left = *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap());
                    let right = *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap());
                    return Box::new(Intersection {
                        left,
                        right,
                        others: scorers,
                        num_docsets,
                    });
                }
            }
            Box::new(Intersection {
                left,
                right,
                others: scorers,
                num_docsets,
            })
        }
        _ => {
            unreachable!();
        }
    }
    Box::new(Intersection {
        left,
        right,
        others: scorers,
        num_docsets,
    })
 }
 /// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
@@ -118,6 +124,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
                        return false;
                    }
                }
                match left.skip_next(candidate) {
                    SkipResult::Reached => {
                        break;
@@ -133,36 +140,35 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
            }
            // test the remaining scorers;
            for (ord, docset) in self.others.iter_mut().enumerate() {
-                if ord == other_candidate_ord {
+                if ord != other_candidate_ord {
-                    continue;
+                    // `candidate_ord` is already at the
-                }
+                    // right position.
-                // `candidate_ord` is already at the
+                    //
-                // right position.
+                    // Calling `skip_next` would advance this docset
-                //
+                    // and miss it.
-                // Calling `skip_next` would advance this docset
+                    match docset.skip_next(candidate) {
-                // and miss it.
+                        SkipResult::Reached => {}
-                match docset.skip_next(candidate) {
+                        SkipResult::OverStep => {
-                    SkipResult::Reached => {}
+                            // this is not in the intersection,
-                    SkipResult::OverStep => {
+                            // let's update our candidate.
-                        // this is not in the intersection,
+                            candidate = docset.doc();
-                        // let's update our candidate.
+                            match left.skip_next(candidate) {
-                        candidate = docset.doc();
+                                SkipResult::Reached => {
-                        match left.skip_next(candidate) {
+                                    other_candidate_ord = ord;
-                            SkipResult::Reached => {
+                                }
-                                other_candidate_ord = ord;
+                                SkipResult::OverStep => {
-                            }
+                                    candidate = left.doc();
-                            SkipResult::OverStep => {
+                                    other_candidate_ord = usize::max_value();
-                                candidate = left.doc();
+                                }
-                                other_candidate_ord = usize::max_value();
+                                SkipResult::End => {
-                            }
+                                    return false;
-                            SkipResult::End => {
+                                }
                                return false;
                            }
                            continue 'outer;
                        }
                        SkipResult::End => {
                            return false;
                        }
                        continue 'outer;
                    }
                    SkipResult::End => {
                        return false;
                    }
                }
            }
--- a/src/query/term_query/mod.rs
+++ b/src/query/term_query/mod.rs
@@ -98,20 +98,4 @@ mod tests {
        }
    }
    #[test]
    fn test_term_query_count_when_there_are_deletes() {
        let mut schema_builder = Schema::builder();
        let text_field = schema_builder.add_text_field("text", TEXT);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema);
        let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap();
        index_writer.add_document(doc!(text_field=>"a b"));
        index_writer.add_document(doc!(text_field=>"a c"));
        index_writer.delete_term(Term::from_field_text(text_field, "b"));
        index_writer.commit().unwrap();
        let term_a = Term::from_field_text(text_field, "a");
        let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
        let reader = index.reader().unwrap();
        assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
    }
 }
--- a/src/query/term_query/term_weight.rs
+++ b/src/query/term_query/term_weight.rs
@@ -39,15 +39,15 @@ impl Weight for TermWeight {
    }
    fn count(&self, reader: &SegmentReader) -> Result<u32> {
-        if let Some(delete_bitset) = reader.delete_bitset() {
+        if reader.num_deleted_docs() == 0 {
            Ok(self.scorer(reader)?.count(delete_bitset))
        } else {
            let field = self.term.field();
            Ok(reader
                .inverted_index(field)
                .get_term_info(&self.term)
                .map(|term_info| term_info.doc_freq)
                .unwrap_or(0))
        } else {
            Ok(self.scorer(reader)?.count())
        }
    }
 }
--- a/src/query/union.rs
+++ b/src/query/union.rs
@@ -145,7 +145,7 @@ where
        }
    }
-    fn count_including_deleted(&mut self) -> u32 {
+    fn count(&mut self) -> u32 {
        let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
            .iter()
            .map(|bitset| bitset.len())
@@ -163,8 +163,6 @@ where
        count
    }
    // TODO implement `count` efficiently.
    fn skip_next(&mut self, target: DocId) -> SkipResult {
        if !self.advance() {
            return SkipResult::End;
@@ -302,7 +300,7 @@ mod tests {
            count += 1;
        }
        assert!(!union_expected.advance());
-        assert_eq!(count, make_union().count_including_deleted());
+        assert_eq!(count, make_union().count());
    }
    #[test]
--- a/src/query/weight.rs
+++ b/src/query/weight.rs
@@ -13,11 +13,6 @@ pub trait Weight: Send + Sync + 'static {
    /// Returns the number documents within the given `SegmentReader`.
    fn count(&self, reader: &SegmentReader) -> Result<u32> {
-        let mut scorer = self.scorer(reader)?;
+        Ok(self.scorer(reader)?.count())
        if let Some(delete_bitset) = reader.delete_bitset() {
            Ok(scorer.count(delete_bitset))
        } else {
            Ok(scorer.count_including_deleted())
        }
    }
 }
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -97,8 +97,6 @@
 //! If you built your schema programmatically, a complete example
 //! could like this for instance.
 //!
 //! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html).
 //!
 //! # Example
 //!
 //! ```
@@ -159,13 +157,6 @@ pub use self::tokenizer::BoxedTokenizer;
 pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
 pub use self::tokenizer_manager::TokenizerManager;
 /// Maximum authorized len (in bytes) for a token.
 ///
 /// Tokenizer are in charge of not emitting tokens larger than this value.
 /// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
 /// `2^16 - 1 - 4`, the token will simply be ignored downstream.
 pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
 #[cfg(test)]
 pub mod tests {
    use super::{
Author	SHA1	Message	Date
Paul Masurel	2b28e491c2	Preparing for hotfix release 0.9.1	2019-03-28 09:58:33 +09:00
Panagiotis Ktistakis	1d4fa4547c	Fix non english stemmers (#521 )	2019-03-28 09:50:27 +09:00