Added logging when token is dropped.

Closes #526 (#535 )
Merge branch 'master' of github.com:tantivy-search/tantivy
2026-02-25 09:10:37 +00:00 · 2019-04-26 09:22:01 +09:00 · 2019-04-24 20:59:48 +09:00 · 2019-04-24 12:31:47 +09:00 · 2019-04-24 12:31:32 +09:00 · 2019-04-23 09:55:55 +09:00
17 changed files with 286 additions and 74 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -29,7 +29,7 @@ addons:
 matrix:
  include:
    # Android
-    - env: TARGET=aarch64-linux-android DISABLE_TESTS
+    - env: TARGET=aarch64-linux-android DISABLE_TESTS=1
    #- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
    #- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
    #- env: TARGET=i686-linux-android DISABLE_TESTS=1
@@ -68,6 +68,11 @@ cache: cargo
 before_cache:
  # Travis can't cache files that are not readable by "others"
  - chmod -R a+r $HOME/.cargo
+  - find ./target/debug -type f -maxdepth 1 -delete
+  - rm -f  ./target/.rustc_info.json
+  - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
+  - rm -r target/debug/examples/
+  - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete

 #branches:
 #  only:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,15 @@
+Tantivy 0.10.0
+====================
+
+
+Minor
+---------
+- Small simplification of the code. 
+Calling .freq() or .doc() when .advance() has never 
+on segment postings should panic from now on.
+- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
+
+
 Tantivy 0.9.0
 =====================
 *0.9.0 index format is not compatible with the 
@@ -17,6 +29,35 @@ previous index format.*
 - Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
 - SIMD linear search within blocks (@fulmicoton)

+## How to update ?
+
+tantivy 0.9 brought some API breaking change.
+To update from tantivy 0.8, you will need to go through the following steps.
+
+- `schema::INT_INDEXED` and `schema::INT_STORED`  should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
+- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called 
+`IndexReader` for this. 
+    
+    ```rust
+    // create the reader. You typically need to create 1 reader for the entire
+    // lifetime of you program.
+    let reader = index.reader()?;
+    
+    // Acquire a searcher (previously `index.searcher()`) is now written:
+    let searcher = reader.searcher();
+    
+    // With the default setting of the reader, you are not required to 
+    // call `index.load_searchers()` anymore.
+    //
+    // The IndexReader will pick up that change automatically, regardless
+    // of whether the update was done in a different process or not.
+    // If this behavior is not wanted, you can create your reader with 
+    // the `ReloadPolicy::Manual`, and manually decide when to reload the index
+    // by calling `reader.reload()?`.
+  
+    ```
+
+
 Tantivy 0.8.2
 =====================
 Fixing build for x86_64 platforms. (#496)
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.9.0"
+version = "0.10.0-dev"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -23,7 +23,7 @@ snap = {version="0.2"}
 atomicwrites = {version="0.2.2", optional=true}
 tempfile = "3.0"
 log = "0.4"
-combine = "3"
+combine = ">=3.6.0,<4.0.0"
 tempdir = "0.3"
 serde = "1.0"
 serde_derive = "1.0"
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -13,7 +13,6 @@ pub use self::serialize::{BinarySerializable, FixedSize};
 pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
 pub use byteorder::LittleEndian as Endianness;

-
 /// Segment's max doc must be `< MAX_DOC_LIMIT`.
 ///
 /// We do not allow segments with more than
--- a/src/docset.rs
+++ b/src/docset.rs
@@ -1,4 +1,5 @@
 use common::BitSet;
+use fastfield::DeleteBitSet;
 use std::borrow::Borrow;
 use std::borrow::BorrowMut;
 use std::cmp::Ordering;
@@ -95,9 +96,23 @@ pub trait DocSet {
    }

    /// Returns the number documents matching.
-    ///
    /// Calling this method consumes the `DocSet`.
-    fn count(&mut self) -> u32 {
+    fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
+        let mut count = 0u32;
+        while self.advance() {
+            if !delete_bitset.is_deleted(self.doc()) {
+                count += 1u32;
+            }
+        }
+        count
+    }
+
+    /// Returns the count of documents, deleted or not.
+    /// Calling this method consumes the `DocSet`.
+    ///
+    /// Of course, the result is an upper bound of the result
+    /// given by `count()`.
+    fn count_including_deleted(&mut self) -> u32 {
        let mut count = 0u32;
        while self.advance() {
            count += 1u32;
@@ -127,9 +142,14 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
        unboxed.size_hint()
    }

-    fn count(&mut self) -> u32 {
+    fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
        let unboxed: &mut TDocSet = self.borrow_mut();
-        unboxed.count()
+        unboxed.count(delete_bitset)
+    }
+
+    fn count_including_deleted(&mut self) -> u32 {
+        let unboxed: &mut TDocSet = self.borrow_mut();
+        unboxed.count_including_deleted()
    }

    fn append_to_bitset(&mut self, bitset: &mut BitSet) {
--- a/src/postings/block_search.rs
+++ b/src/postings/block_search.rs
@@ -1,3 +1,5 @@
+use postings::compression::AlignedBuffer;
+
 /// This modules define the logic used to search for a doc in a given
 /// block. (at most 128 docs)
 ///
@@ -6,7 +8,7 @@

 #[cfg(target_arch = "x86_64")]
 mod sse2 {
-    use postings::compression::COMPRESSION_BLOCK_SIZE;
+    use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
    use std::arch::x86_64::__m128i as DataType;
    use std::arch::x86_64::_mm_add_epi32 as op_add;
    use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
@@ -23,9 +25,9 @@ mod sse2 {
    ///
    /// There is no early exit here. We simply count the
    /// number of elements that are `< target`.
-    pub fn linear_search_sse2_128(arr: &[u32], target: u32) -> usize {
+    pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
        unsafe {
-            let ptr = arr.as_ptr() as *const DataType;
+            let ptr = arr as *const AlignedBuffer as *const DataType;
            let vkey = set1(target as i32);
            let mut cnt = set0();
            // We work over 4 `__m128i` at a time.
@@ -47,14 +49,16 @@ mod sse2 {
    #[cfg(test)]
    mod test {
        use super::linear_search_sse2_128;
+        use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};

        #[test]
        fn test_linear_search_sse2_128_u32() {
-            for i in 0..23 {
-                dbg!(i);
-                let arr: Vec<u32> = (0..128).map(|el| el * 2 + 1 << 18).collect();
-                assert_eq!(linear_search_sse2_128(&arr, arr[64] + 1), 65);
+            let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
+            for el in 0u32..128u32 {
+                block[el as usize] = el * 2 + 1 << 18;
            }
+            let target = block[64] + 1;
+            assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
        }
    }
 }
@@ -127,17 +131,21 @@ impl BlockSearcher {
    /// then we use a different implementation that does an exhaustive linear search over
    /// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
    /// of branch.
-    pub fn search_in_block(&self, block_docs: &[u32], start: usize, target: u32) -> usize {
+    pub(crate) fn search_in_block(
+        self,
+        block_docs: &AlignedBuffer,
+        len: usize,
+        start: usize,
+        target: u32,
+    ) -> usize {
        #[cfg(target_arch = "x86_64")]
        {
            use postings::compression::COMPRESSION_BLOCK_SIZE;
-            if *self == BlockSearcher::SSE2 {
-                if block_docs.len() == COMPRESSION_BLOCK_SIZE {
-                    return sse2::linear_search_sse2_128(block_docs, target);
-                }
+            if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
+                return sse2::linear_search_sse2_128(block_docs, target);
            }
        }
-        start + galloping(&block_docs[start..], target)
+        start + galloping(&block_docs.0[start..len], target)
    }
 }

@@ -158,6 +166,7 @@ mod tests {
    use super::exponential_search;
    use super::linear_search;
    use super::BlockSearcher;
+    use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};

    #[test]
    fn test_linear_search() {
@@ -186,8 +195,19 @@ mod tests {

    fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
        let cursor = search_in_block_trivial_but_slow(block, target);
+        assert!(block.len() < COMPRESSION_BLOCK_SIZE);
+        let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
+        output_buffer[..block.len()].copy_from_slice(block);
        for i in 0..cursor {
-            assert_eq!(block_searcher.search_in_block(block, i, target), cursor);
+            assert_eq!(
+                block_searcher.search_in_block(
+                    &AlignedBuffer(output_buffer),
+                    block.len(),
+                    i,
+                    target
+                ),
+                cursor
+            );
        }
    }

--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -46,11 +46,11 @@ impl BlockEncoder {
 /// We ensure that the OutputBuffer is align on 128 bits
 /// in order to run SSE2 linear search on it.
 #[repr(align(128))]
-struct OutputBuffer([u32; COMPRESSION_BLOCK_SIZE + 1]);
+pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);

 pub struct BlockDecoder {
    bitpacker: BitPacker4x,
-    output: OutputBuffer,
+    output: AlignedBuffer,
    pub output_len: usize,
 }

@@ -60,11 +60,9 @@ impl BlockDecoder {
    }

    pub fn with_val(val: u32) -> BlockDecoder {
-        let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
-        output[COMPRESSION_BLOCK_SIZE] = 0u32;
        BlockDecoder {
            bitpacker: BitPacker4x::new(),
-            output: OutputBuffer(output),
+            output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
            output_len: 0,
        }
    }
@@ -91,6 +89,11 @@ impl BlockDecoder {
        &self.output.0[..self.output_len]
    }

+    #[inline]
+    pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
+        (&self.output, self.output_len)
+    }
+
    #[inline]
    pub fn output(&self, idx: usize) -> u32 {
        self.output.0[idx]
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -55,13 +55,15 @@ pub mod tests {
    use fieldnorm::FieldNormReader;
    use indexer::operation::AddOperation;
    use indexer::SegmentWriter;
+    use merge_policy::NoMergePolicy;
    use query::Scorer;
    use rand::rngs::StdRng;
    use rand::{Rng, SeedableRng};
-    use schema::Field;
-    use schema::IndexRecordOption;
    use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
+    use schema::{Field, TextOptions};
+    use schema::{IndexRecordOption, TextFieldIndexing};
    use std::iter;
+    use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
    use DocId;
    use Score;

@@ -160,6 +162,52 @@ pub mod tests {
        }
    }

+    #[test]
+    pub fn test_drop_token_that_are_too_long() {
+        let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
+        let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
+        exceeding_token_text.push_str(" hello");
+        let mut schema_builder = Schema::builder();
+        let text_options = TextOptions::default().set_indexing_options(
+            TextFieldIndexing::default()
+                .set_index_option(IndexRecordOption::WithFreqsAndPositions)
+                .set_tokenizer("simple_no_truncation"),
+        );
+        let text_field = schema_builder.add_text_field("text", text_options);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema.clone());
+        index
+            .tokenizers()
+            .register("simple_no_truncation", SimpleTokenizer);
+        let reader = index.reader().unwrap();
+        let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+        index_writer.set_merge_policy(Box::new(NoMergePolicy));
+        {
+            index_writer.add_document(doc!(text_field=>exceeding_token_text));
+            index_writer.commit().unwrap();
+            reader.reload().unwrap();
+            let searcher = reader.searcher();
+            let segment_reader = searcher.segment_reader(0u32);
+            let inverted_index = segment_reader.inverted_index(text_field);
+            assert_eq!(inverted_index.terms().num_terms(), 1);
+            let mut bytes = vec![];
+            assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
+            assert_eq!(&bytes, b"hello");
+        }
+        {
+            index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
+            index_writer.commit().unwrap();
+            reader.reload().unwrap();
+            let searcher = reader.searcher();
+            let segment_reader = searcher.segment_reader(1u32);
+            let inverted_index = segment_reader.inverted_index(text_field);
+            assert_eq!(inverted_index.terms().num_terms(), 1);
+            let mut bytes = vec![];
+            assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
+            assert_eq!(&bytes[..], ok_token_text.as_bytes());
+        }
+    }
+
    #[test]
    pub fn test_position_and_fieldnorm1() {
        let mut positions = Vec::new();
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -12,8 +12,8 @@ use std::io;
 use std::marker::PhantomData;
 use std::ops::DerefMut;
 use termdict::TermOrdinal;
-use tokenizer::Token;
 use tokenizer::TokenStream;
+use tokenizer::{Token, MAX_TOKEN_LEN};
 use DocId;
 use Result;

@@ -210,8 +210,18 @@ pub trait PostingsWriter {
    ) -> u32 {
        let mut term = Term::for_field(field);
        let mut sink = |token: &Token| {
-            term.set_text(token.text.as_str());
-            self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
+            // We skip all tokens with a len greater than u16.
+            if token.text.len() <= MAX_TOKEN_LEN {
+                term.set_text(token.text.as_str());
+                self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
+            } else {
+                info!(
+                    "A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
+                     MAX_TOKEN_LEN in the documentation for more information.",
+                    token.text.len(),
+                    MAX_TOKEN_LEN
+                );
+            }
        };
        token_stream.process(&mut sink)
    }
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -4,7 +4,7 @@ use common::{BinarySerializable, VInt};
 use docset::{DocSet, SkipResult};
 use owned_read::OwnedRead;
 use positions::PositionReader;
-use postings::compression::compressed_block_size;
+use postings::compression::{compressed_block_size, AlignedBuffer};
 use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
 use postings::serializer::PostingsSerializer;
 use postings::BlockSearcher;
@@ -130,9 +130,11 @@ impl DocSet for SegmentPostings {
    // next needs to be called a first time to point to the correct element.
    #[inline]
    fn advance(&mut self) -> bool {
-        if self.position_computer.is_some() {
+        if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
            let term_freq = self.term_freq() as usize;
-            self.position_computer.as_mut().unwrap().add_skip(term_freq);
+            if let Some(position_computer) = self.position_computer.as_mut() {
+                position_computer.add_skip(term_freq);
+            }
        }
        self.cur += 1;
        if self.cur >= self.block_cursor.block_len() {
@@ -167,7 +169,6 @@ impl DocSet for SegmentPostings {

        // skip blocks until one that might contain the target
        // check if we need to go to the next block
-        let need_positions = self.position_computer.is_some();
        let mut sum_freqs_skipped: u32 = 0;
        if !self
            .block_cursor
@@ -181,7 +182,7 @@ impl DocSet for SegmentPostings {
            // we are not in the right block.
            //
            // First compute all of the freqs skipped from the current block.
-            if need_positions {
+            if self.position_computer.is_some() {
                sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
                match self.block_cursor.skip_to(target) {
                    BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
@@ -200,24 +201,21 @@ impl DocSet for SegmentPostings {
            self.cur = 0;
        }

+        let cur = self.cur;
+
        // we're in the right block now, start with an exponential search
-        let block_docs = self.block_cursor.docs();
+        let (output, len) = self.block_cursor.docs_aligned();
        let new_cur = self
            .block_searcher
-            .search_in_block(&block_docs, self.cur, target);
-        if need_positions {
-            sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
-                .iter()
-                .sum::<u32>();
-            self.position_computer
-                .as_mut()
-                .unwrap()
-                .add_skip(sum_freqs_skipped as usize);
+            .search_in_block(&output, len, cur, target);
+        if let Some(position_computer) = self.position_computer.as_mut() {
+            sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
+            position_computer.add_skip(sum_freqs_skipped as usize);
        }
        self.cur = new_cur;

        // `doc` is now the first element >= `target`
-        let doc = block_docs[new_cur];
+        let doc = output.0[new_cur];
        debug_assert!(doc >= target);
        if doc == target {
            SkipResult::Reached
@@ -227,12 +225,16 @@ impl DocSet for SegmentPostings {
    }

    /// Return the current document's `DocId`.
+    ///
+    /// # Panics
+    ///
+    /// Will panics if called without having called advance before.
    #[inline]
    fn doc(&self) -> DocId {
        let docs = self.block_cursor.docs();
        debug_assert!(
            self.cur < docs.len(),
-            "Have you forgotten to call `.advance()` at least once before calling .doc()."
+            "Have you forgotten to call `.advance()` at least once before calling `.doc()`                                      ."
        );
        docs[self.cur]
    }
@@ -264,17 +266,33 @@ impl HasLen for SegmentPostings {
 }

 impl Postings for SegmentPostings {
+    /// Returns the frequency associated to the current document.
+    /// If the schema is set up so that no frequency have been encoded,
+    /// this method should always return 1.
+    ///
+    /// # Panics
+    ///
+    /// Will panics if called without having called advance before.
    fn term_freq(&self) -> u32 {
+        debug_assert!(
+            // Here we do not use the len of `freqs()`
+            // because it is actually ok to request for the freq of doc
+            // even if no frequency were encoded for the field.
+            //
+            // In that case we hit the block just as if the frequency had been
+            // decoded. The block is simply prefilled by the value 1.
+            self.cur < COMPRESSION_BLOCK_SIZE,
+            "Have you forgotten to call `.advance()` at least once before calling \
+             `.term_freq()`."
+        );
        self.block_cursor.freq(self.cur)
    }

    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
-        if self.position_computer.is_some() {
-            output.resize(self.term_freq() as usize, 0u32);
-            self.position_computer
-                .as_mut()
-                .unwrap()
-                .positions_with_offset(offset, &mut output[..])
+        let term_freq = self.term_freq() as usize;
+        if let Some(position_comp) = self.position_computer.as_mut() {
+            output.resize(term_freq, 0u32);
+            position_comp.positions_with_offset(offset, &mut output[..]);
        } else {
            output.clear();
        }
@@ -396,6 +414,10 @@ impl BlockSegmentPostings {
        self.doc_decoder.output_array()
    }

+    pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
+        self.doc_decoder.output_aligned()
+    }
+
    /// Return the document at index `idx` of the block.
    #[inline]
    pub fn doc(&self, idx: usize) -> u32 {
@@ -592,6 +614,7 @@ mod tests {
    use common::HasLen;
    use core::Index;
    use docset::DocSet;
+    use postings::postings::Postings;
    use schema::IndexRecordOption;
    use schema::Schema;
    use schema::Term;
@@ -608,6 +631,18 @@ mod tests {
        assert_eq!(postings.len(), 0);
    }

+    #[test]
+    #[should_panic(expected = "Have you forgotten to call `.advance()`")]
+    fn test_panic_if_doc_called_before_advance() {
+        SegmentPostings::empty().doc();
+    }
+
+    #[test]
+    #[should_panic(expected = "Have you forgotten to call `.advance()`")]
+    fn test_panic_if_freq_called_before_advance() {
+        SegmentPostings::empty().term_freq();
+    }
+
    #[test]
    fn test_empty_block_segment_postings() {
        let mut postings = BlockSegmentPostings::empty();
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -14,7 +14,7 @@ use termdict::{TermDictionaryBuilder, TermOrdinal};
 use DocId;
 use Result;

-/// `PostingsSerializer` is in charge of serializing
+/// `InvertedIndexSerializer` is in charge of serializing
 /// postings on disk, in the
 /// * `.idx` (inverted index)
 /// * `.pos` (positions file)
@@ -54,7 +54,7 @@ pub struct InvertedIndexSerializer {
 }

 impl InvertedIndexSerializer {
-    /// Open a new `PostingsSerializer` for the given segment
+    /// Open a new `InvertedIndexSerializer` for the given segment
    fn create(
        terms_write: CompositeWrite<WritePtr>,
        postings_write: CompositeWrite<WritePtr>,
--- a/src/query/term_query/mod.rs
+++ b/src/query/term_query/mod.rs
@@ -98,4 +98,20 @@ mod tests {
        }
    }

+    #[test]
+    fn test_term_query_count_when_there_are_deletes() {
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap();
+        index_writer.add_document(doc!(text_field=>"a b"));
+        index_writer.add_document(doc!(text_field=>"a c"));
+        index_writer.delete_term(Term::from_field_text(text_field, "b"));
+        index_writer.commit().unwrap();
+        let term_a = Term::from_field_text(text_field, "a");
+        let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
+        let reader = index.reader().unwrap();
+        assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
+    }
 }
--- a/src/query/term_query/term_weight.rs
+++ b/src/query/term_query/term_weight.rs
@@ -39,15 +39,15 @@ impl Weight for TermWeight {
    }

    fn count(&self, reader: &SegmentReader) -> Result<u32> {
-        if reader.num_deleted_docs() == 0 {
+        if let Some(delete_bitset) = reader.delete_bitset() {
+            Ok(self.scorer(reader)?.count(delete_bitset))
+        } else {
            let field = self.term.field();
            Ok(reader
                .inverted_index(field)
                .get_term_info(&self.term)
                .map(|term_info| term_info.doc_freq)
                .unwrap_or(0))
-        } else {
-            Ok(self.scorer(reader)?.count())
        }
    }
 }
--- a/src/query/union.rs
+++ b/src/query/union.rs
@@ -145,7 +145,7 @@ where
        }
    }

-    fn count(&mut self) -> u32 {
+    fn count_including_deleted(&mut self) -> u32 {
        let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
            .iter()
            .map(|bitset| bitset.len())
@@ -163,6 +163,8 @@ where
        count
    }

+    // TODO implement `count` efficiently.
+
    fn skip_next(&mut self, target: DocId) -> SkipResult {
        if !self.advance() {
            return SkipResult::End;
@@ -300,7 +302,7 @@ mod tests {
            count += 1;
        }
        assert!(!union_expected.advance());
-        assert_eq!(count, make_union().count());
+        assert_eq!(count, make_union().count_including_deleted());
    }

    #[test]
--- a/src/query/weight.rs
+++ b/src/query/weight.rs
@@ -13,6 +13,11 @@ pub trait Weight: Send + Sync + 'static {

    /// Returns the number documents within the given `SegmentReader`.
    fn count(&self, reader: &SegmentReader) -> Result<u32> {
-        Ok(self.scorer(reader)?.count())
+        let mut scorer = self.scorer(reader)?;
+        if let Some(delete_bitset) = reader.delete_bitset() {
+            Ok(scorer.count(delete_bitset))
+        } else {
+            Ok(scorer.count_including_deleted())
+        }
    }
 }
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -97,6 +97,8 @@
 //! If you built your schema programmatically, a complete example
 //! could like this for instance.
 //!
+//! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html).
+//!
 //! # Example
 //!
 //! ```
@@ -157,6 +159,13 @@ pub use self::tokenizer::BoxedTokenizer;
 pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
 pub use self::tokenizer_manager::TokenizerManager;

+/// Maximum authorized len (in bytes) for a token.
+///
+/// Tokenizer are in charge of not emitting tokens larger than this value.
+/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
+/// `2^16 - 1 - 4`, the token will simply be ignored downstream.
+pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
+
 #[cfg(test)]
 pub mod tests {
    use super::{
@@ -228,27 +237,27 @@ pub mod tests {
    fn test_non_en_tokenizer() {
        let tokenizer_manager = TokenizerManager::default();
        tokenizer_manager.register(
-            "es_stem",
+            "el_stem",
            SimpleTokenizer
                .filter(RemoveLongFilter::limit(40))
                .filter(LowerCaser)
-                .filter(Stemmer::new(Language::Spanish)),
+                .filter(Stemmer::new(Language::Greek)),
        );
-        let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
+        let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
                tokens.push(token.clone());
            };
            en_tokenizer
-                .token_stream("Hola, feliz contribuyente!")
+                .token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
                .process(&mut add_token);
        }

        assert_eq!(tokens.len(), 3);
-        assert_token(&tokens[0], 0, "hola", 0, 4);
-        assert_token(&tokens[1], 1, "feliz", 6, 11);
-        assert_token(&tokens[2], 2, "contribuyent", 12, 25);
+        assert_token(&tokens[0], 0, "καλημερ", 0, 16);
+        assert_token(&tokens[1], 1, "χαρουμεν", 18, 36);
+        assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63);
    }

    #[test]
--- a/src/tokenizer/stemmer.rs
+++ b/src/tokenizer/stemmer.rs
@@ -2,7 +2,6 @@

 use super::{Token, TokenFilter, TokenStream};
 use rust_stemmers::{self, Algorithm};
-use std::sync::Arc;

 /// Available stemmer languages.
 #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
@@ -57,14 +56,14 @@ impl Language {
 /// Tokens are expected to be lowercased beforehand.
 #[derive(Clone)]
 pub struct Stemmer {
-    stemmer_algorithm: Arc<Algorithm>,
+    stemmer_algorithm: Algorithm,
 }

 impl Stemmer {
    /// Creates a new Stemmer `TokenFilter` for a given language algorithm.
    pub fn new(language: Language) -> Stemmer {
        Stemmer {
-            stemmer_algorithm: Arc::new(language.algorithm()),
+            stemmer_algorithm: language.algorithm(),
        }
    }
 }
@@ -83,7 +82,7 @@ where
    type ResultTokenStream = StemmerTokenStream<TailTokenStream>;

    fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
-        let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
+        let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
        StemmerTokenStream::wrap(inner_stemmer, token_stream)
    }
 }
Author	SHA1	Message	Date
Paul Masurel	edcfa915ff	Added logging when token is dropped.	2019-04-26 09:22:01 +09:00
Paul Masurel	96a4f503ec	Closes #526 (#535 )	2019-04-24 20:59:48 +09:00
Paul Masurel	9df288b0c9	Merge branch 'master' of github.com:tantivy-search/tantivy	2019-04-24 12:31:47 +09:00
Paul Masurel	b7c2d0de97	Clippy2 (#534 ) * Clippy comments Clippy complaints that about the cast of &[u32] to a const __m128i, because of the lack of alignment constraints. This commit passes the OutputBuffer object (which enforces proper alignment) instead of `&[u32]`. Clippy. Block alignment * Code simplification * Added comment. Code simplification * Removed the extraneous freq block len hack.	2019-04-24 12:31:32 +09:00
Paul Masurel	62445e0ec8	Merge branch 'master' of github.com:tantivy-search/tantivy	2019-04-23 09:55:55 +09:00
Paul Masurel	a228825462	Clippy comments (#532 ) Clippy complaints that about the cast of &[u32] to a *const __m128i, because of the lack of alignment constraints. This commit passes the OutputBuffer object (which enforces proper alignment) instead of `&[u32]`.	2019-04-23 09:54:02 +09:00
Paul Masurel	d3eabd14bc	Clippy comments Clippy complaints that about the cast of &[u32] to a *const __m128i, because of the lack of alignment constraints. This commit passes the OutputBuffer object (which enforces proper alignment) instead of `&[u32]`.	2019-04-22 11:16:21 +09:00
petr-tik	c967031d21	Delete files from target/ dir to avoid caching them on CI (#531 ) * Delete files from target/ dir to avoid caching them on CI idea from here https://github.com/rust-lang/cargo/issues/5885#issuecomment-432723546 * Delete examples	2019-04-21 08:02:27 +09:00
Paul Masurel	d823163d52	Closes #527 . (#529 ) Fixing the bug that affects the result of `query.count()` in presence of deletes.	2019-04-19 09:19:50 +09:00
Paul Masurel	c4f59f202d	Bumped combine version	2019-04-11 08:33:56 +09:00
Paul Masurel	acd29b535d	Fix comment	2019-04-02 10:05:14 +09:00
Panagiotis Ktistakis	2cd31bcda2	Fix non english stemmers (#521 )	2019-03-27 08:54:16 +09:00
Paul Masurel	99870de55c	0.10.0-dev	2019-03-25 08:58:26 +09:00
Paul Masurel	cad2d91845	Disabled tests for android	2019-03-24 22:58:46 +09:00
Paul Masurel	79f3cd6cf4	Added instructions to update	2019-03-24 09:10:31 +09:00