Added field norm readers

Assert nearly equals macro (#853 )
* Assert nearly equals macro * Renamed specialized_scorer in TermScorer
2026-02-27 02:00:37 +00:00 · 2020-07-20 11:59:43 +09:00 · 2020-07-17 16:40:41 +09:00 · 2020-07-16 12:33:11 +09:00 · 2020-07-16 12:32:45 +09:00 · 2020-07-16 12:24:55 +09:00
38 changed files with 442 additions and 249 deletions
--- a/examples/iterating_docs_and_positions.rs
+++ b/examples/iterating_docs_and_positions.rs
@@ -117,11 +117,16 @@ fn main() -> tantivy::Result<()> {
        if let Some(mut block_segment_postings) =
            inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
        {
-            while block_segment_postings.advance() {
+            loop {
+                let docs = block_segment_postings.docs();
+                if docs.is_empty() {
+                    break;
+                }
                // Once again these docs MAY contains deleted documents as well.
                let docs = block_segment_postings.docs();
                // Prints `Docs [0, 2].`
                println!("Docs {:?}", docs);
+                block_segment_postings.advance();
            }
        }
    }
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -24,8 +24,10 @@ use crate::IndexWriter;
 use std::borrow::BorrowMut;
 use std::collections::HashSet;
 use std::fmt;
+
 #[cfg(feature = "mmap")]
-use std::path::{Path, PathBuf};
+use std::path::Path;
+use std::path::PathBuf;
 use std::sync::Arc;

 fn load_metas(
@@ -281,7 +283,7 @@ impl Index {
                TantivyError::LockFailure(
                    err,
                    Some(
-                        "Failed to acquire index lock. If you are using\
+                        "Failed to acquire index lock. If you are using \
                         a regular directory, this means there is already an \
                         `IndexWriter` working on this `Directory`, in this process \
                         or in a different process."
--- a/src/core/index_meta.rs
+++ b/src/core/index_meta.rs
@@ -213,7 +213,7 @@ pub struct IndexMeta {
    #[serde(skip_serializing_if = "Option::is_none")]
    /// Payload associated to the last commit.
    ///
-    /// Upon commit, clients can optionally add a small `Striing` payload to their commit
+    /// Upon commit, clients can optionally add a small `String` payload to their commit
    /// to help identify this commit.
    /// This payload is entirely unused by tantivy.
    pub payload: Option<String>,
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -8,7 +8,7 @@ use crate::directory::ReadOnlySource;
 use crate::fastfield::DeleteBitSet;
 use crate::fastfield::FacetReader;
 use crate::fastfield::FastFieldReaders;
-use crate::fieldnorm::FieldNormReader;
+use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
 use crate::schema::Field;
 use crate::schema::FieldType;
 use crate::schema::Schema;
@@ -48,7 +48,7 @@ pub struct SegmentReader {
    positions_composite: CompositeFile,
    positions_idx_composite: CompositeFile,
    fast_fields_readers: Arc<FastFieldReaders>,
-    fieldnorms_composite: CompositeFile,
+    fieldnorm_readers: FieldNormReaders,

    store_source: ReadOnlySource,
    delete_bitset_opt: Option<DeleteBitSet>,
@@ -126,8 +126,8 @@ impl SegmentReader {
    /// They are simply stored as a fast field, serialized in
    /// the `.fieldnorm` file of the segment.
    pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
-        if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
-            FieldNormReader::open(fieldnorm_source)
+        if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) {
+            fieldnorm_reader
        } else {
            let field_name = self.schema.get_field_name(field);
            let err_msg = format!(
@@ -178,8 +178,8 @@ impl SegmentReader {
        let fast_field_readers =
            Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);

-        let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
-        let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
+        let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
+        let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?;

        let delete_bitset_opt = if segment.meta().has_deletes() {
            let delete_data = segment.open_read(SegmentComponent::DELETE)?;
@@ -195,7 +195,7 @@ impl SegmentReader {
            termdict_composite,
            postings_composite,
            fast_fields_readers: fast_field_readers,
-            fieldnorms_composite,
+            fieldnorm_readers,
            segment_id: segment.id(),
            store_source,
            delete_bitset_opt,
@@ -295,8 +295,8 @@ impl SegmentReader {
    }

    /// Returns an iterator that will iterate over the alive document ids
-    pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
-        SegmentReaderAliveDocsIterator::new(&self)
+    pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
+        (0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
    }

    /// Summarize total space usage of this segment.
@@ -308,7 +308,7 @@ impl SegmentReader {
            self.positions_composite.space_usage(),
            self.positions_idx_composite.space_usage(),
            self.fast_fields_readers.space_usage(),
-            self.fieldnorms_composite.space_usage(),
+            self.fieldnorm_readers.space_usage(),
            self.get_store_reader().space_usage(),
            self.delete_bitset_opt
                .as_ref()
@@ -324,52 +324,6 @@ impl fmt::Debug for SegmentReader {
    }
 }

-/// Implements the iterator trait to allow easy iteration
-/// over non-deleted ("alive") DocIds in a SegmentReader
-pub struct SegmentReaderAliveDocsIterator<'a> {
-    reader: &'a SegmentReader,
-    max_doc: DocId,
-    current: DocId,
-}
-
-impl<'a> SegmentReaderAliveDocsIterator<'a> {
-    pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
-        SegmentReaderAliveDocsIterator {
-            reader,
-            max_doc: reader.max_doc(),
-            current: 0,
-        }
-    }
-}
-
-impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
-    type Item = DocId;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        // TODO: Use TinySet (like in BitSetDocSet) to speed this process up
-        if self.current >= self.max_doc {
-            return None;
-        }
-
-        // find the next alive doc id
-        while self.reader.is_deleted(self.current) {
-            self.current += 1;
-
-            if self.current >= self.max_doc {
-                return None;
-            }
-        }
-
-        // capture the current alive DocId
-        let result = Some(self.current);
-
-        // move down the chain
-        self.current += 1;
-
-        result
-    }
-}
-
 #[cfg(test)]
 mod test {
    use crate::core::Index;
--- a/src/docset.rs
+++ b/src/docset.rs
@@ -38,6 +38,7 @@ pub trait DocSet {
    /// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a DocSet.
    fn seek(&mut self, target: DocId) -> DocId {
        let mut doc = self.doc();
+        debug_assert!(doc <= target);
        while doc < target {
            doc = self.advance();
        }
--- a/src/fieldnorm/mod.rs
+++ b/src/fieldnorm/mod.rs
@@ -21,7 +21,7 @@ mod reader;
 mod serializer;
 mod writer;

-pub use self::reader::FieldNormReader;
+pub use self::reader::{FieldNormReader, FieldNormReaders};
 pub use self::serializer::FieldNormsSerializer;
 pub use self::writer::FieldNormsWriter;

--- a/src/fieldnorm/reader.rs
+++ b/src/fieldnorm/reader.rs
@@ -1,6 +1,41 @@
 use super::{fieldnorm_to_id, id_to_fieldnorm};
+use crate::common::CompositeFile;
 use crate::directory::ReadOnlySource;
+use crate::schema::Field;
+use crate::space_usage::PerFieldSpaceUsage;
 use crate::DocId;
+use std::sync::Arc;
+
+/// Reader for the fieldnorm (for each document, the number of tokens indexed in the
+/// field) of all indexed fields in the index.
+///
+/// Each fieldnorm is approximately compressed over one byte. We refer to this byte as
+/// `fieldnorm_id`.
+/// The mapping from `fieldnorm` to `fieldnorm_id` is given by monotonic.
+#[derive(Clone)]
+pub struct FieldNormReaders {
+    data: Arc<CompositeFile>,
+}
+
+impl FieldNormReaders {
+    /// Creates a field norm reader.
+    pub fn new(source: ReadOnlySource) -> crate::Result<FieldNormReaders> {
+        let data = CompositeFile::open(&source)?;
+        Ok(FieldNormReaders {
+            data: Arc::new(data),
+        })
+    }
+
+    /// Returns the FieldNormReader for a specific field.
+    pub fn get_field(&self, field: Field) -> Option<FieldNormReader> {
+        self.data.open_read(field).map(FieldNormReader::open)
+    }
+
+    /// Return a break down of the space usage per field.
+    pub fn space_usage(&self) -> PerFieldSpaceUsage {
+        self.data.space_usage()
+    }
+}

 /// Reads the fieldnorm associated to a document.
 /// The fieldnorm represents the length associated to
@@ -19,6 +54,7 @@ use crate::DocId;
 /// Apart from compression, this scale also makes it possible to
 /// precompute computationally expensive functions of the fieldnorm
 /// in a very short array.
+#[derive(Clone)]
 pub struct FieldNormReader {
    data: ReadOnlySource,
 }
@@ -29,6 +65,11 @@ impl FieldNormReader {
        FieldNormReader { data }
    }

+    /// Returns the number of documents in this segment.
+    pub fn num_docs(&self) -> u32 {
+        self.data.len() as u32
+    }
+
    /// Returns the `fieldnorm` associated to a doc id.
    /// The fieldnorm is a value approximating the number
    /// of tokens in a given field of the `doc_id`.
@@ -65,10 +106,11 @@ impl FieldNormReader {
 }

 #[cfg(test)]
-impl From<Vec<u32>> for FieldNormReader {
-    fn from(field_norms: Vec<u32>) -> FieldNormReader {
+impl From<&[u32]> for FieldNormReader {
+    fn from(field_norms: &[u32]) -> FieldNormReader {
        let field_norms_id = field_norms
-            .into_iter()
+            .iter()
+            .cloned()
            .map(FieldNormReader::fieldnorm_to_id)
            .collect::<Vec<u8>>();
        let field_norms_data = ReadOnlySource::from(field_norms_id);
--- a/src/fieldnorm/writer.rs
+++ b/src/fieldnorm/writer.rs
@@ -78,11 +78,12 @@ impl FieldNormsWriter {
    }

    /// Serialize the seen fieldnorm values to the serializer for all fields.
-    pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
+    pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
        for &field in self.fields.iter() {
            let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
            fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
        }
+        fieldnorms_serializer.close()?;
        Ok(())
    }
 }
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -167,7 +167,7 @@ impl IndexMerger {

    fn write_fieldnorms(
        &self,
-        fieldnorms_serializer: &mut FieldNormsSerializer,
+        mut fieldnorms_serializer: FieldNormsSerializer,
    ) -> crate::Result<()> {
        let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
        let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
@@ -182,6 +182,7 @@ impl IndexMerger {
            }
            fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
        }
+        fieldnorms_serializer.close()?;
        Ok(())
    }

@@ -589,48 +590,45 @@ impl IndexMerger {
            // of all of the segments containing the given term.
            //
            // These segments are non-empty and advance has already been called.
-            if !segment_postings.is_empty() {
-                // If not, the `term` will be entirely removed.
-
-                // We know that there is at least one document containing
-                // the term, so we add it.
-                let to_term_ord = field_serializer.new_term(term_bytes)?;
-
-                if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
-                    for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
-                        term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
-                    }
-                }
-
-                // We can now serialize this postings, by pushing each document to the
-                // postings serializer.
-                for (segment_ord, mut segment_postings) in segment_postings {
-                    let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
-
-                    let mut doc = segment_postings.doc();
-                    while doc != TERMINATED {
-                        // deleted doc are skipped as they do not have a `remapped_doc_id`.
-                        if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
-                            // we make sure to only write the term iff
-                            // there is at least one document.
-                            let term_freq = segment_postings.term_freq();
-                            segment_postings.positions(&mut positions_buffer);
-
-                            let delta_positions = delta_computer.compute_delta(&positions_buffer);
-                            field_serializer.write_doc(
-                                remapped_doc_id,
-                                term_freq,
-                                delta_positions,
-                            )?;
-                        }
-
-                        doc = segment_postings.advance();
-                    }
-                }
-
-                // closing the term.
-                field_serializer.close_term()?;
+            if segment_postings.is_empty() {
+                continue;
            }
+            // If not, the `term` will be entirely removed.
+
+            // We know that there is at least one document containing
+            // the term, so we add it.
+            let to_term_ord = field_serializer.new_term(term_bytes)?;
+
+            if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
+                for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
+                    term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
+                }
+            }
+
+            // We can now serialize this postings, by pushing each document to the
+            // postings serializer.
+            for (segment_ord, mut segment_postings) in segment_postings {
+                let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
+
+                let mut doc = segment_postings.doc();
+                while doc != TERMINATED {
+                    // deleted doc are skipped as they do not have a `remapped_doc_id`.
+                    if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
+                        // we make sure to only write the term iff
+                        // there is at least one document.
+                        let term_freq = segment_postings.term_freq();
+                        segment_postings.positions(&mut positions_buffer);
+
+                        let delta_positions = delta_computer.compute_delta(&positions_buffer);
+                        field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions)?;
+                    }
+
+                    doc = segment_postings.advance();
+                }
+            }
+
+            // closing the term.
+            field_serializer.close_term()?;
        }
        field_serializer.close()?;
        Ok(term_ord_mapping_opt)
@@ -671,8 +669,10 @@ impl IndexMerger {

 impl SerializableSegment for IndexMerger {
    fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
+        if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
+            self.write_fieldnorms(fieldnorms_serializer)?;
+        }
        let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
-        self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
        self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
        self.write_storable_fields(serializer.get_store_writer())?;
        serializer.close()?;
@@ -1507,12 +1507,9 @@ mod tests {
        for i in 0..100 {
            let mut doc = Document::new();
            doc.add_f64(field, 42.0);
-
            doc.add_f64(multi_field, 0.24);
            doc.add_f64(multi_field, 0.27);
-
            writer.add_document(doc);
-
            if i % 5 == 0 {
                writer.commit()?;
            }
@@ -1524,7 +1521,6 @@ mod tests {
        // If a merging thread fails, we should end up with more
        // than one segment here
        assert_eq!(1, index.searchable_segments()?.len());
-
        Ok(())
    }
 }
--- a/src/indexer/segment_serializer.rs
+++ b/src/indexer/segment_serializer.rs
@@ -8,15 +8,16 @@ use crate::store::StoreWriter;
 /// Segment serializer is in charge of laying out on disk
 /// the data accumulated and sorted by the `SegmentWriter`.
 pub struct SegmentSerializer {
+    segment: Segment,
    store_writer: StoreWriter,
    fast_field_serializer: FastFieldSerializer,
-    fieldnorms_serializer: FieldNormsSerializer,
+    fieldnorms_serializer: Option<FieldNormsSerializer>,
    postings_serializer: InvertedIndexSerializer,
 }

 impl SegmentSerializer {
    /// Creates a new `SegmentSerializer`.
-    pub fn for_segment(segment: &mut Segment) -> crate::Result<SegmentSerializer> {
+    pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
        let store_write = segment.open_write(SegmentComponent::STORE)?;

        let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
@@ -25,15 +26,21 @@ impl SegmentSerializer {
        let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
        let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;

-        let postings_serializer = InvertedIndexSerializer::open(segment)?;
+        let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
        Ok(SegmentSerializer {
+            segment,
            store_writer: StoreWriter::new(store_write),
            fast_field_serializer,
-            fieldnorms_serializer,
+            fieldnorms_serializer: Some(fieldnorms_serializer),
            postings_serializer,
        })
    }

+    #[allow(dead_code)]
+    pub fn segment(&self) -> &Segment {
+        &self.segment
+    }
+
    /// Accessor to the `PostingsSerializer`.
    pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
        &mut self.postings_serializer
@@ -44,9 +51,11 @@ impl SegmentSerializer {
        &mut self.fast_field_serializer
    }

-    /// Accessor to the field norm serializer.
-    pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
-        &mut self.fieldnorms_serializer
+    /// Extract the field norm serializer.
+    ///
+    /// Note the fieldnorms serializer can only be extracted once.
+    pub fn extract_fieldnorms_serializer(&mut self) -> Option<FieldNormsSerializer> {
+        self.fieldnorms_serializer.take()
    }

    /// Accessor to the `StoreWriter`.
@@ -55,11 +64,13 @@ impl SegmentSerializer {
    }

    /// Finalize the segment serialization.
-    pub fn close(self) -> crate::Result<()> {
+    pub fn close(mut self) -> crate::Result<()> {
+        if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
+            fieldnorms_serializer.close()?;
+        }
        self.fast_field_serializer.close()?;
        self.postings_serializer.close()?;
        self.store_writer.close()?;
-        self.fieldnorms_serializer.close()?;
        Ok(())
    }
 }
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -112,7 +112,7 @@ fn merge(
    target_opstamp: Opstamp,
 ) -> crate::Result<SegmentEntry> {
    // first we need to apply deletes to our segment.
-    let mut merged_segment = index.new_segment();
+    let merged_segment = index.new_segment();

    // First we apply all of the delet to the merged segment, up to the target opstamp.
    for segment_entry in &mut segment_entries {
@@ -131,12 +131,13 @@ fn merge(
    let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;

    // ... we just serialize this index merger in our new segment to merge the two segments.
-    let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?;
+    let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;

    let num_docs = merger.write(segment_serializer)?;

-    let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs);
+    let merged_segment_id = merged_segment.id();

+    let segment_meta = index.new_segment_meta(merged_segment_id, num_docs);
    Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
 }

@@ -521,7 +522,7 @@ impl SegmentUpdater {
    ///
    /// Upon termination of the current merging threads,
    /// merge opportunity may appear.
-    //
+    ///
    /// We keep waiting until the merge policy judges that
    /// no opportunity is available.
    ///
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -62,11 +62,12 @@ impl SegmentWriter {
    /// - schema
    pub fn for_segment(
        memory_budget: usize,
-        mut segment: Segment,
+        segment: Segment,
        schema: &Schema,
    ) -> crate::Result<SegmentWriter> {
+        let tokenizer_manager = segment.index().tokenizers().clone();
        let table_num_bits = initial_table_size(memory_budget)?;
-        let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
+        let segment_serializer = SegmentSerializer::for_segment(segment)?;
        let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
        let tokenizers = schema
            .fields()
@@ -76,7 +77,7 @@ impl SegmentWriter {
                        .get_indexing_options()
                        .and_then(|text_index_option| {
                            let tokenizer_name = &text_index_option.tokenizer();
-                            segment.index().tokenizers().get(tokenizer_name)
+                            tokenizer_manager.get(tokenizer_name)
                        }),
                    _ => None,
                },
@@ -280,9 +281,11 @@ fn write(
    fieldnorms_writer: &FieldNormsWriter,
    mut serializer: SegmentSerializer,
 ) -> crate::Result<()> {
+    if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
+        fieldnorms_writer.serialize(fieldnorms_serializer)?;
+    }
    let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
    fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
-    fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
    serializer.close()?;
    Ok(())
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -298,17 +298,26 @@ mod tests {
    use rand::rngs::StdRng;
    use rand::{Rng, SeedableRng};

-    pub fn assert_nearly_equals(expected: f32, val: f32) {
-        assert!(
-            nearly_equals(val, expected),
-            "Got {}, expected {}.",
-            val,
-            expected
-        );
-    }
-
-    pub fn nearly_equals(a: f32, b: f32) -> bool {
-        (a - b).abs() < 0.0005 * (a + b).abs()
+    /// Checks if left and right are close one to each other.
+    /// Panics if the two values are more than 0.5% apart.
+    #[macro_export]
+    macro_rules! assert_nearly_equals {
+        ($left:expr, $right:expr) => {{
+            match (&$left, &$right) {
+                (left_val, right_val) => {
+                    let diff = (left_val - right_val).abs();
+                    let add = left_val.abs() + right_val.abs();
+                    if diff > 0.0005 * add {
+                        panic!(
+                            r#"assertion failed: `(left ~= right)`
+  left: `{:?}`,
+ right: `{:?}`"#,
+                            &*left_val, &*right_val
+                        )
+                    }
+                }
+            }
+        }};
    }

    pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
--- a/src/postings/block_segment_postings.rs
+++ b/src/postings/block_segment_postings.rs
@@ -47,7 +47,6 @@ fn decode_vint_block(
    doc_offset: DocId,
    num_vint_docs: usize,
 ) {
-    doc_decoder.clear();
    let num_consumed_bytes = doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs);
    if let Some(freq_decoder) = freq_decoder_opt {
        freq_decoder.uncompress_vint_unsorted(&data[num_consumed_bytes..], num_vint_docs);
@@ -99,7 +98,7 @@ impl BlockSegmentPostings {
            data: postings_data,
            skip_reader,
        };
-        block_segment_postings.advance();
+        block_segment_postings.load_block();
        block_segment_postings
    }

@@ -117,13 +116,13 @@ impl BlockSegmentPostings {
        let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, postings_data);
        self.data = ReadOnlySource::new(postings_data);
        self.loaded_offset = std::usize::MAX;
-        self.loaded_offset = std::usize::MAX;
        if let Some(skip_data) = skip_data_opt {
            self.skip_reader.reset(skip_data, doc_freq);
        } else {
            self.skip_reader.reset(ReadOnlySource::empty(), doc_freq);
        }
        self.doc_freq = doc_freq as usize;
+        self.load_block();
    }

    /// Returns the document frequency associated to this block postings.
@@ -215,6 +214,10 @@ impl BlockSegmentPostings {
                );
            }
            BlockInfo::VInt(num_vint_docs) => {
+                self.doc_decoder.clear();
+                if num_vint_docs == 0 {
+                    return;
+                }
                decode_vint_block(
                    &mut self.doc_decoder,
                    if let FreqReadingOption::ReadFreq = self.freq_reading_option {
@@ -233,12 +236,9 @@ impl BlockSegmentPostings {
    /// Advance to the next block.
    ///
    /// Returns false iff there was no remaining blocks.
-    pub fn advance(&mut self) -> bool {
-        if !self.skip_reader.advance() {
-            return false;
-        }
+    pub fn advance(&mut self) {
+        self.skip_reader.advance();
        self.load_block();
-        true
    }

    /// Returns an empty segment postings object
@@ -294,7 +294,8 @@ mod tests {
    #[test]
    fn test_empty_block_segment_postings() {
        let mut postings = BlockSegmentPostings::empty();
-        assert!(!postings.advance());
+        postings.advance();
+        assert!(postings.docs().is_empty());
        assert_eq!(postings.doc_freq(), 0);
    }

@@ -306,13 +307,14 @@ mod tests {
        assert_eq!(block_segments.doc_freq(), 100_000);
        loop {
            let block = block_segments.docs();
+            if block.is_empty() {
+                break;
+            }
            for (i, doc) in block.iter().cloned().enumerate() {
                assert_eq!(offset + (i as u32), doc);
            }
            offset += block.len() as u32;
-            if block_segments.advance() {
-                break;
-            }
+            block_segments.advance();
        }
    }

@@ -421,7 +423,6 @@ mod tests {
            let term_info = inverted_index.get_term_info(&term).unwrap();
            inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
        }
-        assert!(block_segments.advance());
        assert_eq!(block_segments.docs(), &[1, 3, 5]);
    }
 }
--- a/src/postings/compression/mod.rs
+++ b/src/postings/compression/mod.rs
@@ -109,6 +109,7 @@ impl BlockDecoder {
    }

    pub fn clear(&mut self) {
+        self.output_len = 0;
        self.output.0.iter_mut().for_each(|el| *el = TERMINATED);
    }
 }
@@ -244,6 +245,19 @@ pub mod tests {
        }
    }

+    #[test]
+    fn test_clearing() {
+        let mut encoder = BlockEncoder::new();
+        let vals = (0u32..128u32).map(|i| i * 3).collect::<Vec<_>>();
+        let (num_bits, compressed) = encoder.compress_block_sorted(&vals[..], 0u32);
+        let mut decoder = BlockDecoder::default();
+        decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
+        assert_eq!(decoder.output_len, 128);
+        assert_eq!(decoder.output_array(), &vals[..]);
+        decoder.clear();
+        assert!(decoder.output_array().is_empty());
+    }
+
    #[test]
    fn test_encode_unsorted_block_with_junk() {
        let mut compressed: Vec<u8> = Vec::new();
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -582,6 +582,9 @@ pub mod tests {
    ) {
        for target in targets {
            let mut postings_opt = postings_factory();
+            if target < postings_opt.doc() {
+                continue;
+            }
            let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
            let skip_result_opt = postings_opt.seek(target);
            let skip_result_unopt = postings_unopt.seek(target);
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -100,14 +100,15 @@ impl DocSet for SegmentPostings {
    }

    fn seek(&mut self, target: DocId) -> DocId {
-        if self.doc() == target {
-            return target;
+        debug_assert!(self.doc() <= target);
+        if self.doc() >= target {
+            return self.doc();
        }
+
        self.block_cursor.seek(target);

        // At this point we are on the block, that might contain our document.
        let output = self.block_cursor.docs_aligned();
-
        self.cur = self.block_searcher.search_in_block(&output, target);

        // The last block is not full and padded with the value TERMINATED,
@@ -123,6 +124,7 @@ impl DocSet for SegmentPostings {
        // After the search, the cursor should point to the first value of TERMINATED.
        let doc = output.0[self.cur];
        debug_assert!(doc >= target);
+        debug_assert_eq!(doc, self.doc());
        doc
    }

--- a/src/postings/skip.rs
+++ b/src/postings/skip.rs
@@ -81,25 +81,41 @@ impl Default for BlockInfo {

 impl SkipReader {
    pub fn new(data: ReadOnlySource, doc_freq: u32, skip_info: IndexRecordOption) -> SkipReader {
-        SkipReader {
-            last_doc_in_block: 0u32,
+        let mut skip_reader = SkipReader {
+            last_doc_in_block: if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
+                0
+            } else {
+                TERMINATED
+            },
            last_doc_in_previous_block: 0u32,
            owned_read: OwnedRead::new(data),
            skip_info,
-            block_info: BlockInfo::default(),
+            block_info: BlockInfo::VInt(doc_freq),
            byte_offset: 0,
            remaining_docs: doc_freq,
            position_offset: 0u64,
+        };
+        if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
+            skip_reader.read_block_info();
        }
+        skip_reader
    }

    pub fn reset(&mut self, data: ReadOnlySource, doc_freq: u32) {
-        self.last_doc_in_block = 0u32;
+        self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
+            0
+        } else {
+            TERMINATED
+        };
        self.last_doc_in_previous_block = 0u32;
        self.owned_read = OwnedRead::new(data);
-        self.block_info = BlockInfo::default();
+        self.block_info = BlockInfo::VInt(doc_freq);
        self.byte_offset = 0;
        self.remaining_docs = doc_freq;
+        self.position_offset = 0u64;
+        if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
+            self.read_block_info();
+        }
    }

    #[cfg(test)]
@@ -165,7 +181,7 @@ impl SkipReader {
        }
    }

-    pub fn advance(&mut self) -> bool {
+    pub fn advance(&mut self) {
        match self.block_info {
            BlockInfo::BitPacked {
                doc_num_bits,
@@ -177,17 +193,17 @@ impl SkipReader {
                self.position_offset += tf_sum as u64;
            }
            BlockInfo::VInt(num_vint_docs) => {
-                self.remaining_docs -= num_vint_docs;
+                debug_assert_eq!(num_vint_docs, self.remaining_docs);
+                self.remaining_docs = 0;
+                self.byte_offset = std::usize::MAX;
            }
        }
        self.last_doc_in_previous_block = self.last_doc_in_block;
        if self.remaining_docs >= COMPRESSION_BLOCK_SIZE as u32 {
            self.read_block_info();
-            true
        } else {
            self.last_doc_in_block = TERMINATED;
            self.block_info = BlockInfo::VInt(self.remaining_docs);
-            self.remaining_docs > 0
        }
    }
 }
@@ -217,7 +233,6 @@ mod tests {
            doc_freq,
            IndexRecordOption::WithFreqs,
        );
-        assert!(skip_reader.advance());
        assert_eq!(skip_reader.last_doc_in_block(), 1u32);
        assert_eq!(
            skip_reader.block_info(),
@@ -227,7 +242,7 @@ mod tests {
                tf_sum: 0
            }
        );
-        assert!(skip_reader.advance());
+        skip_reader.advance();
        assert_eq!(skip_reader.last_doc_in_block(), 5u32);
        assert_eq!(
            skip_reader.block_info(),
@@ -237,9 +252,12 @@ mod tests {
                tf_sum: 0
            }
        );
-        assert!(skip_reader.advance());
+        skip_reader.advance();
        assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32));
-        assert!(!skip_reader.advance());
+        skip_reader.advance();
+        assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
+        skip_reader.advance();
+        assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
    }

    #[test]
@@ -256,7 +274,6 @@ mod tests {
            doc_freq,
            IndexRecordOption::Basic,
        );
-        assert!(skip_reader.advance());
        assert_eq!(skip_reader.last_doc_in_block(), 1u32);
        assert_eq!(
            skip_reader.block_info(),
@@ -266,7 +283,7 @@ mod tests {
                tf_sum: 0u32
            }
        );
-        assert!(skip_reader.advance());
+        skip_reader.advance();
        assert_eq!(skip_reader.last_doc_in_block(), 5u32);
        assert_eq!(
            skip_reader.block_info(),
@@ -276,9 +293,12 @@ mod tests {
                tf_sum: 0u32
            }
        );
-        assert!(skip_reader.advance());
+        skip_reader.advance();
        assert_eq!(skip_reader.block_info(), BlockInfo::VInt(3u32));
-        assert!(!skip_reader.advance());
+        skip_reader.advance();
+        assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
+        skip_reader.advance();
+        assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
    }

    #[test]
@@ -294,7 +314,6 @@ mod tests {
            doc_freq,
            IndexRecordOption::Basic,
        );
-        assert!(skip_reader.advance());
        assert_eq!(skip_reader.last_doc_in_block(), 1u32);
        assert_eq!(
            skip_reader.block_info(),
@@ -304,6 +323,7 @@ mod tests {
                tf_sum: 0u32
            }
        );
-        assert!(!skip_reader.advance());
+        skip_reader.advance();
+        assert_eq!(skip_reader.block_info(), BlockInfo::VInt(0u32));
    }
 }
--- a/src/query/automaton_weight.rs
+++ b/src/query/automaton_weight.rs
@@ -43,7 +43,6 @@ where
    fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> {
        let max_doc = reader.max_doc();
        let mut doc_bitset = BitSet::with_max_value(max_doc);
-
        let inverted_index = reader.inverted_index(self.field);
        let term_dict = inverted_index.terms();
        let mut term_stream = self.automaton_stream(term_dict);
@@ -52,12 +51,14 @@ where
            let mut block_segment_postings = inverted_index
                .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
            loop {
-                for &doc in block_segment_postings.docs() {
-                    doc_bitset.insert(doc);
-                }
-                if !block_segment_postings.advance() {
+                let docs = block_segment_postings.docs();
+                if docs.is_empty() {
                    break;
                }
+                for &doc in docs {
+                    doc_bitset.insert(doc);
+                }
+                block_segment_postings.advance();
            }
        }
        let doc_bitset = BitSetDocSet::from(doc_bitset);
--- a/src/query/bm25.rs
+++ b/src/query/bm25.rs
@@ -139,10 +139,10 @@ impl BM25Weight {
 mod tests {

    use super::idf;
-    use crate::tests::assert_nearly_equals;
+    use crate::assert_nearly_equals;

    #[test]
    fn test_idf() {
-        assert_nearly_equals(idf(1, 2), 0.6931472);
+        assert_nearly_equals!(idf(1, 2), 0.6931472);
    }
 }
--- a/src/query/boolean_query/boolean_weight.rs
+++ b/src/query/boolean_query/boolean_weight.rs
@@ -94,7 +94,7 @@ impl BooleanWeight {

        let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
            .remove(&Occur::MustNot)
-            .map(scorer_union::<TScoreCombiner>)
+            .map(scorer_union::<DoNothingCombiner>)
            .map(Into::into);

        let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
--- a/src/query/boolean_query/mod.rs
+++ b/src/query/boolean_query/mod.rs
@@ -7,7 +7,9 @@ pub use self::boolean_query::BooleanQuery;
 mod tests {

    use super::*;
+    use crate::assert_nearly_equals;
    use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
+    use crate::collector::TopDocs;
    use crate::query::score_combiner::SumWithCoordsCombiner;
    use crate::query::term_query::TermScorer;
    use crate::query::Intersection;
@@ -18,9 +20,8 @@ mod tests {
    use crate::query::Scorer;
    use crate::query::TermQuery;
    use crate::schema::*;
-    use crate::tests::assert_nearly_equals;
    use crate::Index;
-    use crate::{DocAddress, DocId};
+    use crate::{DocAddress, DocId, Score};

    fn aux_test_helper() -> (Index, Field) {
        let mut schema_builder = Schema::builder();
@@ -140,7 +141,6 @@ mod tests {
                .map(|doc| doc.1)
                .collect::<Vec<DocId>>()
        };
-
        {
            let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]);
            assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
@@ -177,6 +177,54 @@ mod tests {
        }
    }

+    #[test]
+    pub fn test_boolean_query_two_excluded() {
+        let (index, text_field) = aux_test_helper();
+
+        let make_term_query = |text: &str| {
+            let term_query = TermQuery::new(
+                Term::from_field_text(text_field, text),
+                IndexRecordOption::Basic,
+            );
+            let query: Box<dyn Query> = Box::new(term_query);
+            query
+        };
+
+        let reader = index.reader().unwrap();
+
+        let matching_topdocs = |query: &dyn Query| {
+            reader
+                .searcher()
+                .search(query, &TopDocs::with_limit(3))
+                .unwrap()
+        };
+
+        let score_doc_4: Score; // score of doc 4 should not be influenced by exclusion
+        {
+            let boolean_query_no_excluded =
+                BooleanQuery::from(vec![(Occur::Must, make_term_query("d"))]);
+            let topdocs_no_excluded = matching_topdocs(&boolean_query_no_excluded);
+            assert_eq!(topdocs_no_excluded.len(), 2);
+            let (top_score, top_doc) = topdocs_no_excluded[0];
+            assert_eq!(top_doc, DocAddress(0, 4));
+            assert_eq!(topdocs_no_excluded[1].1, DocAddress(0, 3)); // ignore score of doc 3.
+            score_doc_4 = top_score;
+        }
+
+        {
+            let boolean_query_two_excluded = BooleanQuery::from(vec![
+                (Occur::Must, make_term_query("d")),
+                (Occur::MustNot, make_term_query("a")),
+                (Occur::MustNot, make_term_query("b")),
+            ]);
+            let topdocs_excluded = matching_topdocs(&boolean_query_two_excluded);
+            assert_eq!(topdocs_excluded.len(), 1);
+            let (top_score, top_doc) = topdocs_excluded[0];
+            assert_eq!(top_doc, DocAddress(0, 4));
+            assert_eq!(top_score, score_doc_4);
+        }
+    }
+
    #[test]
    pub fn test_boolean_query_with_weight() {
        let mut schema_builder = Schema::builder();
@@ -208,14 +256,14 @@ mod tests {
                .scorer(searcher.segment_reader(0u32), 1.0f32)
                .unwrap();
            assert_eq!(boolean_scorer.doc(), 0u32);
-            assert_nearly_equals(boolean_scorer.score(), 0.84163445f32);
+            assert_nearly_equals!(boolean_scorer.score(), 0.84163445f32);
        }
        {
            let mut boolean_scorer = boolean_weight
                .scorer(searcher.segment_reader(0u32), 2.0f32)
                .unwrap();
            assert_eq!(boolean_scorer.doc(), 0u32);
-            assert_nearly_equals(boolean_scorer.score(), 1.6832689f32);
+            assert_nearly_equals!(boolean_scorer.score(), 1.6832689f32);
        }
    }

@@ -274,7 +322,7 @@ mod tests {
        index_writer.add_document(doc!(
            // tf = 1 1
            title =>  "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
-            // tf = 0 0 
+            // tf = 0 0
            text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
        ));
        for _ in 0..1_000 {
--- a/src/query/exclude.rs
+++ b/src/query/exclude.rs
@@ -3,6 +3,11 @@ use crate::query::Scorer;
 use crate::DocId;
 use crate::Score;

+#[inline(always)]
+fn is_within<TDocSetExclude: DocSet>(docset: &mut TDocSetExclude, doc: DocId) -> bool {
+    docset.doc() <= doc && docset.seek(doc) == doc
+}
+
 /// Filters a given `DocSet` by removing the docs from a given `DocSet`.
 ///
 /// The excluding docset has no impact on scoring.
@@ -23,8 +28,7 @@ where
    ) -> Exclude<TDocSet, TDocSetExclude> {
        while underlying_docset.doc() != TERMINATED {
            let target = underlying_docset.doc();
-            if excluding_docset.seek(target) != target {
-                // this document is not excluded.
+            if !is_within(&mut excluding_docset, target) {
                break;
            }
            underlying_docset.advance();
@@ -36,42 +40,30 @@ where
    }
 }

-impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
-where
-    TDocSet: DocSet,
-    TDocSetExclude: DocSet,
-{
-    /// Returns true iff the doc is not removed.
-    ///
-    /// The method has to be called with non strictly
-    /// increasing `doc`.
-    fn accept(&mut self) -> bool {
-        let doc = self.underlying_docset.doc();
-        self.excluding_docset.seek(doc) != doc
-    }
-}
-
 impl<TDocSet, TDocSetExclude> DocSet for Exclude<TDocSet, TDocSetExclude>
 where
    TDocSet: DocSet,
    TDocSetExclude: DocSet,
 {
    fn advance(&mut self) -> DocId {
-        while self.underlying_docset.advance() != TERMINATED {
-            if self.accept() {
-                return self.doc();
+        loop {
+            let candidate = self.underlying_docset.advance();
+            if candidate == TERMINATED {
+                return TERMINATED;
+            }
+            if !is_within(&mut self.excluding_docset, candidate) {
+                return candidate;
            }
        }
-        TERMINATED
    }

    fn seek(&mut self, target: DocId) -> DocId {
-        let underlying_seek_result = self.underlying_docset.seek(target);
-        if underlying_seek_result == TERMINATED {
+        let candidate = self.underlying_docset.seek(target);
+        if candidate == TERMINATED {
            return TERMINATED;
        }
-        if self.accept() {
-            return underlying_seek_result;
+        if !is_within(&mut self.excluding_docset, candidate) {
+            return candidate;
        }
        self.advance()
    }
@@ -129,7 +121,7 @@ mod tests {
                    VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
                ))
            },
-            vec![1, 2, 5, 8, 10, 15, 24],
+            vec![5, 8, 10, 15, 24],
        );
    }

--- a/src/query/fuzzy_query.rs
+++ b/src/query/fuzzy_query.rs
@@ -163,10 +163,10 @@ impl Query for FuzzyTermQuery {
 #[cfg(test)]
 mod test {
    use super::FuzzyTermQuery;
+    use crate::assert_nearly_equals;
    use crate::collector::TopDocs;
    use crate::schema::Schema;
    use crate::schema::TEXT;
-    use crate::tests::assert_nearly_equals;
    use crate::Index;
    use crate::Term;

@@ -199,7 +199,7 @@ mod test {
                .unwrap();
            assert_eq!(top_docs.len(), 1, "Expected only 1 document");
            let (score, _) = top_docs[0];
-            assert_nearly_equals(1f32, score);
+            assert_nearly_equals!(1f32, score);
        }

        // fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
@@ -223,7 +223,7 @@ mod test {
                .unwrap();
            assert_eq!(top_docs.len(), 1, "Expected only 1 document");
            let (score, _) = top_docs[0];
-            assert_nearly_equals(1f32, score);
+            assert_nearly_equals!(1f32, score);
        }
    }
 }
--- a/src/query/intersection.rs
+++ b/src/query/intersection.rs
@@ -53,7 +53,8 @@ pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<dyn Scorer>>
 }

 fn go_to_first_doc<TDocSet: DocSet>(docsets: &mut [TDocSet]) -> DocId {
-    let mut candidate = 0;
+    assert!(!docsets.is_empty());
+    let mut candidate = docsets.iter().map(TDocSet::doc).max().unwrap();
    'outer: loop {
        for docset in docsets.iter_mut() {
            let seek_doc = docset.seek(candidate);
@@ -119,6 +120,9 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
                }
            }

+            debug_assert_eq!(candidate, self.left.doc());
+            debug_assert_eq!(candidate, self.right.doc());
+            debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
            return candidate;
        }
    }
@@ -129,7 +133,10 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
        for docset in &mut self.others {
            docsets.push(docset);
        }
-        go_to_first_doc(&mut docsets[..])
+        let doc = go_to_first_doc(&mut docsets[..]);
+        debug_assert!(docsets.iter().all(|docset| docset.doc() == doc));
+        debug_assert!(doc >= target);
+        doc
    }

    fn doc(&self) -> DocId {
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -10,12 +10,13 @@ pub use self::phrase_weight::PhraseWeight;
 pub mod tests {

    use super::*;
+    use crate::assert_nearly_equals;
    use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
    use crate::core::Index;
+    use crate::query::Weight;
    use crate::schema::{Schema, Term, TEXT};
-    use crate::tests::assert_nearly_equals;
-    use crate::DocAddress;
    use crate::DocId;
+    use crate::{DocAddress, TERMINATED};

    pub fn create_index(texts: &[&'static str]) -> Index {
        let mut schema_builder = Schema::builder();
@@ -67,6 +68,23 @@ pub mod tests {
        assert!(test_query(vec!["g", "a"]).is_empty());
    }

+    #[test]
+    pub fn test_phrase_query_simple() -> crate::Result<()> {
+        let index = create_index(&["a b b d c g c", "a b a b c"]);
+        let text_field = index.schema().get_field("text").unwrap();
+        let searcher = index.reader()?.searcher();
+        let terms: Vec<Term> = vec!["a", "b", "c"]
+            .iter()
+            .map(|text| Term::from_field_text(text_field, text))
+            .collect();
+        let phrase_query = PhraseQuery::new(terms);
+        let phrase_weight = phrase_query.phrase_weight(&searcher, false)?;
+        let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0f32)?;
+        assert_eq!(phrase_scorer.doc(), 1);
+        assert_eq!(phrase_scorer.advance(), TERMINATED);
+        Ok(())
+    }
+
    #[test]
    pub fn test_phrase_query_no_score() {
        let index = create_index(&[
@@ -157,8 +175,8 @@ pub mod tests {
                .to_vec()
        };
        let scores = test_query(vec!["a", "b"]);
-        assert_nearly_equals(scores[0], 0.40618482);
-        assert_nearly_equals(scores[1], 0.46844664);
+        assert_nearly_equals!(scores[0], 0.40618482);
+        assert_nearly_equals!(scores[1], 0.46844664);
    }

    #[test] // motivated by #234
--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -239,6 +239,7 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
    }

    fn seek(&mut self, target: DocId) -> DocId {
+        debug_assert!(target >= self.doc());
        let doc = self.intersection_docset.seek(target);
        if doc == TERMINATED || self.phrase_match() {
            return doc;
@@ -266,7 +267,6 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {

 #[cfg(test)]
 mod tests {
-
    use super::{intersection, intersection_count};

    fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -113,7 +113,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
 /// The language covered by the current parser is extremely simple.
 ///
 /// * simple terms: "e.g.: `Barack Obama` are simply tokenized using
-///   tantivy's [`SimpleTokenizer`](tantivy::tokenizer::SimpleTokenizer), hence
+///   tantivy's [`SimpleTokenizer`](../tokenizer/struct.SimpleTokenizer.html), hence
 ///   becoming `["barack", "obama"]`. The terms are then searched within
 ///   the default terms of the query parser.
 ///
--- a/src/query/range_query.rs
+++ b/src/query/range_query.rs
@@ -301,12 +301,14 @@ impl Weight for RangeWeight {
            let mut block_segment_postings = inverted_index
                .read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
            loop {
+                let docs = block_segment_postings.docs();
+                if docs.is_empty() {
+                    break;
+                }
                for &doc in block_segment_postings.docs() {
                    doc_bitset.insert(doc);
                }
-                if !block_segment_postings.advance() {
-                    break;
-                }
+                block_segment_postings.advance();
            }
        }
        let doc_bitset = BitSetDocSet::from(doc_bitset);
--- a/src/query/regex_query.rs
+++ b/src/query/regex_query.rs
@@ -89,10 +89,10 @@ impl Query for RegexQuery {
 #[cfg(test)]
 mod test {
    use super::RegexQuery;
+    use crate::assert_nearly_equals;
    use crate::collector::TopDocs;
    use crate::schema::TEXT;
    use crate::schema::{Field, Schema};
-    use crate::tests::assert_nearly_equals;
    use crate::{Index, IndexReader};
    use std::sync::Arc;
    use tantivy_fst::Regex;
@@ -129,7 +129,7 @@ mod test {
                .unwrap();
            assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
            let (score, _) = scored_docs[0];
-            assert_nearly_equals(1f32, score);
+            assert_nearly_equals!(1f32, score);
        }
        let top_docs = searcher
            .search(&query_matching_zero, &TopDocs::with_limit(2))
--- a/src/query/reqopt_scorer.rs
+++ b/src/query/reqopt_scorer.rs
@@ -72,7 +72,7 @@ where
        let doc = self.doc();
        let mut score_combiner = TScoreCombiner::default();
        score_combiner.update(&mut self.req_scorer);
-        if self.opt_scorer.seek(doc) == doc {
+        if self.opt_scorer.doc() <= doc && self.opt_scorer.seek(doc) == doc {
            score_combiner.update(&mut self.opt_scorer);
        }
        let score = score_combiner.score();
--- a/src/query/term_query/mod.rs
+++ b/src/query/term_query/mod.rs
@@ -9,13 +9,14 @@ pub use self::term_weight::TermWeight;
 #[cfg(test)]
 mod tests {

+    use crate::assert_nearly_equals;
    use crate::collector::TopDocs;
    use crate::docset::DocSet;
+    use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
    use crate::query::{Query, QueryParser, Scorer, TermQuery};
    use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
-    use crate::tests::assert_nearly_equals;
-    use crate::Index;
    use crate::Term;
+    use crate::{Index, TERMINATED};

    #[test]
    pub fn test_term_query_no_freq() {
@@ -42,6 +43,41 @@ mod tests {
        assert_eq!(term_scorer.score(), 0.28768212);
    }

+    #[test]
+    pub fn test_term_query_multiple_of_block_len() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", STRING);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        {
+            // writing the segment
+            let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
+            for _ in 0..COMPRESSION_BLOCK_SIZE {
+                let doc = doc!(text_field => "a");
+                index_writer.add_document(doc);
+            }
+            index_writer.commit()?;
+        }
+        let searcher = index.reader()?.searcher();
+        let term_query = TermQuery::new(
+            Term::from_field_text(text_field, "a"),
+            IndexRecordOption::Basic,
+        );
+        let term_weight = term_query.weight(&searcher, true)?;
+        let segment_reader = searcher.segment_reader(0);
+        let mut term_scorer = term_weight.scorer(segment_reader, 1.0f32)?;
+        for i in 0u32..COMPRESSION_BLOCK_SIZE as u32 {
+            assert_eq!(term_scorer.doc(), i);
+            if i == COMPRESSION_BLOCK_SIZE as u32 - 1u32 {
+                assert_eq!(term_scorer.advance(), TERMINATED);
+            } else {
+                assert_eq!(term_scorer.advance(), i + 1);
+            }
+        }
+        assert_eq!(term_scorer.doc(), TERMINATED);
+        Ok(())
+    }
+
    #[test]
    pub fn test_term_weight() {
        let mut schema_builder = Schema::builder();
@@ -69,7 +105,7 @@ mod tests {
                .unwrap();
            assert_eq!(topdocs.len(), 1);
            let (score, _) = topdocs[0];
-            assert_nearly_equals(0.77802235, score);
+            assert_nearly_equals!(0.77802235, score);
        }
        {
            let term = Term::from_field_text(left_field, "left1");
@@ -79,9 +115,9 @@ mod tests {
                .unwrap();
            assert_eq!(top_docs.len(), 2);
            let (score1, _) = top_docs[0];
-            assert_nearly_equals(0.27101856, score1);
+            assert_nearly_equals!(0.27101856, score1);
            let (score2, _) = top_docs[1];
-            assert_nearly_equals(0.13736556, score2);
+            assert_nearly_equals!(0.13736556, score2);
        }
        {
            let query_parser = QueryParser::for_index(&index, vec![]);
@@ -89,9 +125,9 @@ mod tests {
            let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
            assert_eq!(top_docs.len(), 2);
            let (score1, _) = top_docs[0];
-            assert_nearly_equals(0.9153879, score1);
+            assert_nearly_equals!(0.9153879, score1);
            let (score2, _) = top_docs[1];
-            assert_nearly_equals(0.27101856, score2);
+            assert_nearly_equals!(0.27101856, score2);
        }
    }

@@ -112,6 +148,27 @@ mod tests {
        assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
    }

+    #[test]
+    fn test_term_query_simple_seek() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+        index_writer.add_document(doc!(text_field=>"a"));
+        index_writer.add_document(doc!(text_field=>"a"));
+        index_writer.commit()?;
+        let term_a = Term::from_field_text(text_field, "a");
+        let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
+        let searcher = index.reader()?.searcher();
+        let term_weight = term_query.weight(&searcher, false)?;
+        let mut term_scorer = term_weight.scorer(searcher.segment_reader(0u32), 1.0f32)?;
+        assert_eq!(term_scorer.doc(), 0u32);
+        term_scorer.seek(1u32);
+        assert_eq!(term_scorer.doc(), 1u32);
+        Ok(())
+    }
+
    #[test]
    fn test_term_query_debug() {
        let term_query = TermQuery::new(
--- a/src/query/term_query/term_weight.rs
+++ b/src/query/term_query/term_weight.rs
@@ -20,12 +20,12 @@ pub struct TermWeight {

 impl Weight for TermWeight {
    fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> {
-        let term_scorer = self.scorer_specialized(reader, boost)?;
+        let term_scorer = self.specialized_scorer(reader, boost)?;
        Ok(Box::new(term_scorer))
    }

    fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
-        let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
+        let mut scorer = self.specialized_scorer(reader, 1.0f32)?;
        if scorer.seek(doc) != doc {
            return Err(does_not_match(doc));
        }
@@ -52,7 +52,7 @@ impl Weight for TermWeight {
        reader: &SegmentReader,
        callback: &mut dyn FnMut(DocId, Score),
    ) -> crate::Result<()> {
-        let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
+        let mut scorer = self.specialized_scorer(reader, 1.0f32)?;
        for_each_scorer(&mut scorer, callback);
        Ok(())
    }
@@ -92,7 +92,7 @@ impl TermWeight {
        }
    }

-    fn scorer_specialized(&self, reader: &SegmentReader, boost: f32) -> Result<TermScorer> {
+    fn specialized_scorer(&self, reader: &SegmentReader, boost: f32) -> Result<TermScorer> {
        let field = self.term.field();
        let inverted_index = reader.inverted_index(field);
        let fieldnorm_reader = reader.get_fieldnorms_reader(field);
--- a/src/query/union.rs
+++ b/src/query/union.rs
@@ -183,7 +183,10 @@ where
            // advance all docsets to a doc >= to the target.
            #[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
            unordered_drain_filter(&mut self.docsets, |docset| {
-                docset.seek(target) == TERMINATED
+                if docset.doc() < target {
+                    docset.seek(target);
+                }
+                docset.doc() == TERMINATED
            });

            // at this point all of the docsets
--- a/src/reader/mod.rs
+++ b/src/reader/mod.rs
@@ -22,7 +22,7 @@ pub enum ReloadPolicy {
    /// The index is entirely reloaded manually.
    /// All updates of the index should be manual.
    ///
-    /// No change is reflected automatically. You are required to call `.load_seacher()` manually.
+    /// No change is reflected automatically. You are required to call `IndexReader::reload()` manually.
    Manual,
    /// The index is reloaded within milliseconds after a new commit is available.
    /// This is made possible by watching changes in the `meta.json` file.
--- a/src/schema/field_entry.rs
+++ b/src/schema/field_entry.rs
@@ -14,7 +14,7 @@ use std::fmt;
 /// - a field name
 /// - a field type, itself wrapping up options describing
 /// how the field should be indexed.
-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, PartialEq)]
 pub struct FieldEntry {
    name: String,
    field_type: FieldType,
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -48,7 +48,7 @@ pub enum Type {

 /// A `FieldType` describes the type (text, u64) of a field as well as
 /// how it should be handled by tantivy.
-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, PartialEq)]
 pub enum FieldType {
    /// String field type configuration
    Str(TextOptions),
--- a/src/schema/text_options.rs
+++ b/src/schema/text_options.rs
@@ -6,7 +6,7 @@ use std::borrow::Cow;
 use std::ops::BitOr;

 /// Define how a text field should be handled by tantivy.
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct TextOptions {
    indexing: Option<TextFieldIndexing>,
    stored: bool,
@@ -51,7 +51,7 @@ impl Default for TextOptions {
 /// - the amount of information that should be stored about the presence of a term in a document.
 /// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)).
 /// - the name of the `Tokenizer` that should be used to process the field.
-#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
+#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
 pub struct TextFieldIndexing {
    record: IndexRecordOption,
    tokenizer: Cow<'static, str>,
Author	SHA1	Message	Date
Paul Masurel	063ed30f66	Added field norm readers	2020-07-20 11:59:43 +09:00
Paul Masurel	6db8bb49d6	Assert nearly equals macro (#853 ) * Assert nearly equals macro * Renamed specialized_scorer in TermScorer	2020-07-17 16:40:41 +09:00
lyj	410aed0176	Update segment_updater.rs (#848 )	2020-07-16 12:33:11 +09:00
aptend	00a239a712	fix typo in index_meta.rs (#851 )	2020-07-16 12:32:45 +09:00
Paul Masurel	68fe406924	Removed asserts (#850 )	2020-07-16 12:24:55 +09:00
Paul Masurel	f71b04acb0	Bugfix. (#849 ) go_to_first_doc was typically calling seek with a target smaller than doc. Since SegmentPostings typically do a linear search on the full block, regardless of the current position, it could have our segment postings go backward.	2020-07-16 10:57:51 +09:00
lyj	1ab7f660a4	Update index.rs (#846 )	2020-07-02 15:11:38 +09:00
Sean Stangl	0ebbc4cb5a	Fix incorrect SimpleTokenizer link in documentation (#844 )	2020-07-01 10:26:36 +09:00
lyj	5300cb5da0	Update mod.rs (#845 )	2020-07-01 10:25:26 +09:00
Ype Kingma	7d773abc92	Boolean query: do not combine excluded scores. (#840 ) * Do nothing when combining score values of excluded scores. * Add test case for two excluded. * Test score for two excluded terms. * Use TopDocs in test_boolean_query_two_excluded	2020-06-08 20:01:19 +09:00
Paul Masurel	c34541ccce	Alive doc iterator. (#837 )	2020-06-05 19:42:51 +09:00
Paul Masurel	1cc5bd706c	Fixes build for no-default-features (#839 )	2020-06-05 19:41:55 +09:00
Paul Masurel	4026d183bc	Small readability change	2020-06-03 09:04:57 +09:00