Alive doc iterator.

2026-01-08 10:02:55 +00:00 · 2020-06-02 09:06:46 +09:00
3 changed files with 40 additions and 85 deletions
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -24,10 +24,8 @@ use crate::IndexWriter;
 use std::borrow::BorrowMut;
 use std::collections::HashSet;
 use std::fmt;
 #[cfg(feature = "mmap")]
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::path::PathBuf;
 use std::sync::Arc;
 fn load_metas(
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -295,8 +295,8 @@ impl SegmentReader {
    }
    /// Returns an iterator that will iterate over the alive document ids
-    pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
+    pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
-        SegmentReaderAliveDocsIterator::new(&self)
+        (0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
    }
    /// Summarize total space usage of this segment.
@@ -324,52 +324,6 @@ impl fmt::Debug for SegmentReader {
    }
 }
 /// Implements the iterator trait to allow easy iteration
 /// over non-deleted ("alive") DocIds in a SegmentReader
 pub struct SegmentReaderAliveDocsIterator<'a> {
    reader: &'a SegmentReader,
    max_doc: DocId,
    current: DocId,
 }
 impl<'a> SegmentReaderAliveDocsIterator<'a> {
    pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
        SegmentReaderAliveDocsIterator {
            reader,
            max_doc: reader.max_doc(),
            current: 0,
        }
    }
 }
 impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
    type Item = DocId;
    fn next(&mut self) -> Option<Self::Item> {
        // TODO: Use TinySet (like in BitSetDocSet) to speed this process up
        if self.current >= self.max_doc {
            return None;
        }
        // find the next alive doc id
        while self.reader.is_deleted(self.current) {
            self.current += 1;
            if self.current >= self.max_doc {
                return None;
            }
        }
        // capture the current alive DocId
        let result = Some(self.current);
        // move down the chain
        self.current += 1;
        result
    }
 }
 #[cfg(test)]
 mod test {
    use crate::core::Index;
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -589,45 +589,48 @@ impl IndexMerger {
            // of all of the segments containing the given term.
            //
            // These segments are non-empty and advance has already been called.
-            if segment_postings.is_empty() {
+            if !segment_postings.is_empty() {
-                continue;
+                // If not, the `term` will be entirely removed.
            }
            // If not, the `term` will be entirely removed.
-            // We know that there is at least one document containing
+                // We know that there is at least one document containing
-            // the term, so we add it.
+                // the term, so we add it.
-            let to_term_ord = field_serializer.new_term(term_bytes)?;
+                let to_term_ord = field_serializer.new_term(term_bytes)?;
-            if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
+                if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
-                for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
+                    for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
-                    term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
+                        term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
                }
            }
            // We can now serialize this postings, by pushing each document to the
            // postings serializer.
            for (segment_ord, mut segment_postings) in segment_postings {
                let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
                let mut doc = segment_postings.doc();
                while doc != TERMINATED {
                    // deleted doc are skipped as they do not have a `remapped_doc_id`.
                    if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
                        // we make sure to only write the term iff
                        // there is at least one document.
                        let term_freq = segment_postings.term_freq();
                        segment_postings.positions(&mut positions_buffer);
                        let delta_positions = delta_computer.compute_delta(&positions_buffer);
                        field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions)?;
                    }
                    doc = segment_postings.advance();
                }
            }
-            // closing the term.
+                // We can now serialize this postings, by pushing each document to the
-            field_serializer.close_term()?;
+                // postings serializer.
                for (segment_ord, mut segment_postings) in segment_postings {
                    let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
                    let mut doc = segment_postings.doc();
                    while doc != TERMINATED {
                        // deleted doc are skipped as they do not have a `remapped_doc_id`.
                        if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
                            // we make sure to only write the term iff
                            // there is at least one document.
                            let term_freq = segment_postings.term_freq();
                            segment_postings.positions(&mut positions_buffer);
                            let delta_positions = delta_computer.compute_delta(&positions_buffer);
                            field_serializer.write_doc(
                                remapped_doc_id,
                                term_freq,
                                delta_positions,
                            )?;
                        }
                        doc = segment_postings.advance();
                    }
                }
                // closing the term.
                field_serializer.close_term()?;
            }
        }
        field_serializer.close()?;
        Ok(term_ord_mapping_opt)