mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 10:02:55 +00:00
Compare commits
1 Commits
nodeffeatf
...
removedali
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8861919d5f |
@@ -24,10 +24,8 @@ use crate::IndexWriter;
|
|||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
use std::path::Path;
|
use std::path::{Path, PathBuf};
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
fn load_metas(
|
fn load_metas(
|
||||||
|
|||||||
@@ -295,8 +295,8 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns an iterator that will iterate over the alive document ids
|
/// Returns an iterator that will iterate over the alive document ids
|
||||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
|
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
|
||||||
SegmentReaderAliveDocsIterator::new(&self)
|
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this segment.
|
/// Summarize total space usage of this segment.
|
||||||
@@ -324,52 +324,6 @@ impl fmt::Debug for SegmentReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Implements the iterator trait to allow easy iteration
|
|
||||||
/// over non-deleted ("alive") DocIds in a SegmentReader
|
|
||||||
pub struct SegmentReaderAliveDocsIterator<'a> {
|
|
||||||
reader: &'a SegmentReader,
|
|
||||||
max_doc: DocId,
|
|
||||||
current: DocId,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
|
||||||
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
|
||||||
SegmentReaderAliveDocsIterator {
|
|
||||||
reader,
|
|
||||||
max_doc: reader.max_doc(),
|
|
||||||
current: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
|
||||||
type Item = DocId;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
|
|
||||||
if self.current >= self.max_doc {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
// find the next alive doc id
|
|
||||||
while self.reader.is_deleted(self.current) {
|
|
||||||
self.current += 1;
|
|
||||||
|
|
||||||
if self.current >= self.max_doc {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// capture the current alive DocId
|
|
||||||
let result = Some(self.current);
|
|
||||||
|
|
||||||
// move down the chain
|
|
||||||
self.current += 1;
|
|
||||||
|
|
||||||
result
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use crate::core::Index;
|
use crate::core::Index;
|
||||||
|
|||||||
@@ -589,45 +589,48 @@ impl IndexMerger {
|
|||||||
// of all of the segments containing the given term.
|
// of all of the segments containing the given term.
|
||||||
//
|
//
|
||||||
// These segments are non-empty and advance has already been called.
|
// These segments are non-empty and advance has already been called.
|
||||||
if segment_postings.is_empty() {
|
if !segment_postings.is_empty() {
|
||||||
continue;
|
// If not, the `term` will be entirely removed.
|
||||||
}
|
|
||||||
// If not, the `term` will be entirely removed.
|
|
||||||
|
|
||||||
// We know that there is at least one document containing
|
// We know that there is at least one document containing
|
||||||
// the term, so we add it.
|
// the term, so we add it.
|
||||||
let to_term_ord = field_serializer.new_term(term_bytes)?;
|
let to_term_ord = field_serializer.new_term(term_bytes)?;
|
||||||
|
|
||||||
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
|
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
|
||||||
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
|
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
|
||||||
term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
|
term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We can now serialize this postings, by pushing each document to the
|
|
||||||
// postings serializer.
|
|
||||||
for (segment_ord, mut segment_postings) in segment_postings {
|
|
||||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
|
||||||
|
|
||||||
let mut doc = segment_postings.doc();
|
|
||||||
while doc != TERMINATED {
|
|
||||||
// deleted doc are skipped as they do not have a `remapped_doc_id`.
|
|
||||||
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
|
|
||||||
// we make sure to only write the term iff
|
|
||||||
// there is at least one document.
|
|
||||||
let term_freq = segment_postings.term_freq();
|
|
||||||
segment_postings.positions(&mut positions_buffer);
|
|
||||||
|
|
||||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
|
||||||
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
doc = segment_postings.advance();
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// closing the term.
|
// We can now serialize this postings, by pushing each document to the
|
||||||
field_serializer.close_term()?;
|
// postings serializer.
|
||||||
|
for (segment_ord, mut segment_postings) in segment_postings {
|
||||||
|
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||||
|
|
||||||
|
let mut doc = segment_postings.doc();
|
||||||
|
while doc != TERMINATED {
|
||||||
|
// deleted doc are skipped as they do not have a `remapped_doc_id`.
|
||||||
|
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
|
||||||
|
// we make sure to only write the term iff
|
||||||
|
// there is at least one document.
|
||||||
|
let term_freq = segment_postings.term_freq();
|
||||||
|
segment_postings.positions(&mut positions_buffer);
|
||||||
|
|
||||||
|
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||||
|
field_serializer.write_doc(
|
||||||
|
remapped_doc_id,
|
||||||
|
term_freq,
|
||||||
|
delta_positions,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
doc = segment_postings.advance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// closing the term.
|
||||||
|
field_serializer.close_term()?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
field_serializer.close()?;
|
field_serializer.close()?;
|
||||||
Ok(term_ord_mapping_opt)
|
Ok(term_ord_mapping_opt)
|
||||||
|
|||||||
Reference in New Issue
Block a user