mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-13 12:32:55 +00:00
Compare commits
2 Commits
removedali
...
nodeffeatf
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e4759b1d82 | ||
|
|
4026d183bc |
@@ -24,8 +24,10 @@ use crate::IndexWriter;
|
|||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::Path;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
fn load_metas(
|
fn load_metas(
|
||||||
|
|||||||
@@ -295,8 +295,8 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns an iterator that will iterate over the alive document ids
|
/// Returns an iterator that will iterate over the alive document ids
|
||||||
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
|
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
|
||||||
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
|
SegmentReaderAliveDocsIterator::new(&self)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this segment.
|
/// Summarize total space usage of this segment.
|
||||||
@@ -324,6 +324,52 @@ impl fmt::Debug for SegmentReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Implements the iterator trait to allow easy iteration
|
||||||
|
/// over non-deleted ("alive") DocIds in a SegmentReader
|
||||||
|
pub struct SegmentReaderAliveDocsIterator<'a> {
|
||||||
|
reader: &'a SegmentReader,
|
||||||
|
max_doc: DocId,
|
||||||
|
current: DocId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
||||||
|
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
||||||
|
SegmentReaderAliveDocsIterator {
|
||||||
|
reader,
|
||||||
|
max_doc: reader.max_doc(),
|
||||||
|
current: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
||||||
|
type Item = DocId;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
|
||||||
|
if self.current >= self.max_doc {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// find the next alive doc id
|
||||||
|
while self.reader.is_deleted(self.current) {
|
||||||
|
self.current += 1;
|
||||||
|
|
||||||
|
if self.current >= self.max_doc {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// capture the current alive DocId
|
||||||
|
let result = Some(self.current);
|
||||||
|
|
||||||
|
// move down the chain
|
||||||
|
self.current += 1;
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use crate::core::Index;
|
use crate::core::Index;
|
||||||
|
|||||||
@@ -589,48 +589,45 @@ impl IndexMerger {
|
|||||||
// of all of the segments containing the given term.
|
// of all of the segments containing the given term.
|
||||||
//
|
//
|
||||||
// These segments are non-empty and advance has already been called.
|
// These segments are non-empty and advance has already been called.
|
||||||
if !segment_postings.is_empty() {
|
if segment_postings.is_empty() {
|
||||||
// If not, the `term` will be entirely removed.
|
continue;
|
||||||
|
|
||||||
// We know that there is at least one document containing
|
|
||||||
// the term, so we add it.
|
|
||||||
let to_term_ord = field_serializer.new_term(term_bytes)?;
|
|
||||||
|
|
||||||
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
|
|
||||||
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
|
|
||||||
term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We can now serialize this postings, by pushing each document to the
|
|
||||||
// postings serializer.
|
|
||||||
for (segment_ord, mut segment_postings) in segment_postings {
|
|
||||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
|
||||||
|
|
||||||
let mut doc = segment_postings.doc();
|
|
||||||
while doc != TERMINATED {
|
|
||||||
// deleted doc are skipped as they do not have a `remapped_doc_id`.
|
|
||||||
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
|
|
||||||
// we make sure to only write the term iff
|
|
||||||
// there is at least one document.
|
|
||||||
let term_freq = segment_postings.term_freq();
|
|
||||||
segment_postings.positions(&mut positions_buffer);
|
|
||||||
|
|
||||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
|
||||||
field_serializer.write_doc(
|
|
||||||
remapped_doc_id,
|
|
||||||
term_freq,
|
|
||||||
delta_positions,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
doc = segment_postings.advance();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// closing the term.
|
|
||||||
field_serializer.close_term()?;
|
|
||||||
}
|
}
|
||||||
|
// If not, the `term` will be entirely removed.
|
||||||
|
|
||||||
|
// We know that there is at least one document containing
|
||||||
|
// the term, so we add it.
|
||||||
|
let to_term_ord = field_serializer.new_term(term_bytes)?;
|
||||||
|
|
||||||
|
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
|
||||||
|
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
|
||||||
|
term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can now serialize this postings, by pushing each document to the
|
||||||
|
// postings serializer.
|
||||||
|
for (segment_ord, mut segment_postings) in segment_postings {
|
||||||
|
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||||
|
|
||||||
|
let mut doc = segment_postings.doc();
|
||||||
|
while doc != TERMINATED {
|
||||||
|
// deleted doc are skipped as they do not have a `remapped_doc_id`.
|
||||||
|
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
|
||||||
|
// we make sure to only write the term iff
|
||||||
|
// there is at least one document.
|
||||||
|
let term_freq = segment_postings.term_freq();
|
||||||
|
segment_postings.positions(&mut positions_buffer);
|
||||||
|
|
||||||
|
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||||
|
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
doc = segment_postings.advance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// closing the term.
|
||||||
|
field_serializer.close_term()?;
|
||||||
}
|
}
|
||||||
field_serializer.close()?;
|
field_serializer.close()?;
|
||||||
Ok(term_ord_mapping_opt)
|
Ok(term_ord_mapping_opt)
|
||||||
|
|||||||
Reference in New Issue
Block a user