mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 00:50:41 +00:00
add dynamic fastfield case
add dynamic fastfield for single fast field unsorted fix scary documentation bug add num_len instead of len
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
use crate::common::HasLen;
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
@@ -63,11 +62,9 @@ impl SegmentReader {
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents.
|
||||
/// Returns the number of alive documents.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
/// Today, `tantivy` does not handle deletes so max doc and
|
||||
/// num_docs are the same.
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.num_docs
|
||||
}
|
||||
@@ -81,7 +78,7 @@ impl SegmentReader {
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
self.delete_bitset()
|
||||
.map(|delete_set| delete_set.len() as DocId)
|
||||
.map(|delete_set| delete_set.num_deleted() as DocId)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
@@ -329,6 +326,32 @@ mod test {
|
||||
use crate::schema::{Schema, Term, STORED, TEXT};
|
||||
use crate::DocId;
|
||||
|
||||
#[test]
|
||||
fn test_num_alive() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let name = schema.get_field("name").unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => "tantivy"));
|
||||
index_writer.add_document(doc!(name => "horse"));
|
||||
index_writer.add_document(doc!(name => "jockey"));
|
||||
index_writer.add_document(doc!(name => "cap"));
|
||||
// we should now have one segment with two docs
|
||||
index_writer.delete_term(Term::from_field_text(name, "horse"));
|
||||
index_writer.delete_term(Term::from_field_text(name, "cap"));
|
||||
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
assert_eq!(2, searcher.segment_reader(0).num_docs());
|
||||
assert_eq!(4, searcher.segment_reader(0).max_doc());
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -91,6 +91,10 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
pub fn num_deleted(&self) -> usize {
|
||||
self.num_deleted
|
||||
}
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.data.len()
|
||||
|
||||
@@ -344,6 +344,11 @@ impl IndexMerger {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
let stats = FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: doc_id_mapping.len() as u64,
|
||||
};
|
||||
#[derive(Clone)]
|
||||
struct SortedDocidFieldAccessProvider<'a> {
|
||||
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
|
||||
@@ -355,11 +360,6 @@ impl IndexMerger {
|
||||
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
|
||||
}
|
||||
}
|
||||
let stats = FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: doc_id_mapping.len() as u64,
|
||||
};
|
||||
let fastfield_accessor = SortedDocidFieldAccessProvider {
|
||||
doc_id_mapping,
|
||||
fast_field_readers: &fast_field_readers,
|
||||
@@ -378,31 +378,86 @@ impl IndexMerger {
|
||||
|
||||
Ok(())
|
||||
} else {
|
||||
let u64_readers = self.readers.iter()
|
||||
.filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0))
|
||||
let num_vals = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| reader.num_docs() as u64)
|
||||
.sum();
|
||||
|
||||
let stats = FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals,
|
||||
};
|
||||
#[derive(Clone)]
|
||||
struct DocidFieldAccessProvider<'a> {
|
||||
segment_and_field_readers: Vec<(&'a SegmentReader, DynamicFastFieldReader<u64>)>,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for DocidFieldAccessProvider<'a> {
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
// Find the reader which will contain the doc_id.
|
||||
let mut num_docs_so_far = 0;
|
||||
let reader_ordinal = self
|
||||
.segment_and_field_readers
|
||||
.iter()
|
||||
.position(|(segment_reader, _)| {
|
||||
num_docs_so_far += segment_reader.num_docs() as u64;
|
||||
|
||||
num_docs_so_far > doc
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let (segment_reader, reader) =
|
||||
&self.segment_and_field_readers[reader_ordinal as usize];
|
||||
let pos_in_reader = doc - (num_docs_so_far - segment_reader.num_docs() as u64);
|
||||
|
||||
let docid = segment_reader
|
||||
.doc_ids_alive()
|
||||
.nth(pos_in_reader as usize)
|
||||
.expect(&format!(
|
||||
"unexpected error, could not find doc id in alive list docid {}, number of docids in segment {} ",
|
||||
pos_in_reader,
|
||||
segment_reader.doc_ids_alive().count()
|
||||
));
|
||||
reader.get(docid)
|
||||
}
|
||||
}
|
||||
let segment_and_field_readers = self.readers.iter()
|
||||
.map(|reader|{
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
(reader.max_doc(), u64_reader, reader.delete_bitset())
|
||||
(reader, u64_reader)
|
||||
}).collect::<Vec<_>>();
|
||||
|
||||
let mut fast_single_field_serializer =
|
||||
fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
|
||||
for (max_doc, u64_reader, delete_bitset_opt) in u64_readers {
|
||||
for doc_id in 0u32..max_doc {
|
||||
let is_deleted = delete_bitset_opt
|
||||
.map(|delete_bitset| delete_bitset.is_deleted(doc_id))
|
||||
.unwrap_or(false);
|
||||
if !is_deleted {
|
||||
let val = u64_reader.get(doc_id);
|
||||
fast_single_field_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
let iter1 = segment_and_field_readers
|
||||
.iter()
|
||||
.map(|(reader, u64_reader)| {
|
||||
reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| u64_reader.get(doc_id))
|
||||
})
|
||||
.flatten();
|
||||
let iter2 = segment_and_field_readers
|
||||
.iter()
|
||||
.map(|(reader, u64_reader)| {
|
||||
reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| u64_reader.get(doc_id))
|
||||
})
|
||||
.flatten();
|
||||
|
||||
fast_single_field_serializer.close_field()?;
|
||||
let fastfield_accessor = DocidFieldAccessProvider {
|
||||
segment_and_field_readers: segment_and_field_readers.clone(),
|
||||
};
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter1,
|
||||
iter2,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -747,16 +802,21 @@ impl IndexMerger {
|
||||
min_value,
|
||||
};
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
struct SortedDocidFieldAccessProvider<'a> {
|
||||
struct SortedDocidMultiValueAccessProvider<'a> {
|
||||
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
|
||||
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
|
||||
offsets: Vec<u64>,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
|
||||
impl<'a> FastFieldDataAccess for SortedDocidMultiValueAccessProvider<'a> {
|
||||
fn get_val(&self, pos: u64) -> u64 {
|
||||
// use the offsets index to find the doc_id which will contain the position.
|
||||
// the offsets are stricly increasing so we can do a simple search on it.
|
||||
let new_docid = self.offsets.iter().position(|&x| x > pos).unwrap() - 1;
|
||||
let new_docid = self
|
||||
.offsets
|
||||
.iter()
|
||||
.position(|&offset| offset > pos)
|
||||
.unwrap()
|
||||
- 1;
|
||||
|
||||
// now we need to find the position of `pos` in the multivalued bucket
|
||||
let num_pos_covered_until_now = self.offsets[new_docid];
|
||||
@@ -773,7 +833,7 @@ impl IndexMerger {
|
||||
vals[pos_in_values as usize]
|
||||
}
|
||||
}
|
||||
let fastfield_accessor = SortedDocidFieldAccessProvider {
|
||||
let fastfield_accessor = SortedDocidMultiValueAccessProvider {
|
||||
doc_id_mapping,
|
||||
fast_field_readers: &fast_field_readers,
|
||||
offsets,
|
||||
|
||||
Reference in New Issue
Block a user