add dynamic fastfield case

add dynamic fastfield for single fast field unsorted
fix scary documentation bug
add num_len instead of len
This commit is contained in:
Pascal Seitz
2021-06-24 12:22:27 +02:00
parent 6ba302c481
commit 8526434b63
3 changed files with 118 additions and 31 deletions

View File

@@ -1,4 +1,3 @@
use crate::common::HasLen;
use crate::core::InvertedIndexReader;
use crate::core::Segment;
use crate::core::SegmentComponent;
@@ -63,11 +62,9 @@ impl SegmentReader {
self.max_doc
}
/// Returns the number of documents.
/// Returns the number of alive documents.
/// Deleted documents are not counted.
///
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
self.num_docs
}
@@ -81,7 +78,7 @@ impl SegmentReader {
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset()
.map(|delete_set| delete_set.len() as DocId)
.map(|delete_set| delete_set.num_deleted() as DocId)
.unwrap_or(0u32)
}
@@ -329,6 +326,32 @@ mod test {
use crate::schema::{Schema, Term, STORED, TEXT};
use crate::DocId;
#[test]
fn test_num_alive() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("name", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs
index_writer.delete_term(Term::from_field_text(name, "horse"));
index_writer.delete_term(Term::from_field_text(name, "cap"));
// ok, now we should have a deleted doc
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
assert_eq!(2, searcher.segment_reader(0).num_docs());
assert_eq!(4, searcher.segment_reader(0).max_doc());
Ok(())
}
#[test]
fn test_alive_docs_iterator() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -91,6 +91,10 @@ impl DeleteBitSet {
b & (1u8 << shift) != 0
}
/// The number of deleted docs
pub fn num_deleted(&self) -> usize {
self.num_deleted
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()

View File

@@ -344,6 +344,11 @@ impl IndexMerger {
})
.collect::<Vec<_>>();
if let Some(doc_id_mapping) = doc_id_mapping {
let stats = FastFieldStats {
min_value,
max_value,
num_vals: doc_id_mapping.len() as u64,
};
#[derive(Clone)]
struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
@@ -355,11 +360,6 @@ impl IndexMerger {
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
}
}
let stats = FastFieldStats {
min_value,
max_value,
num_vals: doc_id_mapping.len() as u64,
};
let fastfield_accessor = SortedDocidFieldAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
@@ -378,31 +378,86 @@ impl IndexMerger {
Ok(())
} else {
let u64_readers = self.readers.iter()
.filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0))
let num_vals = self
.readers
.iter()
.map(|reader| reader.num_docs() as u64)
.sum();
let stats = FastFieldStats {
min_value,
max_value,
num_vals,
};
#[derive(Clone)]
struct DocidFieldAccessProvider<'a> {
segment_and_field_readers: Vec<(&'a SegmentReader, DynamicFastFieldReader<u64>)>,
}
impl<'a> FastFieldDataAccess for DocidFieldAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 {
// Find the reader which will contain the doc_id.
let mut num_docs_so_far = 0;
let reader_ordinal = self
.segment_and_field_readers
.iter()
.position(|(segment_reader, _)| {
num_docs_so_far += segment_reader.num_docs() as u64;
num_docs_so_far > doc
})
.unwrap();
let (segment_reader, reader) =
&self.segment_and_field_readers[reader_ordinal as usize];
let pos_in_reader = doc - (num_docs_so_far - segment_reader.num_docs() as u64);
let docid = segment_reader
.doc_ids_alive()
.nth(pos_in_reader as usize)
.expect(&format!(
"unexpected error, could not find doc id in alive list docid {}, number of docids in segment {} ",
pos_in_reader,
segment_reader.doc_ids_alive().count()
));
reader.get(docid)
}
}
let segment_and_field_readers = self.readers.iter()
.map(|reader|{
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
(reader.max_doc(), u64_reader, reader.delete_bitset())
(reader, u64_reader)
}).collect::<Vec<_>>();
let mut fast_single_field_serializer =
fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
for (max_doc, u64_reader, delete_bitset_opt) in u64_readers {
for doc_id in 0u32..max_doc {
let is_deleted = delete_bitset_opt
.map(|delete_bitset| delete_bitset.is_deleted(doc_id))
.unwrap_or(false);
if !is_deleted {
let val = u64_reader.get(doc_id);
fast_single_field_serializer.add_val(val)?;
}
}
}
let iter1 = segment_and_field_readers
.iter()
.map(|(reader, u64_reader)| {
reader
.doc_ids_alive()
.map(move |doc_id| u64_reader.get(doc_id))
})
.flatten();
let iter2 = segment_and_field_readers
.iter()
.map(|(reader, u64_reader)| {
reader
.doc_ids_alive()
.map(move |doc_id| u64_reader.get(doc_id))
})
.flatten();
fast_single_field_serializer.close_field()?;
let fastfield_accessor = DocidFieldAccessProvider {
segment_and_field_readers: segment_and_field_readers.clone(),
};
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
fastfield_accessor,
iter1,
iter2,
)?;
Ok(())
}
}
@@ -747,16 +802,21 @@ impl IndexMerger {
min_value,
};
if let Some(doc_id_mapping) = doc_id_mapping {
struct SortedDocidFieldAccessProvider<'a> {
struct SortedDocidMultiValueAccessProvider<'a> {
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
offsets: Vec<u64>,
}
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
impl<'a> FastFieldDataAccess for SortedDocidMultiValueAccessProvider<'a> {
fn get_val(&self, pos: u64) -> u64 {
// use the offsets index to find the doc_id which will contain the position.
// the offsets are stricly increasing so we can do a simple search on it.
let new_docid = self.offsets.iter().position(|&x| x > pos).unwrap() - 1;
let new_docid = self
.offsets
.iter()
.position(|&offset| offset > pos)
.unwrap()
- 1;
// now we need to find the position of `pos` in the multivalued bucket
let num_pos_covered_until_now = self.offsets[new_docid];
@@ -773,7 +833,7 @@ impl IndexMerger {
vals[pos_in_values as usize]
}
}
let fastfield_accessor = SortedDocidFieldAccessProvider {
let fastfield_accessor = SortedDocidMultiValueAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
offsets,