mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 00:50:41 +00:00
Jannickj prove bug with facets (#508)
* prove bug with facets * Closing #505 Introduce a term id in the TermHashMap
This commit is contained in:
@@ -8,6 +8,7 @@ pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
mod tests {
|
||||
|
||||
use schema::Cardinality;
|
||||
use schema::Facet;
|
||||
use schema::IntOptions;
|
||||
use schema::Schema;
|
||||
use Index;
|
||||
@@ -85,4 +86,17 @@ mod tests {
|
||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_many_facets() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_facet_field("facetfield");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for i in 0..100_000 {
|
||||
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ use DocId;
|
||||
/// term ids when the segment is getting serialized.
|
||||
pub struct MultiValueIntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u64>,
|
||||
vals: Vec<UnorderedTermId>,
|
||||
doc_index: Vec<u64>,
|
||||
is_facet: bool,
|
||||
}
|
||||
|
||||
@@ -1008,9 +1008,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000), 12);
|
||||
assert_eq!(initial_table_size(1_000_000), 15);
|
||||
assert_eq!(initial_table_size(10_000_000), 18);
|
||||
assert_eq!(initial_table_size(100_000), 11);
|
||||
assert_eq!(initial_table_size(1_000_000), 14);
|
||||
assert_eq!(initial_table_size(10_000_000), 17);
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
|
||||
@@ -31,7 +31,6 @@ pub(crate) use self::stacker::compute_table_size;
|
||||
pub use common::HasLen;
|
||||
|
||||
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
|
||||
|
||||
@@ -51,6 +51,30 @@ pub struct MultiFieldPostingsWriter {
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
||||
}
|
||||
|
||||
|
||||
fn make_field_partition(term_offsets: & Vec<(&[u8], Addr, UnorderedTermId)>) -> Vec<(Field, usize, usize)> {
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
let mut fields = vec![];
|
||||
let mut offsets = vec![];
|
||||
for (offset, field) in term_offsets_it {
|
||||
if field != prev_field {
|
||||
prev_field = field;
|
||||
fields.push(field);
|
||||
offsets.push(offset);
|
||||
}
|
||||
}
|
||||
offsets .push(term_offsets.len());
|
||||
let mut field_offsets = vec![];
|
||||
for i in 0..fields.len() {
|
||||
field_offsets.push((fields[i], offsets[i] , offsets[i+1]));
|
||||
}
|
||||
field_offsets
|
||||
}
|
||||
|
||||
impl MultiFieldPostingsWriter {
|
||||
/// Create a new `MultiFieldPostingsWriter` given
|
||||
/// a schema and a heap.
|
||||
@@ -99,33 +123,15 @@ impl MultiFieldPostingsWriter {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
|
||||
.term_index
|
||||
.iter()
|
||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||
.collect();
|
||||
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
if field != prev_field {
|
||||
offsets.push((field, offset));
|
||||
prev_field = field;
|
||||
}
|
||||
}
|
||||
offsets.push((Field(0), term_offsets.len()));
|
||||
|
||||
for i in 0..(offsets.len() - 1) {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i + 1];
|
||||
let field_offsets = make_field_partition(&term_offsets);
|
||||
|
||||
for (field, start, stop) in field_offsets {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match *field_entry.field_type() {
|
||||
|
||||
@@ -40,6 +40,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct Addr(u32);
|
||||
|
||||
|
||||
impl Addr {
|
||||
/// Creates a null pointer.
|
||||
pub fn null_pointer() -> Addr {
|
||||
|
||||
@@ -4,4 +4,4 @@ mod term_hashmap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::memory_arena::{Addr, MemoryArena};
|
||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
@@ -8,8 +8,7 @@ use postings::stacker::memory_arena::store;
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use std::slice;
|
||||
|
||||
pub type BucketId = usize;
|
||||
use postings::UnorderedTermId;
|
||||
|
||||
/// Returns the actual memory size in bytes
|
||||
/// required to create a table of size $2^num_bits$.
|
||||
@@ -28,6 +27,7 @@ pub fn compute_table_size(num_bits: usize) -> usize {
|
||||
struct KeyValue {
|
||||
key_value_addr: Addr,
|
||||
hash: u32,
|
||||
unordered_term_id: UnorderedTermId,
|
||||
}
|
||||
|
||||
impl Default for KeyValue {
|
||||
@@ -35,6 +35,7 @@ impl Default for KeyValue {
|
||||
KeyValue {
|
||||
key_value_addr: Addr::null_pointer(),
|
||||
hash: 0u32,
|
||||
unordered_term_id: UnorderedTermId::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -59,6 +60,7 @@ pub struct TermHashMap {
|
||||
pub heap: MemoryArena,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
len: usize
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
@@ -85,13 +87,13 @@ pub struct Iter<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = (&'a [u8], Addr, BucketId);
|
||||
type Item = (&'a [u8], Addr, UnorderedTermId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as BucketId)
|
||||
(key, offset, kv.unordered_term_id)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -106,6 +108,7 @@ impl TermHashMap {
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
len: 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,12 +142,16 @@ impl TermHashMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
||||
fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) -> UnorderedTermId {
|
||||
self.occupied.push(bucket);
|
||||
let unordered_term_id = self.len as UnorderedTermId;
|
||||
self.len += 1;
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr,
|
||||
hash,
|
||||
unordered_term_id
|
||||
};
|
||||
unordered_term_id
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter {
|
||||
@@ -184,7 +191,7 @@ impl TermHashMap {
|
||||
/// will be in charge of returning a default value.
|
||||
/// If the key already as an associated value, then it will be passed
|
||||
/// `Some(previous_value)`.
|
||||
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
||||
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> UnorderedTermId
|
||||
where
|
||||
S: AsRef<[u8]>,
|
||||
V: Copy + 'static,
|
||||
@@ -200,6 +207,7 @@ impl TermHashMap {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
// The key does not exists yet.
|
||||
let val = updater(None);
|
||||
let num_bytes =
|
||||
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
|
||||
@@ -211,16 +219,16 @@ impl TermHashMap {
|
||||
data[2..stop].copy_from_slice(key_bytes);
|
||||
store(&mut data[stop..], val);
|
||||
}
|
||||
self.set_bucket(hash, key_addr, bucket);
|
||||
return bucket as BucketId;
|
||||
return self.set_bucket(hash, key_addr, bucket);
|
||||
} else if kv.hash == hash {
|
||||
if let Some(val_addr) =
|
||||
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
|
||||
{
|
||||
|
||||
let v = self.heap.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.heap.write_at(val_addr, new_v);
|
||||
return bucket as BucketId;
|
||||
return kv.unordered_term_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user