From 27c9fa6028b255c0b3aaa17c6901141dc19e9ffd Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 25 Feb 2019 22:33:17 +0900 Subject: [PATCH] Jannickj prove bug with facets (#508) * prove bug with facets * Closing #505 Introduce a term id in the TermHashMap --- src/fastfield/multivalued/mod.rs | 14 +++++++++ src/fastfield/multivalued/writer.rs | 2 +- src/indexer/index_writer.rs | 6 ++-- src/postings/mod.rs | 1 - src/postings/postings_writer.rs | 46 ++++++++++++++++------------ src/postings/stacker/memory_arena.rs | 1 + src/postings/stacker/mod.rs | 2 +- src/postings/stacker/term_hashmap.rs | 26 ++++++++++------ 8 files changed, 63 insertions(+), 35 deletions(-) diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 733f211bb..c1cd2dd7f 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -8,6 +8,7 @@ pub use self::writer::MultiValueIntFastFieldWriter; mod tests { use schema::Cardinality; + use schema::Facet; use schema::IntOptions; use schema::Schema; use Index; @@ -85,4 +86,17 @@ mod tests { assert_eq!(&vals, &[-5i64, -20i64, 1i64]); } } + #[test] + #[ignore] + fn test_many_facets() { + let mut schema_builder = Schema::builder(); + let field = schema_builder.add_facet_field("facetfield"); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + for i in 0..100_000 { + index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str()))); + } + assert!(index_writer.commit().is_ok()); + } } diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index e5fd45203..a4186ffd7 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -32,7 +32,7 @@ use DocId; /// term ids when the segment is getting serialized. pub struct MultiValueIntFastFieldWriter { field: Field, - vals: Vec, + vals: Vec, doc_index: Vec, is_facet: bool, } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 8a43b0c3a..2272fb394 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1008,9 +1008,9 @@ mod tests { #[test] fn test_hashmap_size() { - assert_eq!(initial_table_size(100_000), 12); - assert_eq!(initial_table_size(1_000_000), 15); - assert_eq!(initial_table_size(10_000_000), 18); + assert_eq!(initial_table_size(100_000), 11); + assert_eq!(initial_table_size(1_000_000), 14); + assert_eq!(initial_table_size(10_000_000), 17); assert_eq!(initial_table_size(1_000_000_000), 19); } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index c94e887a9..eddb10799 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -31,7 +31,6 @@ pub(crate) use self::stacker::compute_table_size; pub use common::HasLen; pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32; - pub(crate) type UnorderedTermId = u64; #[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))] diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index c408c15df..d70d148a5 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -51,6 +51,30 @@ pub struct MultiFieldPostingsWriter { per_field_postings_writers: Vec>, } + +fn make_field_partition(term_offsets: & Vec<(&[u8], Addr, UnorderedTermId)>) -> Vec<(Field, usize, usize)> { + let term_offsets_it = term_offsets + .iter() + .map(|(key, _, _)| Term::wrap(key).field()) + .enumerate(); + let mut prev_field = Field(u32::max_value()); + let mut fields = vec![]; + let mut offsets = vec![]; + for (offset, field) in term_offsets_it { + if field != prev_field { + prev_field = field; + fields.push(field); + offsets.push(offset); + } + } + offsets .push(term_offsets.len()); + let mut field_offsets = vec![]; + for i in 0..fields.len() { + field_offsets.push((fields[i], offsets[i] , offsets[i+1])); + } + field_offsets +} + impl MultiFieldPostingsWriter { /// Create a new `MultiFieldPostingsWriter` given /// a schema and a heap. @@ -99,33 +123,15 @@ impl MultiFieldPostingsWriter { let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self .term_index .iter() - .map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId)) .collect(); term_offsets.sort_unstable_by_key(|&(k, _, _)| k); - let mut offsets: Vec<(Field, usize)> = vec![]; - let term_offsets_it = term_offsets - .iter() - .cloned() - .map(|(key, _, _)| Term::wrap(key).field()) - .enumerate(); - let mut unordered_term_mappings: HashMap> = HashMap::new(); - let mut prev_field = Field(u32::max_value()); - for (offset, field) in term_offsets_it { - if field != prev_field { - offsets.push((field, offset)); - prev_field = field; - } - } - offsets.push((Field(0), term_offsets.len())); - - for i in 0..(offsets.len() - 1) { - let (field, start) = offsets[i]; - let (_, stop) = offsets[i + 1]; + let field_offsets = make_field_partition(&term_offsets); + for (field, start, stop) in field_offsets { let field_entry = self.schema.get_field_entry(field); match *field_entry.field_type() { diff --git a/src/postings/stacker/memory_arena.rs b/src/postings/stacker/memory_arena.rs index 816492d28..973c1dbd0 100644 --- a/src/postings/stacker/memory_arena.rs +++ b/src/postings/stacker/memory_arena.rs @@ -40,6 +40,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large #[derive(Copy, Clone, Debug)] pub struct Addr(u32); + impl Addr { /// Creates a null pointer. pub fn null_pointer() -> Addr { diff --git a/src/postings/stacker/mod.rs b/src/postings/stacker/mod.rs index be80510d7..0955f4e32 100644 --- a/src/postings/stacker/mod.rs +++ b/src/postings/stacker/mod.rs @@ -4,4 +4,4 @@ mod term_hashmap; pub use self::expull::ExpUnrolledLinkedList; pub use self::memory_arena::{Addr, MemoryArena}; -pub use self::term_hashmap::{compute_table_size, TermHashMap}; +pub use self::term_hashmap::{compute_table_size, TermHashMap}; \ No newline at end of file diff --git a/src/postings/stacker/term_hashmap.rs b/src/postings/stacker/term_hashmap.rs index 072f8d77c..b2c7a26c9 100644 --- a/src/postings/stacker/term_hashmap.rs +++ b/src/postings/stacker/term_hashmap.rs @@ -8,8 +8,7 @@ use postings::stacker::memory_arena::store; use std::iter; use std::mem; use std::slice; - -pub type BucketId = usize; +use postings::UnorderedTermId; /// Returns the actual memory size in bytes /// required to create a table of size $2^num_bits$. @@ -28,6 +27,7 @@ pub fn compute_table_size(num_bits: usize) -> usize { struct KeyValue { key_value_addr: Addr, hash: u32, + unordered_term_id: UnorderedTermId, } impl Default for KeyValue { @@ -35,6 +35,7 @@ impl Default for KeyValue { KeyValue { key_value_addr: Addr::null_pointer(), hash: 0u32, + unordered_term_id: UnorderedTermId::default(), } } } @@ -59,6 +60,7 @@ pub struct TermHashMap { pub heap: MemoryArena, mask: usize, occupied: Vec, + len: usize } struct QuadraticProbing { @@ -85,13 +87,13 @@ pub struct Iter<'a> { } impl<'a> Iterator for Iter<'a> { - type Item = (&'a [u8], Addr, BucketId); + type Item = (&'a [u8], Addr, UnorderedTermId); fn next(&mut self) -> Option { self.inner.next().cloned().map(move |bucket: usize| { let kv = self.hashmap.table[bucket]; let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr); - (key, offset, bucket as BucketId) + (key, offset, kv.unordered_term_id) }) } } @@ -106,6 +108,7 @@ impl TermHashMap { heap, mask: table_size - 1, occupied: Vec::with_capacity(table_size / 2), + len: 0 } } @@ -139,12 +142,16 @@ impl TermHashMap { } } - pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) { + fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) -> UnorderedTermId { self.occupied.push(bucket); + let unordered_term_id = self.len as UnorderedTermId; + self.len += 1; self.table[bucket] = KeyValue { key_value_addr, hash, + unordered_term_id }; + unordered_term_id } pub fn iter(&self) -> Iter { @@ -184,7 +191,7 @@ impl TermHashMap { /// will be in charge of returning a default value. /// If the key already as an associated value, then it will be passed /// `Some(previous_value)`. - pub fn mutate_or_create(&mut self, key: S, mut updater: TMutator) -> BucketId + pub fn mutate_or_create(&mut self, key: S, mut updater: TMutator) -> UnorderedTermId where S: AsRef<[u8]>, V: Copy + 'static, @@ -200,6 +207,7 @@ impl TermHashMap { let bucket = probe.next_probe(); let kv: KeyValue = self.table[bucket]; if kv.is_empty() { + // The key does not exists yet. let val = updater(None); let num_bytes = std::mem::size_of::() + key_bytes.len() + std::mem::size_of::(); @@ -211,16 +219,16 @@ impl TermHashMap { data[2..stop].copy_from_slice(key_bytes); store(&mut data[stop..], val); } - self.set_bucket(hash, key_addr, bucket); - return bucket as BucketId; + return self.set_bucket(hash, key_addr, bucket); } else if kv.hash == hash { if let Some(val_addr) = self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr) { + let v = self.heap.read(val_addr); let new_v = updater(Some(v)); self.heap.write_at(val_addr, new_v); - return bucket as BucketId; + return kv.unordered_term_id; } } }