From c62ddb61b7f65712ce32138458cf068a8e2456cc Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 6 Sep 2022 12:47:49 +0800 Subject: [PATCH] rename, add position to docid function --- src/fastfield/bytes/reader.rs | 5 +++ src/fastfield/fast_value.rs | 1 - src/fastfield/mod.rs | 8 +++- src/fastfield/multivalued/reader.rs | 65 ++++++++++++++++++----------- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs index 6726202d6..2d37ce664 100644 --- a/src/fastfield/bytes/reader.rs +++ b/src/fastfield/bytes/reader.rs @@ -52,6 +52,11 @@ impl BytesFastFieldReader { } impl MultiValueLength for BytesFastFieldReader { + fn get_range(&self, doc_id: DocId) -> std::ops::Range { + let (start, stop) = self.range(doc_id); + start as u64..stop as u64 + } + fn get_len(&self, doc_id: DocId) -> u64 { self.num_bytes(doc_id) as u64 } diff --git a/src/fastfield/fast_value.rs b/src/fastfield/fast_value.rs index 64896d3a1..bc2b2e6b3 100644 --- a/src/fastfield/fast_value.rs +++ b/src/fastfield/fast_value.rs @@ -16,7 +16,6 @@ pub trait FastValueU128: Clone + Copy + Send + Sync + PartialOrd + 'static { /// Converts a value from u128 /// /// Internally all fast field values are encoded as u128. - /// **Note: To be used for converting encoded Term, Posting values.** fn from_u128(val: u128) -> Self; /// Converts a value to u128. diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 8656a0e38..1ee6f72a7 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -20,6 +20,8 @@ //! //! Read access performance is comparable to that of an array lookup. +use std::collections::btree_map::Range; + pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet}; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; @@ -75,6 +77,8 @@ fn value_to_u64(value: &Value) -> u64 { /// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data /// for a doc_id pub trait MultiValueLength { + /// returns the positions of values associated to a doc_id + fn get_range(&self, doc_id: DocId) -> std::ops::Range; /// returns the num of values associated to a doc_id fn get_len(&self, doc_id: DocId) -> u64; /// returns the sum of num values for all doc_ids @@ -510,8 +514,8 @@ mod tests { // multi value let ip_addr_fast_field = fast_fields.ip_addrs(ips_field).unwrap(); - assert_eq!(ip_addr_fast_field.get_val(0), None); - assert_eq!(ip_addr_fast_field.get_val(1), Some(ip2)); + assert_eq!(ip_addr_fast_field.get_first_val(0), None); + assert_eq!(ip_addr_fast_field.get_first_val(1), Some(ip2)); let mut out = vec![]; ip_addr_fast_field.get_vals(0, &mut out); diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 7c19a2193..80e1be822 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -89,10 +89,12 @@ impl MultiValuedFastFieldReader { } impl MultiValueLength for MultiValuedFastFieldReader { + fn get_range(&self, doc_id: DocId) -> std::ops::Range { + self.range(doc_id) + } fn get_len(&self, doc_id: DocId) -> u64 { self.num_vals(doc_id) as u64 } - fn get_total_len(&self) -> u64 { self.total_num_vals() as u64 } @@ -133,7 +135,7 @@ impl MultiValuedU128FastFieldReader { /// Returns the array of values associated to the given `doc`. #[inline] - pub fn get_val(&self, doc: DocId) -> Option { + pub fn get_first_val(&self, doc: DocId) -> Option { let range = self.range(doc); if range.is_empty() { return None; @@ -157,30 +159,10 @@ impl MultiValuedU128FastFieldReader { } /// Returns all docids which are in the provided value range - pub fn get_between_vals(&self, range: RangeInclusive) -> Vec { + pub fn get_between_vals(&self, range: RangeInclusive) -> Vec { let positions = self.vals_reader.get_between_vals(range); - // Now we need to convert the positions to docids - let mut docs = vec![]; - let mut cursor = 0usize; - let mut last_doc = None; - for pos in positions { - loop { - let range = self.range(cursor as u32); - if range.contains(&(pos as u64)) { - // avoid duplicates - if Some(cursor) == last_doc { - break; - } - docs.push(cursor); - last_doc = Some(cursor); - break; - } - cursor += 1; - } - } - - docs + positions_to_docids(&positions, self) } /// Iterates over all elements in the fast field @@ -220,7 +202,42 @@ impl MultiValuedU128FastFieldReader { } } +/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds. +/// +/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index. +/// +/// Correctness: positions needs to be sorted. +/// +/// TODO: Instead of a linear scan we can employ a binary search to match a docid to its value +/// position. +fn positions_to_docids(positions: &[usize], multival_idx: &T) -> Vec { + let mut docs = vec![]; + let mut cur_doc = 0u32; + let mut last_doc = None; + + for pos in positions { + loop { + let range = multival_idx.get_range(cur_doc); + if range.contains(&(*pos as u64)) { + // avoid duplicates + if Some(cur_doc) == last_doc { + break; + } + docs.push(cur_doc); + last_doc = Some(cur_doc); + break; + } + cur_doc += 1; + } + } + + docs +} + impl MultiValueLength for MultiValuedU128FastFieldReader { + fn get_range(&self, doc_id: DocId) -> std::ops::Range { + self.range(doc_id) + } fn get_len(&self, doc_id: DocId) -> u64 { self.num_vals(doc_id) as u64 }