Files
tantivy/sstable/src/dictionary.rs
Paul Masurel 04beab3b29 Performance improvement for nested cardinality aggregation
When a string cardinality aggregation is nested it end up being applied to different buckets.
Dictionary encoding relies on a different dictionaries for each segment.

As a result, during segment collection, we only collect term ordinals in a HashSet, and decode them in the
term dictionary at the end of collection.

Before this PR, this decoding phase was done once for each bucket, causing the same work to be done over and over. This PR introduce a coupon cache. The HLL sketch relies on a hash of the string values.

We populate the cache before bucket collection, and get our values from it.

This PR also rename "caching" "buffering" in aggregation (it was never caching), and does several cleanups.
2026-04-10 14:51:00 +02:00

1151 lines
43 KiB
Rust

#![allow(clippy::needless_borrows_for_generic_args)]
use std::cmp::Ordering;
use std::io;
use std::marker::PhantomData;
use std::ops::{Bound, RangeBounds};
use std::sync::Arc;
use common::bounds::{TransformBound, transform_bound_inner_res};
use common::file_slice::FileSlice;
use common::{BinarySerializable, ByteCount, OwnedBytes};
use futures_util::{StreamExt, TryStreamExt, stream};
use itertools::Itertools;
use tantivy_fst::Automaton;
use tantivy_fst::automaton::AlwaysMatch;
use crate::sstable_index_v3::SSTableIndexV3Empty;
use crate::streamer::{Streamer, StreamerBuilder};
use crate::{
BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, SSTableIndexV3, TermOrdinal, VoidSSTable,
};
/// An SSTable is a sorted map that associates sorted `&[u8]` keys
/// to any kind of typed values.
///
/// The SSTable is organized in blocks.
/// In each block, keys and values are encoded separately.
///
/// The keys are encoded using incremental encoding.
/// The values on the other hand, are encoded according to a value-specific
/// codec defined in the TSSTable generic argument.
///
/// Finally, an index is joined to the Dictionary to make it possible,
/// given a key to identify which block contains this key.
///
/// The codec was designed in such a way that the sstable
/// reader is not aware of block, and yet can read any sequence of blocks,
/// as long as the slice of bytes it is given starts and stops at
/// block boundary.
///
/// (See also README.md)
#[derive(Debug, Clone)]
pub struct Dictionary<TSSTable: SSTable = VoidSSTable> {
pub sstable_slice: FileSlice,
pub sstable_index: SSTableIndex,
num_bytes: ByteCount,
num_terms: u64,
phantom_data: PhantomData<TSSTable>,
}
impl Dictionary<VoidSSTable> {
pub fn build_for_tests(terms: &[&str]) -> Dictionary {
let mut terms = terms.to_vec();
terms.sort();
let mut buffer = Vec::new();
let mut dictionary_writer = Self::builder(&mut buffer).unwrap();
for term in terms {
dictionary_writer.insert(term, &()).unwrap();
}
dictionary_writer.finish().unwrap();
Dictionary::from_bytes(OwnedBytes::new(buffer)).unwrap()
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TermOrdHit {
/// Exact term ord hit
Exact(TermOrdinal),
/// Next best term ordinal
Next(TermOrdinal),
}
impl TermOrdHit {
fn into_exact(self) -> Option<TermOrdinal> {
match self {
TermOrdHit::Exact(ord) => Some(ord),
TermOrdHit::Next(_) => None,
}
}
fn map<F: FnOnce(TermOrdinal) -> TermOrdinal>(self, f: F) -> Self {
match self {
TermOrdHit::Exact(ord) => TermOrdHit::Exact(f(ord)),
TermOrdHit::Next(ord) => TermOrdHit::Next(f(ord)),
}
}
}
impl<TSSTable: SSTable> Dictionary<TSSTable> {
pub fn builder<W: io::Write>(wrt: W) -> io::Result<crate::Writer<W, TSSTable::ValueWriter>> {
Ok(TSSTable::writer(wrt))
}
pub(crate) fn sstable_reader_block(
&self,
block_addr: BlockAddr,
) -> io::Result<Reader<TSSTable::ValueReader>> {
let data = self.sstable_slice.read_bytes_slice(block_addr.byte_range)?;
Ok(TSSTable::reader(data))
}
pub(crate) async fn sstable_delta_reader_for_key_range_async(
&self,
key_range: impl RangeBounds<[u8]>,
limit: Option<u64>,
automaton: &impl Automaton,
merge_holes_under_bytes: usize,
) -> io::Result<DeltaReader<TSSTable::ValueReader>> {
let match_all = automaton.will_always_match(&automaton.start());
if match_all {
let slice = self.file_slice_for_range(key_range, limit);
let data = slice.read_bytes_async().await?;
Ok(TSSTable::delta_reader(data))
} else {
let blocks = stream::iter(self.get_block_iterator_for_range_and_automaton(
key_range,
automaton,
merge_holes_under_bytes,
));
let data = blocks
.map(|block_addr| {
self.sstable_slice
.read_bytes_slice_async(block_addr.byte_range)
})
.buffered(5)
.try_collect::<Vec<_>>()
.await?;
Ok(DeltaReader::from_multiple_blocks(data))
}
}
pub(crate) fn sstable_delta_reader_for_key_range(
&self,
key_range: impl RangeBounds<[u8]>,
limit: Option<u64>,
automaton: &impl Automaton,
) -> io::Result<DeltaReader<TSSTable::ValueReader>> {
let match_all = automaton.will_always_match(&automaton.start());
if match_all {
let slice = self.file_slice_for_range(key_range, limit);
let data = slice.read_bytes()?;
Ok(TSSTable::delta_reader(data))
} else {
// if operations are sync, we assume latency is almost null, and there is no point in
// merging across holes
let blocks = self.get_block_iterator_for_range_and_automaton(key_range, automaton, 0);
let data = blocks
.map(|block_addr| self.sstable_slice.read_bytes_slice(block_addr.byte_range))
.collect::<Result<Vec<_>, _>>()?;
Ok(DeltaReader::from_multiple_blocks(data))
}
}
pub(crate) fn sstable_delta_reader_block(
&self,
block_addr: BlockAddr,
) -> io::Result<DeltaReader<TSSTable::ValueReader>> {
let data = self.sstable_slice.read_bytes_slice(block_addr.byte_range)?;
Ok(TSSTable::delta_reader(data))
}
pub(crate) async fn sstable_delta_reader_block_async(
&self,
block_addr: BlockAddr,
) -> io::Result<DeltaReader<TSSTable::ValueReader>> {
let data = self
.sstable_slice
.read_bytes_slice_async(block_addr.byte_range)
.await?;
Ok(TSSTable::delta_reader(data))
}
/// This function returns a file slice covering a set of sstable blocks
/// that include the key range passed in arguments. Optionally returns
/// only block for up to `limit` matching terms.
///
/// It works by identifying
/// - `first_block`: the block containing the start boundary key
/// - `last_block`: the block containing the end boundary key.
///
/// And then returning the range that spans over all blocks between.
/// and including first_block and last_block, aka:
/// `[first_block.start_offset .. last_block.end_offset)`
///
/// Technically this function does not provide the tightest fit, as
/// for simplification, it treats the start bound of the `key_range`
/// as if it was inclusive, even if it is exclusive.
/// On the rare edge case where a user asks for `(start_key, end_key]`
/// and `start_key` happens to be the last key of a block, we return a
/// slice that is the first block was not necessary.
pub fn file_slice_for_range(
&self,
key_range: impl RangeBounds<[u8]>,
limit: Option<u64>,
) -> FileSlice {
let first_block_id = match key_range.start_bound() {
Bound::Included(key) | Bound::Excluded(key) => {
let Some(first_block_id) = self.sstable_index.locate_with_key(key) else {
return FileSlice::empty();
};
Some(first_block_id)
}
Bound::Unbounded => None,
};
let last_block_id = match key_range.end_bound() {
Bound::Included(key) | Bound::Excluded(key) => self.sstable_index.locate_with_key(key),
Bound::Unbounded => None,
};
let start_bound = if let Some(first_block_id) = first_block_id {
let Some(block_addr) = self.sstable_index.get_block(first_block_id) else {
return FileSlice::empty();
};
Bound::Included(block_addr.byte_range.start)
} else {
Bound::Unbounded
};
let last_block_id = if let Some(limit) = limit {
let second_block_id = first_block_id.map(|id| id + 1).unwrap_or(0);
if let Some(block_addr) = self.sstable_index.get_block(second_block_id) {
let ordinal_limit = block_addr.first_ordinal + limit;
let last_block_limit = self.sstable_index.locate_with_ord(ordinal_limit);
if let Some(last_block_id) = last_block_id {
Some(last_block_id.min(last_block_limit))
} else {
Some(last_block_limit)
}
} else {
last_block_id
}
} else {
last_block_id
};
let end_bound = last_block_id
.and_then(|block_id| self.sstable_index.get_block(block_id))
.map(|block_addr| Bound::Excluded(block_addr.byte_range.end))
.unwrap_or(Bound::Unbounded);
self.sstable_slice.slice((start_bound, end_bound))
}
fn get_block_iterator_for_range_and_automaton<'a>(
&'a self,
key_range: impl RangeBounds<[u8]>,
automaton: &'a impl Automaton,
merge_holes_under_bytes: usize,
) -> impl Iterator<Item = BlockAddr> + 'a {
let lower_bound = match key_range.start_bound() {
Bound::Included(key) | Bound::Excluded(key) => {
self.sstable_index.locate_with_key(key).unwrap_or(u64::MAX)
}
Bound::Unbounded => 0,
};
let upper_bound = match key_range.end_bound() {
Bound::Included(key) | Bound::Excluded(key) => {
self.sstable_index.locate_with_key(key).unwrap_or(u64::MAX)
}
Bound::Unbounded => u64::MAX,
};
let block_range = lower_bound..=upper_bound;
self.sstable_index
.get_block_for_automaton(automaton)
.filter(move |(block_id, _)| block_range.contains(block_id))
.map(|(_, block_addr)| block_addr)
.coalesce(move |first, second| {
if first.byte_range.end + merge_holes_under_bytes >= second.byte_range.start {
Ok(BlockAddr {
first_ordinal: first.first_ordinal,
byte_range: first.byte_range.start..second.byte_range.end,
})
} else {
Err((first, second))
}
})
}
/// Opens a `TermDictionary`.
pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
let num_bytes = term_dictionary_file.num_bytes();
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(20);
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
let index_offset = u64::deserialize(&mut footer_len_bytes)?;
let num_terms = u64::deserialize(&mut footer_len_bytes)?;
let version = u32::deserialize(&mut footer_len_bytes)?;
let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
let sstable_index_bytes = index_slice.read_bytes()?;
let sstable_index = match version {
2 => SSTableIndex::V2(
crate::sstable_index_v2::SSTableIndex::load(sstable_index_bytes).map_err(|_| {
io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
})?,
),
3 => {
let (sstable_index_bytes, mut footerv3_len_bytes) = sstable_index_bytes.rsplit(8);
let store_offset = u64::deserialize(&mut footerv3_len_bytes)?;
if store_offset != 0 {
SSTableIndex::V3(
SSTableIndexV3::load(sstable_index_bytes, store_offset).map_err(|_| {
io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
})?,
)
} else {
// if store_offset is zero, there is no index, so we build a pseudo-index
// assuming a single block of sstable covering everything.
SSTableIndex::V3Empty(SSTableIndexV3Empty::load(index_offset as usize))
}
}
_ => {
return Err(io::Error::other(format!(
"Unsupported sstable version, expected one of [2, 3], found {version}"
)));
}
};
Ok(Dictionary {
sstable_slice,
sstable_index,
num_bytes,
num_terms,
phantom_data: PhantomData,
})
}
/// Creates a term dictionary from the supplied bytes.
pub fn from_bytes(owned_bytes: OwnedBytes) -> io::Result<Self> {
Dictionary::open(FileSlice::new(Arc::new(owned_bytes)))
}
/// Creates an empty term dictionary which contains no terms.
pub fn empty() -> Self {
let term_dictionary_data: Vec<u8> = Self::builder(Vec::<u8>::new())
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
let empty_dict_file = FileSlice::from(term_dictionary_data);
Dictionary::open(empty_dict_file).unwrap()
}
/// Returns the number of terms in the dictionary.
/// Term ordinals range from 0 to `num_terms() - 1`.
pub fn num_terms(&self) -> usize {
self.num_terms as usize
}
/// Returns the total number of bytes used by the dictionary on disk.
pub fn num_bytes(&self) -> ByteCount {
self.num_bytes
}
/// Decode a DeltaReader up to key, returning the number of terms traversed
///
/// If the key was not found, returns Ok(None).
/// After calling this function, it is possible to call `DeltaReader::value` to get the
/// associated value.
fn decode_up_to_key<K: AsRef<[u8]>>(
&self,
key: K,
sstable_delta_reader: &mut DeltaReader<TSSTable::ValueReader>,
) -> io::Result<Option<TermOrdinal>> {
self.decode_up_to_or_next(key, sstable_delta_reader)
.map(|hit| hit.into_exact())
}
/// Decode a DeltaReader up to key, returning the number of terms traversed
///
/// If the key was not found, it returns the next term id.
fn decode_up_to_or_next<K: AsRef<[u8]>>(
&self,
key: K,
sstable_delta_reader: &mut DeltaReader<TSSTable::ValueReader>,
) -> io::Result<TermOrdHit> {
let mut term_ord = 0;
let key_bytes = key.as_ref();
let mut ok_bytes = 0;
while sstable_delta_reader.advance()? {
let prefix_len = sstable_delta_reader.common_prefix_len();
let suffix = sstable_delta_reader.suffix();
match prefix_len.cmp(&ok_bytes) {
Ordering::Less => return Ok(TermOrdHit::Next(term_ord)), /* popped bytes already matched => too far */
Ordering::Equal => (),
Ordering::Greater => {
// the ok prefix is less than current entry prefix => continue to next elem
term_ord += 1;
continue;
}
}
// we have ok_bytes byte of common prefix, check if this key adds more
for (key_byte, suffix_byte) in key_bytes[ok_bytes..].iter().zip(suffix) {
match suffix_byte.cmp(key_byte) {
Ordering::Less => break, // byte too small
Ordering::Equal => ok_bytes += 1, // new matching
// byte
Ordering::Greater => return Ok(TermOrdHit::Next(term_ord)), // too far
}
}
if ok_bytes == key_bytes.len() {
if prefix_len + suffix.len() == ok_bytes {
return Ok(TermOrdHit::Exact(term_ord));
} else {
// current key is a prefix of current element, not a match
return Ok(TermOrdHit::Next(term_ord));
}
}
term_ord += 1;
}
Ok(TermOrdHit::Next(term_ord))
}
/// Returns the ordinal associated with a given term.
pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermOrdinal>> {
let key_bytes = key.as_ref();
let Some(block_addr) = self.sstable_index.get_block_with_key(key_bytes) else {
return Ok(None);
};
let first_ordinal = block_addr.first_ordinal;
let mut sstable_delta_reader = self.sstable_delta_reader_block(block_addr)?;
self.decode_up_to_key(key_bytes, &mut sstable_delta_reader)
.map(|opt| opt.map(|ord| ord + first_ordinal))
}
/// Returns the ordinal associated with a given term or its closest next term_id
/// The closest next term_id may not exist.
pub fn term_ord_or_next<K: AsRef<[u8]>>(&self, key: K) -> io::Result<TermOrdHit> {
let key_bytes = key.as_ref();
let Some(block_addr) = self.sstable_index.get_block_with_key(key_bytes) else {
// TODO: Would be more consistent to return last_term id + 1
return Ok(TermOrdHit::Next(u64::MAX));
};
let first_ordinal = block_addr.first_ordinal;
let mut sstable_delta_reader = self.sstable_delta_reader_block(block_addr)?;
self.decode_up_to_or_next(key_bytes, &mut sstable_delta_reader)
.map(|opt| opt.map(|ord| ord + first_ordinal))
}
/// Converts strings into a Bound range.
/// This does handle several special cases if the term is not exactly in the dictionary.
/// e.g. [bbb, ddd]
/// lower_bound: Bound::Included(aaa) => Included(0) // "Next" term id
/// lower_bound: Bound::Excluded(aaa) => Included(0) // "Next" term id + Change the Bounds
/// lower_bound: Bound::Included(ccc) => Included(1) // "Next" term id
/// lower_bound: Bound::Excluded(ccc) => Included(1) // "Next" term id + Change the Bounds
/// lower_bound: Bound::Included(zzz) => Included(2) // "Next" term id
/// lower_bound: Bound::Excluded(zzz) => Included(2) // "Next" term id + Change the Bounds
/// For zzz we should have some post processing to return an empty query`
///
/// upper_bound: Bound::Included(aaa) => Excluded(0) // "Next" term id + Change the bounds
/// upper_bound: Bound::Excluded(aaa) => Excluded(0) // "Next" term id
/// upper_bound: Bound::Included(ccc) => Excluded(1) // Next term id + Change the bounds
/// upper_bound: Bound::Excluded(ccc) => Excluded(1) // Next term id
/// upper_bound: Bound::Included(zzz) => Excluded(2) // Next term id + Change the bounds
/// upper_bound: Bound::Excluded(zzz) => Excluded(2) // Next term id
pub fn term_bounds_to_ord<K: AsRef<[u8]>>(
&self,
lower_bound: Bound<K>,
upper_bound: Bound<K>,
) -> io::Result<(Bound<TermOrdinal>, Bound<TermOrdinal>)> {
let lower_bound = transform_bound_inner_res(&lower_bound, |start_bound_bytes| {
let ord = self.term_ord_or_next(start_bound_bytes)?;
match ord {
TermOrdHit::Exact(ord) => Ok(TransformBound::Existing(ord)),
TermOrdHit::Next(ord) => Ok(TransformBound::NewBound(Bound::Included(ord))), /* Change bounds to included */
}
})?;
let upper_bound = transform_bound_inner_res(&upper_bound, |end_bound_bytes| {
let ord = self.term_ord_or_next(end_bound_bytes)?;
match ord {
TermOrdHit::Exact(ord) => Ok(TransformBound::Existing(ord)),
TermOrdHit::Next(ord) => Ok(TransformBound::NewBound(Bound::Excluded(ord))), /* Change bounds to excluded */
}
})?;
Ok((lower_bound, upper_bound))
}
/// Returns the term associated with a given term ordinal.
///
/// Term ordinals are defined as the position of the term in
/// the sorted list of terms.
///
/// Returns true if and only if the term has been found.
///
/// Regardless of whether the term is found or not,
/// the buffer may be modified.
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
// find block in which the term would be
let block_addr = self.sstable_index.get_block_with_ord(ord);
let first_ordinal = block_addr.first_ordinal;
// then search inside that block only
let mut sstable_delta_reader = self.sstable_delta_reader_block(block_addr)?;
for _ in first_ordinal..=ord {
if !sstable_delta_reader.advance()? {
return Ok(false);
}
bytes.truncate(sstable_delta_reader.common_prefix_len());
bytes.extend_from_slice(sstable_delta_reader.suffix());
}
Ok(true)
}
/// Returns the terms for a _sorted_ list of term ordinals.
///
/// Returns true if and only if all terms have been found.
pub fn sorted_ords_to_term_cb(
&self,
ords: &[TermOrdinal],
mut cb: impl FnMut(&[u8]),
) -> io::Result<bool> {
assert!(ords.is_sorted());
let mut ords = ords.iter().copied();
let Some(mut ord) = ords.next() else {
return Ok(true);
};
// Open the block for the first ordinal.
let mut bytes = Vec::new();
let mut current_block_addr = self.sstable_index.get_block_with_ord(ord);
let mut current_sstable_delta_reader =
self.sstable_delta_reader_block(current_block_addr.clone())?;
let mut current_block_ordinal = current_block_addr.first_ordinal;
loop {
// move to the ord inside the current block
while current_block_ordinal <= ord {
if !current_sstable_delta_reader.advance()? {
return Ok(false);
}
bytes.truncate(current_sstable_delta_reader.common_prefix_len());
bytes.extend_from_slice(current_sstable_delta_reader.suffix());
current_block_ordinal += 1;
}
cb(&bytes);
// fetch the next ordinal
let next_ord = loop {
let Some(next_ord) = ords.next() else {
return Ok(true);
};
if next_ord == ord {
// This is the same ordinal, let's just call the callback directly.
cb(&bytes);
} else {
// we checked it was sorted beforehands
debug_assert!(next_ord > ord);
break next_ord;
}
};
// TODO optimization: it is silly to do a binary search to get the block every single
// time.
//
// Check if block changed for new term_ord
let new_block_addr = self.sstable_index.get_block_with_ord(next_ord);
if new_block_addr != current_block_addr {
current_block_addr = new_block_addr;
current_block_ordinal = current_block_addr.first_ordinal;
current_sstable_delta_reader =
self.sstable_delta_reader_block(current_block_addr.clone())?;
bytes.clear();
}
ord = next_ord;
}
}
/// Returns the number of terms in the dictionary.
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> {
// find block in which the term would be
let block_addr = self.sstable_index.get_block_with_ord(term_ord);
let first_ordinal = block_addr.first_ordinal;
// then search inside that block only
let mut sstable_reader = self.sstable_reader_block(block_addr)?;
for _ in first_ordinal..=term_ord {
if !sstable_reader.advance()? {
return Ok(None);
}
}
Ok(Some(sstable_reader.value().clone()))
}
/// Lookups the value corresponding to the key.
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TSSTable::Value>> {
if let Some(block_addr) = self.sstable_index.get_block_with_key(key.as_ref()) {
let sstable_reader = self.sstable_delta_reader_block(block_addr)?;
return self.do_get(key, sstable_reader);
}
Ok(None)
}
/// Lookups the value corresponding to the key.
pub async fn get_async<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TSSTable::Value>> {
if let Some(block_addr) = self.sstable_index.get_block_with_key(key.as_ref()) {
let sstable_reader = self.sstable_delta_reader_block_async(block_addr).await?;
return self.do_get(key, sstable_reader);
}
Ok(None)
}
fn do_get<K: AsRef<[u8]>>(
&self,
key: K,
mut reader: DeltaReader<TSSTable::ValueReader>,
) -> io::Result<Option<TSSTable::Value>> {
if let Some(_ord) = self.decode_up_to_key(key, &mut reader)? {
Ok(Some(reader.value().clone()))
} else {
Ok(None)
}
}
/// Returns a range builder, to stream all of the terms
/// within an interval.
pub fn range(&self) -> StreamerBuilder<'_, TSSTable> {
StreamerBuilder::new(self, AlwaysMatch)
}
/// Returns a range builder filtered with a prefix.
pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<'_, TSSTable> {
let lower_bound = prefix.as_ref();
let mut upper_bound = lower_bound.to_vec();
for idx in (0..upper_bound.len()).rev() {
if upper_bound[idx] == 255 {
upper_bound.pop();
} else {
upper_bound[idx] += 1;
break;
}
}
let mut builder = self.range().ge(lower_bound);
if !upper_bound.is_empty() {
builder = builder.lt(upper_bound);
}
builder
}
/// A stream of all the sorted terms.
pub fn stream(&self) -> io::Result<Streamer<'_, TSSTable>> {
self.range().into_stream()
}
/// Returns a search builder, to stream all of the terms
/// within the Automaton
pub fn search<'a, A: Automaton + 'a>(
&'a self,
automaton: A,
) -> StreamerBuilder<'a, TSSTable, A>
where
A::State: Clone,
{
StreamerBuilder::<TSSTable, A>::new(self, automaton)
}
#[doc(hidden)]
pub async fn warm_up_dictionary(&self) -> io::Result<()> {
self.sstable_slice.read_bytes_async().await?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::ops::{Bound, Range};
use std::sync::{Arc, Mutex};
use common::OwnedBytes;
use super::Dictionary;
use crate::dictionary::TermOrdHit;
use crate::{MonotonicU64SSTable, TermOrdinal};
#[derive(Debug)]
struct PermissionedHandle {
bytes: OwnedBytes,
allowed_range: Mutex<Range<usize>>,
}
impl PermissionedHandle {
fn new(bytes: Vec<u8>) -> Self {
let bytes = OwnedBytes::new(bytes);
PermissionedHandle {
allowed_range: Mutex::new(0..bytes.len()),
bytes,
}
}
fn restrict(&self, range: Range<usize>) {
*self.allowed_range.lock().unwrap() = range;
}
}
impl common::HasLen for PermissionedHandle {
fn len(&self) -> usize {
self.bytes.len()
}
}
impl common::file_slice::FileHandle for PermissionedHandle {
fn read_bytes(&self, range: Range<usize>) -> std::io::Result<OwnedBytes> {
let allowed_range = self.allowed_range.lock().unwrap();
if !allowed_range.contains(&range.start) || !allowed_range.contains(&(range.end - 1)) {
return Err(std::io::Error::other(format!(
"invalid range, allowed {allowed_range:?}, requested {range:?}"
)));
}
Ok(self.bytes.slice(range))
}
}
fn make_test_sstable() -> (Dictionary<MonotonicU64SSTable>, Arc<PermissionedHandle>) {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
// this makes 256k keys, enough to fill multiple blocks.
for elem in 0..0x3ffff {
let key = format!("{elem:05X}").into_bytes();
builder.insert(&key, &elem).unwrap();
}
let table = builder.finish().unwrap();
let table = Arc::new(PermissionedHandle::new(table));
let slice = common::file_slice::FileSlice::new(table.clone());
let dictionary = Dictionary::<MonotonicU64SSTable>::open(slice).unwrap();
// if the last block is id 0, tests are meaningless
assert_ne!(dictionary.sstable_index.locate_with_ord(u64::MAX), 0);
assert_eq!(dictionary.num_terms(), 0x3ffff);
(dictionary, table)
}
#[test]
fn test_term_to_ord_or_next() {
let dict = {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
builder.insert(b"bbb", &1).unwrap();
builder.insert(b"ddd", &2).unwrap();
let table = builder.finish().unwrap();
let table = Arc::new(PermissionedHandle::new(table));
let slice = common::file_slice::FileSlice::new(table.clone());
Dictionary::<MonotonicU64SSTable>::open(slice).unwrap()
};
assert_eq!(dict.term_ord_or_next(b"aaa").unwrap(), TermOrdHit::Next(0));
assert_eq!(dict.term_ord_or_next(b"bbb").unwrap(), TermOrdHit::Exact(0));
assert_eq!(dict.term_ord_or_next(b"bb").unwrap(), TermOrdHit::Next(0));
assert_eq!(dict.term_ord_or_next(b"bbbb").unwrap(), TermOrdHit::Next(1));
assert_eq!(dict.term_ord_or_next(b"dd").unwrap(), TermOrdHit::Next(1));
assert_eq!(dict.term_ord_or_next(b"ddd").unwrap(), TermOrdHit::Exact(1));
assert_eq!(dict.term_ord_or_next(b"dddd").unwrap(), TermOrdHit::Next(2));
// This is not u64::MAX because for very small sstables (only one block),
// we don't store an index, and the pseudo-index always reply that the
// answer lies in block number 0
assert_eq!(
dict.term_ord_or_next(b"zzzzzzz").unwrap(),
TermOrdHit::Next(2)
);
}
#[test]
fn test_term_to_ord_or_next_2() {
let dict = {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
let mut term_ord = 0;
builder.insert(b"bbb", &term_ord).unwrap();
// Fill blocks in between
for elem in 0..50_000 {
term_ord += 1;
let key = format!("ccccc{elem:05X}").into_bytes();
builder.insert(&key, &term_ord).unwrap();
}
term_ord += 1;
builder.insert(b"eee", &term_ord).unwrap();
let table = builder.finish().unwrap();
let table = Arc::new(PermissionedHandle::new(table));
let slice = common::file_slice::FileSlice::new(table.clone());
Dictionary::<MonotonicU64SSTable>::open(slice).unwrap()
};
assert_eq!(dict.term_ord(b"bbb").unwrap(), Some(0));
assert_eq!(dict.term_ord_or_next(b"bbb").unwrap(), TermOrdHit::Exact(0));
assert_eq!(dict.term_ord_or_next(b"aaa").unwrap(), TermOrdHit::Next(0));
assert_eq!(dict.term_ord_or_next(b"bb").unwrap(), TermOrdHit::Next(0));
assert_eq!(dict.term_ord_or_next(b"bbbb").unwrap(), TermOrdHit::Next(1));
assert_eq!(
dict.term_ord_or_next(b"ee").unwrap(),
TermOrdHit::Next(50001)
);
assert_eq!(
dict.term_ord_or_next(b"eee").unwrap(),
TermOrdHit::Exact(50001)
);
assert_eq!(
dict.term_ord_or_next(b"eeee").unwrap(),
TermOrdHit::Next(u64::MAX)
);
assert_eq!(
dict.term_ord_or_next(b"zzzzzzz").unwrap(),
TermOrdHit::Next(u64::MAX)
);
}
#[test]
fn test_term_bounds_to_ord() {
let dict = {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
builder.insert(b"bbb", &1).unwrap();
builder.insert(b"ddd", &2).unwrap();
let table = builder.finish().unwrap();
let table = Arc::new(PermissionedHandle::new(table));
let slice = common::file_slice::FileSlice::new(table.clone());
Dictionary::<MonotonicU64SSTable>::open(slice).unwrap()
};
// Test cases for lower_bound
let test_lower_bound = |bound, expected| {
assert_eq!(
dict.term_bounds_to_ord::<&[u8]>(bound, Bound::Included(b"ignored"))
.unwrap()
.0,
expected
);
};
test_lower_bound(Bound::Included(b"aaa".as_slice()), Bound::Included(0));
test_lower_bound(Bound::Excluded(b"aaa".as_slice()), Bound::Included(0));
test_lower_bound(Bound::Included(b"bbb".as_slice()), Bound::Included(0));
test_lower_bound(Bound::Excluded(b"bbb".as_slice()), Bound::Excluded(0));
test_lower_bound(Bound::Included(b"ccc".as_slice()), Bound::Included(1));
test_lower_bound(Bound::Excluded(b"ccc".as_slice()), Bound::Included(1));
test_lower_bound(Bound::Included(b"zzz".as_slice()), Bound::Included(2));
test_lower_bound(Bound::Excluded(b"zzz".as_slice()), Bound::Included(2));
// Test cases for upper_bound
let test_upper_bound = |bound, expected| {
assert_eq!(
dict.term_bounds_to_ord::<&[u8]>(Bound::Included(b"ignored"), bound,)
.unwrap()
.1,
expected
);
};
test_upper_bound(Bound::Included(b"ccc".as_slice()), Bound::Excluded(1));
test_upper_bound(Bound::Excluded(b"ccc".as_slice()), Bound::Excluded(1));
test_upper_bound(Bound::Included(b"zzz".as_slice()), Bound::Excluded(2));
test_upper_bound(Bound::Excluded(b"zzz".as_slice()), Bound::Excluded(2));
test_upper_bound(Bound::Included(b"ddd".as_slice()), Bound::Included(1));
test_upper_bound(Bound::Excluded(b"ddd".as_slice()), Bound::Excluded(1));
}
#[test]
fn test_ord_term_conversion() {
let (dic, slice) = make_test_sstable();
let block = dic.sstable_index.get_block_with_ord(100_000);
slice.restrict(block.byte_range);
let mut res = Vec::new();
// middle of a block
assert!(dic.ord_to_term(100_000, &mut res).unwrap());
assert_eq!(res, format!("{:05X}", 100_000).into_bytes());
assert_eq!(dic.term_info_from_ord(100_000).unwrap().unwrap(), 100_000);
assert_eq!(dic.get(&res).unwrap().unwrap(), 100_000);
assert_eq!(dic.term_ord(&res).unwrap().unwrap(), 100_000);
// start of a block
assert!(dic.ord_to_term(block.first_ordinal, &mut res).unwrap());
assert_eq!(res, format!("{:05X}", block.first_ordinal).into_bytes());
assert_eq!(
dic.term_info_from_ord(block.first_ordinal)
.unwrap()
.unwrap(),
block.first_ordinal
);
assert_eq!(dic.get(&res).unwrap().unwrap(), block.first_ordinal);
assert_eq!(dic.term_ord(&res).unwrap().unwrap(), block.first_ordinal);
// end of a block
let ordinal = block.first_ordinal - 1;
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
slice.restrict(new_range);
assert!(dic.ord_to_term(ordinal, &mut res).unwrap());
assert_eq!(res, format!("{ordinal:05X}").into_bytes());
assert_eq!(dic.term_info_from_ord(ordinal).unwrap().unwrap(), ordinal);
assert_eq!(dic.get(&res).unwrap().unwrap(), ordinal);
assert_eq!(dic.term_ord(&res).unwrap().unwrap(), ordinal);
// before first block
// 1st block must be loaded for key-related operations
let block = dic.sstable_index.get_block_with_ord(0);
slice.restrict(block.byte_range);
assert!(dic.get(b"$$$").unwrap().is_none());
assert!(dic.term_ord(b"$$$").unwrap().is_none());
// after last block
// last block must be loaded for ord related operations
let ordinal = 0x40000 + 10;
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
slice.restrict(new_range);
assert!(!dic.ord_to_term(ordinal, &mut res).unwrap());
assert!(dic.term_info_from_ord(ordinal).unwrap().is_none());
// last block isn't required to be loaded for key related operations
slice.restrict(0..0);
assert!(dic.get(b"~~~").unwrap().is_none());
assert!(dic.term_ord(b"~~~").unwrap().is_none());
slice.restrict(0..slice.bytes.len());
// between 1000F and 10010, test case where matched prefix > prefix kept
assert!(dic.term_ord(b"1000G").unwrap().is_none());
// shorter than 10000, tests prefix case
assert!(dic.term_ord(b"1000").unwrap().is_none());
}
#[test]
fn test_sorted_ords_to_term() {
let (dic, _slice) = make_test_sstable();
// Single term
let mut terms = Vec::new();
assert!(
dic.sorted_ords_to_term_cb(&[100_000], |term| {
terms.push(term.to_vec());
})
.unwrap()
);
assert_eq!(terms, vec![format!("{:05X}", 100_000).into_bytes(),]);
// Single term
let mut terms = Vec::new();
let ords: Vec<TermOrdinal> = (100_001..100_002).collect();
assert!(
dic.sorted_ords_to_term_cb(&ords, |term| {
terms.push(term.to_vec());
})
.unwrap()
);
assert_eq!(terms, vec![format!("{:05X}", 100_001).into_bytes(),]);
// both terms
let mut terms = Vec::new();
assert!(
dic.sorted_ords_to_term_cb(&[100_000, 100_001], |term| {
terms.push(term.to_vec());
})
.unwrap()
);
assert_eq!(
terms,
vec![
format!("{:05X}", 100_000).into_bytes(),
format!("{:05X}", 100_001).into_bytes(),
]
);
// Test cross block
let mut terms = Vec::new();
let ords: Vec<TermOrdinal> = (98653..=98655).collect();
assert!(
dic.sorted_ords_to_term_cb(&ords, |term| {
terms.push(term.to_vec());
})
.unwrap()
);
assert_eq!(
terms,
vec![
format!("{:05X}", 98653).into_bytes(),
format!("{:05X}", 98654).into_bytes(),
format!("{:05X}", 98655).into_bytes(),
]
);
// redundant
let mut terms = Vec::new();
let ords: Vec<TermOrdinal> = vec![1, 1, 2];
assert!(
dic.sorted_ords_to_term_cb(&ords, |term| {
terms.push(term.to_vec());
})
.unwrap()
);
assert_eq!(
terms,
vec![
format!("{:05X}", 1).into_bytes(),
format!("{:05X}", 1).into_bytes(),
format!("{:05X}", 2).into_bytes(),
]
);
// redundant cross block
let mut terms = Vec::new();
let ords: Vec<TermOrdinal> = vec![98653, 98653, 98654, 98654, 98655, 98655];
assert!(
dic.sorted_ords_to_term_cb(&ords, |term| {
terms.push(term.to_vec());
})
.unwrap()
);
assert_eq!(
terms,
vec![
format!("{:05X}", 98_653).into_bytes(),
format!("{:05X}", 98_653).into_bytes(),
format!("{:05X}", 98_654).into_bytes(),
format!("{:05X}", 98_654).into_bytes(),
format!("{:05X}", 98_655).into_bytes(),
format!("{:05X}", 98_655).into_bytes(),
]
);
}
#[test]
fn test_range() {
let (dic, slice) = make_test_sstable();
let start = dic
.sstable_index
.get_block_with_key(b"10000")
.unwrap()
.byte_range;
let end = dic
.sstable_index
.get_block_with_key(b"18000")
.unwrap()
.byte_range;
slice.restrict(start.start..end.end);
let mut stream = dic.range().ge(b"10000").lt(b"18000").into_stream().unwrap();
for i in 0x10000..0x18000 {
assert!(stream.advance());
assert_eq!(stream.term_ord(), i);
assert_eq!(stream.value(), &i);
assert_eq!(stream.key(), format!("{i:05X}").into_bytes());
}
assert!(!stream.advance());
// verify limiting the number of results reduce the size read
slice.restrict(start.start..(end.end - 1));
let mut stream = dic
.range()
.ge(b"10000")
.lt(b"18000")
.limit(0xfff)
.into_stream()
.unwrap();
for i in 0x10000..0x10fff {
assert!(stream.advance());
assert_eq!(stream.term_ord(), i);
assert_eq!(stream.value(), &i);
assert_eq!(stream.key(), format!("{i:05X}").into_bytes());
}
// there might be more successful elements after, though how many is undefined
slice.restrict(0..slice.bytes.len());
let mut stream = dic.stream().unwrap();
for i in 0..0x3ffff {
assert!(stream.advance());
assert_eq!(stream.term_ord(), i);
assert_eq!(stream.value(), &i);
assert_eq!(stream.key(), format!("{i:05X}").into_bytes());
}
assert!(!stream.advance());
}
#[test]
fn test_prefix() {
let (dic, _slice) = make_test_sstable();
{
let mut stream = dic.prefix_range("1").into_stream().unwrap();
for i in 0x10000..0x20000 {
assert!(stream.advance());
assert_eq!(stream.term_ord(), i);
assert_eq!(stream.value(), &i);
assert_eq!(stream.key(), format!("{i:05X}").into_bytes());
}
assert!(!stream.advance());
}
{
let mut stream = dic.prefix_range("").into_stream().unwrap();
for i in 0..0x3ffff {
assert!(stream.advance(), "failed at {i:05X}");
assert_eq!(stream.term_ord(), i);
assert_eq!(stream.value(), &i);
assert_eq!(stream.key(), format!("{i:05X}").into_bytes());
}
assert!(!stream.advance());
}
{
let mut stream = dic.prefix_range("0FF").into_stream().unwrap();
for i in 0x0ff00..=0x0ffff {
assert!(stream.advance(), "failed at {i:05X}");
assert_eq!(stream.term_ord(), i);
assert_eq!(stream.value(), &i);
assert_eq!(stream.key(), format!("{i:05X}").into_bytes());
}
assert!(!stream.advance());
}
}
#[test]
fn test_prefix_edge() {
let dict = {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
builder.insert(&[0, 254], &0).unwrap();
builder.insert(&[0, 255], &1).unwrap();
builder.insert(&[0, 255, 12], &2).unwrap();
builder.insert(&[1], &2).unwrap();
builder.insert(&[1, 0], &2).unwrap();
let table = builder.finish().unwrap();
let table = Arc::new(PermissionedHandle::new(table));
let slice = common::file_slice::FileSlice::new(table.clone());
Dictionary::<MonotonicU64SSTable>::open(slice).unwrap()
};
let mut stream = dict.prefix_range(&[0, 255]).into_stream().unwrap();
assert!(stream.advance());
assert_eq!(stream.key(), &[0, 255]);
assert!(stream.advance());
assert_eq!(stream.key(), &[0, 255, 12]);
assert!(!stream.advance());
}
}