attempt to optimize sorted_ords_to_term_cb

This commit is contained in:
trinity Pointard
2025-07-01 16:47:45 +02:00
parent 080fa4d1f4
commit dedd1aa83a
3 changed files with 77 additions and 31 deletions

View File

@@ -487,7 +487,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
/// the buffer may be modified. /// the buffer may be modified.
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> { pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
// find block in which the term would be // find block in which the term would be
let block_addr = self.sstable_index.get_block_with_ord(ord); let block_addr = self.sstable_index.get_block_with_ord::<false>(ord).0;
let first_ordinal = block_addr.first_ordinal; let first_ordinal = block_addr.first_ordinal;
// then search inside that block only // then search inside that block only
@@ -511,16 +511,17 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
mut cb: F, mut cb: F,
) -> io::Result<bool> { ) -> io::Result<bool> {
let mut bytes = Vec::new(); let mut bytes = Vec::new();
let mut current_block_addr = self.sstable_index.get_block_with_ord(0); let (mut current_block_addr, mut next_block_ord) =
self.sstable_index.get_block_with_ord::<true>(0);
let mut current_sstable_delta_reader = let mut current_sstable_delta_reader =
self.sstable_delta_reader_block(current_block_addr.clone())?; self.sstable_delta_reader_block(current_block_addr.clone())?;
let mut current_ordinal = 0; let mut current_ordinal = 0;
for ord in ord { for ord in ord {
assert!(ord >= current_ordinal); assert!(ord >= current_ordinal);
// check if block changed for new term_ord // check if block changed for new term_ord
let new_block_addr = self.sstable_index.get_block_with_ord(ord); if ord >= next_block_ord {
if new_block_addr != current_block_addr { (current_block_addr, next_block_ord) =
current_block_addr = new_block_addr; self.sstable_index.get_block_with_ord::<true>(ord);
current_ordinal = current_block_addr.first_ordinal; current_ordinal = current_block_addr.first_ordinal;
current_sstable_delta_reader = current_sstable_delta_reader =
self.sstable_delta_reader_block(current_block_addr.clone())?; self.sstable_delta_reader_block(current_block_addr.clone())?;
@@ -544,7 +545,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
/// Returns the number of terms in the dictionary. /// Returns the number of terms in the dictionary.
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> { pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> {
// find block in which the term would be // find block in which the term would be
let block_addr = self.sstable_index.get_block_with_ord(term_ord); let block_addr = self.sstable_index.get_block_with_ord::<false>(term_ord).0;
let first_ordinal = block_addr.first_ordinal; let first_ordinal = block_addr.first_ordinal;
// then search inside that block only // then search inside that block only
@@ -846,7 +847,7 @@ mod tests {
fn test_ord_term_conversion() { fn test_ord_term_conversion() {
let (dic, slice) = make_test_sstable(); let (dic, slice) = make_test_sstable();
let block = dic.sstable_index.get_block_with_ord(100_000); let block = dic.sstable_index.get_block_with_ord::<false>(100_000).0;
slice.restrict(block.byte_range); slice.restrict(block.byte_range);
let mut res = Vec::new(); let mut res = Vec::new();
@@ -872,7 +873,11 @@ mod tests {
// end of a block // end of a block
let ordinal = block.first_ordinal - 1; let ordinal = block.first_ordinal - 1;
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range; let new_range = dic
.sstable_index
.get_block_with_ord::<false>(ordinal)
.0
.byte_range;
slice.restrict(new_range); slice.restrict(new_range);
assert!(dic.ord_to_term(ordinal, &mut res).unwrap()); assert!(dic.ord_to_term(ordinal, &mut res).unwrap());
assert_eq!(res, format!("{ordinal:05X}").into_bytes()); assert_eq!(res, format!("{ordinal:05X}").into_bytes());
@@ -882,7 +887,7 @@ mod tests {
// before first block // before first block
// 1st block must be loaded for key-related operations // 1st block must be loaded for key-related operations
let block = dic.sstable_index.get_block_with_ord(0); let block = dic.sstable_index.get_block_with_ord::<false>(0).0;
slice.restrict(block.byte_range); slice.restrict(block.byte_range);
assert!(dic.get(b"$$$").unwrap().is_none()); assert!(dic.get(b"$$$").unwrap().is_none());
@@ -891,7 +896,11 @@ mod tests {
// after last block // after last block
// last block must be loaded for ord related operations // last block must be loaded for ord related operations
let ordinal = 0x40000 + 10; let ordinal = 0x40000 + 10;
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range; let new_range = dic
.sstable_index
.get_block_with_ord::<false>(ordinal)
.0
.byte_range;
slice.restrict(new_range); slice.restrict(new_range);
assert!(!dic.ord_to_term(ordinal, &mut res).unwrap()); assert!(!dic.ord_to_term(ordinal, &mut res).unwrap());
assert!(dic.term_info_from_ord(ordinal).unwrap().is_none()); assert!(dic.term_info_from_ord(ordinal).unwrap().is_none());

View File

@@ -72,9 +72,15 @@ impl SSTableIndex {
} }
/// Get the [`BlockAddr`] of the block containing the `ord`-th term. /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr { pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> (BlockAddr, u64) {
// locate_with_ord always returns an index within range // locate_with_ord always returns an index within range
self.get_block(self.locate_with_ord(ord)).unwrap() let block_pos = self.locate_with_ord(ord);
(
self.get_block(block_pos).unwrap(),
self.get_block(block_pos + 1)
.map(|b| b.first_ordinal)
.unwrap_or(u64::MAX),
)
} }
pub(crate) fn get_block_for_automaton<'a>( pub(crate) fn get_block_for_automaton<'a>(

View File

@@ -58,10 +58,13 @@ impl SSTableIndex {
} }
/// Get the [`BlockAddr`] of the block containing the `ord`-th term. /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr { pub(crate) fn get_block_with_ord<const FETCH_NEXT_ORD: bool>(
&self,
ord: TermOrdinal,
) -> (BlockAddr, u64) {
match self { match self {
SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord), SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord),
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord(ord), SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord::<FETCH_NEXT_ORD>(ord),
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord), SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord),
} }
} }
@@ -152,12 +155,18 @@ impl SSTableIndexV3 {
} }
pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 { pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
self.block_addr_store.binary_search_ord(ord).0 self.block_addr_store.binary_search_ord::<false>(ord).0
} }
/// Get the [`BlockAddr`] of the block containing the `ord`-th term. /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr { pub(crate) fn get_block_with_ord<const FETCH_NEXT_ORD: bool>(
self.block_addr_store.binary_search_ord(ord).1 &self,
ord: TermOrdinal,
) -> (BlockAddr, u64) {
let (_block_id, block_addr, next_ord) = self
.block_addr_store
.binary_search_ord::<FETCH_NEXT_ORD>(ord);
(block_addr, next_ord)
} }
pub(crate) fn get_block_for_automaton<'a>( pub(crate) fn get_block_for_automaton<'a>(
@@ -253,8 +262,8 @@ impl SSTableIndexV3Empty {
} }
/// Get the [`BlockAddr`] of the block containing the `ord`-th term. /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> BlockAddr { pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> (BlockAddr, u64) {
self.block_addr.clone() (self.block_addr.clone(), u64::MAX)
} }
} }
#[derive(Clone, Eq, PartialEq, Debug)] #[derive(Clone, Eq, PartialEq, Debug)]
@@ -461,7 +470,11 @@ impl BlockAddrBlockMetadata {
}) })
} }
fn bisect_for_ord(&self, data: &[u8], target_ord: TermOrdinal) -> (u64, BlockAddr) { fn bisect_for_ord<const FETCH_NEXT_ORD: bool>(
&self,
data: &[u8],
target_ord: TermOrdinal,
) -> (u64, BlockAddr, u64) {
let inner_target_ord = target_ord - self.ref_block_addr.first_ordinal; let inner_target_ord = target_ord - self.ref_block_addr.first_ordinal;
let num_bits = self.num_bits() as usize; let num_bits = self.num_bits() as usize;
let range_start_nbits = self.range_start_nbits as usize; let range_start_nbits = self.range_start_nbits as usize;
@@ -481,11 +494,17 @@ impl BlockAddrBlockMetadata {
Err(inner_offset) => inner_offset, Err(inner_offset) => inner_offset,
}; };
// we can unwrap because inner_offset <= self.block_len // we can unwrap because inner_offset <= self.block_len
( let block = self
inner_offset, .deserialize_block_addr(data, inner_offset as usize)
self.deserialize_block_addr(data, inner_offset as usize) .unwrap();
.unwrap(), let next_ord = if FETCH_NEXT_ORD {
) self.deserialize_block_addr(data, inner_offset as usize + 1)
.map(|b| b.first_ordinal)
.unwrap_or(u64::MAX)
} else {
0
};
(inner_offset, block, next_ord)
} }
} }
@@ -591,7 +610,10 @@ impl BlockAddrStore {
) )
} }
fn binary_search_ord(&self, ord: TermOrdinal) -> (u64, BlockAddr) { fn binary_search_ord<const FETCH_NEXT_ORD: bool>(
&self,
ord: TermOrdinal,
) -> (u64, BlockAddr, u64) {
let max_block = let max_block =
(self.block_meta_bytes.len() / BlockAddrBlockMetadata::SIZE_IN_BYTES) as u64; (self.block_meta_bytes.len() / BlockAddrBlockMetadata::SIZE_IN_BYTES) as u64;
let get_first_ordinal = |block_id| { let get_first_ordinal = |block_id| {
@@ -606,20 +628,29 @@ impl BlockAddrStore {
Ok(store_block_id) => { Ok(store_block_id) => {
let block_id = store_block_id * STORE_BLOCK_LEN as u64; let block_id = store_block_id * STORE_BLOCK_LEN as u64;
// we can unwrap because store_block_id < max_block // we can unwrap because store_block_id < max_block
return (block_id, self.get(block_id).unwrap()); let next_ord = if FETCH_NEXT_ORD {
self.get(block_id + 1)
.map(|b| b.first_ordinal)
.unwrap_or(u64::MAX)
} else {
0
};
return (block_id, self.get(block_id).unwrap(), next_ord);
} }
Err(store_block_id) => store_block_id - 1, Err(store_block_id) => store_block_id - 1,
}; };
// we can unwrap because store_block_id < max_block // we can unwrap because store_block_id < max_block
let block_addr_block_data = self.get_block_meta(store_block_id as usize).unwrap(); let block_addr_block_data = self.get_block_meta(store_block_id as usize).unwrap();
let (inner_offset, block_addr) = block_addr_block_data.bisect_for_ord( let (inner_offset, block_addr, next_block_ord) = block_addr_block_data
.bisect_for_ord::<FETCH_NEXT_ORD>(
&self.addr_bytes[block_addr_block_data.offset as usize..], &self.addr_bytes[block_addr_block_data.offset as usize..],
ord, ord,
); );
( (
store_block_id * STORE_BLOCK_LEN as u64 + inner_offset, store_block_id * STORE_BLOCK_LEN as u64 + inner_offset,
block_addr, block_addr,
next_block_ord,
) )
} }
} }