mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
attempt to optimize sorted_ords_to_term_cb
This commit is contained in:
@@ -487,7 +487,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
/// the buffer may be modified.
|
||||
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
|
||||
// find block in which the term would be
|
||||
let block_addr = self.sstable_index.get_block_with_ord(ord);
|
||||
let block_addr = self.sstable_index.get_block_with_ord::<false>(ord).0;
|
||||
let first_ordinal = block_addr.first_ordinal;
|
||||
|
||||
// then search inside that block only
|
||||
@@ -511,16 +511,17 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
mut cb: F,
|
||||
) -> io::Result<bool> {
|
||||
let mut bytes = Vec::new();
|
||||
let mut current_block_addr = self.sstable_index.get_block_with_ord(0);
|
||||
let (mut current_block_addr, mut next_block_ord) =
|
||||
self.sstable_index.get_block_with_ord::<true>(0);
|
||||
let mut current_sstable_delta_reader =
|
||||
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
||||
let mut current_ordinal = 0;
|
||||
for ord in ord {
|
||||
assert!(ord >= current_ordinal);
|
||||
// check if block changed for new term_ord
|
||||
let new_block_addr = self.sstable_index.get_block_with_ord(ord);
|
||||
if new_block_addr != current_block_addr {
|
||||
current_block_addr = new_block_addr;
|
||||
if ord >= next_block_ord {
|
||||
(current_block_addr, next_block_ord) =
|
||||
self.sstable_index.get_block_with_ord::<true>(ord);
|
||||
current_ordinal = current_block_addr.first_ordinal;
|
||||
current_sstable_delta_reader =
|
||||
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
||||
@@ -544,7 +545,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
/// Returns the number of terms in the dictionary.
|
||||
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> {
|
||||
// find block in which the term would be
|
||||
let block_addr = self.sstable_index.get_block_with_ord(term_ord);
|
||||
let block_addr = self.sstable_index.get_block_with_ord::<false>(term_ord).0;
|
||||
let first_ordinal = block_addr.first_ordinal;
|
||||
|
||||
// then search inside that block only
|
||||
@@ -846,7 +847,7 @@ mod tests {
|
||||
fn test_ord_term_conversion() {
|
||||
let (dic, slice) = make_test_sstable();
|
||||
|
||||
let block = dic.sstable_index.get_block_with_ord(100_000);
|
||||
let block = dic.sstable_index.get_block_with_ord::<false>(100_000).0;
|
||||
slice.restrict(block.byte_range);
|
||||
|
||||
let mut res = Vec::new();
|
||||
@@ -872,7 +873,11 @@ mod tests {
|
||||
|
||||
// end of a block
|
||||
let ordinal = block.first_ordinal - 1;
|
||||
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
|
||||
let new_range = dic
|
||||
.sstable_index
|
||||
.get_block_with_ord::<false>(ordinal)
|
||||
.0
|
||||
.byte_range;
|
||||
slice.restrict(new_range);
|
||||
assert!(dic.ord_to_term(ordinal, &mut res).unwrap());
|
||||
assert_eq!(res, format!("{ordinal:05X}").into_bytes());
|
||||
@@ -882,7 +887,7 @@ mod tests {
|
||||
|
||||
// before first block
|
||||
// 1st block must be loaded for key-related operations
|
||||
let block = dic.sstable_index.get_block_with_ord(0);
|
||||
let block = dic.sstable_index.get_block_with_ord::<false>(0).0;
|
||||
slice.restrict(block.byte_range);
|
||||
|
||||
assert!(dic.get(b"$$$").unwrap().is_none());
|
||||
@@ -891,7 +896,11 @@ mod tests {
|
||||
// after last block
|
||||
// last block must be loaded for ord related operations
|
||||
let ordinal = 0x40000 + 10;
|
||||
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
|
||||
let new_range = dic
|
||||
.sstable_index
|
||||
.get_block_with_ord::<false>(ordinal)
|
||||
.0
|
||||
.byte_range;
|
||||
slice.restrict(new_range);
|
||||
assert!(!dic.ord_to_term(ordinal, &mut res).unwrap());
|
||||
assert!(dic.term_info_from_ord(ordinal).unwrap().is_none());
|
||||
|
||||
@@ -72,9 +72,15 @@ impl SSTableIndex {
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||
// locate_with_ord always returns an index within range
|
||||
self.get_block(self.locate_with_ord(ord)).unwrap()
|
||||
let block_pos = self.locate_with_ord(ord);
|
||||
(
|
||||
self.get_block(block_pos).unwrap(),
|
||||
self.get_block(block_pos + 1)
|
||||
.map(|b| b.first_ordinal)
|
||||
.unwrap_or(u64::MAX),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn get_block_for_automaton<'a>(
|
||||
|
||||
@@ -58,10 +58,13 @@ impl SSTableIndex {
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
||||
pub(crate) fn get_block_with_ord<const FETCH_NEXT_ORD: bool>(
|
||||
&self,
|
||||
ord: TermOrdinal,
|
||||
) -> (BlockAddr, u64) {
|
||||
match self {
|
||||
SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord(ord),
|
||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord::<FETCH_NEXT_ORD>(ord),
|
||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord),
|
||||
}
|
||||
}
|
||||
@@ -152,12 +155,18 @@ impl SSTableIndexV3 {
|
||||
}
|
||||
|
||||
pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
|
||||
self.block_addr_store.binary_search_ord(ord).0
|
||||
self.block_addr_store.binary_search_ord::<false>(ord).0
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
||||
self.block_addr_store.binary_search_ord(ord).1
|
||||
pub(crate) fn get_block_with_ord<const FETCH_NEXT_ORD: bool>(
|
||||
&self,
|
||||
ord: TermOrdinal,
|
||||
) -> (BlockAddr, u64) {
|
||||
let (_block_id, block_addr, next_ord) = self
|
||||
.block_addr_store
|
||||
.binary_search_ord::<FETCH_NEXT_ORD>(ord);
|
||||
(block_addr, next_ord)
|
||||
}
|
||||
|
||||
pub(crate) fn get_block_for_automaton<'a>(
|
||||
@@ -253,8 +262,8 @@ impl SSTableIndexV3Empty {
|
||||
}
|
||||
|
||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||
pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> BlockAddr {
|
||||
self.block_addr.clone()
|
||||
pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||
(self.block_addr.clone(), u64::MAX)
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
@@ -461,7 +470,11 @@ impl BlockAddrBlockMetadata {
|
||||
})
|
||||
}
|
||||
|
||||
fn bisect_for_ord(&self, data: &[u8], target_ord: TermOrdinal) -> (u64, BlockAddr) {
|
||||
fn bisect_for_ord<const FETCH_NEXT_ORD: bool>(
|
||||
&self,
|
||||
data: &[u8],
|
||||
target_ord: TermOrdinal,
|
||||
) -> (u64, BlockAddr, u64) {
|
||||
let inner_target_ord = target_ord - self.ref_block_addr.first_ordinal;
|
||||
let num_bits = self.num_bits() as usize;
|
||||
let range_start_nbits = self.range_start_nbits as usize;
|
||||
@@ -481,11 +494,17 @@ impl BlockAddrBlockMetadata {
|
||||
Err(inner_offset) => inner_offset,
|
||||
};
|
||||
// we can unwrap because inner_offset <= self.block_len
|
||||
(
|
||||
inner_offset,
|
||||
self.deserialize_block_addr(data, inner_offset as usize)
|
||||
.unwrap(),
|
||||
)
|
||||
let block = self
|
||||
.deserialize_block_addr(data, inner_offset as usize)
|
||||
.unwrap();
|
||||
let next_ord = if FETCH_NEXT_ORD {
|
||||
self.deserialize_block_addr(data, inner_offset as usize + 1)
|
||||
.map(|b| b.first_ordinal)
|
||||
.unwrap_or(u64::MAX)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
(inner_offset, block, next_ord)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -591,7 +610,10 @@ impl BlockAddrStore {
|
||||
)
|
||||
}
|
||||
|
||||
fn binary_search_ord(&self, ord: TermOrdinal) -> (u64, BlockAddr) {
|
||||
fn binary_search_ord<const FETCH_NEXT_ORD: bool>(
|
||||
&self,
|
||||
ord: TermOrdinal,
|
||||
) -> (u64, BlockAddr, u64) {
|
||||
let max_block =
|
||||
(self.block_meta_bytes.len() / BlockAddrBlockMetadata::SIZE_IN_BYTES) as u64;
|
||||
let get_first_ordinal = |block_id| {
|
||||
@@ -606,20 +628,29 @@ impl BlockAddrStore {
|
||||
Ok(store_block_id) => {
|
||||
let block_id = store_block_id * STORE_BLOCK_LEN as u64;
|
||||
// we can unwrap because store_block_id < max_block
|
||||
return (block_id, self.get(block_id).unwrap());
|
||||
let next_ord = if FETCH_NEXT_ORD {
|
||||
self.get(block_id + 1)
|
||||
.map(|b| b.first_ordinal)
|
||||
.unwrap_or(u64::MAX)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
return (block_id, self.get(block_id).unwrap(), next_ord);
|
||||
}
|
||||
Err(store_block_id) => store_block_id - 1,
|
||||
};
|
||||
|
||||
// we can unwrap because store_block_id < max_block
|
||||
let block_addr_block_data = self.get_block_meta(store_block_id as usize).unwrap();
|
||||
let (inner_offset, block_addr) = block_addr_block_data.bisect_for_ord(
|
||||
&self.addr_bytes[block_addr_block_data.offset as usize..],
|
||||
ord,
|
||||
);
|
||||
let (inner_offset, block_addr, next_block_ord) = block_addr_block_data
|
||||
.bisect_for_ord::<FETCH_NEXT_ORD>(
|
||||
&self.addr_bytes[block_addr_block_data.offset as usize..],
|
||||
ord,
|
||||
);
|
||||
(
|
||||
store_block_id * STORE_BLOCK_LEN as u64 + inner_offset,
|
||||
block_addr,
|
||||
next_block_ord,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user