mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
attempt to optimize sorted_ords_to_term_cb
This commit is contained in:
@@ -487,7 +487,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
|||||||
/// the buffer may be modified.
|
/// the buffer may be modified.
|
||||||
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
|
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
|
||||||
// find block in which the term would be
|
// find block in which the term would be
|
||||||
let block_addr = self.sstable_index.get_block_with_ord(ord);
|
let block_addr = self.sstable_index.get_block_with_ord::<false>(ord).0;
|
||||||
let first_ordinal = block_addr.first_ordinal;
|
let first_ordinal = block_addr.first_ordinal;
|
||||||
|
|
||||||
// then search inside that block only
|
// then search inside that block only
|
||||||
@@ -511,16 +511,17 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
|||||||
mut cb: F,
|
mut cb: F,
|
||||||
) -> io::Result<bool> {
|
) -> io::Result<bool> {
|
||||||
let mut bytes = Vec::new();
|
let mut bytes = Vec::new();
|
||||||
let mut current_block_addr = self.sstable_index.get_block_with_ord(0);
|
let (mut current_block_addr, mut next_block_ord) =
|
||||||
|
self.sstable_index.get_block_with_ord::<true>(0);
|
||||||
let mut current_sstable_delta_reader =
|
let mut current_sstable_delta_reader =
|
||||||
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
||||||
let mut current_ordinal = 0;
|
let mut current_ordinal = 0;
|
||||||
for ord in ord {
|
for ord in ord {
|
||||||
assert!(ord >= current_ordinal);
|
assert!(ord >= current_ordinal);
|
||||||
// check if block changed for new term_ord
|
// check if block changed for new term_ord
|
||||||
let new_block_addr = self.sstable_index.get_block_with_ord(ord);
|
if ord >= next_block_ord {
|
||||||
if new_block_addr != current_block_addr {
|
(current_block_addr, next_block_ord) =
|
||||||
current_block_addr = new_block_addr;
|
self.sstable_index.get_block_with_ord::<true>(ord);
|
||||||
current_ordinal = current_block_addr.first_ordinal;
|
current_ordinal = current_block_addr.first_ordinal;
|
||||||
current_sstable_delta_reader =
|
current_sstable_delta_reader =
|
||||||
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
self.sstable_delta_reader_block(current_block_addr.clone())?;
|
||||||
@@ -544,7 +545,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
|||||||
/// Returns the number of terms in the dictionary.
|
/// Returns the number of terms in the dictionary.
|
||||||
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> {
|
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> {
|
||||||
// find block in which the term would be
|
// find block in which the term would be
|
||||||
let block_addr = self.sstable_index.get_block_with_ord(term_ord);
|
let block_addr = self.sstable_index.get_block_with_ord::<false>(term_ord).0;
|
||||||
let first_ordinal = block_addr.first_ordinal;
|
let first_ordinal = block_addr.first_ordinal;
|
||||||
|
|
||||||
// then search inside that block only
|
// then search inside that block only
|
||||||
@@ -846,7 +847,7 @@ mod tests {
|
|||||||
fn test_ord_term_conversion() {
|
fn test_ord_term_conversion() {
|
||||||
let (dic, slice) = make_test_sstable();
|
let (dic, slice) = make_test_sstable();
|
||||||
|
|
||||||
let block = dic.sstable_index.get_block_with_ord(100_000);
|
let block = dic.sstable_index.get_block_with_ord::<false>(100_000).0;
|
||||||
slice.restrict(block.byte_range);
|
slice.restrict(block.byte_range);
|
||||||
|
|
||||||
let mut res = Vec::new();
|
let mut res = Vec::new();
|
||||||
@@ -872,7 +873,11 @@ mod tests {
|
|||||||
|
|
||||||
// end of a block
|
// end of a block
|
||||||
let ordinal = block.first_ordinal - 1;
|
let ordinal = block.first_ordinal - 1;
|
||||||
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
|
let new_range = dic
|
||||||
|
.sstable_index
|
||||||
|
.get_block_with_ord::<false>(ordinal)
|
||||||
|
.0
|
||||||
|
.byte_range;
|
||||||
slice.restrict(new_range);
|
slice.restrict(new_range);
|
||||||
assert!(dic.ord_to_term(ordinal, &mut res).unwrap());
|
assert!(dic.ord_to_term(ordinal, &mut res).unwrap());
|
||||||
assert_eq!(res, format!("{ordinal:05X}").into_bytes());
|
assert_eq!(res, format!("{ordinal:05X}").into_bytes());
|
||||||
@@ -882,7 +887,7 @@ mod tests {
|
|||||||
|
|
||||||
// before first block
|
// before first block
|
||||||
// 1st block must be loaded for key-related operations
|
// 1st block must be loaded for key-related operations
|
||||||
let block = dic.sstable_index.get_block_with_ord(0);
|
let block = dic.sstable_index.get_block_with_ord::<false>(0).0;
|
||||||
slice.restrict(block.byte_range);
|
slice.restrict(block.byte_range);
|
||||||
|
|
||||||
assert!(dic.get(b"$$$").unwrap().is_none());
|
assert!(dic.get(b"$$$").unwrap().is_none());
|
||||||
@@ -891,7 +896,11 @@ mod tests {
|
|||||||
// after last block
|
// after last block
|
||||||
// last block must be loaded for ord related operations
|
// last block must be loaded for ord related operations
|
||||||
let ordinal = 0x40000 + 10;
|
let ordinal = 0x40000 + 10;
|
||||||
let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
|
let new_range = dic
|
||||||
|
.sstable_index
|
||||||
|
.get_block_with_ord::<false>(ordinal)
|
||||||
|
.0
|
||||||
|
.byte_range;
|
||||||
slice.restrict(new_range);
|
slice.restrict(new_range);
|
||||||
assert!(!dic.ord_to_term(ordinal, &mut res).unwrap());
|
assert!(!dic.ord_to_term(ordinal, &mut res).unwrap());
|
||||||
assert!(dic.term_info_from_ord(ordinal).unwrap().is_none());
|
assert!(dic.term_info_from_ord(ordinal).unwrap().is_none());
|
||||||
|
|||||||
@@ -72,9 +72,15 @@ impl SSTableIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||||
// locate_with_ord always returns an index within range
|
// locate_with_ord always returns an index within range
|
||||||
self.get_block(self.locate_with_ord(ord)).unwrap()
|
let block_pos = self.locate_with_ord(ord);
|
||||||
|
(
|
||||||
|
self.get_block(block_pos).unwrap(),
|
||||||
|
self.get_block(block_pos + 1)
|
||||||
|
.map(|b| b.first_ordinal)
|
||||||
|
.unwrap_or(u64::MAX),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_block_for_automaton<'a>(
|
pub(crate) fn get_block_for_automaton<'a>(
|
||||||
|
|||||||
@@ -58,10 +58,13 @@ impl SSTableIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
pub(crate) fn get_block_with_ord<const FETCH_NEXT_ORD: bool>(
|
||||||
|
&self,
|
||||||
|
ord: TermOrdinal,
|
||||||
|
) -> (BlockAddr, u64) {
|
||||||
match self {
|
match self {
|
||||||
SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord),
|
SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord),
|
||||||
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord(ord),
|
SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord::<FETCH_NEXT_ORD>(ord),
|
||||||
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord),
|
SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -152,12 +155,18 @@ impl SSTableIndexV3 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
|
pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
|
||||||
self.block_addr_store.binary_search_ord(ord).0
|
self.block_addr_store.binary_search_ord::<false>(ord).0
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||||
pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
|
pub(crate) fn get_block_with_ord<const FETCH_NEXT_ORD: bool>(
|
||||||
self.block_addr_store.binary_search_ord(ord).1
|
&self,
|
||||||
|
ord: TermOrdinal,
|
||||||
|
) -> (BlockAddr, u64) {
|
||||||
|
let (_block_id, block_addr, next_ord) = self
|
||||||
|
.block_addr_store
|
||||||
|
.binary_search_ord::<FETCH_NEXT_ORD>(ord);
|
||||||
|
(block_addr, next_ord)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_block_for_automaton<'a>(
|
pub(crate) fn get_block_for_automaton<'a>(
|
||||||
@@ -253,8 +262,8 @@ impl SSTableIndexV3Empty {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
/// Get the [`BlockAddr`] of the block containing the `ord`-th term.
|
||||||
pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> BlockAddr {
|
pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> (BlockAddr, u64) {
|
||||||
self.block_addr.clone()
|
(self.block_addr.clone(), u64::MAX)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||||
@@ -461,7 +470,11 @@ impl BlockAddrBlockMetadata {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bisect_for_ord(&self, data: &[u8], target_ord: TermOrdinal) -> (u64, BlockAddr) {
|
fn bisect_for_ord<const FETCH_NEXT_ORD: bool>(
|
||||||
|
&self,
|
||||||
|
data: &[u8],
|
||||||
|
target_ord: TermOrdinal,
|
||||||
|
) -> (u64, BlockAddr, u64) {
|
||||||
let inner_target_ord = target_ord - self.ref_block_addr.first_ordinal;
|
let inner_target_ord = target_ord - self.ref_block_addr.first_ordinal;
|
||||||
let num_bits = self.num_bits() as usize;
|
let num_bits = self.num_bits() as usize;
|
||||||
let range_start_nbits = self.range_start_nbits as usize;
|
let range_start_nbits = self.range_start_nbits as usize;
|
||||||
@@ -481,11 +494,17 @@ impl BlockAddrBlockMetadata {
|
|||||||
Err(inner_offset) => inner_offset,
|
Err(inner_offset) => inner_offset,
|
||||||
};
|
};
|
||||||
// we can unwrap because inner_offset <= self.block_len
|
// we can unwrap because inner_offset <= self.block_len
|
||||||
(
|
let block = self
|
||||||
inner_offset,
|
.deserialize_block_addr(data, inner_offset as usize)
|
||||||
self.deserialize_block_addr(data, inner_offset as usize)
|
.unwrap();
|
||||||
.unwrap(),
|
let next_ord = if FETCH_NEXT_ORD {
|
||||||
)
|
self.deserialize_block_addr(data, inner_offset as usize + 1)
|
||||||
|
.map(|b| b.first_ordinal)
|
||||||
|
.unwrap_or(u64::MAX)
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
(inner_offset, block, next_ord)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -591,7 +610,10 @@ impl BlockAddrStore {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn binary_search_ord(&self, ord: TermOrdinal) -> (u64, BlockAddr) {
|
fn binary_search_ord<const FETCH_NEXT_ORD: bool>(
|
||||||
|
&self,
|
||||||
|
ord: TermOrdinal,
|
||||||
|
) -> (u64, BlockAddr, u64) {
|
||||||
let max_block =
|
let max_block =
|
||||||
(self.block_meta_bytes.len() / BlockAddrBlockMetadata::SIZE_IN_BYTES) as u64;
|
(self.block_meta_bytes.len() / BlockAddrBlockMetadata::SIZE_IN_BYTES) as u64;
|
||||||
let get_first_ordinal = |block_id| {
|
let get_first_ordinal = |block_id| {
|
||||||
@@ -606,20 +628,29 @@ impl BlockAddrStore {
|
|||||||
Ok(store_block_id) => {
|
Ok(store_block_id) => {
|
||||||
let block_id = store_block_id * STORE_BLOCK_LEN as u64;
|
let block_id = store_block_id * STORE_BLOCK_LEN as u64;
|
||||||
// we can unwrap because store_block_id < max_block
|
// we can unwrap because store_block_id < max_block
|
||||||
return (block_id, self.get(block_id).unwrap());
|
let next_ord = if FETCH_NEXT_ORD {
|
||||||
|
self.get(block_id + 1)
|
||||||
|
.map(|b| b.first_ordinal)
|
||||||
|
.unwrap_or(u64::MAX)
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
return (block_id, self.get(block_id).unwrap(), next_ord);
|
||||||
}
|
}
|
||||||
Err(store_block_id) => store_block_id - 1,
|
Err(store_block_id) => store_block_id - 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
// we can unwrap because store_block_id < max_block
|
// we can unwrap because store_block_id < max_block
|
||||||
let block_addr_block_data = self.get_block_meta(store_block_id as usize).unwrap();
|
let block_addr_block_data = self.get_block_meta(store_block_id as usize).unwrap();
|
||||||
let (inner_offset, block_addr) = block_addr_block_data.bisect_for_ord(
|
let (inner_offset, block_addr, next_block_ord) = block_addr_block_data
|
||||||
&self.addr_bytes[block_addr_block_data.offset as usize..],
|
.bisect_for_ord::<FETCH_NEXT_ORD>(
|
||||||
ord,
|
&self.addr_bytes[block_addr_block_data.offset as usize..],
|
||||||
);
|
ord,
|
||||||
|
);
|
||||||
(
|
(
|
||||||
store_block_id * STORE_BLOCK_LEN as u64 + inner_offset,
|
store_block_id * STORE_BLOCK_LEN as u64 + inner_offset,
|
||||||
block_addr,
|
block_addr,
|
||||||
|
next_block_ord,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user