From ebf4d84553a63e1ab7710f7f635c04d6452460d6 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 20 Dec 2024 12:20:35 +0100 Subject: [PATCH] add comment about cpu-intensive operation in async context --- src/index/inverted_index_reader.rs | 5 +++++ sstable/src/block_match_automaton.rs | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/index/inverted_index_reader.rs b/src/index/inverted_index_reader.rs index d3f8f4640..7a6655ca9 100644 --- a/src/index/inverted_index_reader.rs +++ b/src/index/inverted_index_reader.rs @@ -351,11 +351,16 @@ impl InvertedIndexReader { let mut _term_info = self .get_term_range_async(.., automaton.clone(), None, merge_holes_under) .await?; + // we build a 2nd iterator, this one with no holes, so we don't go through blocks we can't // match, and just download them to reduce our query count. This makes the assumption // there is a caching layer below, which might not always be true, but is in Quickwit. let term_info = self.get_term_range_async(.., automaton, None, 0).await?; + // TODO this operation is often cheap for "friendly" automatons, but can be very costly for + // "unfriendly" ones such as ".*a{50}" (very few terms if any match this pattern, but we + // can't know early). In this case, we decompress and iterate over the entire sstable, while + // still being in async context. Ideally we should spawn this on a threadpool. let range_to_load = term_info .map(|term_info| term_info.postings_range) .coalesce(|range1, range2| { diff --git a/sstable/src/block_match_automaton.rs b/sstable/src/block_match_automaton.rs index e3be4b56f..bb516ea2e 100644 --- a/sstable/src/block_match_automaton.rs +++ b/sstable/src/block_match_automaton.rs @@ -132,7 +132,7 @@ fn match_range_start>( automaton: &A, mut state: S, ) -> bool { - // case [abcdgj, abcpqr], `abcd` is already consumed, we need to handle: + // case ]abcdgj, abcpqr], `abcd` is already consumed, we need to handle: // - [h-\xff].* // - g[k-\xff].* // - gj.+ == gf[\0-\xff].*