add comment about cpu-intensive operation in async context

2025-12-23 02:29:57 +00:00 · 2024-12-20 12:20:35 +01:00
parent 42efc7f7c8
commit ebf4d84553
2 changed files with 6 additions and 1 deletions
--- a/src/index/inverted_index_reader.rs
+++ b/src/index/inverted_index_reader.rs
@@ -351,11 +351,16 @@ impl InvertedIndexReader {
        let mut _term_info = self
            .get_term_range_async(.., automaton.clone(), None, merge_holes_under)
            .await?;
+
        // we build a 2nd iterator, this one with no holes, so we don't go through blocks we can't
        // match, and just download them to reduce our query count. This makes the assumption
        // there is a caching layer below, which might not always be true, but is in Quickwit.
        let term_info = self.get_term_range_async(.., automaton, None, 0).await?;

+        // TODO this operation is often cheap for "friendly" automatons, but can be very costly for
+        // "unfriendly" ones such as ".*a{50}" (very few terms if any  match this pattern, but we
+        // can't know early). In this case, we decompress and iterate over the entire sstable, while
+        // still being in async context. Ideally we should spawn this on a threadpool.
        let range_to_load = term_info
            .map(|term_info| term_info.postings_range)
            .coalesce(|range1, range2| {
--- a/sstable/src/block_match_automaton.rs
+++ b/sstable/src/block_match_automaton.rs
@@ -132,7 +132,7 @@ fn match_range_start<S, A: Automaton<State = S>>(
    automaton: &A,
    mut state: S,
 ) -> bool {
-    // case [abcdgj, abcpqr], `abcd` is already consumed, we need to handle:
+    // case ]abcdgj, abcpqr], `abcd` is already consumed, we need to handle:
    // - [h-\xff].*
    // - g[k-\xff].*
    // - gj.+ == gf[\0-\xff].*