From ebf4d84553a63e1ab7710f7f635c04d6452460d6 Mon Sep 17 00:00:00 2001
From: trinity-1686a <trinity@quickwit.io>
Date: Fri, 20 Dec 2024 12:20:35 +0100
Subject: [PATCH] add comment about cpu-intensive operation in async context

---
 src/index/inverted_index_reader.rs   | 5 +++++
 sstable/src/block_match_automaton.rs | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/src/index/inverted_index_reader.rs b/src/index/inverted_index_reader.rs
index d3f8f4640..7a6655ca9 100644
--- a/src/index/inverted_index_reader.rs
+++ b/src/index/inverted_index_reader.rs
@@ -351,11 +351,16 @@ impl InvertedIndexReader {
         let mut _term_info = self
             .get_term_range_async(.., automaton.clone(), None, merge_holes_under)
             .await?;
+
         // we build a 2nd iterator, this one with no holes, so we don't go through blocks we can't
         // match, and just download them to reduce our query count. This makes the assumption
         // there is a caching layer below, which might not always be true, but is in Quickwit.
         let term_info = self.get_term_range_async(.., automaton, None, 0).await?;
 
+        // TODO this operation is often cheap for "friendly" automatons, but can be very costly for
+        // "unfriendly" ones such as ".*a{50}" (very few terms if any  match this pattern, but we
+        // can't know early). In this case, we decompress and iterate over the entire sstable, while
+        // still being in async context. Ideally we should spawn this on a threadpool.
         let range_to_load = term_info
             .map(|term_info| term_info.postings_range)
             .coalesce(|range1, range2| {
diff --git a/sstable/src/block_match_automaton.rs b/sstable/src/block_match_automaton.rs
index e3be4b56f..bb516ea2e 100644
--- a/sstable/src/block_match_automaton.rs
+++ b/sstable/src/block_match_automaton.rs
@@ -132,7 +132,7 @@ fn match_range_start<S, A: Automaton<State = S>>(
     automaton: &A,
     mut state: S,
 ) -> bool {
-    // case [abcdgj, abcpqr], `abcd` is already consumed, we need to handle:
+    // case ]abcdgj, abcpqr], `abcd` is already consumed, we need to handle:
     // - [h-\xff].*
     // - g[k-\xff].*
     // - gj.+ == gf[\0-\xff].*