feat: MatchesConstTerm displays probes (#6518)

* feat: `MatchesConstTerm` displays probes Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix fmt Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2026-01-05 21:02:58 +00:00 · 2025-07-17 15:01:55 +08:00
parent 50148b25b5
commit eb99e439c7
2 changed files with 48 additions and 19 deletions
--- a/src/mito2/src/sst/index/fulltext_index/applier.rs
+++ b/src/mito2/src/sst/index/fulltext_index/applier.rs
@@ -22,7 +22,8 @@ use common_telemetry::warn;
 use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
 use index::bloom_filter::reader::BloomFilterReaderImpl;
 use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
-use index::fulltext_index::Config;
+use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
+use index::fulltext_index::{Analyzer, Config};
 use object_store::ObjectStore;
 use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
 use puffin::puffin_manager::{GuardWithMetadata, PuffinManager, PuffinReader};
@@ -393,21 +394,7 @@ impl FulltextIndexApplier {
                // lowercased terms are not indexed
                continue;
            }
-
-            let ts = term
-                .term
-                .split(|c: char| !c.is_alphanumeric())
-                .filter(|&t| !t.is_empty())
-                .map(|t| {
-                    if !config.case_sensitive {
-                        t.to_lowercase()
-                    } else {
-                        t.to_string()
-                    }
-                    .into_bytes()
-                });
-
-            probes.extend(ts);
+            probes.extend(Self::term_to_probes(&term.term, config));
        }

        probes
@@ -417,6 +404,22 @@ impl FulltextIndexApplier {
            })
            .collect::<Vec<_>>()
    }
+
+    fn term_to_probes<'a>(term: &'a str, config: &'a Config) -> impl Iterator<Item = Vec<u8>> + 'a {
+        let tokens = match config.analyzer {
+            Analyzer::English => EnglishTokenizer {}.tokenize(term),
+            Analyzer::Chinese => ChineseTokenizer {}.tokenize(term),
+        };
+
+        tokens.into_iter().map(|t| {
+            if !config.case_sensitive {
+                t.to_lowercase()
+            } else {
+                t.to_string()
+            }
+            .into_bytes()
+        })
+    }
 }

 /// The source of the index.
--- a/src/query/src/optimizer/constant_term.rs
+++ b/src/query/src/optimizer/constant_term.rs
@@ -42,11 +42,19 @@ pub struct PreCompiledMatchesTermExpr {
    term: String,
    /// The pre-compiled term finder
    finder: MatchesTermFinder,
+
+    /// No used but show how index tokenizes the term basically.
+    /// Not precise due to column options is unknown but for debugging purpose in most cases it's enough.
+    probes: Vec<String>,
 }

 impl fmt::Display for PreCompiledMatchesTermExpr {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "MatchesConstTerm({}, \"{}\")", self.text, self.term)
+        write!(
+            f,
+            "MatchesConstTerm({}, term: \"{}\", probes: {:?})",
+            self.text, self.term, self.probes
+        )
    }
 }

@@ -118,6 +126,7 @@ impl PhysicalExpr for PreCompiledMatchesTermExpr {
            text: children[0].clone(),
            term: self.term.clone(),
            finder: self.finder.clone(),
+            probes: self.probes.clone(),
        }))
    }
 }
@@ -167,10 +176,19 @@ impl PhysicalOptimizerRule for MatchesConstantTermOptimizer {
                            if let Some(lit) = args[1].as_any().downcast_ref::<Literal>() {
                                if let ScalarValue::Utf8(Some(term)) = lit.value() {
                                    let finder = MatchesTermFinder::new(term);
+
+                                    // For debugging purpose. Not really precise but enough for most cases.
+                                    let probes = term
+                                        .split(|c: char| !c.is_alphanumeric())
+                                        .filter(|s| !s.is_empty())
+                                        .map(|s| s.to_string())
+                                        .collect();
+
                                    let expr = PreCompiledMatchesTermExpr {
                                        text: args[0].clone(),
                                        term: term.to_string(),
                                        finder,
+                                        probes,
                                    };

                                    return Ok(Transformed::yes(Arc::new(expr)));
@@ -390,7 +408,7 @@ mod tests {
    async fn test_matches_term_optimization_from_sql() {
        let sql = "WITH base AS (
        SELECT text, timestamp FROM test 
-        WHERE MATCHES_TERM(text, 'hello') 
+        WHERE MATCHES_TERM(text, 'hello world') 
        AND timestamp > '2025-01-01 00:00:00'
    ),
    subquery1 AS (
@@ -448,7 +466,15 @@ mod tests {
            .unwrap();

        let plan_str = get_plan_string(&physical_plan).join("\n");
-        assert!(plan_str.contains("MatchesConstTerm"));
+        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"foo\", probes: [\"foo\"]"));
+        assert!(plan_str.contains(
+            "MatchesConstTerm(text@0, term: \"hello world\", probes: [\"hello\", \"world\"]"
+        ));
+        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"world\", probes: [\"world\"]"));
+        assert!(plan_str
+            .contains("MatchesConstTerm(text@0, term: \"greeting\", probes: [\"greeting\"]"));
+        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"there\", probes: [\"there\"]"));
+        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"42\", probes: [\"42\"]"));
        assert!(!plan_str.contains("matches_term"))
    }
 }