From 772bc21b6564971e9c416b73d5f474edbe24acbb Mon Sep 17 00:00:00 2001
From: Zhenchi <zhongzc_arch@outlook.com>
Date: Thu, 17 Jul 2025 15:01:55 +0800
Subject: [PATCH] feat: `MatchesConstTerm` displays probes (#6518)

* feat: `MatchesConstTerm` displays probes

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix fmt

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
Signed-off-by: evenyag <realevenyag@gmail.com>
---
 .../src/sst/index/fulltext_index/applier.rs   | 35 ++++++++++---------
 src/query/src/optimizer/constant_term.rs      | 32 +++++++++++++++--
 2 files changed, 48 insertions(+), 19 deletions(-)
diff --git a/src/mito2/src/sst/index/fulltext_index/applier.rs b/src/mito2/src/sst/index/fulltext_index/applier.rs
index ac14579de6..b46af9e8da 100644
--- a/src/mito2/src/sst/index/fulltext_index/applier.rs
+++ b/src/mito2/src/sst/index/fulltext_index/applier.rs
@@ -22,7 +22,8 @@ use common_telemetry::warn;
 use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
 use index::bloom_filter::reader::BloomFilterReaderImpl;
 use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
-use index::fulltext_index::Config;
+use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
+use index::fulltext_index::{Analyzer, Config};
 use object_store::ObjectStore;
 use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
 use puffin::puffin_manager::{GuardWithMetadata, PuffinManager, PuffinReader};
@@ -393,21 +394,7 @@ impl FulltextIndexApplier {
                 // lowercased terms are not indexed
                 continue;
             }
-
-            let ts = term
-                .term
-                .split(|c: char| !c.is_alphanumeric())
-                .filter(|&t| !t.is_empty())
-                .map(|t| {
-                    if !config.case_sensitive {
-                        t.to_lowercase()
-                    } else {
-                        t.to_string()
-                    }
-                    .into_bytes()
-                });
-
-            probes.extend(ts);
+            probes.extend(Self::term_to_probes(&term.term, config));
         }
 
         probes
@@ -417,6 +404,22 @@ impl FulltextIndexApplier {
             })
             .collect::<Vec<_>>()
     }
+
+    fn term_to_probes<'a>(term: &'a str, config: &'a Config) -> impl Iterator<Item = Vec<u8>> + 'a {
+        let tokens = match config.analyzer {
+            Analyzer::English => EnglishTokenizer {}.tokenize(term),
+            Analyzer::Chinese => ChineseTokenizer {}.tokenize(term),
+        };
+
+        tokens.into_iter().map(|t| {
+            if !config.case_sensitive {
+                t.to_lowercase()
+            } else {
+                t.to_string()
+            }
+            .into_bytes()
+        })
+    }
 }
 
 /// The source of the index.
diff --git a/src/query/src/optimizer/constant_term.rs b/src/query/src/optimizer/constant_term.rs
index 60e5b76d9d..87c1831350 100644
--- a/src/query/src/optimizer/constant_term.rs
+++ b/src/query/src/optimizer/constant_term.rs
@@ -42,11 +42,19 @@ pub struct PreCompiledMatchesTermExpr {
     term: String,
     /// The pre-compiled term finder
     finder: MatchesTermFinder,
+
+    /// No used but show how index tokenizes the term basically.
+    /// Not precise due to column options is unknown but for debugging purpose in most cases it's enough.
+    probes: Vec<String>,
 }
 
 impl fmt::Display for PreCompiledMatchesTermExpr {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "MatchesConstTerm({}, \"{}\")", self.text, self.term)
+        write!(
+            f,
+            "MatchesConstTerm({}, term: \"{}\", probes: {:?})",
+            self.text, self.term, self.probes
+        )
     }
 }
 
@@ -118,6 +126,7 @@ impl PhysicalExpr for PreCompiledMatchesTermExpr {
             text: children[0].clone(),
             term: self.term.clone(),
             finder: self.finder.clone(),
+            probes: self.probes.clone(),
         }))
     }
 }
@@ -167,10 +176,19 @@ impl PhysicalOptimizerRule for MatchesConstantTermOptimizer {
                             if let Some(lit) = args[1].as_any().downcast_ref::<Literal>() {
                                 if let ScalarValue::Utf8(Some(term)) = lit.value() {
                                     let finder = MatchesTermFinder::new(term);
+
+                                    // For debugging purpose. Not really precise but enough for most cases.
+                                    let probes = term
+                                        .split(|c: char| !c.is_alphanumeric())
+                                        .filter(|s| !s.is_empty())
+                                        .map(|s| s.to_string())
+                                        .collect();
+
                                     let expr = PreCompiledMatchesTermExpr {
                                         text: args[0].clone(),
                                         term: term.to_string(),
                                         finder,
+                                        probes,
                                     };
 
                                     return Ok(Transformed::yes(Arc::new(expr)));
@@ -390,7 +408,7 @@ mod tests {
     async fn test_matches_term_optimization_from_sql() {
         let sql = "WITH base AS (
         SELECT text, timestamp FROM test 
-        WHERE MATCHES_TERM(text, 'hello') 
+        WHERE MATCHES_TERM(text, 'hello world') 
         AND timestamp > '2025-01-01 00:00:00'
     ),
     subquery1 AS (
@@ -448,7 +466,15 @@ mod tests {
             .unwrap();
 
         let plan_str = get_plan_string(&physical_plan).join("\n");
-        assert!(plan_str.contains("MatchesConstTerm"));
+        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"foo\", probes: [\"foo\"]"));
+        assert!(plan_str.contains(
+            "MatchesConstTerm(text@0, term: \"hello world\", probes: [\"hello\", \"world\"]"
+        ));
+        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"world\", probes: [\"world\"]"));
+        assert!(plan_str
+            .contains("MatchesConstTerm(text@0, term: \"greeting\", probes: [\"greeting\"]"));
+        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"there\", probes: [\"there\"]"));
+        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"42\", probes: [\"42\"]"));
         assert!(!plan_str.contains("matches_term"))
     }
 }