From 772bc21b6564971e9c416b73d5f474edbe24acbb Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Thu, 17 Jul 2025 15:01:55 +0800 Subject: [PATCH] feat: `MatchesConstTerm` displays probes (#6518) * feat: `MatchesConstTerm` displays probes Signed-off-by: Zhenchi * fix fmt Signed-off-by: Zhenchi --------- Signed-off-by: Zhenchi Signed-off-by: evenyag --- .../src/sst/index/fulltext_index/applier.rs | 35 ++++++++++--------- src/query/src/optimizer/constant_term.rs | 32 +++++++++++++++-- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/src/mito2/src/sst/index/fulltext_index/applier.rs b/src/mito2/src/sst/index/fulltext_index/applier.rs index ac14579de6..b46af9e8da 100644 --- a/src/mito2/src/sst/index/fulltext_index/applier.rs +++ b/src/mito2/src/sst/index/fulltext_index/applier.rs @@ -22,7 +22,8 @@ use common_telemetry::warn; use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate}; use index::bloom_filter::reader::BloomFilterReaderImpl; use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher}; -use index::fulltext_index::Config; +use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer}; +use index::fulltext_index::{Analyzer, Config}; use object_store::ObjectStore; use puffin::puffin_manager::cache::PuffinMetadataCacheRef; use puffin::puffin_manager::{GuardWithMetadata, PuffinManager, PuffinReader}; @@ -393,21 +394,7 @@ impl FulltextIndexApplier { // lowercased terms are not indexed continue; } - - let ts = term - .term - .split(|c: char| !c.is_alphanumeric()) - .filter(|&t| !t.is_empty()) - .map(|t| { - if !config.case_sensitive { - t.to_lowercase() - } else { - t.to_string() - } - .into_bytes() - }); - - probes.extend(ts); + probes.extend(Self::term_to_probes(&term.term, config)); } probes @@ -417,6 +404,22 @@ impl FulltextIndexApplier { }) .collect::>() } + + fn term_to_probes<'a>(term: &'a str, config: &'a Config) -> impl Iterator> + 'a { + let tokens = match config.analyzer { + Analyzer::English => EnglishTokenizer {}.tokenize(term), + Analyzer::Chinese => ChineseTokenizer {}.tokenize(term), + }; + + tokens.into_iter().map(|t| { + if !config.case_sensitive { + t.to_lowercase() + } else { + t.to_string() + } + .into_bytes() + }) + } } /// The source of the index. diff --git a/src/query/src/optimizer/constant_term.rs b/src/query/src/optimizer/constant_term.rs index 60e5b76d9d..87c1831350 100644 --- a/src/query/src/optimizer/constant_term.rs +++ b/src/query/src/optimizer/constant_term.rs @@ -42,11 +42,19 @@ pub struct PreCompiledMatchesTermExpr { term: String, /// The pre-compiled term finder finder: MatchesTermFinder, + + /// No used but show how index tokenizes the term basically. + /// Not precise due to column options is unknown but for debugging purpose in most cases it's enough. + probes: Vec, } impl fmt::Display for PreCompiledMatchesTermExpr { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "MatchesConstTerm({}, \"{}\")", self.text, self.term) + write!( + f, + "MatchesConstTerm({}, term: \"{}\", probes: {:?})", + self.text, self.term, self.probes + ) } } @@ -118,6 +126,7 @@ impl PhysicalExpr for PreCompiledMatchesTermExpr { text: children[0].clone(), term: self.term.clone(), finder: self.finder.clone(), + probes: self.probes.clone(), })) } } @@ -167,10 +176,19 @@ impl PhysicalOptimizerRule for MatchesConstantTermOptimizer { if let Some(lit) = args[1].as_any().downcast_ref::() { if let ScalarValue::Utf8(Some(term)) = lit.value() { let finder = MatchesTermFinder::new(term); + + // For debugging purpose. Not really precise but enough for most cases. + let probes = term + .split(|c: char| !c.is_alphanumeric()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .collect(); + let expr = PreCompiledMatchesTermExpr { text: args[0].clone(), term: term.to_string(), finder, + probes, }; return Ok(Transformed::yes(Arc::new(expr))); @@ -390,7 +408,7 @@ mod tests { async fn test_matches_term_optimization_from_sql() { let sql = "WITH base AS ( SELECT text, timestamp FROM test - WHERE MATCHES_TERM(text, 'hello') + WHERE MATCHES_TERM(text, 'hello world') AND timestamp > '2025-01-01 00:00:00' ), subquery1 AS ( @@ -448,7 +466,15 @@ mod tests { .unwrap(); let plan_str = get_plan_string(&physical_plan).join("\n"); - assert!(plan_str.contains("MatchesConstTerm")); + assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"foo\", probes: [\"foo\"]")); + assert!(plan_str.contains( + "MatchesConstTerm(text@0, term: \"hello world\", probes: [\"hello\", \"world\"]" + )); + assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"world\", probes: [\"world\"]")); + assert!(plan_str + .contains("MatchesConstTerm(text@0, term: \"greeting\", probes: [\"greeting\"]")); + assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"there\", probes: [\"there\"]")); + assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"42\", probes: [\"42\"]")); assert!(!plan_str.contains("matches_term")) } }