diff --git a/Cargo.lock b/Cargo.lock index 5182582e82..2035b5090c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2897,7 +2897,7 @@ checksum = "e8566979429cf69b49a5c740c60791108e86440e8be149bbea4fe54d2c32d6e2" [[package]] name = "datafusion" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "arrow-array", @@ -2948,7 +2948,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "async-trait", @@ -2968,7 +2968,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "arrow-schema", @@ -2991,7 +2991,7 @@ dependencies = [ [[package]] name = "datafusion-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "ahash 0.8.11", "arrow", @@ -3016,7 +3016,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "log", "tokio", @@ -3025,12 +3025,12 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" [[package]] name = "datafusion-execution" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "dashmap", @@ -3048,7 +3048,7 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "chrono", @@ -3068,7 +3068,7 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "datafusion-common", @@ -3079,7 +3079,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "arrow-buffer", @@ -3108,7 +3108,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "ahash 0.8.11", "arrow", @@ -3129,7 +3129,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "ahash 0.8.11", "arrow", @@ -3141,7 +3141,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "arrow-array", @@ -3163,7 +3163,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "async-trait", @@ -3178,7 +3178,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "datafusion-common", "datafusion-doc", @@ -3194,7 +3194,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -3203,7 +3203,7 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "datafusion-expr", "quote", @@ -3213,7 +3213,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "chrono", @@ -3231,7 +3231,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "ahash 0.8.11", "arrow", @@ -3254,7 +3254,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "ahash 0.8.11", "arrow", @@ -3267,7 +3267,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "arrow-schema", @@ -3288,7 +3288,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "ahash 0.8.11", "arrow", @@ -3318,7 +3318,7 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "arrow", "arrow-array", @@ -3336,7 +3336,7 @@ dependencies = [ [[package]] name = "datafusion-substrait" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375" +source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220" dependencies = [ "async-recursion", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index f3bd54a661..05835d88f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -112,15 +112,15 @@ clap = { version = "4.4", features = ["derive"] } config = "0.13.0" crossbeam-utils = "0.8" dashmap = "6.1" -datafusion = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } -datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } -datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } -datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } -datafusion-optimizer = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } -datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } -datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } -datafusion-sql = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } -datafusion-substrait = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" } +datafusion = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } +datafusion-common = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } +datafusion-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } +datafusion-functions = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } +datafusion-optimizer = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } +datafusion-physical-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } +datafusion-physical-plan = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } +datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } +datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" } deadpool = "0.12" deadpool-postgres = "0.14" derive_builder = "0.20" diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs index e6596e923a..52a33029e2 100644 --- a/src/query/src/optimizer.rs +++ b/src/query/src/optimizer.rs @@ -21,6 +21,7 @@ pub mod scan_hint; pub mod string_normalization; #[cfg(test)] pub(crate) mod test_util; +pub mod transcribe_atat; pub mod type_conversion; pub mod windowed_sort; diff --git a/src/query/src/optimizer/transcribe_atat.rs b/src/query/src/optimizer/transcribe_atat.rs new file mode 100644 index 0000000000..3292f19f08 --- /dev/null +++ b/src/query/src/optimizer/transcribe_atat.rs @@ -0,0 +1,230 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use common_function::scalars::matches_term::MatchesTermFunction; +use common_function::scalars::udf::create_udf; +use common_function::state::FunctionState; +use datafusion::config::ConfigOptions; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; +use datafusion_common::Result; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::{Expr, LogicalPlan}; +use datafusion_optimizer::analyzer::AnalyzerRule; +use session::context::QueryContext; + +use crate::plan::ExtractExpr; + +/// TranscribeAtatRule is an analyzer rule that transcribes `@@` operator +/// to `matches_term` function. +/// +/// Example: +/// ```sql +/// SELECT matches_term('cat!', 'cat') as result; +/// +/// SELECT matches_term(`log_message`, '/start') as `matches_start` FROM t; +/// ``` +/// +/// to +/// +/// ```sql +/// SELECT 'cat!' @@ 'cat' as result; +/// +/// SELECT `log_message` @@ '/start' as `matches_start` FROM t; +/// ``` +#[derive(Debug)] +pub struct TranscribeAtatRule; + +impl AnalyzerRule for TranscribeAtatRule { + fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result { + plan.transform(Self::do_analyze).map(|x| x.data) + } + + fn name(&self) -> &str { + "TranscribeAtatRule" + } +} + +impl TranscribeAtatRule { + fn do_analyze(plan: LogicalPlan) -> Result> { + let mut rewriter = TranscribeAtatRewriter::default(); + let new_expr = plan + .expressions_consider_join() + .into_iter() + .map(|e| e.rewrite(&mut rewriter).map(|x| x.data)) + .collect::>>()?; + + if rewriter.transcribed { + let inputs = plan.inputs().into_iter().cloned().collect::>(); + plan.with_new_exprs(new_expr, inputs).map(Transformed::yes) + } else { + Ok(Transformed::no(plan)) + } + } +} + +#[derive(Default)] +struct TranscribeAtatRewriter { + transcribed: bool, +} + +impl TreeNodeRewriter for TranscribeAtatRewriter { + type Node = Expr; + + fn f_up(&mut self, expr: Expr) -> Result> { + if let Expr::BinaryExpr(binary_expr) = &expr + && matches!(binary_expr.op, datafusion_expr::Operator::AtAt) + { + self.transcribed = true; + let scalar_udf = create_udf( + Arc::new(MatchesTermFunction), + QueryContext::arc(), + Arc::new(FunctionState::default()), + ); + let exprs = vec![ + binary_expr.left.as_ref().clone(), + binary_expr.right.as_ref().clone(), + ]; + Ok(Transformed::yes(Expr::ScalarFunction( + ScalarFunction::new_udf(Arc::new(scalar_udf), exprs), + ))) + } else { + Ok(Transformed::no(expr)) + } + } +} +#[cfg(test)] +mod tests { + + use arrow_schema::SchemaRef; + use datafusion::datasource::{provider_as_source, MemTable}; + use datafusion::logical_expr::{col, lit, LogicalPlan, LogicalPlanBuilder}; + use datafusion_expr::{BinaryExpr, Operator}; + use datatypes::arrow::datatypes::{DataType, Field, Schema}; + + use super::*; + + fn optimize(plan: LogicalPlan) -> Result { + TranscribeAtatRule.analyze(plan, &ConfigOptions::default()) + } + + fn prepare_test_plan_builder() -> LogicalPlanBuilder { + let schema = Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + ]); + let table = MemTable::try_new(SchemaRef::from(schema), vec![]).unwrap(); + LogicalPlanBuilder::scan("t", provider_as_source(Arc::new(table)), None).unwrap() + } + + #[test] + fn test_multiple_atat() { + let plan = prepare_test_plan_builder() + .filter( + Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("a")), + op: Operator::AtAt, + right: Box::new(lit("foo")), + }) + .and(Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("b")), + op: Operator::AtAt, + right: Box::new(lit("bar")), + })), + ) + .unwrap() + .project(vec![ + Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("a")), + op: Operator::AtAt, + right: Box::new(col("b")), + }), + col("b"), + ]) + .unwrap() + .build() + .unwrap(); + + let expected = r#"Projection: matches_term(t.a, t.b), t.b + Filter: matches_term(t.a, Utf8("foo")) AND matches_term(t.b, Utf8("bar")) + TableScan: t"#; + + let optimized_plan = optimize(plan).unwrap(); + let formatted = optimized_plan.to_string(); + + assert_eq!(formatted, expected); + } + + #[test] + fn test_nested_atat() { + let plan = prepare_test_plan_builder() + .filter( + Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("a")), + op: Operator::AtAt, + right: Box::new(lit("foo")), + }) + .and( + Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("b")), + op: Operator::AtAt, + right: Box::new(lit("bar")), + }) + .or(Expr::BinaryExpr(BinaryExpr { + left: Box::new( + // Nested case in function argument + Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("a")), + op: Operator::AtAt, + right: Box::new(lit("nested")), + }), + ), + op: Operator::Eq, + right: Box::new(lit(true)), + })), + ), + ) + .unwrap() + .project(vec![ + col("a"), + // Complex nested expression with multiple @@ operators + Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("a")), + op: Operator::AtAt, + right: Box::new(lit("foo")), + })), + op: Operator::And, + right: Box::new(Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("b")), + op: Operator::AtAt, + right: Box::new(lit("bar")), + })), + }), + ]) + .unwrap() + .build() + .unwrap(); + + let expected = r#"Projection: t.a, matches_term(t.a, Utf8("foo")) AND matches_term(t.b, Utf8("bar")) + Filter: matches_term(t.a, Utf8("foo")) AND (matches_term(t.b, Utf8("bar")) OR matches_term(t.a, Utf8("nested")) = Boolean(true)) + TableScan: t"#; + + let optimized_plan = optimize(plan).unwrap(); + let formatted = optimized_plan.to_string(); + + assert_eq!(formatted, expected); + } +} diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs index 03f3a2a13d..d55ab471f9 100644 --- a/src/query/src/query_engine/state.rs +++ b/src/query/src/query_engine/state.rs @@ -52,6 +52,7 @@ use crate::optimizer::pass_distribution::PassDistribution; use crate::optimizer::remove_duplicate::RemoveDuplicate; use crate::optimizer::scan_hint::ScanHintRule; use crate::optimizer::string_normalization::StringNormalizationRule; +use crate::optimizer::transcribe_atat::TranscribeAtatRule; use crate::optimizer::type_conversion::TypeConversionRule; use crate::optimizer::windowed_sort::WindowedSortPhysicalRule; use crate::optimizer::ExtensionAnalyzerRule; @@ -115,6 +116,7 @@ impl QueryEngineState { // Apply the datafusion rules let mut analyzer = Analyzer::new(); + analyzer.rules.insert(0, Arc::new(TranscribeAtatRule)); analyzer.rules.insert(0, Arc::new(StringNormalizationRule)); // Use our custom rule instead to optimize the count(*) query diff --git a/tests/cases/standalone/common/expr/atat.result b/tests/cases/standalone/common/expr/atat.result new file mode 100644 index 0000000000..6beec6347a --- /dev/null +++ b/tests/cases/standalone/common/expr/atat.result @@ -0,0 +1,315 @@ +-- Derived from matches_term cases +-- Test basic term matching +-- Expect: true +SELECT 'cat!' @@ 'cat' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test phrase matching with spaces +-- Expect: true +SELECT 'warning:hello world!' @@ 'hello world' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test numbers in term +SELECT 'v1.0!' @@ 'v1.0' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test case sensitivity +-- Expect: true +SELECT 'Cat' @@ 'Cat' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: false +SELECT 'cat' @@ 'Cat' as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Test empty string handling +-- Expect: true +SELECT '' @@ '' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: false +SELECT 'any' @@ '' as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Expect: false +SELECT '' @@ 'any' as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Test partial matches (should fail) +-- Expect: false +SELECT 'category' @@ 'cat' as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Expect: false +SELECT 'rebooted' @@ 'boot' as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Test adjacent alphanumeric characters +SELECT 'cat5' @@ 'cat' as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +SELECT 'dogcat' @@ 'dog' as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Test leading non-alphanumeric +-- Expect: true +SELECT 'dog/cat' @@ '/cat' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: true +SELECT 'dog/cat' @@ 'dog/' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: true +SELECT 'dog/cat' @@ 'dog/cat' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test unicode characters +-- Expect: true +SELECT 'café>' @@ 'café' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: true +SELECT 'русский!' @@ 'русский' as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test complete word matching +CREATE TABLE logs ( + `id` TIMESTAMP TIME INDEX, + `log_message` STRING +); + +Affected Rows: 0 + +INSERT INTO logs VALUES + (1, 'An error occurred!'), + (2, 'Critical error: system failure'), + (3, 'error-prone'), + (4, 'errors'), + (5, 'error123'), + (6, 'errorLogs'), + (7, 'Version v1.0 released'), + (8, 'v1.0!'), + (9, 'v1.0a'), + (10, 'v1.0beta'), + (11, 'GET /app/start'), + (12, 'Command: /start-prosess'), + (13, 'Command: /start'), + (14, 'start'), + (15, 'start/stop'), + (16, 'Alert: system failure detected'), + (17, 'system failure!'), + (18, 'system-failure'), + (19, 'system failure2023'), + (20, 'critical error: system failure'), + (21, 'critical failure detected'), + (22, 'critical issue'), + (23, 'failure imminent'), + (24, 'Warning: high temperature'), + (25, 'WARNING: system overload'), + (26, 'warned'), + (27, 'warnings'); + +Affected Rows: 27 + +-- Test complete word matching for 'error' +-- Expect: +-- 1|An error occurred!|true +-- 2|Critical error: system failure|true +-- 3|error-prone|true +-- 4|errors|false +-- 5|error123|false +-- 6|errorLogs|false +SELECT `id`, `log_message`, `log_message` @@ 'error' as `matches_error` FROM logs WHERE `id` <= 6 ORDER BY `id`; + ++-------------------------+--------------------------------+---------------+ +| id | log_message | matches_error | ++-------------------------+--------------------------------+---------------+ +| 1970-01-01T00:00:00.001 | An error occurred! | true | +| 1970-01-01T00:00:00.002 | Critical error: system failure | true | +| 1970-01-01T00:00:00.003 | error-prone | true | +| 1970-01-01T00:00:00.004 | errors | false | +| 1970-01-01T00:00:00.005 | error123 | false | +| 1970-01-01T00:00:00.006 | errorLogs | false | ++-------------------------+--------------------------------+---------------+ + +-- Test complete word matching for 'v1.0' +-- Expect: +-- 7|Version v1.0 released|true +-- 8|v1.0!|true +-- 9|v1.0a|false +-- 10|v1.0beta|false +SELECT `id`, `log_message`, `log_message` @@ 'v1.0' as `matches_version` FROM logs WHERE `id` BETWEEN 7 AND 10 ORDER BY `id`; + ++-------------------------+-----------------------+-----------------+ +| id | log_message | matches_version | ++-------------------------+-----------------------+-----------------+ +| 1970-01-01T00:00:00.007 | Version v1.0 released | true | +| 1970-01-01T00:00:00.008 | v1.0! | true | +| 1970-01-01T00:00:00.009 | v1.0a | false | +| 1970-01-01T00:00:00.010 | v1.0beta | false | ++-------------------------+-----------------------+-----------------+ + +-- Test complete word matching for '/start' +-- Expect: +-- 11|GET /app/start|true +-- 12|Command: /start-prosess|true +-- 13|Command: /start|true +-- 14|start|false +-- 15|start/stop|false +SELECT `id`, `log_message`, `log_message` @@ '/start' as `matches_start` FROM logs WHERE `id` BETWEEN 11 AND 15 ORDER BY `id`; + ++-------------------------+-------------------------+---------------+ +| id | log_message | matches_start | ++-------------------------+-------------------------+---------------+ +| 1970-01-01T00:00:00.011 | GET /app/start | true | +| 1970-01-01T00:00:00.012 | Command: /start-prosess | true | +| 1970-01-01T00:00:00.013 | Command: /start | true | +| 1970-01-01T00:00:00.014 | start | false | +| 1970-01-01T00:00:00.015 | start/stop | false | ++-------------------------+-------------------------+---------------+ + +-- Test phrase matching for 'system failure' +-- Expect: +-- 16|Alert: system failure detected|true +-- 17|system failure!|true +-- 18|system-failure|false +-- 19|system failure2023|false +SELECT `id`, `log_message`, `log_message` @@ 'system failure' as `matches_phrase` FROM logs WHERE `id` BETWEEN 16 AND 19 ORDER BY `id`; + ++-------------------------+--------------------------------+----------------+ +| id | log_message | matches_phrase | ++-------------------------+--------------------------------+----------------+ +| 1970-01-01T00:00:00.016 | Alert: system failure detected | true | +| 1970-01-01T00:00:00.017 | system failure! | true | +| 1970-01-01T00:00:00.018 | system-failure | false | +| 1970-01-01T00:00:00.019 | system failure2023 | false | ++-------------------------+--------------------------------+----------------+ + +-- Test multi-word matching using AND +-- Expect: +-- 20|critical error: system failure|true|true|true +-- 21|critical failure detected|true|true|true +-- 22|critical issue|true|false|false +-- 23|failure imminent|false|true|false +SELECT `id`, `log_message`, + `log_message` @@ 'critical' as `matches_critical`, + `log_message` @@ 'failure' as `matches_failure`, + `log_message` @@ 'critical' AND `log_message` @@ 'failure' as `matches_both` +FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`; + ++-------------------------+--------------------------------+------------------+-----------------+--------------+ +| id | log_message | matches_critical | matches_failure | matches_both | ++-------------------------+--------------------------------+------------------+-----------------+--------------+ +| 1970-01-01T00:00:00.020 | critical error: system failure | true | true | true | +| 1970-01-01T00:00:00.021 | critical failure detected | true | true | true | +| 1970-01-01T00:00:00.022 | critical issue | true | false | false | +| 1970-01-01T00:00:00.023 | failure imminent | false | true | false | ++-------------------------+--------------------------------+------------------+-----------------+--------------+ + +-- Test case-insensitive matching using lower() +-- Expect: +-- 24|Warning: high temperature|true +-- 25|WARNING: system overload|true +-- 26|warned|false +-- 27|warnings|false +SELECT `id`, `log_message`, lower(`log_message`) @@ 'warning' as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`; + ++-------------------------+---------------------------+-----------------+ +| id | log_message | matches_warning | ++-------------------------+---------------------------+-----------------+ +| 1970-01-01T00:00:00.024 | Warning: high temperature | true | +| 1970-01-01T00:00:00.025 | WARNING: system overload | true | +| 1970-01-01T00:00:00.026 | warned | false | +| 1970-01-01T00:00:00.027 | warnings | false | ++-------------------------+---------------------------+-----------------+ + +DROP TABLE logs; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/expr/atat.sql b/tests/cases/standalone/common/expr/atat.sql new file mode 100644 index 0000000000..da32dcf0bf --- /dev/null +++ b/tests/cases/standalone/common/expr/atat.sql @@ -0,0 +1,144 @@ +-- Derived from matches_term cases + +-- Test basic term matching +-- Expect: true +SELECT 'cat!' @@ 'cat' as result; + +-- Test phrase matching with spaces +-- Expect: true +SELECT 'warning:hello world!' @@ 'hello world' as result; + +-- Test numbers in term +SELECT 'v1.0!' @@ 'v1.0' as result; + +-- Test case sensitivity +-- Expect: true +SELECT 'Cat' @@ 'Cat' as result; +-- Expect: false +SELECT 'cat' @@ 'Cat' as result; + +-- Test empty string handling +-- Expect: true +SELECT '' @@ '' as result; +-- Expect: false +SELECT 'any' @@ '' as result; +-- Expect: false +SELECT '' @@ 'any' as result; + +-- Test partial matches (should fail) +-- Expect: false +SELECT 'category' @@ 'cat' as result; +-- Expect: false +SELECT 'rebooted' @@ 'boot' as result; + +-- Test adjacent alphanumeric characters +SELECT 'cat5' @@ 'cat' as result; +SELECT 'dogcat' @@ 'dog' as result; + +-- Test leading non-alphanumeric +-- Expect: true +SELECT 'dog/cat' @@ '/cat' as result; +-- Expect: true +SELECT 'dog/cat' @@ 'dog/' as result; +-- Expect: true +SELECT 'dog/cat' @@ 'dog/cat' as result; + +-- Test unicode characters +-- Expect: true +SELECT 'café>' @@ 'café' as result; +-- Expect: true +SELECT 'русский!' @@ 'русский' as result; + +-- Test complete word matching +CREATE TABLE logs ( + `id` TIMESTAMP TIME INDEX, + `log_message` STRING +); + +INSERT INTO logs VALUES + (1, 'An error occurred!'), + (2, 'Critical error: system failure'), + (3, 'error-prone'), + (4, 'errors'), + (5, 'error123'), + (6, 'errorLogs'), + (7, 'Version v1.0 released'), + (8, 'v1.0!'), + (9, 'v1.0a'), + (10, 'v1.0beta'), + (11, 'GET /app/start'), + (12, 'Command: /start-prosess'), + (13, 'Command: /start'), + (14, 'start'), + (15, 'start/stop'), + (16, 'Alert: system failure detected'), + (17, 'system failure!'), + (18, 'system-failure'), + (19, 'system failure2023'), + (20, 'critical error: system failure'), + (21, 'critical failure detected'), + (22, 'critical issue'), + (23, 'failure imminent'), + (24, 'Warning: high temperature'), + (25, 'WARNING: system overload'), + (26, 'warned'), + (27, 'warnings'); + +-- Test complete word matching for 'error' +-- Expect: +-- 1|An error occurred!|true +-- 2|Critical error: system failure|true +-- 3|error-prone|true +-- 4|errors|false +-- 5|error123|false +-- 6|errorLogs|false +SELECT `id`, `log_message`, `log_message` @@ 'error' as `matches_error` FROM logs WHERE `id` <= 6 ORDER BY `id`; + + +-- Test complete word matching for 'v1.0' +-- Expect: +-- 7|Version v1.0 released|true +-- 8|v1.0!|true +-- 9|v1.0a|false +-- 10|v1.0beta|false +SELECT `id`, `log_message`, `log_message` @@ 'v1.0' as `matches_version` FROM logs WHERE `id` BETWEEN 7 AND 10 ORDER BY `id`; + +-- Test complete word matching for '/start' +-- Expect: +-- 11|GET /app/start|true +-- 12|Command: /start-prosess|true +-- 13|Command: /start|true +-- 14|start|false +-- 15|start/stop|false +SELECT `id`, `log_message`, `log_message` @@ '/start' as `matches_start` FROM logs WHERE `id` BETWEEN 11 AND 15 ORDER BY `id`; + +-- Test phrase matching for 'system failure' +-- Expect: +-- 16|Alert: system failure detected|true +-- 17|system failure!|true +-- 18|system-failure|false +-- 19|system failure2023|false +SELECT `id`, `log_message`, `log_message` @@ 'system failure' as `matches_phrase` FROM logs WHERE `id` BETWEEN 16 AND 19 ORDER BY `id`; + + +-- Test multi-word matching using AND +-- Expect: +-- 20|critical error: system failure|true|true|true +-- 21|critical failure detected|true|true|true +-- 22|critical issue|true|false|false +-- 23|failure imminent|false|true|false +SELECT `id`, `log_message`, + `log_message` @@ 'critical' as `matches_critical`, + `log_message` @@ 'failure' as `matches_failure`, + `log_message` @@ 'critical' AND `log_message` @@ 'failure' as `matches_both` +FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`; + +-- Test case-insensitive matching using lower() +-- Expect: +-- 24|Warning: high temperature|true +-- 25|WARNING: system overload|true +-- 26|warned|false +-- 27|warnings|false +SELECT `id`, `log_message`, lower(`log_message`) @@ 'warning' as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`; + +DROP TABLE logs; diff --git a/tests/cases/standalone/common/tql-explain-analyze/explain.result b/tests/cases/standalone/common/tql-explain-analyze/explain.result index 8b4952ed3d..200ec5c814 100644 --- a/tests/cases/standalone/common/tql-explain-analyze/explain.result +++ b/tests/cases/standalone/common/tql-explain-analyze/explain.result @@ -80,6 +80,7 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test; |_|_TableScan: test_| | logical_plan after count_wildcard_to_time_index_rule_| SAME TEXT AS ABOVE_| | logical_plan after StringNormalizationRule_| SAME TEXT AS ABOVE_| +| logical_plan after TranscribeAtatRule_| SAME TEXT AS ABOVE_| | logical_plan after inline_table_scan_| SAME TEXT AS ABOVE_| | logical_plan after expand_wildcard_rule_| SAME TEXT AS ABOVE_| | logical_plan after resolve_grouping_function_| SAME TEXT AS ABOVE_|