diff --git a/Cargo.lock b/Cargo.lock index bc55fc6722..c525c24055 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2038,6 +2038,7 @@ dependencies = [ "h3o", "hyperloglogplus", "jsonb", + "memchr", "nalgebra 0.33.2", "num", "num-traits", diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index 60bad51780..fdfd9b4e63 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -39,6 +39,7 @@ geohash = { version = "0.13", optional = true } h3o = { version = "0.6", optional = true } hyperloglogplus = "0.4" jsonb.workspace = true +memchr = "2.7" nalgebra.workspace = true num = "0.4" num-traits = "0.2" diff --git a/src/common/function/src/function_registry.rs b/src/common/function/src/function_registry.rs index 5141391693..773131314c 100644 --- a/src/common/function/src/function_registry.rs +++ b/src/common/function/src/function_registry.rs @@ -27,6 +27,7 @@ use crate::scalars::hll_count::HllCalcFunction; use crate::scalars::ip::IpFunctions; use crate::scalars::json::JsonFunction; use crate::scalars::matches::MatchesFunction; +use crate::scalars::matches_term::MatchesTermFunction; use crate::scalars::math::MathFunction; use crate::scalars::timestamp::TimestampFunction; use crate::scalars::uddsketch_calc::UddSketchCalcFunction; @@ -116,6 +117,7 @@ pub static FUNCTION_REGISTRY: Lazy> = Lazy::new(|| { // Full text search function MatchesFunction::register(&function_registry); + MatchesTermFunction::register(&function_registry); // System and administration functions SystemFunction::register(&function_registry); diff --git a/src/common/function/src/scalars.rs b/src/common/function/src/scalars.rs index d655e4b175..ac5389e9fd 100644 --- a/src/common/function/src/scalars.rs +++ b/src/common/function/src/scalars.rs @@ -19,6 +19,7 @@ pub mod expression; pub mod geo; pub mod json; pub mod matches; +pub mod matches_term; pub mod math; pub mod vector; diff --git a/src/common/function/src/scalars/matches_term.rs b/src/common/function/src/scalars/matches_term.rs new file mode 100644 index 0000000000..c99c5ca572 --- /dev/null +++ b/src/common/function/src/scalars/matches_term.rs @@ -0,0 +1,375 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::{fmt, iter}; + +use common_query::error::{InvalidFuncArgsSnafu, Result}; +use common_query::prelude::Volatility; +use datatypes::prelude::ConcreteDataType; +use datatypes::scalars::ScalarVectorBuilder; +use datatypes::vectors::{BooleanVector, BooleanVectorBuilder, MutableVector, VectorRef}; +use memchr::memmem; +use snafu::ensure; + +use crate::function::{Function, FunctionContext}; +use crate::function_registry::FunctionRegistry; + +/// Exact term/phrase matching function for text columns. +/// +/// This function checks if a text column contains exact term/phrase matches +/// with non-alphanumeric boundaries. Designed for: +/// - Whole-word matching (e.g. "cat" in "cat!" but not in "category") +/// - Phrase matching (e.g. "hello world" in "note:hello world!") +/// +/// # Signature +/// `matches_term(text: String, term: String) -> Boolean` +/// +/// # Arguments +/// * `text` - String column to search +/// * `term` - Search term/phrase +/// +/// # Returns +/// BooleanVector where each element indicates if the corresponding text +/// contains an exact match of the term, following these rules: +/// 1. Exact substring match found (case-sensitive) +/// 2. Match boundaries are either: +/// - Start/end of text +/// - Any non-alphanumeric character (including spaces, hyphens, punctuation, etc.) +/// +/// # Examples +/// ``` +/// -- SQL examples -- +/// -- Match phrase with space -- +/// SELECT matches_term(column, 'hello world') FROM table; +/// -- Text: "warning:hello world!" => true +/// -- Text: "hello-world" => false (hyphen instead of space) +/// -- Text: "hello world2023" => false (ending with numbers) +/// +/// -- Match multiple words with boundaries -- +/// SELECT matches_term(column, 'critical error') FROM logs; +/// -- Match in: "ERROR:critical error!" +/// -- No match: "critical_errors" +/// +/// -- Empty string handling -- +/// SELECT matches_term(column, '') FROM table; +/// -- Text: "" => true +/// -- Text: "any" => false +/// +/// -- Case sensitivity -- +/// SELECT matches_term(column, 'Cat') FROM table; +/// -- Text: "Cat" => true +/// -- Text: "cat" => false +/// ``` +pub struct MatchesTermFunction; + +impl MatchesTermFunction { + pub fn register(registry: &FunctionRegistry) { + registry.register(Arc::new(MatchesTermFunction)); + } +} + +impl fmt::Display for MatchesTermFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "MATCHES_TERM") + } +} + +impl Function for MatchesTermFunction { + fn name(&self) -> &str { + "matches_term" + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::boolean_datatype()) + } + + fn signature(&self) -> common_query::prelude::Signature { + common_query::prelude::Signature::exact( + vec![ + ConcreteDataType::string_datatype(), + ConcreteDataType::string_datatype(), + ], + Volatility::Immutable, + ) + } + + fn eval(&self, _func_ctx: &FunctionContext, columns: &[VectorRef]) -> Result { + ensure!( + columns.len() == 2, + InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not correct, expect exactly 2, have: {}", + columns.len() + ), + } + ); + + let text_column = &columns[0]; + if text_column.is_empty() { + return Ok(Arc::new(BooleanVector::from(Vec::::with_capacity(0)))); + } + + let term_column = &columns[1]; + let compiled_finder = if term_column.is_const() { + let term = term_column.get_ref(0).as_string().unwrap(); + match term { + None => { + return Ok(Arc::new(BooleanVector::from_iter( + iter::repeat(None).take(text_column.len()), + ))); + } + Some(term) => Some(MatchesTermFinder::new(term)), + } + } else { + None + }; + + let len = text_column.len(); + let mut result = BooleanVectorBuilder::with_capacity(len); + for i in 0..len { + let text = text_column.get_ref(i).as_string().unwrap(); + let Some(text) = text else { + result.push_null(); + continue; + }; + + let contains = match &compiled_finder { + Some(finder) => finder.find(text), + None => { + let term = match term_column.get_ref(i).as_string().unwrap() { + None => { + result.push_null(); + continue; + } + Some(term) => term, + }; + MatchesTermFinder::new(term).find(text) + } + }; + result.push(Some(contains)); + } + + Ok(result.to_vector()) + } +} + +/// A compiled finder for `matches_term` function that holds the compiled term +/// and its metadata for efficient matching. +/// +/// A term is considered matched when: +/// 1. The exact sequence appears in the text +/// 2. It is either: +/// - At the start/end of text with adjacent non-alphanumeric character +/// - Surrounded by non-alphanumeric characters +/// +/// # Examples +/// ``` +/// let finder = MatchesTermFinder::new("cat"); +/// assert!(finder.find("cat!")); // Term at end with punctuation +/// assert!(finder.find("dog,cat")); // Term preceded by comma +/// assert!(!finder.find("category")); // Partial match rejected +/// +/// let finder = MatchesTermFinder::new("world"); +/// assert!(finder.find("hello-world")); // Hyphen boundary +/// ``` +#[derive(Clone, Debug)] +pub struct MatchesTermFinder { + finder: memmem::Finder<'static>, + term: String, + starts_with_non_alnum: bool, + ends_with_non_alnum: bool, +} + +impl MatchesTermFinder { + /// Create a new `MatchesTermFinder` for the given term. + pub fn new(term: &str) -> Self { + let starts_with_non_alnum = term.chars().next().is_some_and(|c| !c.is_alphanumeric()); + let ends_with_non_alnum = term.chars().last().is_some_and(|c| !c.is_alphanumeric()); + + Self { + finder: memmem::Finder::new(term).into_owned(), + term: term.to_string(), + starts_with_non_alnum, + ends_with_non_alnum, + } + } + + /// Find the term in the text. + pub fn find(&self, text: &str) -> bool { + if self.term.is_empty() { + return text.is_empty(); + } + + if text.len() < self.term.len() { + return false; + } + + let mut pos = 0; + while let Some(found_pos) = self.finder.find(text[pos..].as_bytes()) { + let actual_pos = pos + found_pos; + + let prev_ok = self.starts_with_non_alnum + || text[..actual_pos] + .chars() + .last() + .map(|c| !c.is_alphanumeric()) + .unwrap_or(true); + + if prev_ok { + let next_pos = actual_pos + self.finder.needle().len(); + let next_ok = self.ends_with_non_alnum + || text[next_pos..] + .chars() + .next() + .map(|c| !c.is_alphanumeric()) + .unwrap_or(true); + + if next_ok { + return true; + } + } + + if let Some(next_char) = text[actual_pos..].chars().next() { + pos = actual_pos + next_char.len_utf8(); + } else { + break; + } + } + + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_term_example() { + let finder = MatchesTermFinder::new("hello world"); + assert!(finder.find("warning:hello world!")); + assert!(!finder.find("hello-world")); + assert!(!finder.find("hello world2023")); + + let finder = MatchesTermFinder::new("critical error"); + assert!(finder.find("ERROR:critical error!")); + assert!(!finder.find("critical_errors")); + + let finder = MatchesTermFinder::new(""); + assert!(finder.find("")); + assert!(!finder.find("any")); + + let finder = MatchesTermFinder::new("Cat"); + assert!(finder.find("Cat")); + assert!(!finder.find("cat")); + } + + #[test] + fn matches_term_with_punctuation() { + assert!(MatchesTermFinder::new("cat").find("cat!")); + assert!(MatchesTermFinder::new("dog").find("!dog")); + } + + #[test] + fn matches_phrase_with_boundaries() { + assert!(MatchesTermFinder::new("hello-world").find("hello-world")); + assert!(MatchesTermFinder::new("'foo bar'").find("test: 'foo bar'")); + } + + #[test] + fn matches_at_text_boundaries() { + assert!(MatchesTermFinder::new("start").find("start...")); + assert!(MatchesTermFinder::new("end").find("...end")); + } + + // Negative cases + #[test] + fn rejects_partial_matches() { + assert!(!MatchesTermFinder::new("cat").find("category")); + assert!(!MatchesTermFinder::new("boot").find("rebooted")); + } + + #[test] + fn rejects_missing_term() { + assert!(!MatchesTermFinder::new("foo").find("hello world")); + } + + // Edge cases + #[test] + fn handles_empty_inputs() { + assert!(!MatchesTermFinder::new("test").find("")); + assert!(!MatchesTermFinder::new("").find("text")); + } + + #[test] + fn different_unicode_boundaries() { + assert!(MatchesTermFinder::new("café").find("café>")); + assert!(!MatchesTermFinder::new("café").find("口café>")); + assert!(!MatchesTermFinder::new("café").find("café口")); + assert!(!MatchesTermFinder::new("café").find("cafémore")); + assert!(MatchesTermFinder::new("русский").find("русский!")); + assert!(MatchesTermFinder::new("русский").find("русский!")); + } + + #[test] + fn case_sensitive_matching() { + assert!(!MatchesTermFinder::new("cat").find("Cat")); + assert!(MatchesTermFinder::new("CaT").find("CaT")); + } + + #[test] + fn numbers_in_term() { + assert!(MatchesTermFinder::new("v1.0").find("v1.0!")); + assert!(!MatchesTermFinder::new("v1.0").find("v1.0a")); + } + + #[test] + fn adjacent_alphanumeric_fails() { + assert!(!MatchesTermFinder::new("cat").find("cat5")); + assert!(!MatchesTermFinder::new("dog").find("dogcat")); + } + + #[test] + fn empty_term_text() { + assert!(!MatchesTermFinder::new("").find("text")); + assert!(MatchesTermFinder::new("").find("")); + assert!(!MatchesTermFinder::new("text").find("")); + } + + #[test] + fn leading_non_alphanumeric() { + assert!(MatchesTermFinder::new("/cat").find("dog/cat")); + assert!(MatchesTermFinder::new("dog/").find("dog/cat")); + assert!(MatchesTermFinder::new("dog/cat").find("dog/cat")); + } + + #[test] + fn continues_searching_after_boundary_mismatch() { + assert!(!MatchesTermFinder::new("log").find("bloglog!")); + assert!(MatchesTermFinder::new("log").find("bloglog log")); + assert!(MatchesTermFinder::new("log").find("alogblog_log!")); + + assert!(MatchesTermFinder::new("error").find("errorlog_error_case")); + assert!(MatchesTermFinder::new("test").find("atestbtestc_test_end")); + assert!(MatchesTermFinder::new("data").find("database_data_store")); + assert!(!MatchesTermFinder::new("data").find("database_datastore")); + assert!(MatchesTermFinder::new("log.txt").find("catalog.txt_log.txt!")); + assert!(!MatchesTermFinder::new("log.txt").find("catalog.txtlog.txt!")); + assert!(MatchesTermFinder::new("data-set").find("bigdata-set_data-set!")); + + assert!(MatchesTermFinder::new("中文").find("这是中文测试,中文!")); + assert!(MatchesTermFinder::new("error").find("错误errorerror日志_error!")); + } +} diff --git a/tests/cases/standalone/common/function/matches_term.result b/tests/cases/standalone/common/function/matches_term.result new file mode 100644 index 0000000000..38b9f30723 --- /dev/null +++ b/tests/cases/standalone/common/function/matches_term.result @@ -0,0 +1,314 @@ +-- Test basic term matching +-- Expect: true +SELECT matches_term('cat!', 'cat') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test phrase matching with spaces +-- Expect: true +SELECT matches_term('warning:hello world!', 'hello world') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test numbers in term +SELECT matches_term('v1.0!', 'v1.0') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test case sensitivity +-- Expect: true +SELECT matches_term('Cat', 'Cat') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: false +SELECT matches_term('cat', 'Cat') as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Test empty string handling +-- Expect: true +SELECT matches_term('', '') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: false +SELECT matches_term('any', '') as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Expect: false +SELECT matches_term('', 'any') as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Test partial matches (should fail) +-- Expect: false +SELECT matches_term('category', 'cat') as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Expect: false +SELECT matches_term('rebooted', 'boot') as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Test adjacent alphanumeric characters +SELECT matches_term('cat5', 'cat') as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +SELECT matches_term('dogcat', 'dog') as result; + ++--------+ +| result | ++--------+ +| false | ++--------+ + +-- Test leading non-alphanumeric +-- Expect: true +SELECT matches_term('dog/cat', '/cat') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: true +SELECT matches_term('dog/cat', 'dog/') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: true +SELECT matches_term('dog/cat', 'dog/cat') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test unicode characters +-- Expect: true +SELECT matches_term('café>', 'café') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Expect: true +SELECT matches_term('русский!', 'русский') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +-- Test complete word matching +CREATE TABLE logs ( + `id` TIMESTAMP TIME INDEX, + `log_message` STRING +); + +Affected Rows: 0 + +INSERT INTO logs VALUES + (1, 'An error occurred!'), + (2, 'Critical error: system failure'), + (3, 'error-prone'), + (4, 'errors'), + (5, 'error123'), + (6, 'errorLogs'), + (7, 'Version v1.0 released'), + (8, 'v1.0!'), + (9, 'v1.0a'), + (10, 'v1.0beta'), + (11, 'GET /app/start'), + (12, 'Command: /start-prosess'), + (13, 'Command: /start'), + (14, 'start'), + (15, 'start/stop'), + (16, 'Alert: system failure detected'), + (17, 'system failure!'), + (18, 'system-failure'), + (19, 'system failure2023'), + (20, 'critical error: system failure'), + (21, 'critical failure detected'), + (22, 'critical issue'), + (23, 'failure imminent'), + (24, 'Warning: high temperature'), + (25, 'WARNING: system overload'), + (26, 'warned'), + (27, 'warnings'); + +Affected Rows: 27 + +-- Test complete word matching for 'error' +-- Expect: +-- 1|An error occurred!|true +-- 2|Critical error: system failure|true +-- 3|error-prone|true +-- 4|errors|false +-- 5|error123|false +-- 6|errorLogs|false +SELECT `id`, `log_message`, matches_term(`log_message`, 'error') as `matches_error` FROM logs WHERE `id` <= 6 ORDER BY `id`; + ++-------------------------+--------------------------------+---------------+ +| id | log_message | matches_error | ++-------------------------+--------------------------------+---------------+ +| 1970-01-01T00:00:00.001 | An error occurred! | true | +| 1970-01-01T00:00:00.002 | Critical error: system failure | true | +| 1970-01-01T00:00:00.003 | error-prone | true | +| 1970-01-01T00:00:00.004 | errors | false | +| 1970-01-01T00:00:00.005 | error123 | false | +| 1970-01-01T00:00:00.006 | errorLogs | false | ++-------------------------+--------------------------------+---------------+ + +-- Test complete word matching for 'v1.0' +-- Expect: +-- 7|Version v1.0 released|true +-- 8|v1.0!|true +-- 9|v1.0a|false +-- 10|v1.0beta|false +SELECT `id`, `log_message`, matches_term(`log_message`, 'v1.0') as `matches_version` FROM logs WHERE `id` BETWEEN 7 AND 10 ORDER BY `id`; + ++-------------------------+-----------------------+-----------------+ +| id | log_message | matches_version | ++-------------------------+-----------------------+-----------------+ +| 1970-01-01T00:00:00.007 | Version v1.0 released | true | +| 1970-01-01T00:00:00.008 | v1.0! | true | +| 1970-01-01T00:00:00.009 | v1.0a | false | +| 1970-01-01T00:00:00.010 | v1.0beta | false | ++-------------------------+-----------------------+-----------------+ + +-- Test complete word matching for '/start' +-- Expect: +-- 11|GET /app/start|true +-- 12|Command: /start-prosess|true +-- 13|Command: /start|true +-- 14|start|false +-- 15|start/stop|false +SELECT `id`, `log_message`, matches_term(`log_message`, '/start') as `matches_start` FROM logs WHERE `id` BETWEEN 11 AND 15 ORDER BY `id`; + ++-------------------------+-------------------------+---------------+ +| id | log_message | matches_start | ++-------------------------+-------------------------+---------------+ +| 1970-01-01T00:00:00.011 | GET /app/start | true | +| 1970-01-01T00:00:00.012 | Command: /start-prosess | true | +| 1970-01-01T00:00:00.013 | Command: /start | true | +| 1970-01-01T00:00:00.014 | start | false | +| 1970-01-01T00:00:00.015 | start/stop | false | ++-------------------------+-------------------------+---------------+ + +-- Test phrase matching for 'system failure' +-- Expect: +-- 16|Alert: system failure detected|true +-- 17|system failure!|true +-- 18|system-failure|false +-- 19|system failure2023|false +SELECT `id`, `log_message`, matches_term(`log_message`, 'system failure') as `matches_phrase` FROM logs WHERE `id` BETWEEN 16 AND 19 ORDER BY `id`; + ++-------------------------+--------------------------------+----------------+ +| id | log_message | matches_phrase | ++-------------------------+--------------------------------+----------------+ +| 1970-01-01T00:00:00.016 | Alert: system failure detected | true | +| 1970-01-01T00:00:00.017 | system failure! | true | +| 1970-01-01T00:00:00.018 | system-failure | false | +| 1970-01-01T00:00:00.019 | system failure2023 | false | ++-------------------------+--------------------------------+----------------+ + +-- Test multi-word matching using AND +-- Expect: +-- 20|critical error: system failure|true|true|true +-- 21|critical failure detected|true|true|true +-- 22|critical issue|true|false|false +-- 23|failure imminent|false|true|false +SELECT `id`, `log_message`, + matches_term(`log_message`, 'critical') as `matches_critical`, + matches_term(`log_message`, 'failure') as `matches_failure`, + matches_term(`log_message`, 'critical') AND matches_term(`log_message`, 'failure') as `matches_both` +FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`; + ++-------------------------+--------------------------------+------------------+-----------------+--------------+ +| id | log_message | matches_critical | matches_failure | matches_both | ++-------------------------+--------------------------------+------------------+-----------------+--------------+ +| 1970-01-01T00:00:00.020 | critical error: system failure | true | true | true | +| 1970-01-01T00:00:00.021 | critical failure detected | true | true | true | +| 1970-01-01T00:00:00.022 | critical issue | true | false | false | +| 1970-01-01T00:00:00.023 | failure imminent | false | true | false | ++-------------------------+--------------------------------+------------------+-----------------+--------------+ + +-- Test case-insensitive matching using lower() +-- Expect: +-- 24|Warning: high temperature|true +-- 25|WARNING: system overload|true +-- 26|warned|false +-- 27|warnings|false +SELECT `id`, `log_message`, matches_term(lower(`log_message`), 'warning') as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`; + ++-------------------------+---------------------------+-----------------+ +| id | log_message | matches_warning | ++-------------------------+---------------------------+-----------------+ +| 1970-01-01T00:00:00.024 | Warning: high temperature | true | +| 1970-01-01T00:00:00.025 | WARNING: system overload | true | +| 1970-01-01T00:00:00.026 | warned | false | +| 1970-01-01T00:00:00.027 | warnings | false | ++-------------------------+---------------------------+-----------------+ + +DROP TABLE logs; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/matches_term.sql b/tests/cases/standalone/common/function/matches_term.sql new file mode 100644 index 0000000000..e91b9fdf5f --- /dev/null +++ b/tests/cases/standalone/common/function/matches_term.sql @@ -0,0 +1,142 @@ +-- Test basic term matching +-- Expect: true +SELECT matches_term('cat!', 'cat') as result; + +-- Test phrase matching with spaces +-- Expect: true +SELECT matches_term('warning:hello world!', 'hello world') as result; + +-- Test numbers in term +SELECT matches_term('v1.0!', 'v1.0') as result; + +-- Test case sensitivity +-- Expect: true +SELECT matches_term('Cat', 'Cat') as result; +-- Expect: false +SELECT matches_term('cat', 'Cat') as result; + +-- Test empty string handling +-- Expect: true +SELECT matches_term('', '') as result; +-- Expect: false +SELECT matches_term('any', '') as result; +-- Expect: false +SELECT matches_term('', 'any') as result; + +-- Test partial matches (should fail) +-- Expect: false +SELECT matches_term('category', 'cat') as result; +-- Expect: false +SELECT matches_term('rebooted', 'boot') as result; + +-- Test adjacent alphanumeric characters +SELECT matches_term('cat5', 'cat') as result; +SELECT matches_term('dogcat', 'dog') as result; + +-- Test leading non-alphanumeric +-- Expect: true +SELECT matches_term('dog/cat', '/cat') as result; +-- Expect: true +SELECT matches_term('dog/cat', 'dog/') as result; +-- Expect: true +SELECT matches_term('dog/cat', 'dog/cat') as result; + +-- Test unicode characters +-- Expect: true +SELECT matches_term('café>', 'café') as result; +-- Expect: true +SELECT matches_term('русский!', 'русский') as result; + +-- Test complete word matching +CREATE TABLE logs ( + `id` TIMESTAMP TIME INDEX, + `log_message` STRING +); + +INSERT INTO logs VALUES + (1, 'An error occurred!'), + (2, 'Critical error: system failure'), + (3, 'error-prone'), + (4, 'errors'), + (5, 'error123'), + (6, 'errorLogs'), + (7, 'Version v1.0 released'), + (8, 'v1.0!'), + (9, 'v1.0a'), + (10, 'v1.0beta'), + (11, 'GET /app/start'), + (12, 'Command: /start-prosess'), + (13, 'Command: /start'), + (14, 'start'), + (15, 'start/stop'), + (16, 'Alert: system failure detected'), + (17, 'system failure!'), + (18, 'system-failure'), + (19, 'system failure2023'), + (20, 'critical error: system failure'), + (21, 'critical failure detected'), + (22, 'critical issue'), + (23, 'failure imminent'), + (24, 'Warning: high temperature'), + (25, 'WARNING: system overload'), + (26, 'warned'), + (27, 'warnings'); + +-- Test complete word matching for 'error' +-- Expect: +-- 1|An error occurred!|true +-- 2|Critical error: system failure|true +-- 3|error-prone|true +-- 4|errors|false +-- 5|error123|false +-- 6|errorLogs|false +SELECT `id`, `log_message`, matches_term(`log_message`, 'error') as `matches_error` FROM logs WHERE `id` <= 6 ORDER BY `id`; + + +-- Test complete word matching for 'v1.0' +-- Expect: +-- 7|Version v1.0 released|true +-- 8|v1.0!|true +-- 9|v1.0a|false +-- 10|v1.0beta|false +SELECT `id`, `log_message`, matches_term(`log_message`, 'v1.0') as `matches_version` FROM logs WHERE `id` BETWEEN 7 AND 10 ORDER BY `id`; + +-- Test complete word matching for '/start' +-- Expect: +-- 11|GET /app/start|true +-- 12|Command: /start-prosess|true +-- 13|Command: /start|true +-- 14|start|false +-- 15|start/stop|false +SELECT `id`, `log_message`, matches_term(`log_message`, '/start') as `matches_start` FROM logs WHERE `id` BETWEEN 11 AND 15 ORDER BY `id`; + +-- Test phrase matching for 'system failure' +-- Expect: +-- 16|Alert: system failure detected|true +-- 17|system failure!|true +-- 18|system-failure|false +-- 19|system failure2023|false +SELECT `id`, `log_message`, matches_term(`log_message`, 'system failure') as `matches_phrase` FROM logs WHERE `id` BETWEEN 16 AND 19 ORDER BY `id`; + + +-- Test multi-word matching using AND +-- Expect: +-- 20|critical error: system failure|true|true|true +-- 21|critical failure detected|true|true|true +-- 22|critical issue|true|false|false +-- 23|failure imminent|false|true|false +SELECT `id`, `log_message`, + matches_term(`log_message`, 'critical') as `matches_critical`, + matches_term(`log_message`, 'failure') as `matches_failure`, + matches_term(`log_message`, 'critical') AND matches_term(`log_message`, 'failure') as `matches_both` +FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`; + +-- Test case-insensitive matching using lower() +-- Expect: +-- 24|Warning: high temperature|true +-- 25|WARNING: system overload|true +-- 26|warned|false +-- 27|warnings|false +SELECT `id`, `log_message`, matches_term(lower(`log_message`), 'warning') as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`; + +DROP TABLE logs;