From 3750819f937d13408012ae36f621311c780fb7eb Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Mon, 13 Apr 2026 21:04:11 +0800 Subject: [PATCH] fix: match term zh (#7952) * fix: match term zh Signed-off-by: discord9 * chore: per gemini Signed-off-by: discord9 * chore: revert accident change Signed-off-by: discord9 * feat: unicode script han Signed-off-by: discord9 --------- Signed-off-by: discord9 --- Cargo.lock | 1 + Cargo.toml | 1 + src/common/function/Cargo.toml | 1 + .../function/src/scalars/matches_term.rs | 170 ++++++++++++++---- src/index/src/fulltext_index/tokenizer.rs | 20 +++ .../common/function/matches_term.result | 65 +++++++ .../common/function/matches_term.sql | 10 ++ 7 files changed, 238 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f6339d83a..68d1dac297 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2348,6 +2348,7 @@ dependencies = [ "geohash", "h3o", "hyperloglogplus", + "icu_properties", "jsonb", "jsonpath-rust 0.7.5", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 34e10d9173..66c35acee8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -161,6 +161,7 @@ humantime = "2.1" humantime-serde = "1.1" hyper = "1.1" hyper-util = "0.1" +icu_properties = "2.0.1" itertools = "0.14" jsonb = { version = "0.4.4", default-features = false } lazy_static = "1.4" diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index d164b9285d..43ddf9ae0c 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -47,6 +47,7 @@ geo-types = { version = "0.7", optional = true } geohash = { version = "0.13", optional = true } h3o = { version = "0.6", optional = true } hyperloglogplus = "0.4" +icu_properties.workspace = true jsonb.workspace = true jsonpath-rust = "0.7.5" memchr = "2.7" diff --git a/src/common/function/src/scalars/matches_term.rs b/src/common/function/src/scalars/matches_term.rs index 8dfb25cbc0..ec1b34d408 100644 --- a/src/common/function/src/scalars/matches_term.rs +++ b/src/common/function/src/scalars/matches_term.rs @@ -20,6 +20,8 @@ use datafusion_common::arrow::compute; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +use icu_properties::props::Script; +use icu_properties::{CodePointMapData, CodePointMapDataBorrowed}; use memchr::memmem; use crate::function::Function; @@ -27,10 +29,11 @@ use crate::function_registry::FunctionRegistry; /// Exact term/phrase matching function for text columns. /// -/// This function checks if a text column contains exact term/phrase matches -/// with non-alphanumeric boundaries. Designed for: -/// - Whole-word matching (e.g. "cat" in "cat!" but not in "category") +/// This function uses script-aware matching rules: +/// - ASCII-only terms keep whole-word style boundary matching, like Whole-word matching (e.g. "cat" in "cat!" but not in "category") /// - Phrase matching (e.g. "hello world" in "note:hello world!") +/// - Terms containing Han characters match as contiguous substrings +/// - Mixed-script identifiers and numeric terms remain searchable in Chinese text /// /// # Signature /// `matches_term(text: String, term: String) -> Boolean` @@ -43,9 +46,8 @@ use crate::function_registry::FunctionRegistry; /// BooleanVector where each element indicates if the corresponding text /// contains an exact match of the term, following these rules: /// 1. Exact substring match found (case-sensitive) -/// 2. Match boundaries are either: -/// - Start/end of text -/// - Any non-alphanumeric character (including spaces, hyphens, punctuation, etc.) +/// 2. For ASCII-only terms, adjacent ASCII word characters block the match +/// 3. For Han-containing terms, contiguous substring match is sufficient /// /// # Examples /// ``` @@ -60,6 +62,9 @@ use crate::function_registry::FunctionRegistry; /// SELECT matches_term(column, 'critical error') FROM logs; /// -- Match in: "ERROR:critical error!" /// -- No match: "critical_errors" +/// -- Chinese substring examples -- +/// SELECT matches_term(column, '手机') FROM table; +/// -- Text: "登录手机号18888888888的动态key" => true /// /// -- Empty string handling -- /// SELECT matches_term(column, '') FROM table; @@ -204,9 +209,8 @@ impl Function for MatchesTermFunction { /// /// A term is considered matched when: /// 1. The exact sequence appears in the text -/// 2. It is either: -/// - At the start/end of text with adjacent non-alphanumeric character -/// - Surrounded by non-alphanumeric characters +/// 2. ASCII-only terms are not adjacent to ASCII word characters +/// 3. Han-containing terms match as contiguous substrings /// /// # Examples /// ``` @@ -215,28 +219,105 @@ impl Function for MatchesTermFunction { /// assert!(finder.find("dog,cat")); // Term preceded by comma /// assert!(!finder.find("category")); // Partial match rejected /// -/// let finder = MatchesTermFinder::new("world"); -/// assert!(finder.find("hello-world")); // Hyphen boundary +/// let finder = MatchesTermFinder::new("手机"); +/// assert!(finder.find("登录手机号18888888888的动态key")); /// ``` #[derive(Clone, Debug)] pub struct MatchesTermFinder { finder: memmem::Finder<'static>, term: String, - starts_with_non_alnum: bool, - ends_with_non_alnum: bool, + term_kind: TermKind, + starts_with_other: bool, + ends_with_other: bool, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum CharClass { + AsciiWord, + Han, + UnicodeWord, + Other, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum TermKind { + AsciiLike, + UnicodeWord, + HanContaining, +} + +fn classify_char(c: char) -> CharClass { + if c.is_ascii_alphanumeric() { + CharClass::AsciiWord + } else if is_han(c) { + CharClass::Han + } else if c.is_alphanumeric() { + CharClass::UnicodeWord + } else { + CharClass::Other + } +} + +static HAN_SCRIPT_DATA: CodePointMapDataBorrowed<'static, Script> = + CodePointMapData::