fix: match term zh (#7952)

* fix: match term zh Signed-off-by: discord9 <discord9@163.com> * chore: per gemini Signed-off-by: discord9 <discord9@163.com> * chore: revert accident change Signed-off-by: discord9 <discord9@163.com> * feat: unicode script han Signed-off-by: discord9 <discord9@163.com> --------- Signed-off-by: discord9 <discord9@163.com>
2026-05-25 09:20:40 +00:00 · 2026-04-13 21:04:11 +08:00
parent a24c58e25c
commit 3750819f93
7 changed files with 238 additions and 30 deletions
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -47,6 +47,7 @@ geo-types = { version = "0.7", optional = true }
 geohash = { version = "0.13", optional = true }
 h3o = { version = "0.6", optional = true }
 hyperloglogplus = "0.4"
+icu_properties.workspace = true
 jsonb.workspace = true
 jsonpath-rust = "0.7.5"
 memchr = "2.7"
--- a/src/common/function/src/scalars/matches_term.rs
+++ b/src/common/function/src/scalars/matches_term.rs
@@ -20,6 +20,8 @@ use datafusion_common::arrow::compute;
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_common::{DataFusionError, ScalarValue};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+use icu_properties::props::Script;
+use icu_properties::{CodePointMapData, CodePointMapDataBorrowed};
 use memchr::memmem;

 use crate::function::Function;
@@ -27,10 +29,11 @@ use crate::function_registry::FunctionRegistry;

 /// Exact term/phrase matching function for text columns.
 ///
-/// This function checks if a text column contains exact term/phrase matches
-/// with non-alphanumeric boundaries. Designed for:
-/// - Whole-word matching (e.g. "cat" in "cat!" but not in "category")
+/// This function uses script-aware matching rules:
+/// - ASCII-only terms keep whole-word style boundary matching, like Whole-word matching (e.g. "cat" in "cat!" but not in "category")
 /// - Phrase matching (e.g. "hello world" in "note:hello world!")
+/// - Terms containing Han characters match as contiguous substrings
+/// - Mixed-script identifiers and numeric terms remain searchable in Chinese text
 ///
 /// # Signature
 /// `matches_term(text: String, term: String) -> Boolean`
@@ -43,9 +46,8 @@ use crate::function_registry::FunctionRegistry;
 /// BooleanVector where each element indicates if the corresponding text
 /// contains an exact match of the term, following these rules:
 /// 1. Exact substring match found (case-sensitive)
-/// 2. Match boundaries are either:
-///    - Start/end of text
-///    - Any non-alphanumeric character (including spaces, hyphens, punctuation, etc.)
+/// 2. For ASCII-only terms, adjacent ASCII word characters block the match
+/// 3. For Han-containing terms, contiguous substring match is sufficient
 ///
 /// # Examples
 /// ```
@@ -60,6 +62,9 @@ use crate::function_registry::FunctionRegistry;
 /// SELECT matches_term(column, 'critical error') FROM logs;
 /// -- Match in: "ERROR:critical error!"
 /// -- No match: "critical_errors"
+/// -- Chinese substring examples --
+/// SELECT matches_term(column, '手机') FROM table;
+/// -- Text: "登录手机号18888888888的动态key" => true
 ///
 /// -- Empty string handling --
 /// SELECT matches_term(column, '') FROM table;
@@ -204,9 +209,8 @@ impl Function for MatchesTermFunction {
 ///
 /// A term is considered matched when:
 /// 1. The exact sequence appears in the text
-/// 2. It is either:
-///    - At the start/end of text with adjacent non-alphanumeric character
-///    - Surrounded by non-alphanumeric characters
+/// 2. ASCII-only terms are not adjacent to ASCII word characters
+/// 3. Han-containing terms match as contiguous substrings
 ///
 /// # Examples
 /// ```
@@ -215,28 +219,105 @@ impl Function for MatchesTermFunction {
 /// assert!(finder.find("dog,cat"));   // Term preceded by comma
 /// assert!(!finder.find("category")); // Partial match rejected
 ///
-/// let finder = MatchesTermFinder::new("world");
-/// assert!(finder.find("hello-world")); // Hyphen boundary
+/// let finder = MatchesTermFinder::new("手机");
+/// assert!(finder.find("登录手机号18888888888的动态key"));
 /// ```
 #[derive(Clone, Debug)]
 pub struct MatchesTermFinder {
    finder: memmem::Finder<'static>,
    term: String,
-    starts_with_non_alnum: bool,
-    ends_with_non_alnum: bool,
+    term_kind: TermKind,
+    starts_with_other: bool,
+    ends_with_other: bool,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum CharClass {
+    AsciiWord,
+    Han,
+    UnicodeWord,
+    Other,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum TermKind {
+    AsciiLike,
+    UnicodeWord,
+    HanContaining,
+}
+
+fn classify_char(c: char) -> CharClass {
+    if c.is_ascii_alphanumeric() {
+        CharClass::AsciiWord
+    } else if is_han(c) {
+        CharClass::Han
+    } else if c.is_alphanumeric() {
+        CharClass::UnicodeWord
+    } else {
+        CharClass::Other
+    }
+}
+
+static HAN_SCRIPT_DATA: CodePointMapDataBorrowed<'static, Script> =
+    CodePointMapData::<Script>::new();
+
+fn is_han(c: char) -> bool {
+    HAN_SCRIPT_DATA.get(c) == Script::Han
+}
+
+fn classify_term(term: &str) -> TermKind {
+    let mut has_han = false;
+    let mut has_unicode_word = false;
+    for c in term.chars() {
+        match classify_char(c) {
+            CharClass::AsciiWord => {}
+            CharClass::Han => has_han = true,
+            CharClass::UnicodeWord => has_unicode_word = true,
+            CharClass::Other => {}
+        }
+    }
+
+    if has_han {
+        TermKind::HanContaining
+    } else if has_unicode_word {
+        TermKind::UnicodeWord
+    } else {
+        TermKind::AsciiLike
+    }
+}
+
+fn boundary_ok(term_kind: TermKind, neighbor: Option<char>, term_has_other_boundary: bool) -> bool {
+    if term_has_other_boundary {
+        return true;
+    }
+
+    match term_kind {
+        TermKind::AsciiLike => !matches!(neighbor.map(classify_char), Some(CharClass::AsciiWord)),
+        TermKind::UnicodeWord => !matches!(
+            neighbor.map(classify_char),
+            Some(CharClass::AsciiWord | CharClass::UnicodeWord | CharClass::Han)
+        ),
+        TermKind::HanContaining => true,
+    }
 }

 impl MatchesTermFinder {
    /// Create a new `MatchesTermFinder` for the given term.
    pub fn new(term: &str) -> Self {
-        let starts_with_non_alnum = term.chars().next().is_some_and(|c| !c.is_alphanumeric());
-        let ends_with_non_alnum = term.chars().last().is_some_and(|c| !c.is_alphanumeric());
-
+        let starts_with_other = term
+            .chars()
+            .next()
+            .is_some_and(|c| classify_char(c) == CharClass::Other);
+        let ends_with_other = term
+            .chars()
+            .last()
+            .is_some_and(|c| classify_char(c) == CharClass::Other);
        Self {
            finder: memmem::Finder::new(term).into_owned(),
            term: term.to_string(),
-            starts_with_non_alnum,
-            ends_with_non_alnum,
+            term_kind: classify_term(term),
+            starts_with_other,
+            ends_with_other,
        }
    }

@@ -254,21 +335,17 @@ impl MatchesTermFinder {
        while let Some(found_pos) = self.finder.find(&text.as_bytes()[pos..]) {
            let actual_pos = pos + found_pos;

-            let prev_ok = self.starts_with_non_alnum
-                || text[..actual_pos]
-                    .chars()
-                    .last()
-                    .map(|c| !c.is_alphanumeric())
-                    .unwrap_or(true);
+            let prev = text[..actual_pos].chars().last();
+            let prev_ok = self.starts_with_other || boundary_ok(self.term_kind, prev, false);

            if prev_ok {
+                if self.term_kind == TermKind::HanContaining {
+                    return true;
+                }
+
                let next_pos = actual_pos + self.finder.needle().len();
-                let next_ok = self.ends_with_non_alnum
-                    || text[next_pos..]
-                        .chars()
-                        .next()
-                        .map(|c| !c.is_alphanumeric())
-                        .unwrap_or(true);
+                let next = text[next_pos..].chars().next();
+                let next_ok = self.ends_with_other || boundary_ok(self.term_kind, next, false);

                if next_ok {
                    return true;
@@ -369,6 +446,25 @@ mod tests {
        assert!(!MatchesTermFinder::new("v1.0").find("v1.0a"));
    }

+    #[test]
+    fn mixed_script_terms_match_inside_chinese_context() {
+        let text = "登录手机号18888888888的动态key";
+        assert!(MatchesTermFinder::new("手机号").find(text));
+        assert!(MatchesTermFinder::new("18888888888").find(text));
+        assert!(MatchesTermFinder::new("手机").find(text));
+        assert!(MatchesTermFinder::new("机号").find(text));
+        assert!(MatchesTermFinder::new("机号1888").find(text));
+        assert!(MatchesTermFinder::new("农业").find("中国农业银行"));
+        assert!(MatchesTermFinder::new("error").find("错误error日志"));
+    }
+
+    #[test]
+    fn underscore_still_counts_as_boundary_for_ascii_terms() {
+        assert!(MatchesTermFinder::new("world").find("hello_world"));
+        assert!(MatchesTermFinder::new("id").find("trace_id=abc"));
+        assert!(!MatchesTermFinder::new("error").find("criticalerrors"));
+    }
+
    #[test]
    fn adjacent_alphanumeric_fails() {
        assert!(!MatchesTermFinder::new("cat").find("cat5"));
@@ -406,4 +502,18 @@ mod tests {
        assert!(MatchesTermFinder::new("中文").find("这是中文测试，中文！"));
        assert!(MatchesTermFinder::new("error").find("错误errorerror日志_error!"));
    }
+
+    #[test]
+    fn han_terms_match_as_contiguous_substrings() {
+        assert!(MatchesTermFinder::new("行账号").find("中国农业银行账号"));
+        assert!(MatchesTermFinder::new("登录").find("登录手机号18888888888的动态key"));
+    }
+
+    #[test]
+    fn han_detection_uses_script_not_all_cjk() {
+        assert!(is_han('汉'));
+        assert!(is_han('\u{30000}'));
+        assert!(!is_han('あ'));
+        assert!(!is_han('한'));
+    }
 }