fix: match term zh (#7952)

* fix: match term zh

Signed-off-by: discord9 <discord9@163.com>

* chore: per gemini

Signed-off-by: discord9 <discord9@163.com>

* chore: revert accident change

Signed-off-by: discord9 <discord9@163.com>

* feat: unicode script han

Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
This commit is contained in:
discord9
2026-04-13 21:04:11 +08:00
committed by GitHub
parent a24c58e25c
commit 3750819f93
7 changed files with 238 additions and 30 deletions

View File

@@ -47,6 +47,7 @@ geo-types = { version = "0.7", optional = true }
geohash = { version = "0.13", optional = true }
h3o = { version = "0.6", optional = true }
hyperloglogplus = "0.4"
icu_properties.workspace = true
jsonb.workspace = true
jsonpath-rust = "0.7.5"
memchr = "2.7"

View File

@@ -20,6 +20,8 @@ use datafusion_common::arrow::compute;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_common::{DataFusionError, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use icu_properties::props::Script;
use icu_properties::{CodePointMapData, CodePointMapDataBorrowed};
use memchr::memmem;
use crate::function::Function;
@@ -27,10 +29,11 @@ use crate::function_registry::FunctionRegistry;
/// Exact term/phrase matching function for text columns.
///
/// This function checks if a text column contains exact term/phrase matches
/// with non-alphanumeric boundaries. Designed for:
/// - Whole-word matching (e.g. "cat" in "cat!" but not in "category")
/// This function uses script-aware matching rules:
/// - ASCII-only terms keep whole-word style boundary matching, like Whole-word matching (e.g. "cat" in "cat!" but not in "category")
/// - Phrase matching (e.g. "hello world" in "note:hello world!")
/// - Terms containing Han characters match as contiguous substrings
/// - Mixed-script identifiers and numeric terms remain searchable in Chinese text
///
/// # Signature
/// `matches_term(text: String, term: String) -> Boolean`
@@ -43,9 +46,8 @@ use crate::function_registry::FunctionRegistry;
/// BooleanVector where each element indicates if the corresponding text
/// contains an exact match of the term, following these rules:
/// 1. Exact substring match found (case-sensitive)
/// 2. Match boundaries are either:
/// - Start/end of text
/// - Any non-alphanumeric character (including spaces, hyphens, punctuation, etc.)
/// 2. For ASCII-only terms, adjacent ASCII word characters block the match
/// 3. For Han-containing terms, contiguous substring match is sufficient
///
/// # Examples
/// ```
@@ -60,6 +62,9 @@ use crate::function_registry::FunctionRegistry;
/// SELECT matches_term(column, 'critical error') FROM logs;
/// -- Match in: "ERROR:critical error!"
/// -- No match: "critical_errors"
/// -- Chinese substring examples --
/// SELECT matches_term(column, '手机') FROM table;
/// -- Text: "登录手机号18888888888的动态key" => true
///
/// -- Empty string handling --
/// SELECT matches_term(column, '') FROM table;
@@ -204,9 +209,8 @@ impl Function for MatchesTermFunction {
///
/// A term is considered matched when:
/// 1. The exact sequence appears in the text
/// 2. It is either:
/// - At the start/end of text with adjacent non-alphanumeric character
/// - Surrounded by non-alphanumeric characters
/// 2. ASCII-only terms are not adjacent to ASCII word characters
/// 3. Han-containing terms match as contiguous substrings
///
/// # Examples
/// ```
@@ -215,28 +219,105 @@ impl Function for MatchesTermFunction {
/// assert!(finder.find("dog,cat")); // Term preceded by comma
/// assert!(!finder.find("category")); // Partial match rejected
///
/// let finder = MatchesTermFinder::new("world");
/// assert!(finder.find("hello-world")); // Hyphen boundary
/// let finder = MatchesTermFinder::new("手机");
/// assert!(finder.find("登录手机号18888888888的动态key"));
/// ```
#[derive(Clone, Debug)]
pub struct MatchesTermFinder {
finder: memmem::Finder<'static>,
term: String,
starts_with_non_alnum: bool,
ends_with_non_alnum: bool,
term_kind: TermKind,
starts_with_other: bool,
ends_with_other: bool,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum CharClass {
AsciiWord,
Han,
UnicodeWord,
Other,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum TermKind {
AsciiLike,
UnicodeWord,
HanContaining,
}
fn classify_char(c: char) -> CharClass {
if c.is_ascii_alphanumeric() {
CharClass::AsciiWord
} else if is_han(c) {
CharClass::Han
} else if c.is_alphanumeric() {
CharClass::UnicodeWord
} else {
CharClass::Other
}
}
static HAN_SCRIPT_DATA: CodePointMapDataBorrowed<'static, Script> =
CodePointMapData::<Script>::new();
fn is_han(c: char) -> bool {
HAN_SCRIPT_DATA.get(c) == Script::Han
}
fn classify_term(term: &str) -> TermKind {
let mut has_han = false;
let mut has_unicode_word = false;
for c in term.chars() {
match classify_char(c) {
CharClass::AsciiWord => {}
CharClass::Han => has_han = true,
CharClass::UnicodeWord => has_unicode_word = true,
CharClass::Other => {}
}
}
if has_han {
TermKind::HanContaining
} else if has_unicode_word {
TermKind::UnicodeWord
} else {
TermKind::AsciiLike
}
}
fn boundary_ok(term_kind: TermKind, neighbor: Option<char>, term_has_other_boundary: bool) -> bool {
if term_has_other_boundary {
return true;
}
match term_kind {
TermKind::AsciiLike => !matches!(neighbor.map(classify_char), Some(CharClass::AsciiWord)),
TermKind::UnicodeWord => !matches!(
neighbor.map(classify_char),
Some(CharClass::AsciiWord | CharClass::UnicodeWord | CharClass::Han)
),
TermKind::HanContaining => true,
}
}
impl MatchesTermFinder {
/// Create a new `MatchesTermFinder` for the given term.
pub fn new(term: &str) -> Self {
let starts_with_non_alnum = term.chars().next().is_some_and(|c| !c.is_alphanumeric());
let ends_with_non_alnum = term.chars().last().is_some_and(|c| !c.is_alphanumeric());
let starts_with_other = term
.chars()
.next()
.is_some_and(|c| classify_char(c) == CharClass::Other);
let ends_with_other = term
.chars()
.last()
.is_some_and(|c| classify_char(c) == CharClass::Other);
Self {
finder: memmem::Finder::new(term).into_owned(),
term: term.to_string(),
starts_with_non_alnum,
ends_with_non_alnum,
term_kind: classify_term(term),
starts_with_other,
ends_with_other,
}
}
@@ -254,21 +335,17 @@ impl MatchesTermFinder {
while let Some(found_pos) = self.finder.find(&text.as_bytes()[pos..]) {
let actual_pos = pos + found_pos;
let prev_ok = self.starts_with_non_alnum
|| text[..actual_pos]
.chars()
.last()
.map(|c| !c.is_alphanumeric())
.unwrap_or(true);
let prev = text[..actual_pos].chars().last();
let prev_ok = self.starts_with_other || boundary_ok(self.term_kind, prev, false);
if prev_ok {
if self.term_kind == TermKind::HanContaining {
return true;
}
let next_pos = actual_pos + self.finder.needle().len();
let next_ok = self.ends_with_non_alnum
|| text[next_pos..]
.chars()
.next()
.map(|c| !c.is_alphanumeric())
.unwrap_or(true);
let next = text[next_pos..].chars().next();
let next_ok = self.ends_with_other || boundary_ok(self.term_kind, next, false);
if next_ok {
return true;
@@ -369,6 +446,25 @@ mod tests {
assert!(!MatchesTermFinder::new("v1.0").find("v1.0a"));
}
#[test]
fn mixed_script_terms_match_inside_chinese_context() {
let text = "登录手机号18888888888的动态key";
assert!(MatchesTermFinder::new("手机号").find(text));
assert!(MatchesTermFinder::new("18888888888").find(text));
assert!(MatchesTermFinder::new("手机").find(text));
assert!(MatchesTermFinder::new("机号").find(text));
assert!(MatchesTermFinder::new("机号1888").find(text));
assert!(MatchesTermFinder::new("农业").find("中国农业银行"));
assert!(MatchesTermFinder::new("error").find("错误error日志"));
}
#[test]
fn underscore_still_counts_as_boundary_for_ascii_terms() {
assert!(MatchesTermFinder::new("world").find("hello_world"));
assert!(MatchesTermFinder::new("id").find("trace_id=abc"));
assert!(!MatchesTermFinder::new("error").find("criticalerrors"));
}
#[test]
fn adjacent_alphanumeric_fails() {
assert!(!MatchesTermFinder::new("cat").find("cat5"));
@@ -406,4 +502,18 @@ mod tests {
assert!(MatchesTermFinder::new("中文").find("这是中文测试,中文!"));
assert!(MatchesTermFinder::new("error").find("错误errorerror日志_error!"));
}
#[test]
fn han_terms_match_as_contiguous_substrings() {
assert!(MatchesTermFinder::new("行账号").find("中国农业银行账号"));
assert!(MatchesTermFinder::new("登录").find("登录手机号18888888888的动态key"));
}
#[test]
fn han_detection_uses_script_not_all_cjk() {
assert!(is_han('汉'));
assert!(is_han('\u{30000}'));
assert!(!is_han('あ'));
assert!(!is_han('한'));
}
}