fix: match term zh

Signed-off-by: discord9 <discord9@163.com>
This commit is contained in:
discord9
2026-04-10 15:45:48 +08:00
parent 76cad696c6
commit 637e7eda5c
5 changed files with 355 additions and 83 deletions

View File

@@ -27,10 +27,11 @@ use crate::function_registry::FunctionRegistry;
/// Exact term/phrase matching function for text columns.
///
/// This function checks if a text column contains exact term/phrase matches
/// with non-alphanumeric boundaries. Designed for:
/// - Whole-word matching (e.g. "cat" in "cat!" but not in "category")
/// This function uses script-aware matching rules:
/// - ASCII-only terms keep whole-word style boundary matching, like Whole-word matching (e.g. "cat" in "cat!" but not in "category")
/// - Phrase matching (e.g. "hello world" in "note:hello world!")
/// - Terms containing Han characters match as contiguous substrings
/// - Mixed-script identifiers and numeric terms remain searchable in Chinese text
///
/// # Signature
/// `matches_term(text: String, term: String) -> Boolean`
@@ -43,9 +44,8 @@ use crate::function_registry::FunctionRegistry;
/// BooleanVector where each element indicates if the corresponding text
/// contains an exact match of the term, following these rules:
/// 1. Exact substring match found (case-sensitive)
/// 2. Match boundaries are either:
/// - Start/end of text
/// - Any non-alphanumeric character (including spaces, hyphens, punctuation, etc.)
/// 2. For ASCII-only terms, adjacent ASCII word characters block the match
/// 3. For Han-containing terms, contiguous substring match is sufficient
///
/// # Examples
/// ```
@@ -60,6 +60,9 @@ use crate::function_registry::FunctionRegistry;
/// SELECT matches_term(column, 'critical error') FROM logs;
/// -- Match in: "ERROR:critical error!"
/// -- No match: "critical_errors"
/// -- Chinese substring examples --
/// SELECT matches_term(column, '手机') FROM table;
/// -- Text: "登录手机号18888888888的动态key" => true
///
/// -- Empty string handling --
/// SELECT matches_term(column, '') FROM table;
@@ -204,9 +207,8 @@ impl Function for MatchesTermFunction {
///
/// A term is considered matched when:
/// 1. The exact sequence appears in the text
/// 2. It is either:
/// - At the start/end of text with adjacent non-alphanumeric character
/// - Surrounded by non-alphanumeric characters
/// 2. ASCII-only terms are not adjacent to ASCII word characters
/// 3. Han-containing terms match as contiguous substrings
///
/// # Examples
/// ```
@@ -215,28 +217,113 @@ impl Function for MatchesTermFunction {
/// assert!(finder.find("dog,cat")); // Term preceded by comma
/// assert!(!finder.find("category")); // Partial match rejected
///
/// let finder = MatchesTermFinder::new("world");
/// assert!(finder.find("hello-world")); // Hyphen boundary
/// let finder = MatchesTermFinder::new("手机");
/// assert!(finder.find("登录手机号18888888888的动态key"));
/// ```
#[derive(Clone, Debug)]
pub struct MatchesTermFinder {
finder: memmem::Finder<'static>,
term: String,
starts_with_non_alnum: bool,
ends_with_non_alnum: bool,
term_kind: TermKind,
starts_with_other: bool,
ends_with_other: bool,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum CharClass {
AsciiWord,
Han,
UnicodeWord,
Other,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum TermKind {
AsciiLike,
UnicodeWord,
HanContaining,
}
fn classify_char(c: char) -> CharClass {
if c.is_ascii_alphanumeric() {
CharClass::AsciiWord
} else if is_han(c) {
CharClass::Han
} else if c.is_alphanumeric() {
CharClass::UnicodeWord
} else {
CharClass::Other
}
}
fn is_han(c: char) -> bool {
matches!(
c as u32,
0x3400..=0x4DBF
| 0x4E00..=0x9FFF
| 0xF900..=0xFAFF
| 0x20000..=0x2A6DF
| 0x2A700..=0x2B73F
| 0x2B740..=0x2B81F
| 0x2B820..=0x2CEAF
| 0x2CEB0..=0x2EBEF
| 0x30000..=0x3134F
)
}
fn classify_term(term: &str) -> TermKind {
let mut has_han = false;
let mut has_unicode_word = false;
for c in term.chars() {
match classify_char(c) {
CharClass::AsciiWord => {}
CharClass::Han => has_han = true,
CharClass::UnicodeWord => has_unicode_word = true,
CharClass::Other => {}
}
}
if has_han {
TermKind::HanContaining
} else if has_unicode_word {
TermKind::UnicodeWord
} else {
TermKind::AsciiLike
}
}
fn boundary_ok(term_kind: TermKind, neighbor: Option<char>, term_has_other_boundary: bool) -> bool {
if term_has_other_boundary {
return true;
}
match term_kind {
TermKind::AsciiLike => !matches!(neighbor.map(classify_char), Some(CharClass::AsciiWord)),
TermKind::UnicodeWord => !matches!(
neighbor.map(classify_char),
Some(CharClass::AsciiWord | CharClass::UnicodeWord | CharClass::Han)
),
TermKind::HanContaining => true,
}
}
impl MatchesTermFinder {
/// Create a new `MatchesTermFinder` for the given term.
pub fn new(term: &str) -> Self {
let starts_with_non_alnum = term.chars().next().is_some_and(|c| !c.is_alphanumeric());
let ends_with_non_alnum = term.chars().last().is_some_and(|c| !c.is_alphanumeric());
let starts_with_other = term
.chars()
.next()
.is_some_and(|c| classify_char(c) == CharClass::Other);
let ends_with_other = term
.chars()
.last()
.is_some_and(|c| classify_char(c) == CharClass::Other);
Self {
finder: memmem::Finder::new(term).into_owned(),
term: term.to_string(),
starts_with_non_alnum,
ends_with_non_alnum,
term_kind: classify_term(term),
starts_with_other,
ends_with_other,
}
}
@@ -254,23 +341,20 @@ impl MatchesTermFinder {
while let Some(found_pos) = self.finder.find(&text.as_bytes()[pos..]) {
let actual_pos = pos + found_pos;
let prev_ok = self.starts_with_non_alnum
|| text[..actual_pos]
.chars()
.last()
.map(|c| !c.is_alphanumeric())
.unwrap_or(true);
let prev = text[..actual_pos].chars().last();
let prev_ok = self.starts_with_other || boundary_ok(self.term_kind, prev, false);
if prev_ok {
let next_pos = actual_pos + self.finder.needle().len();
let next_ok = self.ends_with_non_alnum
|| text[next_pos..]
.chars()
.next()
.map(|c| !c.is_alphanumeric())
.unwrap_or(true);
let next = text[next_pos..].chars().next();
let next_ok = self.ends_with_other || boundary_ok(self.term_kind, next, false);
if next_ok {
let match_ok = match self.term_kind {
TermKind::HanContaining => true,
_ => prev_ok && next_ok,
};
if match_ok {
return true;
}
}
@@ -369,6 +453,25 @@ mod tests {
assert!(!MatchesTermFinder::new("v1.0").find("v1.0a"));
}
#[test]
fn mixed_script_terms_match_inside_chinese_context() {
let text = "登录手机号18888888888的动态key";
assert!(MatchesTermFinder::new("手机号").find(text));
assert!(MatchesTermFinder::new("18888888888").find(text));
assert!(MatchesTermFinder::new("手机").find(text));
assert!(MatchesTermFinder::new("机号").find(text));
assert!(MatchesTermFinder::new("机号1888").find(text));
assert!(MatchesTermFinder::new("农业").find("中国农业银行"));
assert!(MatchesTermFinder::new("error").find("错误error日志"));
}
#[test]
fn underscore_still_counts_as_boundary_for_ascii_terms() {
assert!(MatchesTermFinder::new("world").find("hello_world"));
assert!(MatchesTermFinder::new("id").find("trace_id=abc"));
assert!(!MatchesTermFinder::new("error").find("criticalerrors"));
}
#[test]
fn adjacent_alphanumeric_fails() {
assert!(!MatchesTermFinder::new("cat").find("cat5"));
@@ -406,4 +509,10 @@ mod tests {
assert!(MatchesTermFinder::new("中文").find("这是中文测试,中文!"));
assert!(MatchesTermFinder::new("error").find("错误errorerror日志_error!"));
}
#[test]
fn han_terms_match_as_contiguous_substrings() {
assert!(MatchesTermFinder::new("行账号").find("中国农业银行账号"));
assert!(MatchesTermFinder::new("登录").find("登录手机号18888888888的动态key"));
}
}

View File

@@ -167,6 +167,26 @@ mod tests {
assert_eq!(tokens, vec!["", "喜欢", "苹果"]);
}
#[test]
fn test_chinese_tokenizer_issue_7943_sample() {
let tokenizer = ChineseTokenizer;
let text = "登录手机号18888888888的动态key829889AC8";
let tokens = tokenizer.tokenize(text);
assert_eq!(
tokens,
vec![
"登录",
"手机号",
"18888888888",
"",
"动态",
"key",
"",
"829889AC8"
]
);
}
#[test]
fn test_valid_ascii_token_lookup_table() {
// Test all ASCII values in a single loop

View File

@@ -23,6 +23,8 @@ use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_function::function::FunctionContext;
use common_query::prelude::greptime_value;
use common_time::Timestamp;
use common_time::timestamp::TimeUnit;
use datafusion::common::DFSchemaRef;
use datafusion::datasource::DefaultTableSource;
use datafusion::functions_aggregate::average::avg_udaf;
@@ -91,9 +93,9 @@ use crate::promql::error::{
InvalidRegularExpressionSnafu, InvalidTimeRangeSnafu, MultiFieldsNotSupportedSnafu,
MultipleMetricMatchersSnafu, MultipleVectorSnafu, NoMetricMatcherSnafu, PromqlPlanNodeSnafu,
Result, SameLabelSetSnafu, TableNameNotFoundSnafu, TimeIndexNotFoundSnafu,
UnexpectedPlanExprSnafu, UnexpectedTokenSnafu, UnknownTableSnafu, UnsupportedExprSnafu,
UnsupportedMatcherOpSnafu, UnsupportedVectorMatchSnafu, ValueNotFoundSnafu,
ZeroRangeSelectorSnafu,
TimestampOutOfRangeSnafu, UnexpectedPlanExprSnafu, UnexpectedTokenSnafu, UnknownTableSnafu,
UnsupportedExprSnafu, UnsupportedMatcherOpSnafu, UnsupportedVectorMatchSnafu,
ValueNotFoundSnafu, ZeroRangeSelectorSnafu,
};
use crate::query_engine::QueryEngineState;
@@ -1221,26 +1223,34 @@ impl PromPlanner {
label_matchers: Matchers,
is_range_selector: bool,
) -> Result<LogicalPlan> {
// make table scan plan
let table_ref = self.table_ref()?;
let mut table_scan = self.create_table_scan_plan(table_ref.clone()).await?;
let table_schema = table_scan.schema();
// make filter exprs
let offset_duration = match offset {
Some(Offset::Pos(duration)) => duration.as_millis() as Millisecond,
Some(Offset::Neg(duration)) => -(duration.as_millis() as Millisecond),
None => 0,
};
// make table scan plan
let table_ref = self.table_ref()?;
let (mut table_scan, time_filter_pushed_down) = self
.create_table_scan_plan(table_ref.clone(), offset_duration)
.await?;
let table_schema = table_scan.schema();
// make filter exprs
let mut scan_filters = Self::matchers_to_expr(label_matchers.clone(), table_schema)?;
if let Some(time_index_filter) = self.build_time_index_filter(offset_duration)? {
if !time_filter_pushed_down
&& let Some(time_index_filter) =
self.build_time_index_filter(offset_duration, TimeUnit::Millisecond)?
{
scan_filters.push(time_index_filter);
}
table_scan = LogicalPlanBuilder::from(table_scan)
.filter(conjunction(scan_filters).unwrap()) // Safety: `scan_filters` is not empty.
.context(DataFusionPlanningSnafu)?
.build()
.context(DataFusionPlanningSnafu)?;
if let Some(scan_filter) = conjunction(scan_filters) {
table_scan = LogicalPlanBuilder::from(table_scan)
.filter(scan_filter)
.context(DataFusionPlanningSnafu)?
.build()
.context(DataFusionPlanningSnafu)?;
}
// make a projection plan if there is any `__field__` matcher
if let Some(field_matchers) = &self.ctx.field_column_matcher {
@@ -1590,7 +1600,11 @@ impl PromPlanner {
Ok(table_ref)
}
fn build_time_index_filter(&self, offset_duration: i64) -> Result<Option<DfExpr>> {
fn build_time_index_filter(
&self,
offset_duration: i64,
time_index_unit: TimeUnit,
) -> Result<Option<DfExpr>> {
let start = self.ctx.start;
let end = self.ctx.end;
if end < start {
@@ -1614,56 +1628,92 @@ impl PromPlanner {
// Scan a continuous time range
if (end - start) / interval > MAX_SCATTER_POINTS || interval <= INTERVAL_1H {
let lower_bound = self.build_scan_time_filter_literal(
start - offset_duration - selector_window + lower_exclusive_adjustment,
time_index_unit,
)?;
let upper_bound = self.build_scan_time_filter_literal(
self.ctx
.end
.checked_sub(offset_duration)
.and_then(|ts| ts.checked_add(1))
.with_context(|| TimestampOutOfRangeSnafu {
timestamp: self.ctx.end - offset_duration,
unit: TimeUnit::Millisecond,
})?,
time_index_unit,
)?;
let single_time_range = time_index_expr
.clone()
.gt_eq(DfExpr::Literal(
ScalarValue::TimestampMillisecond(
Some(
self.ctx.start - offset_duration - selector_window
+ lower_exclusive_adjustment,
),
None,
),
None,
))
.and(time_index_expr.lt_eq(DfExpr::Literal(
ScalarValue::TimestampMillisecond(Some(self.ctx.end - offset_duration), None),
None,
)));
.gt_eq(lower_bound)
.and(time_index_expr.lt(upper_bound));
return Ok(Some(single_time_range));
}
// Otherwise scan scatter ranges separately
let mut filters = Vec::with_capacity(num_points as usize + 1);
for timestamp in (start..=end).step_by(interval as usize) {
let lower_bound = self.build_scan_time_filter_literal(
timestamp - offset_duration - selector_window + lower_exclusive_adjustment,
time_index_unit,
)?;
let upper_bound = self.build_scan_time_filter_literal(
timestamp
.checked_sub(offset_duration)
.and_then(|ts| ts.checked_add(1))
.with_context(|| TimestampOutOfRangeSnafu {
timestamp: timestamp - offset_duration,
unit: TimeUnit::Millisecond,
})?,
time_index_unit,
)?;
filters.push(
time_index_expr
.clone()
.gt_eq(DfExpr::Literal(
ScalarValue::TimestampMillisecond(
Some(
timestamp - offset_duration - selector_window
+ lower_exclusive_adjustment,
),
None,
),
None,
))
.and(time_index_expr.clone().lt_eq(DfExpr::Literal(
ScalarValue::TimestampMillisecond(Some(timestamp - offset_duration), None),
None,
))),
.gt_eq(lower_bound)
.and(time_index_expr.clone().lt(upper_bound)),
)
}
Ok(filters.into_iter().reduce(DfExpr::or))
}
fn build_scan_time_filter_literal(
&self,
timestamp_ms: i64,
time_index_unit: TimeUnit,
) -> Result<DfExpr> {
let timestamp = Timestamp::new(timestamp_ms, TimeUnit::Millisecond)
.convert_to_ceil(time_index_unit)
.with_context(|| TimestampOutOfRangeSnafu {
timestamp: timestamp_ms,
unit: time_index_unit,
})?;
Ok(DfExpr::Literal(
Self::timestamp_to_scalar_value(timestamp),
None,
))
}
fn timestamp_to_scalar_value(timestamp: Timestamp) -> ScalarValue {
let value = timestamp.value();
match timestamp.unit() {
TimeUnit::Second => ScalarValue::TimestampSecond(Some(value), None),
TimeUnit::Millisecond => ScalarValue::TimestampMillisecond(Some(value), None),
TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond(Some(value), None),
TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond(Some(value), None),
}
}
/// Create a table scan plan and a filter plan with given filter.
///
/// # Panic
/// If the filter is empty
async fn create_table_scan_plan(&mut self, table_ref: TableReference) -> Result<LogicalPlan> {
async fn create_table_scan_plan(
&mut self,
table_ref: TableReference,
offset_duration: i64,
) -> Result<(LogicalPlan, bool)> {
let provider = self
.table_provider
.resolve_table(table_ref.clone())
@@ -1756,14 +1806,19 @@ impl PromPlanner {
self.ctx.tag_columns.clone()
};
let is_time_index_ms = scan_table
let time_index_unit = scan_table
.schema()
.timestamp_column()
.with_context(|| TimeIndexNotFoundSnafu {
table: maybe_phy_table_ref.to_quoted_string(),
})?
.data_type
== ConcreteDataType::timestamp_millisecond_datatype();
.as_timestamp()
.with_context(|| TimeIndexNotFoundSnafu {
table: maybe_phy_table_ref.to_quoted_string(),
})?
.unit();
let is_time_index_ms = time_index_unit == TimeUnit::Millisecond;
let scan_projection = if table_id_filter.is_some() {
let mut required_columns = HashSet::new();
@@ -1816,6 +1871,17 @@ impl PromPlanner {
.context(DataFusionPlanningSnafu)?;
}
if !is_time_index_ms
&& let Some(time_index_filter) =
self.build_time_index_filter(offset_duration, time_index_unit)?
{
scan_plan = LogicalPlanBuilder::from(scan_plan)
.filter(time_index_filter)
.context(DataFusionPlanningSnafu)?
.build()
.context(DataFusionPlanningSnafu)?;
}
if !is_time_index_ms {
// cast to ms if time_index not in Millisecond precision
let expr: Vec<_> = self
@@ -1882,7 +1948,7 @@ impl PromPlanner {
let result = LogicalPlanBuilder::from(scan_plan)
.build()
.context(DataFusionPlanningSnafu)?;
Ok(result)
Ok((result, !is_time_index_ms))
}
fn collect_row_key_tag_columns_from_plan(
@@ -6085,9 +6151,10 @@ mod test {
"PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-999, None) AND metrics.timestamp <= TimestampMillisecond(100000000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n Filter: metrics.tag = Utf8(\"1\") [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]"
\n Filter: metrics.timestamp >= TimestampNanosecond(-999000000, None) AND metrics.timestamp < TimestampNanosecond(100000001000000, None) [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]\
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]"
);
let plan = PromPlanner::stmt_to_plan(
DfTableSourceProvider::new(
@@ -6118,9 +6185,10 @@ mod test {
\n PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-4999, None) AND metrics.timestamp <= TimestampMillisecond(100000000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n Filter: metrics.tag = Utf8(\"1\") [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(ms)]\
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]"
\n Filter: metrics.timestamp >= TimestampNanosecond(-4999000000, None) AND metrics.timestamp < TimestampNanosecond(100000001000000, None) [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]\
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(ns), field:Float64;N]"
);
}

View File

@@ -157,6 +157,71 @@ SELECT matches_term('русский!', 'русский') as result;
| true |
+--------+
-- Phase 1 mixed Chinese and numeric behavior
SELECT matches_term('登录手机号18888888888的动态key', '手机号') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('登录手机号18888888888的动态key', '18888888888') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('登录手机号18888888888的动态key', '手机') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('登录手机号18888888888的动态key', '机号') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('登录手机号18888888888的动态key', '机号1888') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('中国农业银行', '农业') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('中国农业银行账号', '行账号') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('错误error日志', 'error') as result;
+--------+
| result |
+--------+
| true |
+--------+
-- Test complete word matching
CREATE TABLE logs (
`id` TIMESTAMP TIME INDEX,

View File

@@ -47,6 +47,16 @@ SELECT matches_term('café>', 'café') as result;
-- Expect: true
SELECT matches_term('русский!', 'русский') as result;
-- Phase 1 mixed Chinese and numeric behavior
SELECT matches_term('登录手机号18888888888的动态key', '手机号') as result;
SELECT matches_term('登录手机号18888888888的动态key', '18888888888') as result;
SELECT matches_term('登录手机号18888888888的动态key', '手机') as result;
SELECT matches_term('登录手机号18888888888的动态key', '机号') as result;
SELECT matches_term('登录手机号18888888888的动态key', '机号1888') as result;
SELECT matches_term('中国农业银行', '农业') as result;
SELECT matches_term('中国农业银行账号', '行账号') as result;
SELECT matches_term('错误error日志', 'error') as result;
-- Test complete word matching
CREATE TABLE logs (
`id` TIMESTAMP TIME INDEX,