diff --git a/src/index/src/fulltext_index/tokenizer.rs b/src/index/src/fulltext_index/tokenizer.rs index b6694c555f..919c497317 100644 --- a/src/index/src/fulltext_index/tokenizer.rs +++ b/src/index/src/fulltext_index/tokenizer.rs @@ -95,15 +95,33 @@ impl Tokenizer for ChineseTokenizer { } else { // Search-mode tokenization emits finer-grained searchable terms, while HMM helps // merge some unknown fragments and avoid excessive token fragmentation. - JIEBA + let mut tokens = JIEBA .cut_for_search(text, true) .into_iter() - .filter(|s| s.chars().any(|c| c.is_alphanumeric() || c == '_')) - .collect() + .filter(|s| is_indexable_token(s)) + .collect::>(); + + let english = EnglishTokenizer {}; + tokens.extend( + english + .tokenize(text) + .into_iter() + .filter(|token| is_ascii_underscore_token(token)), + ); + + tokens } } } +fn is_indexable_token(token: &str) -> bool { + token.chars().any(|c| c.is_alphanumeric() || c == '_') +} + +fn is_ascii_underscore_token(token: &str) -> bool { + token.is_ascii() && token.chars().any(|c| c == '_') +} + /// `Analyzer` analyzes a text into a list of tokens. /// /// It uses a `Tokenizer` to tokenize the text and optionally lowercases the tokens. @@ -146,11 +164,26 @@ mod tests { #[test] fn test_english_tokenizer() { let tokenizer = EnglishTokenizer; - let text = "Hello, world!!! This is a----++ test012_345+67890"; + let text = "Hello, world!!! This is a----++ test012_345+67890 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_"; let tokens = tokenizer.tokenize(text); assert_eq!( tokens, - vec!["Hello", "world", "This", "is", "a", "test012_345", "67890"] + vec![ + "Hello", + "world", + "This", + "is", + "a", + "test012_345", + "67890", + "ship_ship", + "ship__ship", + "_", + "__", + "__IDENTIFIER__", + "_ship", + "ship_" + ] ); } @@ -178,11 +211,49 @@ mod tests { #[test] fn test_chinese_tokenizer_issue_7943_sample() { let tokenizer = ChineseTokenizer; - let text = "登录手机号18888888888的动态key:829889AC8"; + let text = "[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_ EOF"; let tokens = tokenizer.tokenize(text); + assert_eq!( tokens, - [ + vec![ + "2026", + "04", + "09", + "13", + "56", + "11.031", + "2026-04", + "09", + "13", + "56", + "11.031", + "trace", + "_", + "id", + "340a6a44b0bd8e37bb7697ss7da61ff0", + "span", + "_", + "id", + "085ff5ttf1e0a23b", + "trace", + "_", + "flags", + "01", + "http", + "nio-8081", + "exec-16", + "INFO", + "c", + "h", + "p", + "xx", + "web", + "service", + "impl", + "CCCXForwardKKKServiceImpl", + "pushout", + "188", "登录", "手机", "手机号", @@ -190,7 +261,71 @@ mod tests { "的", "动态", "key", - "829889AC8" + "829889AC8", + "ship", + "_", + "ship", + "ship", + "__", + "ship", + "_", + "__", + "__", + "IDENTIFIER", + "__", + "_", + "ship", + "ship", + "_", + "EOF", + "trace_id", + "span_id", + "trace_flags", + "ship_ship", + "ship__ship", + "_", + "__", + "__IDENTIFIER__", + "_ship", + "ship_" + ] + ); + } + + #[test] + fn test_chinese_tokenizer_keeps_ascii_underscore_compounds() { + let tokenizer = ChineseTokenizer; + let text = "trace_id=abc 登录手机号 dynamic_key=xyz"; + + let tokens = tokenizer.tokenize(text); + + assert!(tokens.contains(&"trace_id")); + assert!(tokens.contains(&"dynamic_key")); + assert!(tokens.contains(&"登录")); + assert!(tokens.contains(&"手机号")); + } + + #[test] + fn test_chinese_tokenizer_skips_non_ascii_underscore_tokens() { + let tokenizer = ChineseTokenizer; + let text = "登录_id trace_id 手机号_trace"; + + let tokens = tokenizer.tokenize(text); + + assert_eq!( + tokens, + [ + "登录", + "_", + "id", + "trace", + "_", + "id", + "手机", + "手机号", + "_", + "trace", + "trace_id" ] ); } diff --git a/tests/cases/standalone/common/function/matches_term.result b/tests/cases/standalone/common/function/matches_term.result index ef1e94fa8b..37ecf5a55f 100644 --- a/tests/cases/standalone/common/function/matches_term.result +++ b/tests/cases/standalone/common/function/matches_term.result @@ -222,6 +222,46 @@ SELECT matches_term('错误error日志', 'error') as result; | true | +--------+ +SELECT matches_term('trace_id=abc', 'trace_id') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +SELECT matches_term('ship__ship', 'ship__ship') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +SELECT matches_term('__IDENTIFIER__', '__IDENTIFIER__') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +SELECT matches_term('_ship', '_ship') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + +SELECT matches_term('ship_', 'ship_') as result; + ++--------+ +| result | ++--------+ +| true | ++--------+ + -- Test complete word matching CREATE TABLE logs ( `id` TIMESTAMP TIME INDEX, @@ -377,3 +417,94 @@ DROP TABLE logs; Affected Rows: 0 +CREATE TABLE zh_logs ( + `id` TIMESTAMP TIME INDEX, + `log_message` STRING FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true', false_positive_rate = '0.01', granularity = '10240') +); + +Affected Rows: 0 + +INSERT INTO zh_logs VALUES + (1, '[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_'), + (2, '哈基米曼波'); + +Affected Rows: 2 + +ADMIN flush_table('zh_logs'); + ++------------------------------+ +| ADMIN flush_table('zh_logs') | ++------------------------------+ +| 0 | ++------------------------------+ + +SELECT * FROM zh_logs where `log_message` @@ 'trace_id'; + ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| id | log_message | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +SELECT * FROM zh_logs where `log_message` @@ 'ship_ship'; + ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| id | log_message | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +SELECT * FROM zh_logs where `log_message` @@ 'ship__ship'; + ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| id | log_message | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +SELECT * FROM zh_logs where `log_message` @@ '__IDENTIFIER__'; + ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| id | log_message | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +SELECT * FROM zh_logs where `log_message` @@ '_ship'; + ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| id | log_message | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +SELECT * FROM zh_logs where `log_message` @@ 'ship_'; + ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| id | log_message | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +SELECT * FROM zh_logs where `log_message` @@ '登录_id'; + +++ +++ + +SELECT * FROM zh_logs where `log_message` @@ '手机号_trace'; + +++ +++ + +SELECT * FROM zh_logs where `log_message` @@ '手机'; + ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| id | log_message | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ | ++-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +DROP TABLE zh_logs; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/matches_term.sql b/tests/cases/standalone/common/function/matches_term.sql index e01b1451c0..bbffd9cbe9 100644 --- a/tests/cases/standalone/common/function/matches_term.sql +++ b/tests/cases/standalone/common/function/matches_term.sql @@ -56,6 +56,11 @@ SELECT matches_term('登录手机号18888888888的动态key', '机号1888') as r SELECT matches_term('中国农业银行', '农业') as result; SELECT matches_term('中国农业银行账号', '行账号') as result; SELECT matches_term('错误error日志', 'error') as result; +SELECT matches_term('trace_id=abc', 'trace_id') as result; +SELECT matches_term('ship__ship', 'ship__ship') as result; +SELECT matches_term('__IDENTIFIER__', '__IDENTIFIER__') as result; +SELECT matches_term('_ship', '_ship') as result; +SELECT matches_term('ship_', 'ship_') as result; -- Test complete word matching CREATE TABLE logs ( @@ -150,3 +155,34 @@ FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`; SELECT `id`, `log_message`, matches_term(lower(`log_message`), 'warning') as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`; DROP TABLE logs; + +CREATE TABLE zh_logs ( + `id` TIMESTAMP TIME INDEX, + `log_message` STRING FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true', false_positive_rate = '0.01', granularity = '10240') +); + +INSERT INTO zh_logs VALUES + (1, '[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_'), + (2, '哈基米曼波'); + +ADMIN flush_table('zh_logs'); + +SELECT * FROM zh_logs where `log_message` @@ 'trace_id'; + +SELECT * FROM zh_logs where `log_message` @@ 'ship_ship'; + +SELECT * FROM zh_logs where `log_message` @@ 'ship__ship'; + +SELECT * FROM zh_logs where `log_message` @@ '__IDENTIFIER__'; + +SELECT * FROM zh_logs where `log_message` @@ '_ship'; + +SELECT * FROM zh_logs where `log_message` @@ 'ship_'; + +SELECT * FROM zh_logs where `log_message` @@ '登录_id'; + +SELECT * FROM zh_logs where `log_message` @@ '手机号_trace'; + +SELECT * FROM zh_logs where `log_message` @@ '手机'; + +DROP TABLE zh_logs;