fix: zh same underscore behavior (#8002)

* fix: zh same underscore behavior

Signed-off-by: discord9 <discord9@163.com>

* fix: only add token with _ from en analyzer

Signed-off-by: discord9 <discord9@163.com>

* test: neg sqlness case

Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
This commit is contained in:
discord9
2026-04-22 11:21:36 +08:00
committed by GitHub
parent b0c093508a
commit 73325acfe4
3 changed files with 310 additions and 8 deletions

View File

@@ -95,15 +95,33 @@ impl Tokenizer for ChineseTokenizer {
} else {
// Search-mode tokenization emits finer-grained searchable terms, while HMM helps
// merge some unknown fragments and avoid excessive token fragmentation.
JIEBA
let mut tokens = JIEBA
.cut_for_search(text, true)
.into_iter()
.filter(|s| s.chars().any(|c| c.is_alphanumeric() || c == '_'))
.collect()
.filter(|s| is_indexable_token(s))
.collect::<Vec<_>>();
let english = EnglishTokenizer {};
tokens.extend(
english
.tokenize(text)
.into_iter()
.filter(|token| is_ascii_underscore_token(token)),
);
tokens
}
}
}
fn is_indexable_token(token: &str) -> bool {
token.chars().any(|c| c.is_alphanumeric() || c == '_')
}
fn is_ascii_underscore_token(token: &str) -> bool {
token.is_ascii() && token.chars().any(|c| c == '_')
}
/// `Analyzer` analyzes a text into a list of tokens.
///
/// It uses a `Tokenizer` to tokenize the text and optionally lowercases the tokens.
@@ -146,11 +164,26 @@ mod tests {
#[test]
fn test_english_tokenizer() {
let tokenizer = EnglishTokenizer;
let text = "Hello, world!!! This is a----++ test012_345+67890";
let text = "Hello, world!!! This is a----++ test012_345+67890 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_";
let tokens = tokenizer.tokenize(text);
assert_eq!(
tokens,
vec!["Hello", "world", "This", "is", "a", "test012_345", "67890"]
vec![
"Hello",
"world",
"This",
"is",
"a",
"test012_345",
"67890",
"ship_ship",
"ship__ship",
"_",
"__",
"__IDENTIFIER__",
"_ship",
"ship_"
]
);
}
@@ -178,11 +211,49 @@ mod tests {
#[test]
fn test_chinese_tokenizer_issue_7943_sample() {
let tokenizer = ChineseTokenizer;
let text = "登录手机号18888888888的动态key829889AC8";
let text = "[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_ EOF";
let tokens = tokenizer.tokenize(text);
assert_eq!(
tokens,
[
vec![
"2026",
"04",
"09",
"13",
"56",
"11.031",
"2026-04",
"09",
"13",
"56",
"11.031",
"trace",
"_",
"id",
"340a6a44b0bd8e37bb7697ss7da61ff0",
"span",
"_",
"id",
"085ff5ttf1e0a23b",
"trace",
"_",
"flags",
"01",
"http",
"nio-8081",
"exec-16",
"INFO",
"c",
"h",
"p",
"xx",
"web",
"service",
"impl",
"CCCXForwardKKKServiceImpl",
"pushout",
"188",
"登录",
"手机",
"手机号",
@@ -190,7 +261,71 @@ mod tests {
"",
"动态",
"key",
"829889AC8"
"829889AC8",
"ship",
"_",
"ship",
"ship",
"__",
"ship",
"_",
"__",
"__",
"IDENTIFIER",
"__",
"_",
"ship",
"ship",
"_",
"EOF",
"trace_id",
"span_id",
"trace_flags",
"ship_ship",
"ship__ship",
"_",
"__",
"__IDENTIFIER__",
"_ship",
"ship_"
]
);
}
#[test]
fn test_chinese_tokenizer_keeps_ascii_underscore_compounds() {
let tokenizer = ChineseTokenizer;
let text = "trace_id=abc 登录手机号 dynamic_key=xyz";
let tokens = tokenizer.tokenize(text);
assert!(tokens.contains(&"trace_id"));
assert!(tokens.contains(&"dynamic_key"));
assert!(tokens.contains(&"登录"));
assert!(tokens.contains(&"手机号"));
}
#[test]
fn test_chinese_tokenizer_skips_non_ascii_underscore_tokens() {
let tokenizer = ChineseTokenizer;
let text = "登录_id trace_id 手机号_trace";
let tokens = tokenizer.tokenize(text);
assert_eq!(
tokens,
[
"登录",
"_",
"id",
"trace",
"_",
"id",
"手机",
"手机号",
"_",
"trace",
"trace_id"
]
);
}

View File

@@ -222,6 +222,46 @@ SELECT matches_term('错误error日志', 'error') as result;
| true |
+--------+
SELECT matches_term('trace_id=abc', 'trace_id') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('ship__ship', 'ship__ship') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('__IDENTIFIER__', '__IDENTIFIER__') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('_ship', '_ship') as result;
+--------+
| result |
+--------+
| true |
+--------+
SELECT matches_term('ship_', 'ship_') as result;
+--------+
| result |
+--------+
| true |
+--------+
-- Test complete word matching
CREATE TABLE logs (
`id` TIMESTAMP TIME INDEX,
@@ -377,3 +417,94 @@ DROP TABLE logs;
Affected Rows: 0
CREATE TABLE zh_logs (
`id` TIMESTAMP TIME INDEX,
`log_message` STRING FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true', false_positive_rate = '0.01', granularity = '10240')
);
Affected Rows: 0
INSERT INTO zh_logs VALUES
(1, '[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_'),
(2, '哈基米曼波');
Affected Rows: 2
ADMIN flush_table('zh_logs');
+------------------------------+
| ADMIN flush_table('zh_logs') |
+------------------------------+
| 0 |
+------------------------------+
SELECT * FROM zh_logs where `log_message` @@ 'trace_id';
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| id | log_message |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
SELECT * FROM zh_logs where `log_message` @@ 'ship_ship';
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| id | log_message |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
SELECT * FROM zh_logs where `log_message` @@ 'ship__ship';
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| id | log_message |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
SELECT * FROM zh_logs where `log_message` @@ '__IDENTIFIER__';
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| id | log_message |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
SELECT * FROM zh_logs where `log_message` @@ '_ship';
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| id | log_message |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
SELECT * FROM zh_logs where `log_message` @@ 'ship_';
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| id | log_message |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
SELECT * FROM zh_logs where `log_message` @@ '登录_id';
++
++
SELECT * FROM zh_logs where `log_message` @@ '手机号_trace';
++
++
SELECT * FROM zh_logs where `log_message` @@ '手机';
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| id | log_message |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
DROP TABLE zh_logs;
Affected Rows: 0

View File

@@ -56,6 +56,11 @@ SELECT matches_term('登录手机号18888888888的动态key', '机号1888') as r
SELECT matches_term('中国农业银行', '农业') as result;
SELECT matches_term('中国农业银行账号', '行账号') as result;
SELECT matches_term('错误error日志', 'error') as result;
SELECT matches_term('trace_id=abc', 'trace_id') as result;
SELECT matches_term('ship__ship', 'ship__ship') as result;
SELECT matches_term('__IDENTIFIER__', '__IDENTIFIER__') as result;
SELECT matches_term('_ship', '_ship') as result;
SELECT matches_term('ship_', 'ship_') as result;
-- Test complete word matching
CREATE TABLE logs (
@@ -150,3 +155,34 @@ FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`;
SELECT `id`, `log_message`, matches_term(lower(`log_message`), 'warning') as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`;
DROP TABLE logs;
CREATE TABLE zh_logs (
`id` TIMESTAMP TIME INDEX,
`log_message` STRING FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true', false_positive_rate = '0.01', granularity = '10240')
);
INSERT INTO zh_logs VALUES
(1, '[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_'),
(2, '哈基米曼波');
ADMIN flush_table('zh_logs');
SELECT * FROM zh_logs where `log_message` @@ 'trace_id';
SELECT * FROM zh_logs where `log_message` @@ 'ship_ship';
SELECT * FROM zh_logs where `log_message` @@ 'ship__ship';
SELECT * FROM zh_logs where `log_message` @@ '__IDENTIFIER__';
SELECT * FROM zh_logs where `log_message` @@ '_ship';
SELECT * FROM zh_logs where `log_message` @@ 'ship_';
SELECT * FROM zh_logs where `log_message` @@ '登录_id';
SELECT * FROM zh_logs where `log_message` @@ '手机号_trace';
SELECT * FROM zh_logs where `log_message` @@ '手机';
DROP TABLE zh_logs;