mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-17 21:40:37 +00:00
fix: zh same underscore behavior (#8002)
* fix: zh same underscore behavior Signed-off-by: discord9 <discord9@163.com> * fix: only add token with _ from en analyzer Signed-off-by: discord9 <discord9@163.com> * test: neg sqlness case Signed-off-by: discord9 <discord9@163.com> --------- Signed-off-by: discord9 <discord9@163.com>
This commit is contained in:
@@ -95,15 +95,33 @@ impl Tokenizer for ChineseTokenizer {
|
||||
} else {
|
||||
// Search-mode tokenization emits finer-grained searchable terms, while HMM helps
|
||||
// merge some unknown fragments and avoid excessive token fragmentation.
|
||||
JIEBA
|
||||
let mut tokens = JIEBA
|
||||
.cut_for_search(text, true)
|
||||
.into_iter()
|
||||
.filter(|s| s.chars().any(|c| c.is_alphanumeric() || c == '_'))
|
||||
.collect()
|
||||
.filter(|s| is_indexable_token(s))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let english = EnglishTokenizer {};
|
||||
tokens.extend(
|
||||
english
|
||||
.tokenize(text)
|
||||
.into_iter()
|
||||
.filter(|token| is_ascii_underscore_token(token)),
|
||||
);
|
||||
|
||||
tokens
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_indexable_token(token: &str) -> bool {
|
||||
token.chars().any(|c| c.is_alphanumeric() || c == '_')
|
||||
}
|
||||
|
||||
fn is_ascii_underscore_token(token: &str) -> bool {
|
||||
token.is_ascii() && token.chars().any(|c| c == '_')
|
||||
}
|
||||
|
||||
/// `Analyzer` analyzes a text into a list of tokens.
|
||||
///
|
||||
/// It uses a `Tokenizer` to tokenize the text and optionally lowercases the tokens.
|
||||
@@ -146,11 +164,26 @@ mod tests {
|
||||
#[test]
|
||||
fn test_english_tokenizer() {
|
||||
let tokenizer = EnglishTokenizer;
|
||||
let text = "Hello, world!!! This is a----++ test012_345+67890";
|
||||
let text = "Hello, world!!! This is a----++ test012_345+67890 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_";
|
||||
let tokens = tokenizer.tokenize(text);
|
||||
assert_eq!(
|
||||
tokens,
|
||||
vec!["Hello", "world", "This", "is", "a", "test012_345", "67890"]
|
||||
vec![
|
||||
"Hello",
|
||||
"world",
|
||||
"This",
|
||||
"is",
|
||||
"a",
|
||||
"test012_345",
|
||||
"67890",
|
||||
"ship_ship",
|
||||
"ship__ship",
|
||||
"_",
|
||||
"__",
|
||||
"__IDENTIFIER__",
|
||||
"_ship",
|
||||
"ship_"
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
@@ -178,11 +211,49 @@ mod tests {
|
||||
#[test]
|
||||
fn test_chinese_tokenizer_issue_7943_sample() {
|
||||
let tokenizer = ChineseTokenizer;
|
||||
let text = "登录手机号18888888888的动态key:829889AC8";
|
||||
let text = "[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship _ __ __IDENTIFIER__ _ship ship_ EOF";
|
||||
let tokens = tokenizer.tokenize(text);
|
||||
|
||||
assert_eq!(
|
||||
tokens,
|
||||
[
|
||||
vec![
|
||||
"2026",
|
||||
"04",
|
||||
"09",
|
||||
"13",
|
||||
"56",
|
||||
"11.031",
|
||||
"2026-04",
|
||||
"09",
|
||||
"13",
|
||||
"56",
|
||||
"11.031",
|
||||
"trace",
|
||||
"_",
|
||||
"id",
|
||||
"340a6a44b0bd8e37bb7697ss7da61ff0",
|
||||
"span",
|
||||
"_",
|
||||
"id",
|
||||
"085ff5ttf1e0a23b",
|
||||
"trace",
|
||||
"_",
|
||||
"flags",
|
||||
"01",
|
||||
"http",
|
||||
"nio-8081",
|
||||
"exec-16",
|
||||
"INFO",
|
||||
"c",
|
||||
"h",
|
||||
"p",
|
||||
"xx",
|
||||
"web",
|
||||
"service",
|
||||
"impl",
|
||||
"CCCXForwardKKKServiceImpl",
|
||||
"pushout",
|
||||
"188",
|
||||
"登录",
|
||||
"手机",
|
||||
"手机号",
|
||||
@@ -190,7 +261,71 @@ mod tests {
|
||||
"的",
|
||||
"动态",
|
||||
"key",
|
||||
"829889AC8"
|
||||
"829889AC8",
|
||||
"ship",
|
||||
"_",
|
||||
"ship",
|
||||
"ship",
|
||||
"__",
|
||||
"ship",
|
||||
"_",
|
||||
"__",
|
||||
"__",
|
||||
"IDENTIFIER",
|
||||
"__",
|
||||
"_",
|
||||
"ship",
|
||||
"ship",
|
||||
"_",
|
||||
"EOF",
|
||||
"trace_id",
|
||||
"span_id",
|
||||
"trace_flags",
|
||||
"ship_ship",
|
||||
"ship__ship",
|
||||
"_",
|
||||
"__",
|
||||
"__IDENTIFIER__",
|
||||
"_ship",
|
||||
"ship_"
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chinese_tokenizer_keeps_ascii_underscore_compounds() {
|
||||
let tokenizer = ChineseTokenizer;
|
||||
let text = "trace_id=abc 登录手机号 dynamic_key=xyz";
|
||||
|
||||
let tokens = tokenizer.tokenize(text);
|
||||
|
||||
assert!(tokens.contains(&"trace_id"));
|
||||
assert!(tokens.contains(&"dynamic_key"));
|
||||
assert!(tokens.contains(&"登录"));
|
||||
assert!(tokens.contains(&"手机号"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chinese_tokenizer_skips_non_ascii_underscore_tokens() {
|
||||
let tokenizer = ChineseTokenizer;
|
||||
let text = "登录_id trace_id 手机号_trace";
|
||||
|
||||
let tokens = tokenizer.tokenize(text);
|
||||
|
||||
assert_eq!(
|
||||
tokens,
|
||||
[
|
||||
"登录",
|
||||
"_",
|
||||
"id",
|
||||
"trace",
|
||||
"_",
|
||||
"id",
|
||||
"手机",
|
||||
"手机号",
|
||||
"_",
|
||||
"trace",
|
||||
"trace_id"
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
@@ -222,6 +222,46 @@ SELECT matches_term('错误error日志', 'error') as result;
|
||||
| true |
|
||||
+--------+
|
||||
|
||||
SELECT matches_term('trace_id=abc', 'trace_id') as result;
|
||||
|
||||
+--------+
|
||||
| result |
|
||||
+--------+
|
||||
| true |
|
||||
+--------+
|
||||
|
||||
SELECT matches_term('ship__ship', 'ship__ship') as result;
|
||||
|
||||
+--------+
|
||||
| result |
|
||||
+--------+
|
||||
| true |
|
||||
+--------+
|
||||
|
||||
SELECT matches_term('__IDENTIFIER__', '__IDENTIFIER__') as result;
|
||||
|
||||
+--------+
|
||||
| result |
|
||||
+--------+
|
||||
| true |
|
||||
+--------+
|
||||
|
||||
SELECT matches_term('_ship', '_ship') as result;
|
||||
|
||||
+--------+
|
||||
| result |
|
||||
+--------+
|
||||
| true |
|
||||
+--------+
|
||||
|
||||
SELECT matches_term('ship_', 'ship_') as result;
|
||||
|
||||
+--------+
|
||||
| result |
|
||||
+--------+
|
||||
| true |
|
||||
+--------+
|
||||
|
||||
-- Test complete word matching
|
||||
CREATE TABLE logs (
|
||||
`id` TIMESTAMP TIME INDEX,
|
||||
@@ -377,3 +417,94 @@ DROP TABLE logs;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE TABLE zh_logs (
|
||||
`id` TIMESTAMP TIME INDEX,
|
||||
`log_message` STRING FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true', false_positive_rate = '0.01', granularity = '10240')
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
INSERT INTO zh_logs VALUES
|
||||
(1, '[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_'),
|
||||
(2, '哈基米曼波');
|
||||
|
||||
Affected Rows: 2
|
||||
|
||||
ADMIN flush_table('zh_logs');
|
||||
|
||||
+------------------------------+
|
||||
| ADMIN flush_table('zh_logs') |
|
||||
+------------------------------+
|
||||
| 0 |
|
||||
+------------------------------+
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ 'trace_id';
|
||||
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| id | log_message |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ 'ship_ship';
|
||||
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| id | log_message |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ 'ship__ship';
|
||||
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| id | log_message |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '__IDENTIFIER__';
|
||||
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| id | log_message |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '_ship';
|
||||
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| id | log_message |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ 'ship_';
|
||||
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| id | log_message |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '登录_id';
|
||||
|
||||
++
|
||||
++
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '手机号_trace';
|
||||
|
||||
++
|
||||
++
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '手机';
|
||||
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| id | log_message |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 1970-01-01T00:00:00.001 | [2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_ |
|
||||
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
DROP TABLE zh_logs;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
|
||||
@@ -56,6 +56,11 @@ SELECT matches_term('登录手机号18888888888的动态key', '机号1888') as r
|
||||
SELECT matches_term('中国农业银行', '农业') as result;
|
||||
SELECT matches_term('中国农业银行账号', '行账号') as result;
|
||||
SELECT matches_term('错误error日志', 'error') as result;
|
||||
SELECT matches_term('trace_id=abc', 'trace_id') as result;
|
||||
SELECT matches_term('ship__ship', 'ship__ship') as result;
|
||||
SELECT matches_term('__IDENTIFIER__', '__IDENTIFIER__') as result;
|
||||
SELECT matches_term('_ship', '_ship') as result;
|
||||
SELECT matches_term('ship_', 'ship_') as result;
|
||||
|
||||
-- Test complete word matching
|
||||
CREATE TABLE logs (
|
||||
@@ -150,3 +155,34 @@ FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`;
|
||||
SELECT `id`, `log_message`, matches_term(lower(`log_message`), 'warning') as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`;
|
||||
|
||||
DROP TABLE logs;
|
||||
|
||||
CREATE TABLE zh_logs (
|
||||
`id` TIMESTAMP TIME INDEX,
|
||||
`log_message` STRING FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true', false_positive_rate = '0.01', granularity = '10240')
|
||||
);
|
||||
|
||||
INSERT INTO zh_logs VALUES
|
||||
(1, '[2026/04/09/ 13:56:11.031]2026-04-09 13:56:11.031 - [ trace_id=340a6a44b0bd8e37bb7697ss7da61ff0 span_id=085ff5ttf1e0a23b trace_flags=01] - [http-nio-8081-exec-16] INFO c.h.p.xx.web.service.impl.CCCXForwardKKKServiceImpl.pushout(188) - 登录手机号18888888888的动态key:829889AC8 ship_ship ship__ship __IDENTIFIER__ _ship ship_'),
|
||||
(2, '哈基米曼波');
|
||||
|
||||
ADMIN flush_table('zh_logs');
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ 'trace_id';
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ 'ship_ship';
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ 'ship__ship';
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '__IDENTIFIER__';
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '_ship';
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ 'ship_';
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '登录_id';
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '手机号_trace';
|
||||
|
||||
SELECT * FROM zh_logs where `log_message` @@ '手机';
|
||||
|
||||
DROP TABLE zh_logs;
|
||||
|
||||
Reference in New Issue
Block a user