perf: better jieba cut (#7984)

* perf: better jieba cut

Signed-off-by: discord9 <discord9@163.com>

* fix: also filter pun mark

Signed-off-by: discord9 <discord9@163.com>

* chore

Signed-off-by: discord9 <discord9@163.com>

* docs: explain why

Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
This commit is contained in:
discord9
2026-04-17 16:33:29 +08:00
committed by GitHub
parent 8c40374b1a
commit a8540ad39d

View File

@@ -82,7 +82,9 @@ impl Tokenizer for EnglishTokenizer {
/// `ChineseTokenizer` tokenizes a Chinese text.
///
/// It uses the Jieba tokenizer to split the text into Chinese words.
/// It uses Jieba search-mode tokenization to improve recall for Chinese fulltext search.
/// Enabling HMM also helps merge some unknown fragments into larger tokens, which can reduce
/// token cardinality versus a fully fragmented output.
#[derive(Debug, Default)]
pub struct ChineseTokenizer;
@@ -91,7 +93,13 @@ impl Tokenizer for ChineseTokenizer {
if text.is_ascii() {
EnglishTokenizer {}.tokenize(text)
} else {
JIEBA.cut(text, false)
// Search-mode tokenization emits finer-grained searchable terms, while HMM helps
// merge some unknown fragments and avoid excessive token fragmentation.
JIEBA
.cut_for_search(text, true)
.into_iter()
.filter(|s| s.chars().any(|c| c.is_alphanumeric() || c == '_'))
.collect()
}
}
}
@@ -174,19 +182,222 @@ mod tests {
let tokens = tokenizer.tokenize(text);
assert_eq!(
tokens,
vec![
[
"登录",
"手机",
"手机号",
"18888888888",
"",
"动态",
"key",
"",
"829889AC8"
]
);
}
#[test]
fn test_chinese_tokenizer_aggressive_tokenization_probe() {
let tokenizer = ChineseTokenizer;
let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
let default_tokens = tokenizer.tokenize(text);
let cut_hmm_false = JIEBA.cut(text, false);
let cut_hmm_true = JIEBA.cut(text, true);
let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
assert_eq!(
default_tokens,
[
"哈基米",
"",
"南北",
"绿豆",
"",
"",
"自立",
"曼波",
"登录",
"手机",
"手机号",
"中国",
"农业",
"银行",
"中国农业银行",
"",
"电视",
"电视台",
"中国",
"中央",
"广播",
"电视",
"电视台",
"不缩",
"压不缩",
"",
"不活",
]
);
assert_eq!(
cut_hmm_false,
[
"",
"",
"",
"",
"南北",
"绿豆",
"",
"",
"",
"自立",
"",
"",
"",
"登录",
"手机号",
"",
"中国农业银行",
"",
"",
"电视台",
"",
"中国",
"中央",
"广播",
"电视台",
"",
"",
"",
"",
"",
"",
"",
"",
""
]
);
assert_eq!(
cut_hmm_true,
[
"哈基米",
"",
"南北",
"绿豆",
"",
"",
"",
"自立",
"曼波",
"",
"登录",
"手机号",
"",
"中国农业银行",
"",
"",
"电视台",
"",
"中国",
"中央",
"广播",
"电视台",
"",
"压不缩",
"",
"",
"不活",
""
]
);
assert_eq!(
cut_for_search_hmm_false,
[
"",
"",
"",
"",
"南北",
"绿豆",
"",
"",
"",
"自立",
"",
"",
"",
"登录",
"手机",
"手机号",
"",
"中国",
"农业",
"银行",
"中国农业银行",
"",
"",
"电视",
"电视台",
"",
"中国",
"中央",
"广播",
"电视",
"电视台",
"",
"",
"",
"",
"",
"",
"",
"",
""
]
);
assert_eq!(
cut_for_search_hmm_true,
[
"哈基米",
"",
"南北",
"绿豆",
"",
"",
"",
"自立",
"曼波",
"",
"登录",
"手机",
"手机号",
"",
"中国",
"农业",
"银行",
"中国农业银行",
"",
"",
"电视",
"电视台",
"",
"中国",
"中央",
"广播",
"电视",
"电视台",
"",
"不缩",
"压不缩",
"",
"",
"不活",
""
]
);
}
#[test]
fn test_valid_ascii_token_lookup_table() {
// Test all ASCII values in a single loop