fix: faster jieba (#8158)

* fix: faster jieba

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

* fix: also update tantivy and the api

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

* fix: better bench follow the copilot review

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

* fix: apply comments

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

---------

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
yihong
2026-05-25 16:51:40 +08:00
committed by GitHub
parent a25152664b
commit eb264d9adf
5 changed files with 342 additions and 82 deletions

View File

@@ -26,7 +26,7 @@ fst.workspace = true
futures.workspace = true
greptime-proto.workspace = true
itertools.workspace = true
jieba-rs = "0.8"
jieba-rs = "0.10"
lazy_static.workspace = true
mockall.workspace = true
nalgebra.workspace = true
@@ -40,8 +40,8 @@ serde.workspace = true
serde_json.workspace = true
snafu.workspace = true
store-api.workspace = true
tantivy = { version = "0.24", features = ["zstd-compression"] }
tantivy-jieba = "0.16"
tantivy = { version = "0.26", features = ["zstd-compression"] }
tantivy-jieba = "0.20"
tokio.workspace = true
tokio-util.workspace = true
usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }

View File

@@ -12,8 +12,79 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer};
use std::collections::HashMap;
use std::hint::black_box;
use std::path::PathBuf;
use std::time::Duration;
use async_trait::async_trait;
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
use futures::AsyncRead;
use index::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator};
use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
use index::fulltext_index::{Analyzer, Config};
use puffin::puffin_manager::{PuffinWriter, PutOptions};
const CHINESE_TOKENIZER_TEXTS: &[(&str, &str)] = &[
("short", "登录手机号。中国农业银行。"),
(
"mixed_log",
"2025-08-01 21:09:28 用户登录失败 trace_id=abc_123 dynamic_key=mobile_login 中国农业银行接口返回超时。",
),
(
"product_search",
"哈基米哦南北绿豆,噢马自立曼波。装电视台,中国中央广播电视台。压不缩,笑不活。",
),
(
"long_news",
"中国农业银行发布公告称,手机银行登录服务完成升级。多个地区用户反馈查询速度提升,后台监控显示核心链路延迟下降,异常请求自动重试次数减少。系统继续保留 trace_id、request_id 和 dynamic_key 等字段用于排查问题。",
),
];
const CHINESE_INDEX_DOCS: &[&str] = &[
"登录手机号,中国农业银行手机银行接口返回成功。",
"用户登录失败trace_id=abc_123dynamic_key=mobile_login。",
"中国中央广播电视台发布新的节目预告。",
"装电视台的时候遇到压不缩的问题。",
"哈基米哦南北绿豆,噢马自立曼波。",
"后台监控显示核心链路延迟下降。",
"系统保留 request_id 用于排查问题。",
"中文全文索引需要兼顾召回率和 token 数量。",
];
struct NoopPuffinWriter;
#[async_trait]
impl PuffinWriter for NoopPuffinWriter {
async fn put_blob<R>(
&mut self,
_key: &str,
_raw_data: R,
_options: PutOptions,
_properties: HashMap<String, String>,
) -> puffin::error::Result<u64>
where
R: AsyncRead + Send,
{
unreachable!("tantivy fulltext benchmark only writes directory blobs")
}
async fn put_dir(
&mut self,
_key: &str,
_dir: PathBuf,
_options: PutOptions,
_properties: HashMap<String, String>,
) -> puffin::error::Result<u64> {
Ok(0)
}
fn set_footer_lz4_compressed(&mut self, _lz4_compressed: bool) {}
async fn finish(self) -> puffin::error::Result<u64> {
Ok(0)
}
}
fn bench_english_tokenizer(c: &mut Criterion) {
let tokenizer = EnglishTokenizer;
@@ -86,5 +157,104 @@ fn bench_english_tokenizer(c: &mut Criterion) {
repeat_group.finish();
}
criterion_group!(benches, bench_english_tokenizer);
fn bench_chinese_tokenizer(c: &mut Criterion) {
let tokenizer = ChineseTokenizer;
let mut group = c.benchmark_group("chinese_tokenizer");
for (name, text) in CHINESE_TOKENIZER_TEXTS {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_with_input(BenchmarkId::new("tokenize", name), text, |b, text| {
b.iter(|| black_box(tokenizer.tokenize(black_box(text))))
});
}
group.finish();
let mut repeat_group = c.benchmark_group("chinese_tokenizer_repeated");
let sample_text = CHINESE_TOKENIZER_TEXTS
.iter()
.find(|(name, _)| *name == "mixed_log")
.map(|(_, text)| *text)
.expect("mixed_log sample must exist");
for repeat_count in [10, 100, 1000] {
repeat_group.bench_with_input(
BenchmarkId::new("repeated_tokenize", repeat_count),
&repeat_count,
|b, &repeat_count| {
b.iter(|| {
for _ in 0..repeat_count {
black_box(tokenizer.tokenize(black_box(sample_text)));
}
})
},
);
}
repeat_group.finish();
}
fn bench_tantivy_chinese_fulltext_index(c: &mut Criterion) {
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create Tokio runtime");
let config = Config {
analyzer: Analyzer::Chinese,
case_sensitive: false,
};
let mut group = c.benchmark_group("tantivy_chinese_fulltext_index");
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));
for doc_count in [32usize, 256usize] {
group.throughput(Throughput::Elements(doc_count as u64));
group.bench_with_input(
BenchmarkId::new("build_commit", doc_count),
&doc_count,
|b, &doc_count| {
b.iter_batched(
tempfile::tempdir,
|dir| {
let dir = dir.expect("failed to create temp dir");
runtime.block_on(async {
let mut creator =
TantivyFulltextIndexCreator::new(dir.path(), config, 64 << 20)
.await
.expect("failed to create tantivy fulltext index");
for idx in 0..doc_count {
let text = CHINESE_INDEX_DOCS[idx % CHINESE_INDEX_DOCS.len()];
creator
.push_text(black_box(text))
.await
.expect("failed to push text");
}
let mut puffin_writer = NoopPuffinWriter;
creator
.finish(
&mut puffin_writer,
"tantivy_chinese_fulltext_index",
PutOptions::default(),
)
.await
.expect("failed to commit tantivy fulltext index");
});
// Return the temp dir so Criterion drops it after timing the routine.
dir
},
BatchSize::SmallInput,
)
},
);
}
group.finish();
}
criterion_group!(
benches,
bench_english_tokenizer,
bench_chinese_tokenizer,
bench_tantivy_chinese_fulltext_index
);
criterion_main!(benches);

View File

@@ -52,7 +52,7 @@ impl Config {
fn build_tantivy_tokenizer(&self) -> TokenizerManager {
let mut builder = match self.analyzer {
Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer::new()).dynamic(),
};
if !self.case_sensitive {

View File

@@ -98,7 +98,8 @@ impl Tokenizer for ChineseTokenizer {
let mut tokens = JIEBA
.cut_for_search(text, true)
.into_iter()
.filter(|s| is_indexable_token(s))
.map(|token| token.word)
.filter(|token| is_indexable_token(token))
.collect::<Vec<_>>();
let english = EnglishTokenizer {};
@@ -336,10 +337,26 @@ mod tests {
let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
let default_tokens = tokenizer.tokenize(text);
let cut_hmm_false = JIEBA.cut(text, false);
let cut_hmm_true = JIEBA.cut(text, true);
let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
let cut_hmm_false = JIEBA
.cut(text, false)
.into_iter()
.map(|token| token.word)
.collect::<Vec<_>>();
let cut_hmm_true = JIEBA
.cut(text, true)
.into_iter()
.map(|token| token.word)
.collect::<Vec<_>>();
let cut_for_search_hmm_false = JIEBA
.cut_for_search(text, false)
.into_iter()
.map(|token| token.word)
.collect::<Vec<_>>();
let cut_for_search_hmm_true = JIEBA
.cut_for_search(text, true)
.into_iter()
.map(|token| token.word)
.collect::<Vec<_>>();
assert_eq!(
default_tokens,