mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-28 10:50:39 +00:00
fix: faster jieba (#8158)
* fix: faster jieba Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: also update tantivy and the api Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: better bench follow the copilot review Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: apply comments Signed-off-by: yihong0618 <zouzou0208@gmail.com> --------- Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
@@ -26,7 +26,7 @@ fst.workspace = true
|
||||
futures.workspace = true
|
||||
greptime-proto.workspace = true
|
||||
itertools.workspace = true
|
||||
jieba-rs = "0.8"
|
||||
jieba-rs = "0.10"
|
||||
lazy_static.workspace = true
|
||||
mockall.workspace = true
|
||||
nalgebra.workspace = true
|
||||
@@ -40,8 +40,8 @@ serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
snafu.workspace = true
|
||||
store-api.workspace = true
|
||||
tantivy = { version = "0.24", features = ["zstd-compression"] }
|
||||
tantivy-jieba = "0.16"
|
||||
tantivy = { version = "0.26", features = ["zstd-compression"] }
|
||||
tantivy-jieba = "0.20"
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
|
||||
|
||||
@@ -12,8 +12,79 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
|
||||
use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer};
|
||||
use std::collections::HashMap;
|
||||
use std::hint::black_box;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
|
||||
use futures::AsyncRead;
|
||||
use index::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator};
|
||||
use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
|
||||
use index::fulltext_index::{Analyzer, Config};
|
||||
use puffin::puffin_manager::{PuffinWriter, PutOptions};
|
||||
|
||||
const CHINESE_TOKENIZER_TEXTS: &[(&str, &str)] = &[
|
||||
("short", "登录手机号。中国农业银行。"),
|
||||
(
|
||||
"mixed_log",
|
||||
"2025-08-01 21:09:28 用户登录失败 trace_id=abc_123 dynamic_key=mobile_login 中国农业银行接口返回超时。",
|
||||
),
|
||||
(
|
||||
"product_search",
|
||||
"哈基米哦南北绿豆,噢马自立曼波。装电视台,中国中央广播电视台。压不缩,笑不活。",
|
||||
),
|
||||
(
|
||||
"long_news",
|
||||
"中国农业银行发布公告称,手机银行登录服务完成升级。多个地区用户反馈查询速度提升,后台监控显示核心链路延迟下降,异常请求自动重试次数减少。系统继续保留 trace_id、request_id 和 dynamic_key 等字段用于排查问题。",
|
||||
),
|
||||
];
|
||||
|
||||
const CHINESE_INDEX_DOCS: &[&str] = &[
|
||||
"登录手机号,中国农业银行手机银行接口返回成功。",
|
||||
"用户登录失败,trace_id=abc_123,dynamic_key=mobile_login。",
|
||||
"中国中央广播电视台发布新的节目预告。",
|
||||
"装电视台的时候遇到压不缩的问题。",
|
||||
"哈基米哦南北绿豆,噢马自立曼波。",
|
||||
"后台监控显示核心链路延迟下降。",
|
||||
"系统保留 request_id 用于排查问题。",
|
||||
"中文全文索引需要兼顾召回率和 token 数量。",
|
||||
];
|
||||
|
||||
struct NoopPuffinWriter;
|
||||
|
||||
#[async_trait]
|
||||
impl PuffinWriter for NoopPuffinWriter {
|
||||
async fn put_blob<R>(
|
||||
&mut self,
|
||||
_key: &str,
|
||||
_raw_data: R,
|
||||
_options: PutOptions,
|
||||
_properties: HashMap<String, String>,
|
||||
) -> puffin::error::Result<u64>
|
||||
where
|
||||
R: AsyncRead + Send,
|
||||
{
|
||||
unreachable!("tantivy fulltext benchmark only writes directory blobs")
|
||||
}
|
||||
|
||||
async fn put_dir(
|
||||
&mut self,
|
||||
_key: &str,
|
||||
_dir: PathBuf,
|
||||
_options: PutOptions,
|
||||
_properties: HashMap<String, String>,
|
||||
) -> puffin::error::Result<u64> {
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn set_footer_lz4_compressed(&mut self, _lz4_compressed: bool) {}
|
||||
|
||||
async fn finish(self) -> puffin::error::Result<u64> {
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_english_tokenizer(c: &mut Criterion) {
|
||||
let tokenizer = EnglishTokenizer;
|
||||
@@ -86,5 +157,104 @@ fn bench_english_tokenizer(c: &mut Criterion) {
|
||||
repeat_group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_english_tokenizer);
|
||||
fn bench_chinese_tokenizer(c: &mut Criterion) {
|
||||
let tokenizer = ChineseTokenizer;
|
||||
let mut group = c.benchmark_group("chinese_tokenizer");
|
||||
|
||||
for (name, text) in CHINESE_TOKENIZER_TEXTS {
|
||||
group.throughput(Throughput::Bytes(text.len() as u64));
|
||||
group.bench_with_input(BenchmarkId::new("tokenize", name), text, |b, text| {
|
||||
b.iter(|| black_box(tokenizer.tokenize(black_box(text))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
|
||||
let mut repeat_group = c.benchmark_group("chinese_tokenizer_repeated");
|
||||
let sample_text = CHINESE_TOKENIZER_TEXTS
|
||||
.iter()
|
||||
.find(|(name, _)| *name == "mixed_log")
|
||||
.map(|(_, text)| *text)
|
||||
.expect("mixed_log sample must exist");
|
||||
|
||||
for repeat_count in [10, 100, 1000] {
|
||||
repeat_group.bench_with_input(
|
||||
BenchmarkId::new("repeated_tokenize", repeat_count),
|
||||
&repeat_count,
|
||||
|b, &repeat_count| {
|
||||
b.iter(|| {
|
||||
for _ in 0..repeat_count {
|
||||
black_box(tokenizer.tokenize(black_box(sample_text)));
|
||||
}
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
repeat_group.finish();
|
||||
}
|
||||
|
||||
fn bench_tantivy_chinese_fulltext_index(c: &mut Criterion) {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create Tokio runtime");
|
||||
let config = Config {
|
||||
analyzer: Analyzer::Chinese,
|
||||
case_sensitive: false,
|
||||
};
|
||||
let mut group = c.benchmark_group("tantivy_chinese_fulltext_index");
|
||||
group.sample_size(10);
|
||||
group.measurement_time(Duration::from_secs(10));
|
||||
|
||||
for doc_count in [32usize, 256usize] {
|
||||
group.throughput(Throughput::Elements(doc_count as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("build_commit", doc_count),
|
||||
&doc_count,
|
||||
|b, &doc_count| {
|
||||
b.iter_batched(
|
||||
tempfile::tempdir,
|
||||
|dir| {
|
||||
let dir = dir.expect("failed to create temp dir");
|
||||
runtime.block_on(async {
|
||||
let mut creator =
|
||||
TantivyFulltextIndexCreator::new(dir.path(), config, 64 << 20)
|
||||
.await
|
||||
.expect("failed to create tantivy fulltext index");
|
||||
for idx in 0..doc_count {
|
||||
let text = CHINESE_INDEX_DOCS[idx % CHINESE_INDEX_DOCS.len()];
|
||||
creator
|
||||
.push_text(black_box(text))
|
||||
.await
|
||||
.expect("failed to push text");
|
||||
}
|
||||
let mut puffin_writer = NoopPuffinWriter;
|
||||
creator
|
||||
.finish(
|
||||
&mut puffin_writer,
|
||||
"tantivy_chinese_fulltext_index",
|
||||
PutOptions::default(),
|
||||
)
|
||||
.await
|
||||
.expect("failed to commit tantivy fulltext index");
|
||||
});
|
||||
// Return the temp dir so Criterion drops it after timing the routine.
|
||||
dir
|
||||
},
|
||||
BatchSize::SmallInput,
|
||||
)
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_english_tokenizer,
|
||||
bench_chinese_tokenizer,
|
||||
bench_tantivy_chinese_fulltext_index
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
@@ -52,7 +52,7 @@ impl Config {
|
||||
fn build_tantivy_tokenizer(&self) -> TokenizerManager {
|
||||
let mut builder = match self.analyzer {
|
||||
Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
|
||||
Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
|
||||
Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer::new()).dynamic(),
|
||||
};
|
||||
|
||||
if !self.case_sensitive {
|
||||
|
||||
@@ -98,7 +98,8 @@ impl Tokenizer for ChineseTokenizer {
|
||||
let mut tokens = JIEBA
|
||||
.cut_for_search(text, true)
|
||||
.into_iter()
|
||||
.filter(|s| is_indexable_token(s))
|
||||
.map(|token| token.word)
|
||||
.filter(|token| is_indexable_token(token))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let english = EnglishTokenizer {};
|
||||
@@ -336,10 +337,26 @@ mod tests {
|
||||
let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
|
||||
|
||||
let default_tokens = tokenizer.tokenize(text);
|
||||
let cut_hmm_false = JIEBA.cut(text, false);
|
||||
let cut_hmm_true = JIEBA.cut(text, true);
|
||||
let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
|
||||
let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
|
||||
let cut_hmm_false = JIEBA
|
||||
.cut(text, false)
|
||||
.into_iter()
|
||||
.map(|token| token.word)
|
||||
.collect::<Vec<_>>();
|
||||
let cut_hmm_true = JIEBA
|
||||
.cut(text, true)
|
||||
.into_iter()
|
||||
.map(|token| token.word)
|
||||
.collect::<Vec<_>>();
|
||||
let cut_for_search_hmm_false = JIEBA
|
||||
.cut_for_search(text, false)
|
||||
.into_iter()
|
||||
.map(|token| token.word)
|
||||
.collect::<Vec<_>>();
|
||||
let cut_for_search_hmm_true = JIEBA
|
||||
.cut_for_search(text, true)
|
||||
.into_iter()
|
||||
.map(|token| token.word)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(
|
||||
default_tokens,
|
||||
|
||||
Reference in New Issue
Block a user