fix: faster jieba (#8158)

* fix: faster jieba

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

* fix: also update tantivy and the api

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

* fix: better bench follow the copilot review

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

* fix: apply comments

Signed-off-by: yihong0618 <zouzou0208@gmail.com>

---------

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
yihong
2026-05-25 16:51:40 +08:00
committed by GitHub
parent a25152664b
commit eb264d9adf
5 changed files with 342 additions and 82 deletions

213
Cargo.lock generated
View File

@@ -1321,9 +1321,9 @@ dependencies = [
[[package]]
name = "bitpacking"
version = "0.9.2"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92"
checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019"
dependencies = [
"crunchy",
]
@@ -1832,7 +1832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7d8d1efd5109b9c1cd3b7966bd071cdfb53bb6eb0b22a473a68c2f70a11a1eb"
dependencies = [
"parse-zoneinfo",
"phf_codegen",
"phf_codegen 0.12.1",
"phf_shared 0.12.1",
"uncased",
]
@@ -4380,6 +4380,12 @@ dependencies = [
"tracing",
]
[[package]]
name = "datasketches"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745"
[[package]]
name = "datatypes"
version = "1.1.0"
@@ -5486,12 +5492,12 @@ dependencies = [
[[package]]
name = "fs4"
version = "0.8.4"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8"
checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4"
dependencies = [
"rustix 0.38.44",
"windows-sys 0.52.0",
"rustix 1.0.7",
"windows-sys 0.59.0",
]
[[package]]
@@ -6564,27 +6570,37 @@ checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed"
[[package]]
name = "include-flate"
version = "0.3.0"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e"
checksum = "23e233413926ef735f7d87024466cfda5a4b87467730846bd82ea7d504121347"
dependencies = [
"include-flate-codegen",
"lazy_static",
"libflate",
"include-flate-compress",
]
[[package]]
name = "include-flate-codegen"
version = "0.2.0"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7"
checksum = "5e7148f24ef8922cc0e5574ebb908729ccdd3a110c440a45165733fedadd9969"
dependencies = [
"libflate",
"include-flate-compress",
"proc-macro-error2",
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "include-flate-compress"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74783a9ed407e844e99d5e7a57bd650acbfa124cf6e97ffd790ba59d8ab8e7ff"
dependencies = [
"libflate",
"zstd",
]
[[package]]
name = "include_dir"
version = "0.7.4"
@@ -6918,25 +6934,25 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "jieba-macros"
version = "0.8.0"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6105f38f083bb1a79ad523bd32fa0d8ffcb6abd2fc4da9da203c32bca5b6ace3"
checksum = "661344b2412fb00aee1841d2405c9a31f7c91cf6e578a8e953647c43dd1a8b0a"
dependencies = [
"phf_codegen",
"phf_codegen 0.13.1",
]
[[package]]
name = "jieba-rs"
version = "0.8.0"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47982a320106da83b0c5d6aec0fb83e109f0132b69670b063adaa6fa5b4f3f4a"
checksum = "d7ef90d6209fcff084a01b488c4199d882e3764b15ff0e7a6b5d7efaa46e1e4f"
dependencies = [
"cedarwood",
"fxhash",
"include-flate",
"jieba-macros",
"phf 0.12.1",
"phf 0.13.1",
"regex",
"rustc-hash 2.1.1",
]
[[package]]
@@ -7483,25 +7499,25 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
[[package]]
name = "libflate"
version = "2.1.0"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
checksum = "cd96e993e5f3368b0cb8497dae6c860c22af8ff18388c61c6c0b86c58d86b5df"
dependencies = [
"adler32",
"core2",
"crc32fast",
"dary_heap",
"libflate_lz77",
"no_std_io2",
]
[[package]]
name = "libflate_lz77"
version = "2.1.0"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
checksum = "ff7a10e427698aef6eef269482776debfef63384d30f13aad39a1a95e0e098fd"
dependencies = [
"core2",
"hashbrown 0.14.5",
"hashbrown 0.16.1",
"no_std_io2",
"rle-decode-fast",
]
@@ -7816,6 +7832,15 @@ dependencies = [
"hashbrown 0.15.4",
]
[[package]]
name = "lru"
version = "0.16.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
dependencies = [
"hashbrown 0.16.1",
]
[[package]]
name = "lru-slab"
version = "0.1.2"
@@ -8434,7 +8459,7 @@ dependencies = [
"flate2",
"io-enum",
"libc",
"lru",
"lru 0.12.5",
"mysql_common 0.34.1",
"named_pipe",
"pem",
@@ -8497,7 +8522,7 @@ dependencies = [
"futures-sink",
"futures-util",
"keyed_priority_queue",
"lru",
"lru 0.12.5",
"mysql_common 0.34.1",
"pem",
"percent-encoding",
@@ -8695,6 +8720,15 @@ dependencies = [
"libc",
]
[[package]]
name = "no_std_io2"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003"
dependencies = [
"memchr",
]
[[package]]
name = "nohash"
version = "0.2.0"
@@ -9635,6 +9669,15 @@ dependencies = [
"serde",
]
[[package]]
name = "ordered-float"
version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e"
dependencies = [
"num-traits",
]
[[package]]
name = "ordered-multimap"
version = "0.4.3"
@@ -10122,6 +10165,15 @@ checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7"
dependencies = [
"phf_macros",
"phf_shared 0.12.1",
]
[[package]]
name = "phf"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
dependencies = [
"phf_shared 0.13.1",
"serde",
]
@@ -10131,10 +10183,20 @@ version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "efbdcb6f01d193b17f0b9c3360fa7e0e620991b193ff08702f78b3ce365d7e61"
dependencies = [
"phf_generator",
"phf_generator 0.12.1",
"phf_shared 0.12.1",
]
[[package]]
name = "phf_codegen"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
dependencies = [
"phf_generator 0.13.1",
"phf_shared 0.13.1",
]
[[package]]
name = "phf_generator"
version = "0.12.1"
@@ -10145,13 +10207,23 @@ dependencies = [
"phf_shared 0.12.1",
]
[[package]]
name = "phf_generator"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
dependencies = [
"fastrand",
"phf_shared 0.13.1",
]
[[package]]
name = "phf_macros"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d713258393a82f091ead52047ca779d37e5766226d009de21696c4e667044368"
dependencies = [
"phf_generator",
"phf_generator 0.12.1",
"phf_shared 0.12.1",
"proc-macro2",
"quote",
@@ -10178,6 +10250,15 @@ dependencies = [
"uncased",
]
[[package]]
name = "phf_shared"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project"
version = "1.1.10"
@@ -11415,16 +11496,6 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
[[package]]
name = "rand_distr"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
dependencies = [
"num-traits",
"rand 0.8.5",
]
[[package]]
name = "rand_xorshift"
version = "0.4.0"
@@ -12961,9 +13032,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
[[package]]
name = "sketches-ddsketch"
version = "0.3.0"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a"
checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea"
dependencies = [
"serde",
]
@@ -13864,9 +13935,9 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
[[package]]
name = "tantivy"
version = "0.24.2"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43"
checksum = "edde6a10743fff00a4e1a8c9ef020bf5f3cbad301b7d2d39f2b07f123c4eac07"
dependencies = [
"aho-corasick",
"arc-swap",
@@ -13877,17 +13948,17 @@ dependencies = [
"census",
"crc32fast",
"crossbeam-channel",
"datasketches",
"downcast-rs",
"fastdivide",
"fnv",
"fs4",
"htmlescape",
"hyperloglogplus",
"itertools 0.14.0",
"levenshtein_automata",
"log",
"lru",
"lz4_flex 0.11.6",
"lru 0.16.4",
"lz4_flex 0.13.1",
"measure_time",
"memmap2",
"once_cell",
@@ -13910,6 +13981,7 @@ dependencies = [
"tempfile",
"thiserror 2.0.17",
"time",
"typetag",
"uuid",
"winapi",
"zstd",
@@ -13917,18 +13989,18 @@ dependencies = [
[[package]]
name = "tantivy-bitpacker"
version = "0.8.0"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494"
checksum = "4fed3d674429bcd2de5d0a6d1aa5495fed8afd9c5ecce993019caf7615f53fa4"
dependencies = [
"bitpacking",
]
[[package]]
name = "tantivy-columnar"
version = "0.5.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344"
checksum = "c57166f5bcfd478f370ab8445afb4678dce44801fa5ce5c451aaf8595583c5dc"
dependencies = [
"downcast-rs",
"fastdivide",
@@ -13942,9 +14014,9 @@ dependencies = [
[[package]]
name = "tantivy-common"
version = "0.9.0"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f"
checksum = "bbf10915aa75da3c3b0d58b58853d2e889efbaf32d4982a4c3715dde6bba23e5"
dependencies = [
"async-trait",
"byteorder",
@@ -13966,9 +14038,9 @@ dependencies = [
[[package]]
name = "tantivy-jieba"
version = "0.16.0"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b08147cc130e323ecc522117927b198bec617fe1df562a0b6449905858d0363"
checksum = "3392170e86f1c387170aba7d171a466ffdc98a8b55b006e19ac64b123a7b690a"
dependencies = [
"jieba-rs",
"lazy_static",
@@ -13977,20 +14049,22 @@ dependencies = [
[[package]]
name = "tantivy-query-grammar"
version = "0.24.0"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a"
checksum = "dfadb8526b6da90704feb293b0701a6aae62ea14983143344be2dc5ce30f1d82"
dependencies = [
"fnv",
"nom 7.1.3",
"ordered-float 5.3.0",
"serde",
"serde_json",
]
[[package]]
name = "tantivy-sstable"
version = "0.5.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416"
checksum = "8a2cfc3ac5164cbadc28965ffb145a8f47582a60ae5897859ad8d4316596c606"
dependencies = [
"futures-util",
"itertools 0.14.0",
@@ -14002,20 +14076,19 @@ dependencies = [
[[package]]
name = "tantivy-stacker"
version = "0.5.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1"
checksum = "6cbb051742da9d53ca9e8fff43a9b10e319338b24e2c0e15d0372df19ffeb951"
dependencies = [
"murmurhash32",
"rand_distr",
"tantivy-common",
]
[[package]]
name = "tantivy-tokenizer-api"
version = "0.5.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d"
checksum = "eac258c2c6390673f2685813afeeafcb8c4e0ee7de8dd3fc46838dcc37263f98"
dependencies = [
"serde",
]
@@ -15018,9 +15091,9 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
[[package]]
name = "typetag"
version = "0.2.20"
version = "0.2.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73f22b40dd7bfe8c14230cf9702081366421890435b2d625fa92b4acc4c3de6f"
checksum = "c5a897b12c6c1151ad0b138b8db50252dc301f93bc3b027db05eec82aeed298c"
dependencies = [
"erased-serde",
"inventory",
@@ -15031,9 +15104,9 @@ dependencies = [
[[package]]
name = "typetag-impl"
version = "0.2.20"
version = "0.2.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35f5380909ffc31b4de4f4bdf96b877175a016aa2ca98cee39fcfd8c4d53d952"
checksum = "cf808357c6ed7e13ba0f3277ec8d8f21b2d501274895104263985330c726c1c5"
dependencies = [
"proc-macro2",
"quote",

View File

@@ -26,7 +26,7 @@ fst.workspace = true
futures.workspace = true
greptime-proto.workspace = true
itertools.workspace = true
jieba-rs = "0.8"
jieba-rs = "0.10"
lazy_static.workspace = true
mockall.workspace = true
nalgebra.workspace = true
@@ -40,8 +40,8 @@ serde.workspace = true
serde_json.workspace = true
snafu.workspace = true
store-api.workspace = true
tantivy = { version = "0.24", features = ["zstd-compression"] }
tantivy-jieba = "0.16"
tantivy = { version = "0.26", features = ["zstd-compression"] }
tantivy-jieba = "0.20"
tokio.workspace = true
tokio-util.workspace = true
usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }

View File

@@ -12,8 +12,79 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer};
use std::collections::HashMap;
use std::hint::black_box;
use std::path::PathBuf;
use std::time::Duration;
use async_trait::async_trait;
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
use futures::AsyncRead;
use index::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator};
use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
use index::fulltext_index::{Analyzer, Config};
use puffin::puffin_manager::{PuffinWriter, PutOptions};
const CHINESE_TOKENIZER_TEXTS: &[(&str, &str)] = &[
("short", "登录手机号。中国农业银行。"),
(
"mixed_log",
"2025-08-01 21:09:28 用户登录失败 trace_id=abc_123 dynamic_key=mobile_login 中国农业银行接口返回超时。",
),
(
"product_search",
"哈基米哦南北绿豆,噢马自立曼波。装电视台,中国中央广播电视台。压不缩,笑不活。",
),
(
"long_news",
"中国农业银行发布公告称,手机银行登录服务完成升级。多个地区用户反馈查询速度提升,后台监控显示核心链路延迟下降,异常请求自动重试次数减少。系统继续保留 trace_id、request_id 和 dynamic_key 等字段用于排查问题。",
),
];
const CHINESE_INDEX_DOCS: &[&str] = &[
"登录手机号,中国农业银行手机银行接口返回成功。",
"用户登录失败trace_id=abc_123dynamic_key=mobile_login。",
"中国中央广播电视台发布新的节目预告。",
"装电视台的时候遇到压不缩的问题。",
"哈基米哦南北绿豆,噢马自立曼波。",
"后台监控显示核心链路延迟下降。",
"系统保留 request_id 用于排查问题。",
"中文全文索引需要兼顾召回率和 token 数量。",
];
struct NoopPuffinWriter;
#[async_trait]
impl PuffinWriter for NoopPuffinWriter {
async fn put_blob<R>(
&mut self,
_key: &str,
_raw_data: R,
_options: PutOptions,
_properties: HashMap<String, String>,
) -> puffin::error::Result<u64>
where
R: AsyncRead + Send,
{
unreachable!("tantivy fulltext benchmark only writes directory blobs")
}
async fn put_dir(
&mut self,
_key: &str,
_dir: PathBuf,
_options: PutOptions,
_properties: HashMap<String, String>,
) -> puffin::error::Result<u64> {
Ok(0)
}
fn set_footer_lz4_compressed(&mut self, _lz4_compressed: bool) {}
async fn finish(self) -> puffin::error::Result<u64> {
Ok(0)
}
}
fn bench_english_tokenizer(c: &mut Criterion) {
let tokenizer = EnglishTokenizer;
@@ -86,5 +157,104 @@ fn bench_english_tokenizer(c: &mut Criterion) {
repeat_group.finish();
}
criterion_group!(benches, bench_english_tokenizer);
fn bench_chinese_tokenizer(c: &mut Criterion) {
let tokenizer = ChineseTokenizer;
let mut group = c.benchmark_group("chinese_tokenizer");
for (name, text) in CHINESE_TOKENIZER_TEXTS {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_with_input(BenchmarkId::new("tokenize", name), text, |b, text| {
b.iter(|| black_box(tokenizer.tokenize(black_box(text))))
});
}
group.finish();
let mut repeat_group = c.benchmark_group("chinese_tokenizer_repeated");
let sample_text = CHINESE_TOKENIZER_TEXTS
.iter()
.find(|(name, _)| *name == "mixed_log")
.map(|(_, text)| *text)
.expect("mixed_log sample must exist");
for repeat_count in [10, 100, 1000] {
repeat_group.bench_with_input(
BenchmarkId::new("repeated_tokenize", repeat_count),
&repeat_count,
|b, &repeat_count| {
b.iter(|| {
for _ in 0..repeat_count {
black_box(tokenizer.tokenize(black_box(sample_text)));
}
})
},
);
}
repeat_group.finish();
}
fn bench_tantivy_chinese_fulltext_index(c: &mut Criterion) {
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create Tokio runtime");
let config = Config {
analyzer: Analyzer::Chinese,
case_sensitive: false,
};
let mut group = c.benchmark_group("tantivy_chinese_fulltext_index");
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));
for doc_count in [32usize, 256usize] {
group.throughput(Throughput::Elements(doc_count as u64));
group.bench_with_input(
BenchmarkId::new("build_commit", doc_count),
&doc_count,
|b, &doc_count| {
b.iter_batched(
tempfile::tempdir,
|dir| {
let dir = dir.expect("failed to create temp dir");
runtime.block_on(async {
let mut creator =
TantivyFulltextIndexCreator::new(dir.path(), config, 64 << 20)
.await
.expect("failed to create tantivy fulltext index");
for idx in 0..doc_count {
let text = CHINESE_INDEX_DOCS[idx % CHINESE_INDEX_DOCS.len()];
creator
.push_text(black_box(text))
.await
.expect("failed to push text");
}
let mut puffin_writer = NoopPuffinWriter;
creator
.finish(
&mut puffin_writer,
"tantivy_chinese_fulltext_index",
PutOptions::default(),
)
.await
.expect("failed to commit tantivy fulltext index");
});
// Return the temp dir so Criterion drops it after timing the routine.
dir
},
BatchSize::SmallInput,
)
},
);
}
group.finish();
}
criterion_group!(
benches,
bench_english_tokenizer,
bench_chinese_tokenizer,
bench_tantivy_chinese_fulltext_index
);
criterion_main!(benches);

View File

@@ -52,7 +52,7 @@ impl Config {
fn build_tantivy_tokenizer(&self) -> TokenizerManager {
let mut builder = match self.analyzer {
Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer::new()).dynamic(),
};
if !self.case_sensitive {

View File

@@ -98,7 +98,8 @@ impl Tokenizer for ChineseTokenizer {
let mut tokens = JIEBA
.cut_for_search(text, true)
.into_iter()
.filter(|s| is_indexable_token(s))
.map(|token| token.word)
.filter(|token| is_indexable_token(token))
.collect::<Vec<_>>();
let english = EnglishTokenizer {};
@@ -336,10 +337,26 @@ mod tests {
let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
let default_tokens = tokenizer.tokenize(text);
let cut_hmm_false = JIEBA.cut(text, false);
let cut_hmm_true = JIEBA.cut(text, true);
let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
let cut_hmm_false = JIEBA
.cut(text, false)
.into_iter()
.map(|token| token.word)
.collect::<Vec<_>>();
let cut_hmm_true = JIEBA
.cut(text, true)
.into_iter()
.map(|token| token.word)
.collect::<Vec<_>>();
let cut_for_search_hmm_false = JIEBA
.cut_for_search(text, false)
.into_iter()
.map(|token| token.word)
.collect::<Vec<_>>();
let cut_for_search_hmm_true = JIEBA
.cut_for_search(text, true)
.into_iter()
.map(|token| token.word)
.collect::<Vec<_>>();
assert_eq!(
default_tokens,