mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-26 01:40:36 +00:00
fix: faster jieba (#8158)
* fix: faster jieba Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: also update tantivy and the api Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: better bench follow the copilot review Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: apply comments Signed-off-by: yihong0618 <zouzou0208@gmail.com> --------- Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
213
Cargo.lock
generated
213
Cargo.lock
generated
@@ -1321,9 +1321,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "bitpacking"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92"
|
||||
checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019"
|
||||
dependencies = [
|
||||
"crunchy",
|
||||
]
|
||||
@@ -1832,7 +1832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7d8d1efd5109b9c1cd3b7966bd071cdfb53bb6eb0b22a473a68c2f70a11a1eb"
|
||||
dependencies = [
|
||||
"parse-zoneinfo",
|
||||
"phf_codegen",
|
||||
"phf_codegen 0.12.1",
|
||||
"phf_shared 0.12.1",
|
||||
"uncased",
|
||||
]
|
||||
@@ -4380,6 +4380,12 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "datasketches"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745"
|
||||
|
||||
[[package]]
|
||||
name = "datatypes"
|
||||
version = "1.1.0"
|
||||
@@ -5486,12 +5492,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "fs4"
|
||||
version = "0.8.4"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8"
|
||||
checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4"
|
||||
dependencies = [
|
||||
"rustix 0.38.44",
|
||||
"windows-sys 0.52.0",
|
||||
"rustix 1.0.7",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6564,27 +6570,37 @@ checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed"
|
||||
|
||||
[[package]]
|
||||
name = "include-flate"
|
||||
version = "0.3.0"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e"
|
||||
checksum = "23e233413926ef735f7d87024466cfda5a4b87467730846bd82ea7d504121347"
|
||||
dependencies = [
|
||||
"include-flate-codegen",
|
||||
"lazy_static",
|
||||
"libflate",
|
||||
"include-flate-compress",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "include-flate-codegen"
|
||||
version = "0.2.0"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7"
|
||||
checksum = "5e7148f24ef8922cc0e5574ebb908729ccdd3a110c440a45165733fedadd9969"
|
||||
dependencies = [
|
||||
"libflate",
|
||||
"include-flate-compress",
|
||||
"proc-macro-error2",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "include-flate-compress"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74783a9ed407e844e99d5e7a57bd650acbfa124cf6e97ffd790ba59d8ab8e7ff"
|
||||
dependencies = [
|
||||
"libflate",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "include_dir"
|
||||
version = "0.7.4"
|
||||
@@ -6918,25 +6934,25 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "jieba-macros"
|
||||
version = "0.8.0"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6105f38f083bb1a79ad523bd32fa0d8ffcb6abd2fc4da9da203c32bca5b6ace3"
|
||||
checksum = "661344b2412fb00aee1841d2405c9a31f7c91cf6e578a8e953647c43dd1a8b0a"
|
||||
dependencies = [
|
||||
"phf_codegen",
|
||||
"phf_codegen 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jieba-rs"
|
||||
version = "0.8.0"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47982a320106da83b0c5d6aec0fb83e109f0132b69670b063adaa6fa5b4f3f4a"
|
||||
checksum = "d7ef90d6209fcff084a01b488c4199d882e3764b15ff0e7a6b5d7efaa46e1e4f"
|
||||
dependencies = [
|
||||
"cedarwood",
|
||||
"fxhash",
|
||||
"include-flate",
|
||||
"jieba-macros",
|
||||
"phf 0.12.1",
|
||||
"phf 0.13.1",
|
||||
"regex",
|
||||
"rustc-hash 2.1.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7483,25 +7499,25 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
|
||||
|
||||
[[package]]
|
||||
name = "libflate"
|
||||
version = "2.1.0"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
|
||||
checksum = "cd96e993e5f3368b0cb8497dae6c860c22af8ff18388c61c6c0b86c58d86b5df"
|
||||
dependencies = [
|
||||
"adler32",
|
||||
"core2",
|
||||
"crc32fast",
|
||||
"dary_heap",
|
||||
"libflate_lz77",
|
||||
"no_std_io2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libflate_lz77"
|
||||
version = "2.1.0"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
|
||||
checksum = "ff7a10e427698aef6eef269482776debfef63384d30f13aad39a1a95e0e098fd"
|
||||
dependencies = [
|
||||
"core2",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.16.1",
|
||||
"no_std_io2",
|
||||
"rle-decode-fast",
|
||||
]
|
||||
|
||||
@@ -7816,6 +7832,15 @@ dependencies = [
|
||||
"hashbrown 0.15.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.16.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
|
||||
dependencies = [
|
||||
"hashbrown 0.16.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru-slab"
|
||||
version = "0.1.2"
|
||||
@@ -8434,7 +8459,7 @@ dependencies = [
|
||||
"flate2",
|
||||
"io-enum",
|
||||
"libc",
|
||||
"lru",
|
||||
"lru 0.12.5",
|
||||
"mysql_common 0.34.1",
|
||||
"named_pipe",
|
||||
"pem",
|
||||
@@ -8497,7 +8522,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"keyed_priority_queue",
|
||||
"lru",
|
||||
"lru 0.12.5",
|
||||
"mysql_common 0.34.1",
|
||||
"pem",
|
||||
"percent-encoding",
|
||||
@@ -8695,6 +8720,15 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "no_std_io2"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nohash"
|
||||
version = "0.2.0"
|
||||
@@ -9635,6 +9669,15 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-float"
|
||||
version = "5.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.4.3"
|
||||
@@ -10122,6 +10165,15 @@ checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7"
|
||||
dependencies = [
|
||||
"phf_macros",
|
||||
"phf_shared 0.12.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
|
||||
dependencies = [
|
||||
"phf_shared 0.13.1",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -10131,10 +10183,20 @@ version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "efbdcb6f01d193b17f0b9c3360fa7e0e620991b193ff08702f78b3ce365d7e61"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_generator 0.12.1",
|
||||
"phf_shared 0.12.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
|
||||
dependencies = [
|
||||
"phf_generator 0.13.1",
|
||||
"phf_shared 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.12.1"
|
||||
@@ -10145,13 +10207,23 @@ dependencies = [
|
||||
"phf_shared 0.12.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"phf_shared 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_macros"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d713258393a82f091ead52047ca779d37e5766226d009de21696c4e667044368"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_generator 0.12.1",
|
||||
"phf_shared 0.12.1",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -10178,6 +10250,15 @@ dependencies = [
|
||||
"uncased",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.1.10"
|
||||
@@ -11415,16 +11496,6 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_xorshift"
|
||||
version = "0.4.0"
|
||||
@@ -12961,9 +13032,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
|
||||
|
||||
[[package]]
|
||||
name = "sketches-ddsketch"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a"
|
||||
checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -13864,9 +13935,9 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
|
||||
|
||||
[[package]]
|
||||
name = "tantivy"
|
||||
version = "0.24.2"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43"
|
||||
checksum = "edde6a10743fff00a4e1a8c9ef020bf5f3cbad301b7d2d39f2b07f123c4eac07"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"arc-swap",
|
||||
@@ -13877,17 +13948,17 @@ dependencies = [
|
||||
"census",
|
||||
"crc32fast",
|
||||
"crossbeam-channel",
|
||||
"datasketches",
|
||||
"downcast-rs",
|
||||
"fastdivide",
|
||||
"fnv",
|
||||
"fs4",
|
||||
"htmlescape",
|
||||
"hyperloglogplus",
|
||||
"itertools 0.14.0",
|
||||
"levenshtein_automata",
|
||||
"log",
|
||||
"lru",
|
||||
"lz4_flex 0.11.6",
|
||||
"lru 0.16.4",
|
||||
"lz4_flex 0.13.1",
|
||||
"measure_time",
|
||||
"memmap2",
|
||||
"once_cell",
|
||||
@@ -13910,6 +13981,7 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror 2.0.17",
|
||||
"time",
|
||||
"typetag",
|
||||
"uuid",
|
||||
"winapi",
|
||||
"zstd",
|
||||
@@ -13917,18 +13989,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.8.0"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494"
|
||||
checksum = "4fed3d674429bcd2de5d0a6d1aa5495fed8afd9c5ecce993019caf7615f53fa4"
|
||||
dependencies = [
|
||||
"bitpacking",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-columnar"
|
||||
version = "0.5.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344"
|
||||
checksum = "c57166f5bcfd478f370ab8445afb4678dce44801fa5ce5c451aaf8595583c5dc"
|
||||
dependencies = [
|
||||
"downcast-rs",
|
||||
"fastdivide",
|
||||
@@ -13942,9 +14014,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-common"
|
||||
version = "0.9.0"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f"
|
||||
checksum = "bbf10915aa75da3c3b0d58b58853d2e889efbaf32d4982a4c3715dde6bba23e5"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -13966,9 +14038,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-jieba"
|
||||
version = "0.16.0"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b08147cc130e323ecc522117927b198bec617fe1df562a0b6449905858d0363"
|
||||
checksum = "3392170e86f1c387170aba7d171a466ffdc98a8b55b006e19ac64b123a7b690a"
|
||||
dependencies = [
|
||||
"jieba-rs",
|
||||
"lazy_static",
|
||||
@@ -13977,20 +14049,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.24.0"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a"
|
||||
checksum = "dfadb8526b6da90704feb293b0701a6aae62ea14983143344be2dc5ce30f1d82"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"nom 7.1.3",
|
||||
"ordered-float 5.3.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-sstable"
|
||||
version = "0.5.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416"
|
||||
checksum = "8a2cfc3ac5164cbadc28965ffb145a8f47582a60ae5897859ad8d4316596c606"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"itertools 0.14.0",
|
||||
@@ -14002,20 +14076,19 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-stacker"
|
||||
version = "0.5.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1"
|
||||
checksum = "6cbb051742da9d53ca9e8fff43a9b10e319338b24e2c0e15d0372df19ffeb951"
|
||||
dependencies = [
|
||||
"murmurhash32",
|
||||
"rand_distr",
|
||||
"tantivy-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-tokenizer-api"
|
||||
version = "0.5.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d"
|
||||
checksum = "eac258c2c6390673f2685813afeeafcb8c4e0ee7de8dd3fc46838dcc37263f98"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -15018,9 +15091,9 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
|
||||
|
||||
[[package]]
|
||||
name = "typetag"
|
||||
version = "0.2.20"
|
||||
version = "0.2.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73f22b40dd7bfe8c14230cf9702081366421890435b2d625fa92b4acc4c3de6f"
|
||||
checksum = "c5a897b12c6c1151ad0b138b8db50252dc301f93bc3b027db05eec82aeed298c"
|
||||
dependencies = [
|
||||
"erased-serde",
|
||||
"inventory",
|
||||
@@ -15031,9 +15104,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "typetag-impl"
|
||||
version = "0.2.20"
|
||||
version = "0.2.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "35f5380909ffc31b4de4f4bdf96b877175a016aa2ca98cee39fcfd8c4d53d952"
|
||||
checksum = "cf808357c6ed7e13ba0f3277ec8d8f21b2d501274895104263985330c726c1c5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
||||
@@ -26,7 +26,7 @@ fst.workspace = true
|
||||
futures.workspace = true
|
||||
greptime-proto.workspace = true
|
||||
itertools.workspace = true
|
||||
jieba-rs = "0.8"
|
||||
jieba-rs = "0.10"
|
||||
lazy_static.workspace = true
|
||||
mockall.workspace = true
|
||||
nalgebra.workspace = true
|
||||
@@ -40,8 +40,8 @@ serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
snafu.workspace = true
|
||||
store-api.workspace = true
|
||||
tantivy = { version = "0.24", features = ["zstd-compression"] }
|
||||
tantivy-jieba = "0.16"
|
||||
tantivy = { version = "0.26", features = ["zstd-compression"] }
|
||||
tantivy-jieba = "0.20"
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
|
||||
|
||||
@@ -12,8 +12,79 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
|
||||
use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer};
|
||||
use std::collections::HashMap;
|
||||
use std::hint::black_box;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
|
||||
use futures::AsyncRead;
|
||||
use index::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator};
|
||||
use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
|
||||
use index::fulltext_index::{Analyzer, Config};
|
||||
use puffin::puffin_manager::{PuffinWriter, PutOptions};
|
||||
|
||||
const CHINESE_TOKENIZER_TEXTS: &[(&str, &str)] = &[
|
||||
("short", "登录手机号。中国农业银行。"),
|
||||
(
|
||||
"mixed_log",
|
||||
"2025-08-01 21:09:28 用户登录失败 trace_id=abc_123 dynamic_key=mobile_login 中国农业银行接口返回超时。",
|
||||
),
|
||||
(
|
||||
"product_search",
|
||||
"哈基米哦南北绿豆,噢马自立曼波。装电视台,中国中央广播电视台。压不缩,笑不活。",
|
||||
),
|
||||
(
|
||||
"long_news",
|
||||
"中国农业银行发布公告称,手机银行登录服务完成升级。多个地区用户反馈查询速度提升,后台监控显示核心链路延迟下降,异常请求自动重试次数减少。系统继续保留 trace_id、request_id 和 dynamic_key 等字段用于排查问题。",
|
||||
),
|
||||
];
|
||||
|
||||
const CHINESE_INDEX_DOCS: &[&str] = &[
|
||||
"登录手机号,中国农业银行手机银行接口返回成功。",
|
||||
"用户登录失败,trace_id=abc_123,dynamic_key=mobile_login。",
|
||||
"中国中央广播电视台发布新的节目预告。",
|
||||
"装电视台的时候遇到压不缩的问题。",
|
||||
"哈基米哦南北绿豆,噢马自立曼波。",
|
||||
"后台监控显示核心链路延迟下降。",
|
||||
"系统保留 request_id 用于排查问题。",
|
||||
"中文全文索引需要兼顾召回率和 token 数量。",
|
||||
];
|
||||
|
||||
struct NoopPuffinWriter;
|
||||
|
||||
#[async_trait]
|
||||
impl PuffinWriter for NoopPuffinWriter {
|
||||
async fn put_blob<R>(
|
||||
&mut self,
|
||||
_key: &str,
|
||||
_raw_data: R,
|
||||
_options: PutOptions,
|
||||
_properties: HashMap<String, String>,
|
||||
) -> puffin::error::Result<u64>
|
||||
where
|
||||
R: AsyncRead + Send,
|
||||
{
|
||||
unreachable!("tantivy fulltext benchmark only writes directory blobs")
|
||||
}
|
||||
|
||||
async fn put_dir(
|
||||
&mut self,
|
||||
_key: &str,
|
||||
_dir: PathBuf,
|
||||
_options: PutOptions,
|
||||
_properties: HashMap<String, String>,
|
||||
) -> puffin::error::Result<u64> {
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn set_footer_lz4_compressed(&mut self, _lz4_compressed: bool) {}
|
||||
|
||||
async fn finish(self) -> puffin::error::Result<u64> {
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_english_tokenizer(c: &mut Criterion) {
|
||||
let tokenizer = EnglishTokenizer;
|
||||
@@ -86,5 +157,104 @@ fn bench_english_tokenizer(c: &mut Criterion) {
|
||||
repeat_group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_english_tokenizer);
|
||||
fn bench_chinese_tokenizer(c: &mut Criterion) {
|
||||
let tokenizer = ChineseTokenizer;
|
||||
let mut group = c.benchmark_group("chinese_tokenizer");
|
||||
|
||||
for (name, text) in CHINESE_TOKENIZER_TEXTS {
|
||||
group.throughput(Throughput::Bytes(text.len() as u64));
|
||||
group.bench_with_input(BenchmarkId::new("tokenize", name), text, |b, text| {
|
||||
b.iter(|| black_box(tokenizer.tokenize(black_box(text))))
|
||||
});
|
||||
}
|
||||
|
||||
group.finish();
|
||||
|
||||
let mut repeat_group = c.benchmark_group("chinese_tokenizer_repeated");
|
||||
let sample_text = CHINESE_TOKENIZER_TEXTS
|
||||
.iter()
|
||||
.find(|(name, _)| *name == "mixed_log")
|
||||
.map(|(_, text)| *text)
|
||||
.expect("mixed_log sample must exist");
|
||||
|
||||
for repeat_count in [10, 100, 1000] {
|
||||
repeat_group.bench_with_input(
|
||||
BenchmarkId::new("repeated_tokenize", repeat_count),
|
||||
&repeat_count,
|
||||
|b, &repeat_count| {
|
||||
b.iter(|| {
|
||||
for _ in 0..repeat_count {
|
||||
black_box(tokenizer.tokenize(black_box(sample_text)));
|
||||
}
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
repeat_group.finish();
|
||||
}
|
||||
|
||||
fn bench_tantivy_chinese_fulltext_index(c: &mut Criterion) {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create Tokio runtime");
|
||||
let config = Config {
|
||||
analyzer: Analyzer::Chinese,
|
||||
case_sensitive: false,
|
||||
};
|
||||
let mut group = c.benchmark_group("tantivy_chinese_fulltext_index");
|
||||
group.sample_size(10);
|
||||
group.measurement_time(Duration::from_secs(10));
|
||||
|
||||
for doc_count in [32usize, 256usize] {
|
||||
group.throughput(Throughput::Elements(doc_count as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("build_commit", doc_count),
|
||||
&doc_count,
|
||||
|b, &doc_count| {
|
||||
b.iter_batched(
|
||||
tempfile::tempdir,
|
||||
|dir| {
|
||||
let dir = dir.expect("failed to create temp dir");
|
||||
runtime.block_on(async {
|
||||
let mut creator =
|
||||
TantivyFulltextIndexCreator::new(dir.path(), config, 64 << 20)
|
||||
.await
|
||||
.expect("failed to create tantivy fulltext index");
|
||||
for idx in 0..doc_count {
|
||||
let text = CHINESE_INDEX_DOCS[idx % CHINESE_INDEX_DOCS.len()];
|
||||
creator
|
||||
.push_text(black_box(text))
|
||||
.await
|
||||
.expect("failed to push text");
|
||||
}
|
||||
let mut puffin_writer = NoopPuffinWriter;
|
||||
creator
|
||||
.finish(
|
||||
&mut puffin_writer,
|
||||
"tantivy_chinese_fulltext_index",
|
||||
PutOptions::default(),
|
||||
)
|
||||
.await
|
||||
.expect("failed to commit tantivy fulltext index");
|
||||
});
|
||||
// Return the temp dir so Criterion drops it after timing the routine.
|
||||
dir
|
||||
},
|
||||
BatchSize::SmallInput,
|
||||
)
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_english_tokenizer,
|
||||
bench_chinese_tokenizer,
|
||||
bench_tantivy_chinese_fulltext_index
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
@@ -52,7 +52,7 @@ impl Config {
|
||||
fn build_tantivy_tokenizer(&self) -> TokenizerManager {
|
||||
let mut builder = match self.analyzer {
|
||||
Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
|
||||
Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
|
||||
Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer::new()).dynamic(),
|
||||
};
|
||||
|
||||
if !self.case_sensitive {
|
||||
|
||||
@@ -98,7 +98,8 @@ impl Tokenizer for ChineseTokenizer {
|
||||
let mut tokens = JIEBA
|
||||
.cut_for_search(text, true)
|
||||
.into_iter()
|
||||
.filter(|s| is_indexable_token(s))
|
||||
.map(|token| token.word)
|
||||
.filter(|token| is_indexable_token(token))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let english = EnglishTokenizer {};
|
||||
@@ -336,10 +337,26 @@ mod tests {
|
||||
let text = "哈基米哦南北绿豆,噢马自立曼波。登录手机号。中国农业银行。装电视台,中国中央广播电视台。压不缩,笑不活。";
|
||||
|
||||
let default_tokens = tokenizer.tokenize(text);
|
||||
let cut_hmm_false = JIEBA.cut(text, false);
|
||||
let cut_hmm_true = JIEBA.cut(text, true);
|
||||
let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
|
||||
let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
|
||||
let cut_hmm_false = JIEBA
|
||||
.cut(text, false)
|
||||
.into_iter()
|
||||
.map(|token| token.word)
|
||||
.collect::<Vec<_>>();
|
||||
let cut_hmm_true = JIEBA
|
||||
.cut(text, true)
|
||||
.into_iter()
|
||||
.map(|token| token.word)
|
||||
.collect::<Vec<_>>();
|
||||
let cut_for_search_hmm_false = JIEBA
|
||||
.cut_for_search(text, false)
|
||||
.into_iter()
|
||||
.map(|token| token.word)
|
||||
.collect::<Vec<_>>();
|
||||
let cut_for_search_hmm_true = JIEBA
|
||||
.cut_for_search(text, true)
|
||||
.into_iter()
|
||||
.map(|token| token.word)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(
|
||||
default_tokens,
|
||||
|
||||
Reference in New Issue
Block a user