mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 08:30:41 +00:00
bench: compare UnicodeSegmenterTokenizer vs alyze UAX#29 tokenizer on Wikipedia
Adds a new criterion benchmark (`tokenizer_compare`) that measures throughput (MiB/s) of two UAX#29 tokenizer implementations on 64 MiB of English Wikipedia, matching alyze's own benchmark methodology. Implementations compared: - UnicodeSegmenterTokenizer: unicode_segmentation::unicode_word_indices() wrapped in tantivy's Tokenizer trait, with LowerCaser + RemoveLongFilter(255) - alyze: hand-rolled DFA with ASCII fast-path, via its Analyzer API Results on this machine: unicode_seg/tokenize_only ~88 MiB/s unicode_seg/full_pipeline ~74 MiB/s alyze/tokenize_only ~359 MiB/s alyze/full_pipeline ~225 MiB/s Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
10
Cargo.toml
10
Cargo.toml
@@ -91,6 +91,12 @@ postcard = { version = "1.0.4", features = [
|
||||
"use-std",
|
||||
], default-features = false }
|
||||
|
||||
alyze = "0.1.3"
|
||||
unicode-segmentation = "1"
|
||||
parquet = "57"
|
||||
ureq = "3"
|
||||
tempfile = "3"
|
||||
|
||||
[target.'cfg(not(windows))'.dev-dependencies]
|
||||
criterion = { version = "0.5", default-features = false }
|
||||
|
||||
@@ -201,3 +207,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "regex_all_terms"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "tokenizer_compare"
|
||||
harness = false
|
||||
|
||||
259
benches/tokenizer_compare.rs
Normal file
259
benches/tokenizer_compare.rs
Normal file
@@ -0,0 +1,259 @@
|
||||
//! Compares UnicodeSegmenterTokenizer (unicode-segmentation UAX#29) vs alyze (hand-rolled UAX#29 DFA).
|
||||
//!
|
||||
//! Both implement UAX#29 word breaking; the difference is implementation strategy:
|
||||
//! - UnicodeSegmenterTokenizer: `unicode_segmentation::unicode_word_indices()` + tantivy filter chain
|
||||
//! - alyze: custom DFA with ASCII fast-path + ICU for non-ASCII + ReusableBuffer
|
||||
//!
|
||||
//! Corpus: 64 MiB of English Wikipedia (same methodology as alyze's own benchmark).
|
||||
//! First run downloads parquet shards from HuggingFace and caches them under benches/.cache/.
|
||||
//!
|
||||
//! Run with: cargo bench --bench tokenizer_compare
|
||||
|
||||
use std::{
|
||||
fs::File,
|
||||
io::Write as _,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use alyze::{
|
||||
analyze::{AnalysisOptions, Analyzer, ReusableBuffer, TokenizerOptions},
|
||||
uax29,
|
||||
};
|
||||
use criterion::{Criterion, Throughput, criterion_group, criterion_main};
|
||||
use parquet::{
|
||||
file::reader::{FileReader, SerializedFileReader},
|
||||
record::{RowAccessor, reader::RowIter},
|
||||
schema::types::Type,
|
||||
};
|
||||
use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer};
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
const TARGET_BYTES: u64 = 64 << 20; // 64 MiB — matches alyze's benchmark
|
||||
const MAX_TOKEN_LEN: usize = 255; // matches UnicodeSegmenterTokenizer's DEFAULT_REMOVE_TOKEN_LENGTH
|
||||
|
||||
// ── UnicodeSegmenterTokenizer ──────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct UnicodeSegmenterTokenizer;
|
||||
|
||||
struct UnicodeSegmenterTokenStream<'a> {
|
||||
iter: unicode_segmentation::UnicodeWordIndices<'a>,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for UnicodeSegmenterTokenizer {
|
||||
type TokenStream<'a> = UnicodeSegmenterTokenStream<'a>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> UnicodeSegmenterTokenStream<'a> {
|
||||
UnicodeSegmenterTokenStream {
|
||||
iter: text.unicode_word_indices(),
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for UnicodeSegmenterTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if let Some((offset, word)) = self.iter.next() {
|
||||
self.token.offset_from = offset;
|
||||
self.token.offset_to = offset + word.len();
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
self.token.text.clear();
|
||||
self.token.text.push_str(word);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
}
|
||||
}
|
||||
|
||||
// ── Corpus loading (mirrors alyze's wikipedia benchmark) ─────────────────────
|
||||
|
||||
fn cache_dir() -> PathBuf {
|
||||
let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".cache/wikipedia");
|
||||
std::fs::create_dir_all(&dir).expect("failed to create cache directory");
|
||||
dir
|
||||
}
|
||||
|
||||
fn parquet_files_and_urls() -> Vec<(String, String)> {
|
||||
(0..41)
|
||||
.map(|i| {
|
||||
let file = format!("train-{i:05}-of-00041.parquet");
|
||||
let url = format!(
|
||||
"https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/{file}?download=true"
|
||||
);
|
||||
(file, url)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn download_and_cache(file_name: &str, url: &str, dir: &Path) -> File {
|
||||
let path = dir.join(file_name);
|
||||
if !path.exists() {
|
||||
println!("downloading '{file_name}' from {url}");
|
||||
let resp = ureq::get(url).call().expect("HTTP request failed");
|
||||
let mut tmp = tempfile::Builder::new()
|
||||
.tempfile_in(dir)
|
||||
.expect("failed to create tempfile");
|
||||
std::io::copy(&mut resp.into_body().into_reader(), &mut tmp)
|
||||
.expect("failed to write response body");
|
||||
tmp.as_file_mut().flush().expect("flush failed");
|
||||
tmp.persist(&path).expect("rename to cache failed");
|
||||
}
|
||||
File::open(&path).expect("failed to open cached parquet file")
|
||||
}
|
||||
|
||||
fn iter_text_rows(reader: Box<dyn FileReader>) -> impl Iterator<Item = String> {
|
||||
let fields = reader.metadata().file_metadata().schema().get_fields().to_vec();
|
||||
let text_fields: Vec<_> = fields.into_iter().filter(|f| f.name() == "text").collect();
|
||||
let proj = Type::group_type_builder("schema")
|
||||
.with_fields(text_fields)
|
||||
.build()
|
||||
.unwrap();
|
||||
RowIter::from_file_into(reader)
|
||||
.project(Some(proj))
|
||||
.unwrap()
|
||||
.map(|r| r.unwrap().get_string(0).cloned().unwrap())
|
||||
}
|
||||
|
||||
fn load_corpus() -> Vec<String> {
|
||||
let dir = cache_dir();
|
||||
let mut texts: Vec<String> = Vec::new();
|
||||
let mut total: u64 = 0;
|
||||
|
||||
'outer: for (file_name, url) in parquet_files_and_urls() {
|
||||
let file = download_and_cache(&file_name, &url, &dir);
|
||||
let reader = SerializedFileReader::new(file).expect("parquet reader failed");
|
||||
for text in iter_text_rows(Box::new(reader)) {
|
||||
total += text.len() as u64;
|
||||
texts.push(text);
|
||||
if total >= TARGET_BYTES {
|
||||
break 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert!(total >= TARGET_BYTES, "not enough Wikipedia data in parquet shards");
|
||||
texts
|
||||
}
|
||||
|
||||
// ── Benchmarks ────────────────────────────────────────────────────────────────
|
||||
|
||||
fn bench_unicode_seg(c: &mut Criterion, texts: &[String]) {
|
||||
let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
|
||||
let mut analyzer = TextAnalyzer::builder(UnicodeSegmenterTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.filter(RemoveLongFilter::limit(MAX_TOKEN_LEN))
|
||||
.build();
|
||||
|
||||
let mut group = c.benchmark_group("unicode_seg");
|
||||
group.throughput(Throughput::Bytes(bytes));
|
||||
group.sample_size(16);
|
||||
|
||||
// Raw unicode_word_indices() with no filters — measures pure tokenization cost.
|
||||
group.bench_function("tokenize_only", |b| {
|
||||
b.iter(|| {
|
||||
let mut count = 0u64;
|
||||
for text in texts {
|
||||
for _ in text.unicode_word_indices() {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
std::hint::black_box(count)
|
||||
})
|
||||
});
|
||||
|
||||
// Full UnicodeSegmenterTokenizer pipeline: tokenize + lowercase + remove_long(255).
|
||||
group.bench_function("full_pipeline", |b| {
|
||||
b.iter(|| {
|
||||
let mut count = 0u64;
|
||||
for text in texts {
|
||||
let mut stream = analyzer.token_stream(text);
|
||||
while stream.advance() {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
std::hint::black_box(count)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_alyze(c: &mut Criterion, texts: &[String]) {
|
||||
let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
|
||||
|
||||
let base = AnalysisOptions {
|
||||
tokenizer: TokenizerOptions::UAX29Word(uax29::word::Options::default()),
|
||||
maximum_token_length: None,
|
||||
case_sensitive: true,
|
||||
stopword_removal: None,
|
||||
stemming: None,
|
||||
ascii_folding: false,
|
||||
};
|
||||
let full = Analyzer::new(AnalysisOptions {
|
||||
case_sensitive: false,
|
||||
maximum_token_length: Some(MAX_TOKEN_LEN),
|
||||
..base
|
||||
});
|
||||
let mut buffer = ReusableBuffer::new();
|
||||
|
||||
let mut group = c.benchmark_group("alyze");
|
||||
group.throughput(Throughput::Bytes(bytes));
|
||||
group.sample_size(16);
|
||||
|
||||
// Raw UAX#29 DFA with is_word_like() filter — equivalent to unicode_word_indices().
|
||||
group.bench_function("tokenize_only", |b| {
|
||||
b.iter(|| {
|
||||
let mut count = 0u64;
|
||||
for text in texts {
|
||||
uax29::word::tokenize(text, uax29::word::Options::default(), |_, props| {
|
||||
if props.is_word_like() {
|
||||
count += 1;
|
||||
}
|
||||
true
|
||||
});
|
||||
}
|
||||
std::hint::black_box(count)
|
||||
})
|
||||
});
|
||||
|
||||
// alyze pipeline matching UnicodeSegmenterTokenizer: lowercase + remove_long(255).
|
||||
group.bench_function("full_pipeline", |b| {
|
||||
b.iter(|| {
|
||||
let mut count = 0u64;
|
||||
for text in texts {
|
||||
full.analyze(text, &mut buffer, |_| {
|
||||
count += 1;
|
||||
true
|
||||
});
|
||||
}
|
||||
std::hint::black_box(count)
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn tokenizer_compare(c: &mut Criterion) {
|
||||
let texts = load_corpus();
|
||||
let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
|
||||
eprintln!(
|
||||
"corpus: {} articles, {:.1} MiB",
|
||||
texts.len(),
|
||||
bytes as f64 / (1u64 << 20) as f64
|
||||
);
|
||||
bench_unicode_seg(c, &texts);
|
||||
bench_alyze(c, &texts);
|
||||
}
|
||||
|
||||
criterion_group!(benches, tokenizer_compare);
|
||||
criterion_main!(benches);
|
||||
Reference in New Issue
Block a user