From b3da16fa7bf53eae82221436df5b4a74db0529b0 Mon Sep 17 00:00:00 2001 From: Francois Massot Date: Tue, 2 Jun 2026 05:13:06 +0200 Subject: [PATCH] bench: compare UnicodeSegmenterTokenizer vs alyze UAX#29 tokenizer on Wikipedia Adds a new criterion benchmark (`tokenizer_compare`) that measures throughput (MiB/s) of two UAX#29 tokenizer implementations on 64 MiB of English Wikipedia, matching alyze's own benchmark methodology. Implementations compared: - UnicodeSegmenterTokenizer: unicode_segmentation::unicode_word_indices() wrapped in tantivy's Tokenizer trait, with LowerCaser + RemoveLongFilter(255) - alyze: hand-rolled DFA with ASCII fast-path, via its Analyzer API Results on this machine: unicode_seg/tokenize_only ~88 MiB/s unicode_seg/full_pipeline ~74 MiB/s alyze/tokenize_only ~359 MiB/s alyze/full_pipeline ~225 MiB/s Co-Authored-By: Claude Sonnet 4.6 --- Cargo.toml | 10 ++ benches/tokenizer_compare.rs | 259 +++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 benches/tokenizer_compare.rs diff --git a/Cargo.toml b/Cargo.toml index a58aa1214..fd59c2a3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,6 +91,12 @@ postcard = { version = "1.0.4", features = [ "use-std", ], default-features = false } +alyze = "0.1.3" +unicode-segmentation = "1" +parquet = "57" +ureq = "3" +tempfile = "3" + [target.'cfg(not(windows))'.dev-dependencies] criterion = { version = "0.5", default-features = false } @@ -201,3 +207,7 @@ harness = false [[bench]] name = "regex_all_terms" harness = false + +[[bench]] +name = "tokenizer_compare" +harness = false diff --git a/benches/tokenizer_compare.rs b/benches/tokenizer_compare.rs new file mode 100644 index 000000000..0217607c2 --- /dev/null +++ b/benches/tokenizer_compare.rs @@ -0,0 +1,259 @@ +//! Compares UnicodeSegmenterTokenizer (unicode-segmentation UAX#29) vs alyze (hand-rolled UAX#29 DFA). +//! +//! Both implement UAX#29 word breaking; the difference is implementation strategy: +//! - UnicodeSegmenterTokenizer: `unicode_segmentation::unicode_word_indices()` + tantivy filter chain +//! - alyze: custom DFA with ASCII fast-path + ICU for non-ASCII + ReusableBuffer +//! +//! Corpus: 64 MiB of English Wikipedia (same methodology as alyze's own benchmark). +//! First run downloads parquet shards from HuggingFace and caches them under benches/.cache/. +//! +//! Run with: cargo bench --bench tokenizer_compare + +use std::{ + fs::File, + io::Write as _, + path::{Path, PathBuf}, +}; + +use alyze::{ + analyze::{AnalysisOptions, Analyzer, ReusableBuffer, TokenizerOptions}, + uax29, +}; +use criterion::{Criterion, Throughput, criterion_group, criterion_main}; +use parquet::{ + file::reader::{FileReader, SerializedFileReader}, + record::{RowAccessor, reader::RowIter}, + schema::types::Type, +}; +use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer}; +use unicode_segmentation::UnicodeSegmentation; + +const TARGET_BYTES: u64 = 64 << 20; // 64 MiB — matches alyze's benchmark +const MAX_TOKEN_LEN: usize = 255; // matches UnicodeSegmenterTokenizer's DEFAULT_REMOVE_TOKEN_LENGTH + +// ── UnicodeSegmenterTokenizer ────────────────────────────────────────────────────────── + +#[derive(Clone, Default)] +struct UnicodeSegmenterTokenizer; + +struct UnicodeSegmenterTokenStream<'a> { + iter: unicode_segmentation::UnicodeWordIndices<'a>, + token: Token, +} + +impl Tokenizer for UnicodeSegmenterTokenizer { + type TokenStream<'a> = UnicodeSegmenterTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> UnicodeSegmenterTokenStream<'a> { + UnicodeSegmenterTokenStream { + iter: text.unicode_word_indices(), + token: Token::default(), + } + } +} + +impl<'a> TokenStream for UnicodeSegmenterTokenStream<'a> { + fn advance(&mut self) -> bool { + if let Some((offset, word)) = self.iter.next() { + self.token.offset_from = offset; + self.token.offset_to = offset + word.len(); + self.token.position = self.token.position.wrapping_add(1); + self.token.text.clear(); + self.token.text.push_str(word); + true + } else { + false + } + } + + fn token(&self) -> &Token { + &self.token + } + + fn token_mut(&mut self) -> &mut Token { + &mut self.token + } +} + +// ── Corpus loading (mirrors alyze's wikipedia benchmark) ───────────────────── + +fn cache_dir() -> PathBuf { + let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".cache/wikipedia"); + std::fs::create_dir_all(&dir).expect("failed to create cache directory"); + dir +} + +fn parquet_files_and_urls() -> Vec<(String, String)> { + (0..41) + .map(|i| { + let file = format!("train-{i:05}-of-00041.parquet"); + let url = format!( + "https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/{file}?download=true" + ); + (file, url) + }) + .collect() +} + +fn download_and_cache(file_name: &str, url: &str, dir: &Path) -> File { + let path = dir.join(file_name); + if !path.exists() { + println!("downloading '{file_name}' from {url}"); + let resp = ureq::get(url).call().expect("HTTP request failed"); + let mut tmp = tempfile::Builder::new() + .tempfile_in(dir) + .expect("failed to create tempfile"); + std::io::copy(&mut resp.into_body().into_reader(), &mut tmp) + .expect("failed to write response body"); + tmp.as_file_mut().flush().expect("flush failed"); + tmp.persist(&path).expect("rename to cache failed"); + } + File::open(&path).expect("failed to open cached parquet file") +} + +fn iter_text_rows(reader: Box) -> impl Iterator { + let fields = reader.metadata().file_metadata().schema().get_fields().to_vec(); + let text_fields: Vec<_> = fields.into_iter().filter(|f| f.name() == "text").collect(); + let proj = Type::group_type_builder("schema") + .with_fields(text_fields) + .build() + .unwrap(); + RowIter::from_file_into(reader) + .project(Some(proj)) + .unwrap() + .map(|r| r.unwrap().get_string(0).cloned().unwrap()) +} + +fn load_corpus() -> Vec { + let dir = cache_dir(); + let mut texts: Vec = Vec::new(); + let mut total: u64 = 0; + + 'outer: for (file_name, url) in parquet_files_and_urls() { + let file = download_and_cache(&file_name, &url, &dir); + let reader = SerializedFileReader::new(file).expect("parquet reader failed"); + for text in iter_text_rows(Box::new(reader)) { + total += text.len() as u64; + texts.push(text); + if total >= TARGET_BYTES { + break 'outer; + } + } + } + + assert!(total >= TARGET_BYTES, "not enough Wikipedia data in parquet shards"); + texts +} + +// ── Benchmarks ──────────────────────────────────────────────────────────────── + +fn bench_unicode_seg(c: &mut Criterion, texts: &[String]) { + let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum(); + let mut analyzer = TextAnalyzer::builder(UnicodeSegmenterTokenizer) + .filter(LowerCaser) + .filter(RemoveLongFilter::limit(MAX_TOKEN_LEN)) + .build(); + + let mut group = c.benchmark_group("unicode_seg"); + group.throughput(Throughput::Bytes(bytes)); + group.sample_size(16); + + // Raw unicode_word_indices() with no filters — measures pure tokenization cost. + group.bench_function("tokenize_only", |b| { + b.iter(|| { + let mut count = 0u64; + for text in texts { + for _ in text.unicode_word_indices() { + count += 1; + } + } + std::hint::black_box(count) + }) + }); + + // Full UnicodeSegmenterTokenizer pipeline: tokenize + lowercase + remove_long(255). + group.bench_function("full_pipeline", |b| { + b.iter(|| { + let mut count = 0u64; + for text in texts { + let mut stream = analyzer.token_stream(text); + while stream.advance() { + count += 1; + } + } + std::hint::black_box(count) + }) + }); + + group.finish(); +} + +fn bench_alyze(c: &mut Criterion, texts: &[String]) { + let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum(); + + let base = AnalysisOptions { + tokenizer: TokenizerOptions::UAX29Word(uax29::word::Options::default()), + maximum_token_length: None, + case_sensitive: true, + stopword_removal: None, + stemming: None, + ascii_folding: false, + }; + let full = Analyzer::new(AnalysisOptions { + case_sensitive: false, + maximum_token_length: Some(MAX_TOKEN_LEN), + ..base + }); + let mut buffer = ReusableBuffer::new(); + + let mut group = c.benchmark_group("alyze"); + group.throughput(Throughput::Bytes(bytes)); + group.sample_size(16); + + // Raw UAX#29 DFA with is_word_like() filter — equivalent to unicode_word_indices(). + group.bench_function("tokenize_only", |b| { + b.iter(|| { + let mut count = 0u64; + for text in texts { + uax29::word::tokenize(text, uax29::word::Options::default(), |_, props| { + if props.is_word_like() { + count += 1; + } + true + }); + } + std::hint::black_box(count) + }) + }); + + // alyze pipeline matching UnicodeSegmenterTokenizer: lowercase + remove_long(255). + group.bench_function("full_pipeline", |b| { + b.iter(|| { + let mut count = 0u64; + for text in texts { + full.analyze(text, &mut buffer, |_| { + count += 1; + true + }); + } + std::hint::black_box(count) + }) + }); + + group.finish(); +} + +fn tokenizer_compare(c: &mut Criterion) { + let texts = load_corpus(); + let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum(); + eprintln!( + "corpus: {} articles, {:.1} MiB", + texts.len(), + bytes as f64 / (1u64 << 20) as f64 + ); + bench_unicode_seg(c, &texts); + bench_alyze(c, &texts); +} + +criterion_group!(benches, tokenizer_compare); +criterion_main!(benches);