bench: compare UnicodeSegmenterTokenizer vs alyze UAX#29 tokenizer on Wikipedia

Adds a new criterion benchmark (`tokenizer_compare`) that measures throughput
(MiB/s) of two UAX#29 tokenizer implementations on 64 MiB of English Wikipedia,
matching alyze's own benchmark methodology.

Implementations compared:
- UnicodeSegmenterTokenizer: unicode_segmentation::unicode_word_indices() wrapped
  in tantivy's Tokenizer trait, with LowerCaser + RemoveLongFilter(255)
- alyze: hand-rolled DFA with ASCII fast-path, via its Analyzer API

Results on this machine:
  unicode_seg/tokenize_only  ~88 MiB/s
  unicode_seg/full_pipeline  ~74 MiB/s
  alyze/tokenize_only       ~359 MiB/s
  alyze/full_pipeline       ~225 MiB/s

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Francois Massot
2026-06-02 05:13:06 +02:00
parent 2e16243f9a
commit b3da16fa7b
2 changed files with 269 additions and 0 deletions

View File

@@ -91,6 +91,12 @@ postcard = { version = "1.0.4", features = [
"use-std",
], default-features = false }
alyze = "0.1.3"
unicode-segmentation = "1"
parquet = "57"
ureq = "3"
tempfile = "3"
[target.'cfg(not(windows))'.dev-dependencies]
criterion = { version = "0.5", default-features = false }
@@ -201,3 +207,7 @@ harness = false
[[bench]]
name = "regex_all_terms"
harness = false
[[bench]]
name = "tokenizer_compare"
harness = false

View File

@@ -0,0 +1,259 @@
//! Compares UnicodeSegmenterTokenizer (unicode-segmentation UAX#29) vs alyze (hand-rolled UAX#29 DFA).
//!
//! Both implement UAX#29 word breaking; the difference is implementation strategy:
//! - UnicodeSegmenterTokenizer: `unicode_segmentation::unicode_word_indices()` + tantivy filter chain
//! - alyze: custom DFA with ASCII fast-path + ICU for non-ASCII + ReusableBuffer
//!
//! Corpus: 64 MiB of English Wikipedia (same methodology as alyze's own benchmark).
//! First run downloads parquet shards from HuggingFace and caches them under benches/.cache/.
//!
//! Run with: cargo bench --bench tokenizer_compare
use std::{
fs::File,
io::Write as _,
path::{Path, PathBuf},
};
use alyze::{
analyze::{AnalysisOptions, Analyzer, ReusableBuffer, TokenizerOptions},
uax29,
};
use criterion::{Criterion, Throughput, criterion_group, criterion_main};
use parquet::{
file::reader::{FileReader, SerializedFileReader},
record::{RowAccessor, reader::RowIter},
schema::types::Type,
};
use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer};
use unicode_segmentation::UnicodeSegmentation;
const TARGET_BYTES: u64 = 64 << 20; // 64 MiB — matches alyze's benchmark
const MAX_TOKEN_LEN: usize = 255; // matches UnicodeSegmenterTokenizer's DEFAULT_REMOVE_TOKEN_LENGTH
// ── UnicodeSegmenterTokenizer ──────────────────────────────────────────────────────────
#[derive(Clone, Default)]
struct UnicodeSegmenterTokenizer;
struct UnicodeSegmenterTokenStream<'a> {
iter: unicode_segmentation::UnicodeWordIndices<'a>,
token: Token,
}
impl Tokenizer for UnicodeSegmenterTokenizer {
type TokenStream<'a> = UnicodeSegmenterTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> UnicodeSegmenterTokenStream<'a> {
UnicodeSegmenterTokenStream {
iter: text.unicode_word_indices(),
token: Token::default(),
}
}
}
impl<'a> TokenStream for UnicodeSegmenterTokenStream<'a> {
fn advance(&mut self) -> bool {
if let Some((offset, word)) = self.iter.next() {
self.token.offset_from = offset;
self.token.offset_to = offset + word.len();
self.token.position = self.token.position.wrapping_add(1);
self.token.text.clear();
self.token.text.push_str(word);
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
// ── Corpus loading (mirrors alyze's wikipedia benchmark) ─────────────────────
fn cache_dir() -> PathBuf {
let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".cache/wikipedia");
std::fs::create_dir_all(&dir).expect("failed to create cache directory");
dir
}
fn parquet_files_and_urls() -> Vec<(String, String)> {
(0..41)
.map(|i| {
let file = format!("train-{i:05}-of-00041.parquet");
let url = format!(
"https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/{file}?download=true"
);
(file, url)
})
.collect()
}
fn download_and_cache(file_name: &str, url: &str, dir: &Path) -> File {
let path = dir.join(file_name);
if !path.exists() {
println!("downloading '{file_name}' from {url}");
let resp = ureq::get(url).call().expect("HTTP request failed");
let mut tmp = tempfile::Builder::new()
.tempfile_in(dir)
.expect("failed to create tempfile");
std::io::copy(&mut resp.into_body().into_reader(), &mut tmp)
.expect("failed to write response body");
tmp.as_file_mut().flush().expect("flush failed");
tmp.persist(&path).expect("rename to cache failed");
}
File::open(&path).expect("failed to open cached parquet file")
}
fn iter_text_rows(reader: Box<dyn FileReader>) -> impl Iterator<Item = String> {
let fields = reader.metadata().file_metadata().schema().get_fields().to_vec();
let text_fields: Vec<_> = fields.into_iter().filter(|f| f.name() == "text").collect();
let proj = Type::group_type_builder("schema")
.with_fields(text_fields)
.build()
.unwrap();
RowIter::from_file_into(reader)
.project(Some(proj))
.unwrap()
.map(|r| r.unwrap().get_string(0).cloned().unwrap())
}
fn load_corpus() -> Vec<String> {
let dir = cache_dir();
let mut texts: Vec<String> = Vec::new();
let mut total: u64 = 0;
'outer: for (file_name, url) in parquet_files_and_urls() {
let file = download_and_cache(&file_name, &url, &dir);
let reader = SerializedFileReader::new(file).expect("parquet reader failed");
for text in iter_text_rows(Box::new(reader)) {
total += text.len() as u64;
texts.push(text);
if total >= TARGET_BYTES {
break 'outer;
}
}
}
assert!(total >= TARGET_BYTES, "not enough Wikipedia data in parquet shards");
texts
}
// ── Benchmarks ────────────────────────────────────────────────────────────────
fn bench_unicode_seg(c: &mut Criterion, texts: &[String]) {
let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
let mut analyzer = TextAnalyzer::builder(UnicodeSegmenterTokenizer)
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(MAX_TOKEN_LEN))
.build();
let mut group = c.benchmark_group("unicode_seg");
group.throughput(Throughput::Bytes(bytes));
group.sample_size(16);
// Raw unicode_word_indices() with no filters — measures pure tokenization cost.
group.bench_function("tokenize_only", |b| {
b.iter(|| {
let mut count = 0u64;
for text in texts {
for _ in text.unicode_word_indices() {
count += 1;
}
}
std::hint::black_box(count)
})
});
// Full UnicodeSegmenterTokenizer pipeline: tokenize + lowercase + remove_long(255).
group.bench_function("full_pipeline", |b| {
b.iter(|| {
let mut count = 0u64;
for text in texts {
let mut stream = analyzer.token_stream(text);
while stream.advance() {
count += 1;
}
}
std::hint::black_box(count)
})
});
group.finish();
}
fn bench_alyze(c: &mut Criterion, texts: &[String]) {
let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
let base = AnalysisOptions {
tokenizer: TokenizerOptions::UAX29Word(uax29::word::Options::default()),
maximum_token_length: None,
case_sensitive: true,
stopword_removal: None,
stemming: None,
ascii_folding: false,
};
let full = Analyzer::new(AnalysisOptions {
case_sensitive: false,
maximum_token_length: Some(MAX_TOKEN_LEN),
..base
});
let mut buffer = ReusableBuffer::new();
let mut group = c.benchmark_group("alyze");
group.throughput(Throughput::Bytes(bytes));
group.sample_size(16);
// Raw UAX#29 DFA with is_word_like() filter — equivalent to unicode_word_indices().
group.bench_function("tokenize_only", |b| {
b.iter(|| {
let mut count = 0u64;
for text in texts {
uax29::word::tokenize(text, uax29::word::Options::default(), |_, props| {
if props.is_word_like() {
count += 1;
}
true
});
}
std::hint::black_box(count)
})
});
// alyze pipeline matching UnicodeSegmenterTokenizer: lowercase + remove_long(255).
group.bench_function("full_pipeline", |b| {
b.iter(|| {
let mut count = 0u64;
for text in texts {
full.analyze(text, &mut buffer, |_| {
count += 1;
true
});
}
std::hint::black_box(count)
})
});
group.finish();
}
fn tokenizer_compare(c: &mut Criterion) {
let texts = load_corpus();
let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
eprintln!(
"corpus: {} articles, {:.1} MiB",
texts.len(),
bytes as f64 / (1u64 << 20) as f64
);
bench_unicode_seg(c, &texts);
bench_alyze(c, &texts);
}
criterion_group!(benches, tokenizer_compare);
criterion_main!(benches);