From b3da16fa7bf53eae82221436df5b4a74db0529b0 Mon Sep 17 00:00:00 2001
From: Francois Massot <francois.massot@datadoghq.com>
Date: Tue, 2 Jun 2026 05:13:06 +0200
Subject: [PATCH] bench: compare UnicodeSegmenterTokenizer vs alyze UAX#29
 tokenizer on Wikipedia

Adds a new criterion benchmark (`tokenizer_compare`) that measures throughput
(MiB/s) of two UAX#29 tokenizer implementations on 64 MiB of English Wikipedia,
matching alyze's own benchmark methodology.

Implementations compared:
- UnicodeSegmenterTokenizer: unicode_segmentation::unicode_word_indices() wrapped
  in tantivy's Tokenizer trait, with LowerCaser + RemoveLongFilter(255)
- alyze: hand-rolled DFA with ASCII fast-path, via its Analyzer API

Results on this machine:
  unicode_seg/tokenize_only  ~88 MiB/s
  unicode_seg/full_pipeline  ~74 MiB/s
  alyze/tokenize_only       ~359 MiB/s
  alyze/full_pipeline       ~225 MiB/s

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Cargo.toml                   |  10 ++
 benches/tokenizer_compare.rs | 259 +++++++++++++++++++++++++++++++++++
 2 files changed, 269 insertions(+)
 create mode 100644 benches/tokenizer_compare.rs

diff --git a/Cargo.toml b/Cargo.toml
index a58aa1214..fd59c2a3c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -91,6 +91,12 @@ postcard = { version = "1.0.4", features = [
     "use-std",
 ], default-features = false }
 
+alyze = "0.1.3"
+unicode-segmentation = "1"
+parquet = "57"
+ureq = "3"
+tempfile = "3"
+
 [target.'cfg(not(windows))'.dev-dependencies]
 criterion = { version = "0.5", default-features = false }
 
@@ -201,3 +207,7 @@ harness = false
 [[bench]]
 name = "regex_all_terms"
 harness = false
+
+[[bench]]
+name = "tokenizer_compare"
+harness = false
diff --git a/benches/tokenizer_compare.rs b/benches/tokenizer_compare.rs
new file mode 100644
index 000000000..0217607c2
--- /dev/null
+++ b/benches/tokenizer_compare.rs
@@ -0,0 +1,259 @@
+//! Compares UnicodeSegmenterTokenizer (unicode-segmentation UAX#29) vs alyze (hand-rolled UAX#29 DFA).
+//!
+//! Both implement UAX#29 word breaking; the difference is implementation strategy:
+//! - UnicodeSegmenterTokenizer: `unicode_segmentation::unicode_word_indices()` + tantivy filter chain
+//! - alyze: custom DFA with ASCII fast-path + ICU for non-ASCII + ReusableBuffer
+//!
+//! Corpus: 64 MiB of English Wikipedia (same methodology as alyze's own benchmark).
+//! First run downloads parquet shards from HuggingFace and caches them under benches/.cache/.
+//!
+//! Run with: cargo bench --bench tokenizer_compare
+
+use std::{
+    fs::File,
+    io::Write as _,
+    path::{Path, PathBuf},
+};
+
+use alyze::{
+    analyze::{AnalysisOptions, Analyzer, ReusableBuffer, TokenizerOptions},
+    uax29,
+};
+use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use parquet::{
+    file::reader::{FileReader, SerializedFileReader},
+    record::{RowAccessor, reader::RowIter},
+    schema::types::Type,
+};
+use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer};
+use unicode_segmentation::UnicodeSegmentation;
+
+const TARGET_BYTES: u64 = 64 << 20; // 64 MiB — matches alyze's benchmark
+const MAX_TOKEN_LEN: usize = 255; // matches UnicodeSegmenterTokenizer's DEFAULT_REMOVE_TOKEN_LENGTH
+
+// ── UnicodeSegmenterTokenizer ──────────────────────────────────────────────────────────
+
+#[derive(Clone, Default)]
+struct UnicodeSegmenterTokenizer;
+
+struct UnicodeSegmenterTokenStream<'a> {
+    iter: unicode_segmentation::UnicodeWordIndices<'a>,
+    token: Token,
+}
+
+impl Tokenizer for UnicodeSegmenterTokenizer {
+    type TokenStream<'a> = UnicodeSegmenterTokenStream<'a>;
+
+    fn token_stream<'a>(&'a mut self, text: &'a str) -> UnicodeSegmenterTokenStream<'a> {
+        UnicodeSegmenterTokenStream {
+            iter: text.unicode_word_indices(),
+            token: Token::default(),
+        }
+    }
+}
+
+impl<'a> TokenStream for UnicodeSegmenterTokenStream<'a> {
+    fn advance(&mut self) -> bool {
+        if let Some((offset, word)) = self.iter.next() {
+            self.token.offset_from = offset;
+            self.token.offset_to = offset + word.len();
+            self.token.position = self.token.position.wrapping_add(1);
+            self.token.text.clear();
+            self.token.text.push_str(word);
+            true
+        } else {
+            false
+        }
+    }
+
+    fn token(&self) -> &Token {
+        &self.token
+    }
+
+    fn token_mut(&mut self) -> &mut Token {
+        &mut self.token
+    }
+}
+
+// ── Corpus loading (mirrors alyze's wikipedia benchmark) ─────────────────────
+
+fn cache_dir() -> PathBuf {
+    let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".cache/wikipedia");
+    std::fs::create_dir_all(&dir).expect("failed to create cache directory");
+    dir
+}
+
+fn parquet_files_and_urls() -> Vec<(String, String)> {
+    (0..41)
+        .map(|i| {
+            let file = format!("train-{i:05}-of-00041.parquet");
+            let url = format!(
+                "https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/{file}?download=true"
+            );
+            (file, url)
+        })
+        .collect()
+}
+
+fn download_and_cache(file_name: &str, url: &str, dir: &Path) -> File {
+    let path = dir.join(file_name);
+    if !path.exists() {
+        println!("downloading '{file_name}' from {url}");
+        let resp = ureq::get(url).call().expect("HTTP request failed");
+        let mut tmp = tempfile::Builder::new()
+            .tempfile_in(dir)
+            .expect("failed to create tempfile");
+        std::io::copy(&mut resp.into_body().into_reader(), &mut tmp)
+            .expect("failed to write response body");
+        tmp.as_file_mut().flush().expect("flush failed");
+        tmp.persist(&path).expect("rename to cache failed");
+    }
+    File::open(&path).expect("failed to open cached parquet file")
+}
+
+fn iter_text_rows(reader: Box<dyn FileReader>) -> impl Iterator<Item = String> {
+    let fields = reader.metadata().file_metadata().schema().get_fields().to_vec();
+    let text_fields: Vec<_> = fields.into_iter().filter(|f| f.name() == "text").collect();
+    let proj = Type::group_type_builder("schema")
+        .with_fields(text_fields)
+        .build()
+        .unwrap();
+    RowIter::from_file_into(reader)
+        .project(Some(proj))
+        .unwrap()
+        .map(|r| r.unwrap().get_string(0).cloned().unwrap())
+}
+
+fn load_corpus() -> Vec<String> {
+    let dir = cache_dir();
+    let mut texts: Vec<String> = Vec::new();
+    let mut total: u64 = 0;
+
+    'outer: for (file_name, url) in parquet_files_and_urls() {
+        let file = download_and_cache(&file_name, &url, &dir);
+        let reader = SerializedFileReader::new(file).expect("parquet reader failed");
+        for text in iter_text_rows(Box::new(reader)) {
+            total += text.len() as u64;
+            texts.push(text);
+            if total >= TARGET_BYTES {
+                break 'outer;
+            }
+        }
+    }
+
+    assert!(total >= TARGET_BYTES, "not enough Wikipedia data in parquet shards");
+    texts
+}
+
+// ── Benchmarks ────────────────────────────────────────────────────────────────
+
+fn bench_unicode_seg(c: &mut Criterion, texts: &[String]) {
+    let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
+    let mut analyzer = TextAnalyzer::builder(UnicodeSegmenterTokenizer)
+        .filter(LowerCaser)
+        .filter(RemoveLongFilter::limit(MAX_TOKEN_LEN))
+        .build();
+
+    let mut group = c.benchmark_group("unicode_seg");
+    group.throughput(Throughput::Bytes(bytes));
+    group.sample_size(16);
+
+    // Raw unicode_word_indices() with no filters — measures pure tokenization cost.
+    group.bench_function("tokenize_only", |b| {
+        b.iter(|| {
+            let mut count = 0u64;
+            for text in texts {
+                for _ in text.unicode_word_indices() {
+                    count += 1;
+                }
+            }
+            std::hint::black_box(count)
+        })
+    });
+
+    // Full UnicodeSegmenterTokenizer pipeline: tokenize + lowercase + remove_long(255).
+    group.bench_function("full_pipeline", |b| {
+        b.iter(|| {
+            let mut count = 0u64;
+            for text in texts {
+                let mut stream = analyzer.token_stream(text);
+                while stream.advance() {
+                    count += 1;
+                }
+            }
+            std::hint::black_box(count)
+        })
+    });
+
+    group.finish();
+}
+
+fn bench_alyze(c: &mut Criterion, texts: &[String]) {
+    let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
+
+    let base = AnalysisOptions {
+        tokenizer: TokenizerOptions::UAX29Word(uax29::word::Options::default()),
+        maximum_token_length: None,
+        case_sensitive: true,
+        stopword_removal: None,
+        stemming: None,
+        ascii_folding: false,
+    };
+    let full = Analyzer::new(AnalysisOptions {
+        case_sensitive: false,
+        maximum_token_length: Some(MAX_TOKEN_LEN),
+        ..base
+    });
+    let mut buffer = ReusableBuffer::new();
+
+    let mut group = c.benchmark_group("alyze");
+    group.throughput(Throughput::Bytes(bytes));
+    group.sample_size(16);
+
+    // Raw UAX#29 DFA with is_word_like() filter — equivalent to unicode_word_indices().
+    group.bench_function("tokenize_only", |b| {
+        b.iter(|| {
+            let mut count = 0u64;
+            for text in texts {
+                uax29::word::tokenize(text, uax29::word::Options::default(), |_, props| {
+                    if props.is_word_like() {
+                        count += 1;
+                    }
+                    true
+                });
+            }
+            std::hint::black_box(count)
+        })
+    });
+
+    // alyze pipeline matching UnicodeSegmenterTokenizer: lowercase + remove_long(255).
+    group.bench_function("full_pipeline", |b| {
+        b.iter(|| {
+            let mut count = 0u64;
+            for text in texts {
+                full.analyze(text, &mut buffer, |_| {
+                    count += 1;
+                    true
+                });
+            }
+            std::hint::black_box(count)
+        })
+    });
+
+    group.finish();
+}
+
+fn tokenizer_compare(c: &mut Criterion) {
+    let texts = load_corpus();
+    let bytes: u64 = texts.iter().map(|t| t.len() as u64).sum();
+    eprintln!(
+        "corpus: {} articles, {:.1} MiB",
+        texts.len(),
+        bytes as f64 / (1u64 << 20) as f64
+    );
+    bench_unicode_seg(c, &texts);
+    bench_alyze(c, &texts);
+}
+
+criterion_group!(benches, tokenizer_compare);
+criterion_main!(benches);