feat: count underscore in English tokenizer and improve performance (#6660)

* feat: count underscore in English tokenizer and improve performance Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update lock file Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update test results Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * assert lookup table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * handle utf8 alphanumeric Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * finalize Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2026-01-04 12:22:55 +00:00 · 2025-08-06 00:23:18 -07:00
parent af483335b2
commit ed2dff6d27
5 changed files with 161 additions and 8 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6119,6 +6119,7 @@ dependencies = [
 "common-runtime",
 "common-telemetry",
 "common-test-util",
+ "criterion 0.4.0",
 "fastbloom",
 "fst",
 "futures",
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -42,7 +42,12 @@ uuid.workspace = true

 [dev-dependencies]
 common-test-util.workspace = true
+criterion = "0.4"
 rand.workspace = true
 tempfile.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
+
+[[bench]]
+name = "tokenizer_bench"
+harness = false
--- a/src/index/benches/tokenizer_bench.rs
+++ b/src/index/benches/tokenizer_bench.rs
@@ -0,0 +1,66 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer};
+
+fn bench_english_tokenizer(c: &mut Criterion) {
+    let tokenizer = EnglishTokenizer;
+
+    let texts = vec![
+        ("short", "Hello, world! This is a test."),
+        ("medium", "The quick brown fox jumps over the lazy dog. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."),
+        ("long", "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt."),
+        ("mixed_chars", "Hello123 world!!! This-is_a.test@example.com with various: punctuation; and [brackets] {curly} (parentheses) & symbols* + numbers456."),
+        ("numbers_heavy", "test123 456test test789 abc123def 999888777 hello42world 123 456 789 mix1ng l3tt3rs 4nd numb3rs"),
+        ("punctuation_heavy", "Hello!!! World??? This...is...a...test... With lots of!!! punctuation??? marks!!! And... ellipses???"),
+        ("postgres log", "2025-08-01 21:09:28.928 UTC [27] LOG:  checkpoint complete: wrote 0 buffers (0.0%); 0 WAL file(s) added, 0 removed, 0 recycled; write=0.001 s, sync=0.001 s, total=0.003 s; sync files=0, longest=0.000 s, average=0.000 s; distance=0 kB, estimate=5 kB; lsn=0/1992868, redo lsn=0/1992868"),
+        ("many_short_words", "a b c d e f g h i j k l m n o p q r s t u v w x y z"),
+        ("with_unicode", "这是，一个测试。🈶一些 Unicøde 字符比如 café and naïve words."),
+    ];
+
+    let mut group = c.benchmark_group("english_tokenizer");
+
+    for (size, text) in texts {
+        group.bench_with_input(BenchmarkId::new("tokenize", size), &text, |b, text| {
+            b.iter(|| tokenizer.tokenize(text))
+        });
+    }
+
+    group.finish();
+
+    // Benchmark with repeated tokenization to simulate real-world usage
+    let mut repeat_group = c.benchmark_group("english_tokenizer_repeated");
+
+    let sample_text = "The quick brown fox jumps over the lazy dog. This sentence contains most letters of the alphabet.";
+
+    for repeat_count in [10, 100, 1000] {
+        repeat_group.bench_with_input(
+            BenchmarkId::new("repeated_tokenize", repeat_count),
+            &repeat_count,
+            |b, &repeat_count| {
+                b.iter(|| {
+                    for _ in 0..repeat_count {
+                        tokenizer.tokenize(sample_text);
+                    }
+                })
+            },
+        );
+    }
+
+    repeat_group.finish();
+}
+
+criterion_group!(benches, bench_english_tokenizer);
+criterion_main!(benches);
--- a/src/index/src/fulltext_index/tokenizer.rs
+++ b/src/index/src/fulltext_index/tokenizer.rs
@@ -19,6 +19,29 @@ lazy_static::lazy_static! {
    static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
 }

+/// A-Z, a-z, 0-9, and '_' are true
+const VALID_ASCII_TOKEN: [bool; 256] = [
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, true, true, true, true, true,
+    true, true, true, true, true, false, false, false, false, false, false, false, true, true,
+    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
+    true, true, true, true, true, true, true, true, false, false, false, false, true, false, true,
+    true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true,
+    true, true, true, true, true, true, true, true, true, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false, false, false, false,
+    false, false, false, false, false, false, false, false, false, false,
+];
+
 /// `Tokenizer` tokenizes a text into a list of tokens.
 pub trait Tokenizer: Send {
    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
@@ -32,9 +55,28 @@ pub struct EnglishTokenizer;

 impl Tokenizer for EnglishTokenizer {
    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
-        text.split(|c: char| !c.is_alphanumeric())
-            .filter(|s| !s.is_empty())
-            .collect()
+        if text.is_ascii() {
+            let mut tokens = Vec::new();
+            let mut start = 0;
+            for (i, &byte) in text.as_bytes().iter().enumerate() {
+                if !VALID_ASCII_TOKEN[byte as usize] {
+                    if start < i {
+                        tokens.push(&text[start..i]);
+                    }
+                    start = i + 1;
+                }
+            }
+
+            if start < text.len() {
+                tokens.push(&text[start..]);
+            }
+
+            tokens
+        } else {
+            text.split(|c: char| !c.is_alphanumeric() && c != '_')
+                .filter(|s| !s.is_empty())
+                .collect()
+        }
    }
 }

@@ -96,9 +138,25 @@ mod tests {
    #[test]
    fn test_english_tokenizer() {
        let tokenizer = EnglishTokenizer;
-        let text = "Hello, world! This is a test0.";
+        let text = "Hello, world!!! This is a----++   test012_345+67890";
        let tokens = tokenizer.tokenize(text);
-        assert_eq!(tokens, vec!["Hello", "world", "This", "is", "a", "test0"]);
+        assert_eq!(
+            tokens,
+            vec!["Hello", "world", "This", "is", "a", "test012_345", "67890"]
+        );
+    }
+
+    #[test]
+    fn test_english_tokenizer_with_utf8() {
+        let tokenizer = EnglishTokenizer;
+        let text = "💸unfold the 纸巾😣and gently 清洁表😭面";
+        let tokens = tokenizer.tokenize(text);
+        assert_eq!(
+            tokens,
+            // Don't care what happens to non-ASCII characters.
+            // It's kind of a misconfiguration to use EnglishTokenizer on non-ASCII text.
+            vec!["unfold", "the", "纸巾", "and", "gently", "清洁表", "面"]
+        );
    }

    #[test]
@@ -109,6 +167,29 @@ mod tests {
        assert_eq!(tokens, vec!["我", "喜欢", "苹果"]);
    }

+    #[test]
+    fn test_valid_ascii_token_lookup_table() {
+        // Test all ASCII values in a single loop
+        for c in 0u8..=255u8 {
+            let is_valid = VALID_ASCII_TOKEN[c as usize];
+            let should_be_valid = (c as char).is_ascii_alphanumeric() || c == b'_';
+
+            assert_eq!(
+                is_valid,
+                should_be_valid,
+                "Character '{}' (byte {}) validity mismatch: expected {}, got {}",
+                if c.is_ascii() && !c.is_ascii_control() {
+                    c as char
+                } else {
+                    '?'
+                },
+                c,
+                should_be_valid,
+                is_valid
+            );
+        }
+    }
+
    #[test]
    fn test_analyzer() {
        let tokenizer = EnglishTokenizer;
--- a/src/query/src/optimizer/constant_term.rs
+++ b/src/query/src/optimizer/constant_term.rs
@@ -179,7 +179,7 @@ impl PhysicalOptimizerRule for MatchesConstantTermOptimizer {

                                    // For debugging purpose. Not really precise but enough for most cases.
                                    let probes = term
-                                        .split(|c: char| !c.is_alphanumeric())
+                                        .split(|c: char| !c.is_alphanumeric() && c != '_')
                                        .filter(|s| !s.is_empty())
                                        .map(|s| s.to_string())
                                        .collect();
@@ -408,7 +408,7 @@ mod tests {
    async fn test_matches_term_optimization_from_sql() {
        let sql = "WITH base AS (
        SELECT text, timestamp FROM test 
-        WHERE MATCHES_TERM(text, 'hello world') 
+        WHERE MATCHES_TERM(text, 'hello wo_rld') 
        AND timestamp > '2025-01-01 00:00:00'
    ),
    subquery1 AS (
@@ -468,7 +468,7 @@ mod tests {
        let plan_str = get_plan_string(&physical_plan).join("\n");
        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"foo\", probes: [\"foo\"]"));
        assert!(plan_str.contains(
-            "MatchesConstTerm(text@0, term: \"hello world\", probes: [\"hello\", \"world\"]"
+            "MatchesConstTerm(text@0, term: \"hello wo_rld\", probes: [\"hello\", \"wo_rld\"]"
        ));
        assert!(plan_str.contains("MatchesConstTerm(text@0, term: \"world\", probes: [\"world\"]"));
        assert!(plan_str