perf: improve bloom filter reader's byte reading logic (#6658)

* perf: improve bloom filter reader's byte reading logic Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * revert toml change Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clearify comment Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * benchmark Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update lock file Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * pub util fn Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * note endian Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2026-05-14 20:10:37 +00:00 · 2025-08-12 04:37:25 -07:00
parent e80e4a9ed7
commit e495c614f7
4 changed files with 150 additions and 9 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6133,6 +6133,7 @@ dependencies = [
 "prost 0.13.5",
 "puffin",
 "rand 0.9.0",
+ "rand_chacha 0.9.0",
 "regex",
 "regex-automata 0.4.8",
 "roaring",
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -44,6 +44,7 @@ uuid.workspace = true
 common-test-util.workspace = true
 criterion = "0.4"
 rand.workspace = true
+rand_chacha = "0.9"
 tempfile.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
@@ -51,3 +52,7 @@ tokio-util.workspace = true
 [[bench]]
 name = "tokenizer_bench"
 harness = false
+
+[[bench]]
+name = "bytes_to_u64_vec"
+harness = false
--- a/src/index/benches/bytes_to_u64_vec.rs
+++ b/src/index/benches/bytes_to_u64_vec.rs
@@ -0,0 +1,99 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::hint::black_box;
+
+use bytes::Bytes;
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use index::bloom_filter::reader::bytes_to_u64_vec;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+/// Generate test data that is guaranteed to be aligned to 8-byte boundary
+fn generate_aligned_data(size: usize) -> Bytes {
+    let mut rng = ChaCha8Rng::seed_from_u64(42);
+    let u64_count = size / 8; // Number of u64 values
+
+    // Generate random u64 values directly - this guarantees alignment
+    let mut u64_data: Vec<u64> = Vec::with_capacity(u64_count);
+    for _ in 0..u64_count {
+        u64_data.push(rng.random::<u64>());
+    }
+
+    // Transmute Vec<u64> to Vec<u8> while preserving alignment
+    let byte_vec = unsafe {
+        let ptr = u64_data.as_mut_ptr() as *mut u8;
+        let len = u64_data.len() * std::mem::size_of::<u64>();
+        let cap = u64_data.capacity() * std::mem::size_of::<u64>();
+        std::mem::forget(u64_data); // Prevent dropping the original Vec
+        Vec::from_raw_parts(ptr, len, cap)
+    };
+
+    Bytes::from(byte_vec)
+}
+
+/// Generate test data that is guaranteed to be unaligned
+fn generate_unaligned_data(size: usize) -> Bytes {
+    let mut rng = ChaCha8Rng::seed_from_u64(42);
+    let u64_count = size / 8; // Number of u64 values
+
+    // Generate random u64 values - start with aligned data
+    let mut u64_data: Vec<u64> = Vec::with_capacity(u64_count);
+    for _ in 0..u64_count {
+        u64_data.push(rng.random::<u64>());
+    }
+
+    // Transmute Vec<u64> to Vec<u8>
+    let byte_vec = unsafe {
+        let ptr = u64_data.as_mut_ptr() as *mut u8;
+        let len = u64_data.len() * std::mem::size_of::<u64>();
+        let cap = u64_data.capacity() * std::mem::size_of::<u64>();
+        std::mem::forget(u64_data); // Prevent dropping the original Vec
+        Vec::from_raw_parts(ptr, len, cap)
+    };
+
+    let unaligned_bytes = Bytes::from(byte_vec);
+    unaligned_bytes.slice(1..)
+}
+
+fn benchmark_convert(c: &mut Criterion) {
+    let sizes = vec![1024, 16384, 262144, 1048576]; // 1KB to 1MB
+
+    let mut group = c.benchmark_group("bytes_to_u64_vec");
+
+    for size in sizes {
+        let data = generate_aligned_data(size);
+        group.throughput(Throughput::Bytes(data.len() as u64));
+        group.bench_with_input(BenchmarkId::new("aligned", size), &data, |b, data| {
+            b.iter(|| {
+                let result = bytes_to_u64_vec(black_box(data));
+                black_box(result);
+            });
+        });
+
+        let data = generate_unaligned_data(size);
+        group.throughput(Throughput::Bytes(data.len() as u64));
+        group.bench_with_input(BenchmarkId::new("unaligned", size), &data, |b, data| {
+            b.iter(|| {
+                let result = bytes_to_u64_vec(black_box(data));
+                black_box(result);
+            });
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, benchmark_convert);
+criterion_main!(benches);
--- a/src/index/src/bloom_filter/reader.rs
+++ b/src/index/src/bloom_filter/reader.rs
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::ops::Range;
+use std::ops::{Range, Rem};

 use async_trait::async_trait;
+use bytemuck::try_cast_slice;
 use bytes::Bytes;
 use common_base::range_read::RangeReader;
 use fastbloom::BloomFilter;
@@ -33,6 +34,47 @@ const BLOOM_META_LEN_SIZE: u64 = 4;
 /// Default prefetch size of bloom filter meta.
 pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB

+/// Safely converts bytes to Vec<u64> using bytemuck for optimal performance.
+/// Faster than chunking and converting each piece individually.
+///
+/// The input bytes are a sequence of little-endian u64s.
+pub fn bytes_to_u64_vec(bytes: &Bytes) -> Vec<u64> {
+    // drop tailing things, this keeps the same behavior with `chunks_exact`.
+    let aligned_length = bytes.len() - bytes.len().rem(std::mem::size_of::<u64>());
+    let byte_slice = &bytes[..aligned_length];
+
+    // Try fast path first: direct cast if aligned
+    let u64_vec = if let Ok(u64_slice) = try_cast_slice::<u8, u64>(byte_slice) {
+        u64_slice.to_vec()
+    } else {
+        // Slow path: create aligned Vec<u64> and copy data
+        let u64_count = byte_slice.len() / std::mem::size_of::<u64>();
+        let mut u64_vec = Vec::<u64>::with_capacity(u64_count);
+
+        // SAFETY: We're creating a properly sized slice from uninitialized but allocated memory
+        // to copy bytes into. The slice has exactly the right size for the byte data.
+        let dest_slice = unsafe {
+            std::slice::from_raw_parts_mut(u64_vec.as_mut_ptr() as *mut u8, byte_slice.len())
+        };
+        dest_slice.copy_from_slice(byte_slice);
+
+        // SAFETY: We've just initialized exactly u64_count elements worth of bytes
+        unsafe { u64_vec.set_len(u64_count) };
+        u64_vec
+    };
+
+    // Convert from platform endianness to little endian if needed
+    // Just in case.
+    #[cfg(target_endian = "little")]
+    {
+        u64_vec
+    }
+    #[cfg(target_endian = "big")]
+    {
+        u64_vec.into_iter().map(|x| x.swap_bytes()).collect()
+    }
+}
+
 /// `BloomFilterReader` reads the bloom filter from the file.
 #[async_trait]
 pub trait BloomFilterReader: Sync {
@@ -56,10 +98,7 @@ pub trait BloomFilterReader: Sync {
    /// Reads a bloom filter with the given location.
    async fn bloom_filter(&self, loc: &BloomFilterLoc) -> Result<BloomFilter> {
        let bytes = self.range_read(loc.offset, loc.size as _).await?;
-        let vec = bytes
-            .chunks_exact(std::mem::size_of::<u64>())
-            .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap()))
-            .collect();
+        let vec = bytes_to_u64_vec(&bytes);
        let bm = BloomFilter::from_vec(vec)
            .seed(&SEED)
            .expected_items(loc.element_count as _);
@@ -75,10 +114,7 @@ pub trait BloomFilterReader: Sync {

        let mut result = Vec::with_capacity(bss.len());
        for (bs, loc) in bss.into_iter().zip(locs.iter()) {
-            let vec = bs
-                .chunks_exact(std::mem::size_of::<u64>())
-                .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap()))
-                .collect();
+            let vec = bytes_to_u64_vec(&bs);
            let bm = BloomFilter::from_vec(vec)
                .seed(&SEED)
                .expected_items(loc.element_count as _);