From 080fa4d1f4172b76b2e9c68df9c757693a8086d7 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 1 Jul 2025 15:40:02 +0200 Subject: [PATCH] add docs/example and Vec values to sstable (#2660) --- sstable/src/lib.rs | 47 ++++++++++++++++++++++ sstable/src/value/mod.rs | 10 +++-- sstable/src/value/vec_u32.rs | 73 +++++++++++++++++++++++++++++++++++ sstable/tests/sstable_test.rs | 49 +++++++++++++++++++++++ 4 files changed, 175 insertions(+), 4 deletions(-) create mode 100644 sstable/src/value/vec_u32.rs create mode 100644 sstable/tests/sstable_test.rs diff --git a/sstable/src/lib.rs b/sstable/src/lib.rs index ea3965899..82461c363 100644 --- a/sstable/src/lib.rs +++ b/sstable/src/lib.rs @@ -1,3 +1,40 @@ +//! `tantivy_sstable` is a crate that provides a sorted string table data structure. +//! +//! It is used in `tantivy` to store the term dictionary. +//! +//! A `sstable` is a map of sorted `&[u8]` keys to values. +//! The keys are encoded using incremental encoding. +//! +//! Values and keys are compressed using zstd with the default feature flag `zstd-compression`. +//! +//! # Example +//! +//! Here is an example of how to create and search an `sstable`: +//! +//! ```rust +//! use common::OwnedBytes; +//! use tantivy_sstable::{Dictionary, MonotonicU64SSTable}; +//! +//! // Create a new sstable in memory. +//! let mut builder = Dictionary::::builder(Vec::new()).unwrap(); +//! builder.insert(b"apple", &1).unwrap(); +//! builder.insert(b"banana", &2).unwrap(); +//! builder.insert(b"orange", &3).unwrap(); +//! let sstable_bytes = builder.finish().unwrap(); +//! +//! // Open the sstable. +//! let sstable = +//! Dictionary::::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap(); +//! +//! // Search for a key. +//! let value = sstable.get(b"banana").unwrap(); +//! assert_eq!(value, Some(2)); +//! +//! // Search for a non-existent key. +//! let value = sstable.get(b"grape").unwrap(); +//! assert_eq!(value, None); +//! ``` + use std::io::{self, Write}; use std::ops::Range; @@ -19,6 +56,7 @@ pub use streamer::{Streamer, StreamerBuilder}; mod block_reader; use common::{BinarySerializable, OwnedBytes}; +use value::{VecU32ValueReader, VecU32ValueWriter}; pub use self::block_reader::BlockReader; pub use self::delta::{DeltaReader, DeltaWriter}; @@ -130,6 +168,15 @@ impl SSTable for RangeSSTable { type ValueWriter = RangeValueWriter; } +/// SSTable associating keys to Vec. +pub struct VecU32ValueSSTable; + +impl SSTable for VecU32ValueSSTable { + type Value = Vec; + type ValueReader = VecU32ValueReader; + type ValueWriter = VecU32ValueWriter; +} + /// SSTable reader. pub struct Reader { key: Vec, diff --git a/sstable/src/value/mod.rs b/sstable/src/value/mod.rs index be225e55d..bc90c4d7f 100644 --- a/sstable/src/value/mod.rs +++ b/sstable/src/value/mod.rs @@ -1,10 +1,16 @@ pub(crate) mod index; mod range; mod u64_monotonic; +mod vec_u32; mod void; use std::io; +pub use range::{RangeValueReader, RangeValueWriter}; +pub use u64_monotonic::{U64MonotonicValueReader, U64MonotonicValueWriter}; +pub use vec_u32::{VecU32ValueReader, VecU32ValueWriter}; +pub use void::{VoidValueReader, VoidValueWriter}; + /// `ValueReader` is a trait describing the contract of something /// reading blocks of value, and offering random access within this values. pub trait ValueReader: Default { @@ -40,10 +46,6 @@ pub trait ValueWriter: Default { fn clear(&mut self); } -pub use range::{RangeValueReader, RangeValueWriter}; -pub use u64_monotonic::{U64MonotonicValueReader, U64MonotonicValueWriter}; -pub use void::{VoidValueReader, VoidValueWriter}; - fn deserialize_vint_u64(data: &mut &[u8]) -> u64 { let (num_bytes, val) = super::vint::deserialize_read(data); *data = &data[num_bytes..]; diff --git a/sstable/src/value/vec_u32.rs b/sstable/src/value/vec_u32.rs new file mode 100644 index 000000000..3ea278d0e --- /dev/null +++ b/sstable/src/value/vec_u32.rs @@ -0,0 +1,73 @@ +use std::io; + +use super::{ValueReader, ValueWriter}; + +#[derive(Default)] +pub struct VecU32ValueReader { + vals: Vec>, +} + +impl ValueReader for VecU32ValueReader { + type Value = Vec; + + #[inline(always)] + fn value(&self, idx: usize) -> &Self::Value { + &self.vals[idx] + } + + fn load(&mut self, mut data: &[u8]) -> io::Result { + let original_num_bytes = data.len(); + self.vals.clear(); + + // The first 4 bytes are the number of blocks + let num_blocks = u32::from_le_bytes(data[..4].try_into().unwrap()) as usize; + data = &data[4..]; + + for _ in 0..num_blocks { + // Each block starts with a 4-byte length + let segment_len = u32::from_le_bytes(data[..4].try_into().unwrap()) as usize; + data = &data[4..]; + + // Read the segment IDs for this block + let mut segment_ids = Vec::with_capacity(segment_len); + for _ in 0..segment_len { + let segment_id = u32::from_le_bytes(data[..4].try_into().unwrap()); + segment_ids.push(segment_id); + data = &data[4..]; + } + self.vals.push(segment_ids); + } + + // Return the number of bytes consumed + Ok(original_num_bytes - data.len()) + } +} + +#[derive(Default)] +pub struct VecU32ValueWriter { + vals: Vec>, +} + +impl ValueWriter for VecU32ValueWriter { + type Value = Vec; + + fn write(&mut self, val: &Self::Value) { + self.vals.push(val.to_vec()); + } + + fn serialize_block(&self, output: &mut Vec) { + let num_blocks = self.vals.len() as u32; + output.extend_from_slice(&num_blocks.to_le_bytes()); + for vals in &self.vals { + let len = vals.len() as u32; + output.extend_from_slice(&len.to_le_bytes()); + for &segment_id in vals.iter() { + output.extend_from_slice(&segment_id.to_le_bytes()); + } + } + } + + fn clear(&mut self) { + self.vals.clear(); + } +} diff --git a/sstable/tests/sstable_test.rs b/sstable/tests/sstable_test.rs new file mode 100644 index 000000000..c08547404 --- /dev/null +++ b/sstable/tests/sstable_test.rs @@ -0,0 +1,49 @@ +use common::OwnedBytes; +use tantivy_sstable::{Dictionary, MonotonicU64SSTable, VecU32ValueSSTable}; + +#[test] +fn test_create_and_search_sstable() { + // Create a new sstable in memory. + let mut builder = Dictionary::::builder(Vec::new()).unwrap(); + builder.insert(b"apple", &1).unwrap(); + builder.insert(b"banana", &2).unwrap(); + builder.insert(b"orange", &3).unwrap(); + let sstable_bytes = builder.finish().unwrap(); + + // Open the sstable. + let sstable = + Dictionary::::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap(); + + // Search for a key. + let value = sstable.get(b"banana").unwrap(); + assert_eq!(value, Some(2)); + + // Search for a non-existent key. + let value = sstable.get(b"blub").unwrap(); + assert_eq!(value, None); +} + +#[test] +fn test_custom_value_sstable() { + // Create a new sstable with custom values. + let mut builder = Dictionary::::builder(Vec::new()).unwrap(); + builder.set_block_len(4096); // Ensure both values are in the same block + builder.insert(b"first", &vec![1, 2, 3]).unwrap(); + builder.insert(b"second", &vec![4, 5]).unwrap(); + let sstable_bytes = builder.finish().unwrap(); + + // Open the sstable. + let sstable = + Dictionary::::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap(); + + let mut stream = sstable.stream().unwrap(); + assert!(stream.advance()); + assert_eq!(stream.key(), b"first"); + assert_eq!(stream.value(), &vec![1, 2, 3]); + + assert!(stream.advance()); + assert_eq!(stream.key(), b"second"); + assert_eq!(stream.value(), &vec![4, 5]); + + assert!(!stream.advance()); +}