add docs/example and Vec<u32> values to sstable (#2660)

This commit is contained in:
PSeitz
2025-07-01 15:40:02 +02:00
committed by GitHub
parent 988c2b35e7
commit 080fa4d1f4
4 changed files with 175 additions and 4 deletions

View File

@@ -1,3 +1,40 @@
//! `tantivy_sstable` is a crate that provides a sorted string table data structure.
//!
//! It is used in `tantivy` to store the term dictionary.
//!
//! A `sstable` is a map of sorted `&[u8]` keys to values.
//! The keys are encoded using incremental encoding.
//!
//! Values and keys are compressed using zstd with the default feature flag `zstd-compression`.
//!
//! # Example
//!
//! Here is an example of how to create and search an `sstable`:
//!
//! ```rust
//! use common::OwnedBytes;
//! use tantivy_sstable::{Dictionary, MonotonicU64SSTable};
//!
//! // Create a new sstable in memory.
//! let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
//! builder.insert(b"apple", &1).unwrap();
//! builder.insert(b"banana", &2).unwrap();
//! builder.insert(b"orange", &3).unwrap();
//! let sstable_bytes = builder.finish().unwrap();
//!
//! // Open the sstable.
//! let sstable =
//! Dictionary::<MonotonicU64SSTable>::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap();
//!
//! // Search for a key.
//! let value = sstable.get(b"banana").unwrap();
//! assert_eq!(value, Some(2));
//!
//! // Search for a non-existent key.
//! let value = sstable.get(b"grape").unwrap();
//! assert_eq!(value, None);
//! ```
use std::io::{self, Write};
use std::ops::Range;
@@ -19,6 +56,7 @@ pub use streamer::{Streamer, StreamerBuilder};
mod block_reader;
use common::{BinarySerializable, OwnedBytes};
use value::{VecU32ValueReader, VecU32ValueWriter};
pub use self::block_reader::BlockReader;
pub use self::delta::{DeltaReader, DeltaWriter};
@@ -130,6 +168,15 @@ impl SSTable for RangeSSTable {
type ValueWriter = RangeValueWriter;
}
/// SSTable associating keys to Vec<u32>.
pub struct VecU32ValueSSTable;
impl SSTable for VecU32ValueSSTable {
type Value = Vec<u32>;
type ValueReader = VecU32ValueReader;
type ValueWriter = VecU32ValueWriter;
}
/// SSTable reader.
pub struct Reader<TValueReader> {
key: Vec<u8>,

View File

@@ -1,10 +1,16 @@
pub(crate) mod index;
mod range;
mod u64_monotonic;
mod vec_u32;
mod void;
use std::io;
pub use range::{RangeValueReader, RangeValueWriter};
pub use u64_monotonic::{U64MonotonicValueReader, U64MonotonicValueWriter};
pub use vec_u32::{VecU32ValueReader, VecU32ValueWriter};
pub use void::{VoidValueReader, VoidValueWriter};
/// `ValueReader` is a trait describing the contract of something
/// reading blocks of value, and offering random access within this values.
pub trait ValueReader: Default {
@@ -40,10 +46,6 @@ pub trait ValueWriter: Default {
fn clear(&mut self);
}
pub use range::{RangeValueReader, RangeValueWriter};
pub use u64_monotonic::{U64MonotonicValueReader, U64MonotonicValueWriter};
pub use void::{VoidValueReader, VoidValueWriter};
fn deserialize_vint_u64(data: &mut &[u8]) -> u64 {
let (num_bytes, val) = super::vint::deserialize_read(data);
*data = &data[num_bytes..];

View File

@@ -0,0 +1,73 @@
use std::io;
use super::{ValueReader, ValueWriter};
#[derive(Default)]
pub struct VecU32ValueReader {
vals: Vec<Vec<u32>>,
}
impl ValueReader for VecU32ValueReader {
type Value = Vec<u32>;
#[inline(always)]
fn value(&self, idx: usize) -> &Self::Value {
&self.vals[idx]
}
fn load(&mut self, mut data: &[u8]) -> io::Result<usize> {
let original_num_bytes = data.len();
self.vals.clear();
// The first 4 bytes are the number of blocks
let num_blocks = u32::from_le_bytes(data[..4].try_into().unwrap()) as usize;
data = &data[4..];
for _ in 0..num_blocks {
// Each block starts with a 4-byte length
let segment_len = u32::from_le_bytes(data[..4].try_into().unwrap()) as usize;
data = &data[4..];
// Read the segment IDs for this block
let mut segment_ids = Vec::with_capacity(segment_len);
for _ in 0..segment_len {
let segment_id = u32::from_le_bytes(data[..4].try_into().unwrap());
segment_ids.push(segment_id);
data = &data[4..];
}
self.vals.push(segment_ids);
}
// Return the number of bytes consumed
Ok(original_num_bytes - data.len())
}
}
#[derive(Default)]
pub struct VecU32ValueWriter {
vals: Vec<Vec<u32>>,
}
impl ValueWriter for VecU32ValueWriter {
type Value = Vec<u32>;
fn write(&mut self, val: &Self::Value) {
self.vals.push(val.to_vec());
}
fn serialize_block(&self, output: &mut Vec<u8>) {
let num_blocks = self.vals.len() as u32;
output.extend_from_slice(&num_blocks.to_le_bytes());
for vals in &self.vals {
let len = vals.len() as u32;
output.extend_from_slice(&len.to_le_bytes());
for &segment_id in vals.iter() {
output.extend_from_slice(&segment_id.to_le_bytes());
}
}
}
fn clear(&mut self) {
self.vals.clear();
}
}

View File

@@ -0,0 +1,49 @@
use common::OwnedBytes;
use tantivy_sstable::{Dictionary, MonotonicU64SSTable, VecU32ValueSSTable};
#[test]
fn test_create_and_search_sstable() {
// Create a new sstable in memory.
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
builder.insert(b"apple", &1).unwrap();
builder.insert(b"banana", &2).unwrap();
builder.insert(b"orange", &3).unwrap();
let sstable_bytes = builder.finish().unwrap();
// Open the sstable.
let sstable =
Dictionary::<MonotonicU64SSTable>::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap();
// Search for a key.
let value = sstable.get(b"banana").unwrap();
assert_eq!(value, Some(2));
// Search for a non-existent key.
let value = sstable.get(b"blub").unwrap();
assert_eq!(value, None);
}
#[test]
fn test_custom_value_sstable() {
// Create a new sstable with custom values.
let mut builder = Dictionary::<VecU32ValueSSTable>::builder(Vec::new()).unwrap();
builder.set_block_len(4096); // Ensure both values are in the same block
builder.insert(b"first", &vec![1, 2, 3]).unwrap();
builder.insert(b"second", &vec![4, 5]).unwrap();
let sstable_bytes = builder.finish().unwrap();
// Open the sstable.
let sstable =
Dictionary::<VecU32ValueSSTable>::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap();
let mut stream = sstable.stream().unwrap();
assert!(stream.advance());
assert_eq!(stream.key(), b"first");
assert_eq!(stream.value(), &vec![1, 2, 3]);
assert!(stream.advance());
assert_eq!(stream.key(), b"second");
assert_eq!(stream.value(), &vec![4, 5]);
assert!(!stream.advance());
}