mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
137 lines
5.0 KiB
Rust
137 lines
5.0 KiB
Rust
//! XOR delta compression for f64 polygon coordinates.
|
|
//!
|
|
//! Lossless compression for floating-point lat/lon coordinates using XOR delta encoding on IEEE
|
|
//! 754 bit patterns with variable-length integer encoding. Designed for per-polygon random access
|
|
//! in the document store, where each polygon compresses independently without requiring sequential
|
|
//! decompression.
|
|
//!
|
|
//! Spatially local coordinates share most high-order bits. A municipal boundary spanning 1km has
|
|
//! consecutive vertices typically within 100-500 meters, meaning their f64 bit patterns share
|
|
//! 30-40 bits. XOR reveals these common bits as zeros, which varint encoding then compresses
|
|
//! efficiently.
|
|
//!
|
|
//! The format stores the first coordinate as raw 8 bytes, then XOR deltas between consecutive
|
|
//! coordinates encoded as variable-length integers. When compression produces larger output than
|
|
//! the raw input (random data, compression-hostile patterns), the function automatically falls
|
|
//! back to storing coordinates as uncompressed 8-byte values.
|
|
//!
|
|
//! Unlike delta.rs which uses arithmetic deltas for i32 spatial coordinates in the block kd-tree,
|
|
//! this module operates on f64 bit patterns directly to preserve exact floating-point values for
|
|
//! returning to users.
|
|
use std::io::Read;
|
|
|
|
use common::VInt;
|
|
|
|
/// Compresses f64 coordinates using XOR delta encoding with automatic raw fallback.
|
|
///
|
|
/// Stores the first coordinate as raw bits, then computes XOR between consecutive coordinate bit
|
|
/// patterns and encodes as variable-length integers. If the compressed output would be larger than
|
|
/// raw storage (8 bytes per coordinate), automatically falls back to raw encoding.
|
|
///
|
|
/// Returns a byte vector that can be decompressed with `decompress_f64()` to recover exact
|
|
/// original values.
|
|
pub fn compress_f64(values: &[f64]) -> Vec<u8> {
|
|
if values.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
let mut output: Vec<u8> = Vec::new();
|
|
let mut previous: u64 = f64_to_le(values[0]);
|
|
output.extend_from_slice(&previous.to_le_bytes());
|
|
for &value in &values[1..] {
|
|
let bits = value.to_bits();
|
|
let xor = bits ^ previous;
|
|
VInt(xor).serialize_into_vec(&mut output);
|
|
previous = bits
|
|
}
|
|
if output.len() >= values.len() * 8 {
|
|
let mut output = Vec::with_capacity(values.len() * 8);
|
|
for &value in values {
|
|
output.extend_from_slice(&f64_to_le(value).to_le_bytes());
|
|
}
|
|
return output;
|
|
}
|
|
output
|
|
}
|
|
|
|
fn f64_to_le(value: f64) -> u64 {
|
|
u64::from_le_bytes(value.to_le_bytes())
|
|
}
|
|
|
|
fn f64_from_le(value: u64) -> f64 {
|
|
f64::from_le_bytes(value.to_le_bytes())
|
|
}
|
|
|
|
/// Decompresses f64 coordinates from XOR delta or raw encoding.
|
|
///
|
|
/// Detects compression format by byte length - if `bytes.len() == count * 8`, data is raw and
|
|
/// copied directly. Otherwise, reads first coordinate from 8 bytes, then XOR deltas as varints,
|
|
/// reconstructing the original sequence.
|
|
///
|
|
/// Returns exact f64 values that were passed to `compress_f64()`.
|
|
pub fn decompress_f64(mut bytes: &[u8], count: usize) -> Vec<f64> {
|
|
let mut values = Vec::with_capacity(count);
|
|
if bytes.len() == count * 8 {
|
|
for i in 0..count {
|
|
let bits = u64::from_le_bytes(bytes[i * 8..(i + 1) * 8].try_into().unwrap());
|
|
values.push(f64_from_le(bits));
|
|
}
|
|
return values;
|
|
}
|
|
let mut cursor: &mut &[u8] = &mut bytes;
|
|
|
|
// Read first value (raw 8 bytes)
|
|
let mut first_bytes = [0u8; 8];
|
|
cursor.read_exact(&mut first_bytes).unwrap();
|
|
let mut previous = u64::from_le_bytes(first_bytes);
|
|
values.push(f64::from_bits(previous));
|
|
|
|
// Read remaining values as VInt XORs
|
|
while values.len() < count {
|
|
let xor = VInt::deserialize_u64(&mut cursor).unwrap();
|
|
let bits = previous ^ xor;
|
|
values.push(f64::from_bits(bits));
|
|
previous = bits;
|
|
}
|
|
|
|
values
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_compress_spatial_locality() {
|
|
// Small town polygon - longitude only.
|
|
let longitudes = vec![
|
|
40.7580, 40.7581, 40.7582, 40.7583, 40.7584, 40.7585, 40.7586, 40.7587,
|
|
];
|
|
let bytes = compress_f64(&longitudes);
|
|
// Should compress well - XOR deltas will be small
|
|
assert_eq!(bytes.len(), 46);
|
|
// Should decompress to exact original values
|
|
let decompressed = decompress_f64(&bytes, longitudes.len());
|
|
assert_eq!(longitudes, decompressed);
|
|
}
|
|
#[test]
|
|
fn test_fallback_to_raw() {
|
|
// Random, widely scattered values - poor compression
|
|
let values = vec![
|
|
12345.6789,
|
|
-98765.4321,
|
|
0.00001,
|
|
999999.999,
|
|
-0.0,
|
|
std::f64::consts::PI,
|
|
std::f64::consts::E,
|
|
42.0,
|
|
];
|
|
let bytes = compress_f64(&values);
|
|
// Should fall back to raw storage
|
|
assert_eq!(bytes.len(), values.len() * 8);
|
|
// Should still decompress correctly
|
|
let decompressed = decompress_f64(&bytes, values.len());
|
|
assert_eq!(values, decompressed);
|
|
}
|
|
}
|