diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index fa5498c64..0e75f3d89 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -4,3 +4,4 @@ pub mod bkd; pub mod delta; pub mod radix_select; pub mod triangle; +pub mod xor; diff --git a/src/spatial/xor.rs b/src/spatial/xor.rs new file mode 100644 index 000000000..af069a43a --- /dev/null +++ b/src/spatial/xor.rs @@ -0,0 +1,128 @@ +//! XOR delta compression for f64 polygon coordinates. +//! +//! Lossless compression for floating-point lat/lon coordinates using XOR delta encoding on IEEE +//! 754 bit patterns with variable-length integer encoding. Designed for per-polygon random access +//! in the document store, where each polygon compresses independently without requiring sequential +//! decompression. +//! +//! Spatially local coordinates share most high-order bits. A municipal boundary spanning 1km has +//! consecutive vertices typically within 100-500 meters, meaning their f64 bit patterns share +//! 30-40 bits. XOR reveals these common bits as zeros, which varint encoding then compresses +//! efficiently. +//! +//! The format stores the first coordinate as raw 8 bytes, then XOR deltas between consecutive +//! coordinates encoded as variable-length integers. When compression produces larger output than +//! the raw input (random data, compression-hostile patterns), the function automatically falls +//! back to storing coordinates as uncompressed 8-byte values. +//! +//! Unlike delta.rs which uses arithmetic deltas for i32 spatial coordinates in the block kd-tree, +//! this module operates on f64 bit patterns directly to preserve exact floating-point values for +//! returning to users. +use std::io::{Cursor, Read}; + +use common::VInt; + +/// Compresses f64 coordinates using XOR delta encoding with automatic raw fallback. +/// +/// Stores the first coordinate as raw bits, then computes XOR between consecutive coordinate bit +/// patterns and encodes as variable-length integers. If the compressed output would be larger than +/// raw storage (8 bytes per coordinate), automatically falls back to raw encoding. +/// +/// Returns a byte vector that can be decompressed with `decompress_f64()` to recover exact +/// original values. +pub fn compress_f64(values: &[f64]) -> Vec { + if values.is_empty() { + return Vec::new(); + } + let mut output = Vec::new(); + let mut previous = values[0].to_bits(); + output.extend_from_slice(&previous.to_le_bytes()); + for &value in &values[1..] { + let bits = value.to_bits(); + let xor = bits ^ previous; + VInt(xor).serialize_into_vec(&mut output); + previous = bits + } + if output.len() >= values.len() * 8 { + let mut output = Vec::with_capacity(values.len() * 8); + for &value in values { + output.extend_from_slice(&value.to_bits().to_le_bytes()); + } + return output; + } + output +} + +/// Decompresses f64 coordinates from XOR delta or raw encoding. +/// +/// Detects compression format by byte length - if `bytes.len() == count * 8`, data is raw and +/// copied directly. Otherwise, reads first coordinate from 8 bytes, then XOR deltas as varints, +/// reconstructing the original sequence. +/// +/// Returns exact f64 values that were passed to `compress_f64()`. +pub fn decompress_f64(bytes: &[u8], count: usize) -> Vec { + let mut values = Vec::with_capacity(count); + if bytes.len() == count * 8 { + for i in 0..count { + let bits = u64::from_le_bytes(bytes[i * 8..(i + 1) * 8].try_into().unwrap()); + values.push(f64::from_bits(bits)); + } + return values; + } + let mut cursor = Cursor::new(bytes); + + // Read first value (raw 8 bytes) + let mut first_bytes = [0u8; 8]; + cursor.read_exact(&mut first_bytes).unwrap(); + let mut previous = u64::from_le_bytes(first_bytes); + values.push(f64::from_bits(previous)); + + // Read remaining values as VInt XORs + while values.len() < count { + let xor = VInt::deserialize_u64(&mut cursor).unwrap(); + let bits = previous ^ xor; + values.push(f64::from_bits(bits)); + previous = bits; + } + + values +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress_spatial_locality() { + // Small town polygon - longitude only. + let longitudes = vec![ + 40.7580, 40.7581, 40.7582, 40.7583, 40.7584, 40.7585, 40.7586, 40.7587, + ]; + let bytes = compress_f64(&longitudes); + // Should compress well - XOR deltas will be small + assert_eq!(bytes.len(), 46); + // Should decompress to exact original values + let decompressed = decompress_f64(&bytes, longitudes.len()); + assert_eq!(longitudes, decompressed); + } + #[test] + fn test_fallback_to_raw() { + // Random, widely scattered values - poor compression + let values = vec![ + 12345.6789, + -98765.4321, + 0.00001, + 999999.999, + -0.0, + std::f64::consts::PI, + std::f64::consts::E, + 42.0, + ]; + let bytes = compress_f64(&values); + // Should fall back to raw storage + assert_eq!(bytes.len(), values.len() * 8); + // Should still decompress correctly + let decompressed = decompress_f64(&bytes, values.len()); + assert_eq!(values, decompressed); + } +}