diff --git a/Cargo.toml b/Cargo.toml index 2cdb25b1c..cb0c2b8bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -93,6 +93,7 @@ time = { version = "0.3.10", features = ["serde-well-known", "macros"] } postcard = { version = "1.0.4", features = [ "use-std", ], default-features = false } +geojson = "0.24.2" [target.'cfg(not(windows))'.dev-dependencies] criterion = { version = "0.5", default-features = false } diff --git a/src/spatial/bkd.rs b/src/spatial/bkd.rs new file mode 100644 index 000000000..99476e1c2 --- /dev/null +++ b/src/spatial/bkd.rs @@ -0,0 +1,760 @@ +//! Block kd-tree spatial indexing for triangulated polygons. +//! +//! Implements an immutable bulk-loaded spatial index using recursive median partitioning on +//! bounding box dimensions. Each leaf stores up to 512 triangles with delta-compressed coordinates +//! and doc IDs. The tree provides three query types (intersects, within, contains) that use exact +//! integer arithmetic for geometric predicates and accumulate results in bit sets for efficient +//! deduplication across leaves. +//! +//! The serialized format stores compressed leaf pages followed by the tree structure (leaf and +//! branch nodes), enabling zero-copy access through memory-mapped segments without upfront +//! decompression. +use std::io; +use std::io::{Seek, Write}; + +use common::BitSet; + +use crate::spatial::delta::{compress, decompress, Compressible}; +use crate::spatial::triangle::Triangle; + +#[derive(Clone, Copy)] +struct SpreadSurvey { + min: i32, + max: i32, +} + +impl SpreadSurvey { + fn survey(&mut self, value: i32) { + self.min = self.min.min(value); + self.max = self.max.max(value); + } + fn spread(&self) -> i32 { + self.max - self.min + } +} + +impl Default for SpreadSurvey { + fn default() -> Self { + SpreadSurvey { + min: i32::MAX, + max: i32::MIN, + } + } +} + +#[derive(Clone, Copy)] +struct BoundingBoxSurvey { + bbox: [i32; 4], +} + +impl BoundingBoxSurvey { + fn survey(&mut self, triangle: &Triangle) { + self.bbox[0] = triangle.words[0].min(self.bbox[0]); + self.bbox[1] = triangle.words[1].min(self.bbox[1]); + self.bbox[2] = triangle.words[2].max(self.bbox[2]); + self.bbox[3] = triangle.words[3].max(self.bbox[3]); + } + fn bbox(&self) -> [i32; 4] { + self.bbox.clone() + } +} + +impl Default for BoundingBoxSurvey { + fn default() -> Self { + BoundingBoxSurvey { + bbox: [i32::MAX, i32::MAX, i32::MIN, i32::MIN], + } + } +} + +enum BuildNode { + Branch { + bbox: [i32; 4], + left: Box, + right: Box, + }, + Leaf { + bbox: [i32; 4], + pos: u64, + len: u16, + }, +} + +struct CompressibleTriangleI32<'a> { + triangles: &'a [Triangle], + dimension: usize, +} + +impl<'a> CompressibleTriangleI32<'a> { + fn new(triangles: &'a [Triangle], dimension: usize) -> Self { + CompressibleTriangleI32 { + triangles: triangles, + dimension: dimension, + } + } +} + +impl<'a> Compressible for CompressibleTriangleI32<'a> { + type Value = i32; + fn len(&self) -> usize { + self.triangles.len() + } + fn get(&self, i: usize) -> i32 { + self.triangles[i].words[self.dimension] + } +} + +struct CompressibleTriangleDocID<'a> { + triangles: &'a [Triangle], +} + +impl<'a> CompressibleTriangleDocID<'a> { + fn new(triangles: &'a [Triangle]) -> Self { + CompressibleTriangleDocID { + triangles: triangles, + } + } +} + +impl<'a> Compressible for CompressibleTriangleDocID<'a> { + type Value = u32; + fn len(&self) -> usize { + self.triangles.len() + } + fn get(&self, i: usize) -> u32 { + self.triangles[i].doc_id + } +} + +// Leaf pages are first the count of triangles, followed by delta encoded doc_ids, followed by +// the delta encoded words in order. We will then have the length of the page. We build a tree +// after the pages with leaf nodes and branch nodes. Leaf nodes will contain the bounding box +// of the leaf followed position and length of the page. The leaf node is a level of direction +// to store the position and length of the page in a format that is easy to read directly from +// the mapping. + +// We do not compress the tree nodes. We read them directly from the mapping. + +// +fn write_leaf_pages( + triangles: &mut [Triangle], + write: &mut W, +) -> io::Result { + if triangles.len() <= 512 { + let pos = write.stream_position()?; + let mut spreads = [SpreadSurvey::default(); 4]; + let mut bounding_box = BoundingBoxSurvey::default(); + for triangle in triangles.iter() { + for i in 0..4 { + spreads[i].survey(triangle.words[i]); + } + bounding_box.survey(triangle); + } + let mut max_spread = spreads[0].spread(); + let mut dimension = 0; + for i in 1..4 { + let current_spread = spreads[i].spread(); + if current_spread > max_spread { + dimension = i; + max_spread = current_spread; + } + } + write.write_all(&(triangles.len() as u16).to_le_bytes())?; + triangles.sort_by_key(|t| t.words[dimension]); + compress(&CompressibleTriangleDocID::new(triangles), write)?; + let compressible = [ + CompressibleTriangleI32::new(triangles, 0), + CompressibleTriangleI32::new(triangles, 1), + CompressibleTriangleI32::new(triangles, 2), + CompressibleTriangleI32::new(triangles, 3), + CompressibleTriangleI32::new(triangles, 4), + CompressibleTriangleI32::new(triangles, 5), + CompressibleTriangleI32::new(triangles, 6), + ]; + for i in 0..7 { + compress(&compressible[i], write)?; + } + let len = write.stream_position()? - pos; + Ok(BuildNode::Leaf { + bbox: bounding_box.bbox(), + pos: pos, + len: len as u16, + }) + } else { + let mut spreads = [SpreadSurvey::default(); 4]; + let mut bounding_box = BoundingBoxSurvey::default(); + for triangle in triangles.iter() { + for i in 0..4 { + spreads[i].survey(triangle.words[i]); + } + bounding_box.survey(triangle); + } + let mut max_spread = spreads[0].spread(); + let mut dimension = 0; + for i in 0..4 { + let current_spread = spreads[i].spread(); + if current_spread > max_spread { + dimension = i; + max_spread = current_spread; + } + } + let mid = triangles.len() / 2; + triangles.select_nth_unstable_by_key(mid, |t| t.words[dimension]); + let partition = triangles[mid].words[dimension]; + let mut split_point = mid + 1; + while split_point < triangles.len() && triangles[split_point].words[dimension] == partition + { + split_point += 1; + } + if split_point == triangles.len() { + split_point = mid; // Force split at midpoint index + } + let (left, right) = triangles.split_at_mut(split_point); + let left_node = write_leaf_pages(left, write)?; + let right_node = write_leaf_pages(right, write)?; + Ok(BuildNode::Branch { + bbox: bounding_box.bbox(), + left: Box::new(left_node), + right: Box::new(right_node), + }) + } +} + +fn write_leaf_nodes(node: &BuildNode, write: &mut W) -> io::Result<()> { + match node { + BuildNode::Branch { + bbox: _, + left, + right, + } => { + write_leaf_nodes(right, write)?; + write_leaf_nodes(left, write)?; + } + BuildNode::Leaf { bbox, pos, len } => { + for &dimension in bbox.iter() { + write.write_all(&dimension.to_le_bytes())?; + } + write.write_all(&pos.to_le_bytes())?; + write.write_all(&len.to_le_bytes())?; + write.write_all(&[0u8; 6])?; + } + } + Ok(()) +} + +fn write_branch_nodes( + node: &BuildNode, + branch_offset: &mut i32, + leaf_offset: &mut i32, + write: &mut W, +) -> io::Result { + match node { + BuildNode::Leaf { .. } => { + let pos = *leaf_offset; + *leaf_offset -= 1; + Ok(pos * size_of::() as i32) + } + BuildNode::Branch { bbox, left, right } => { + let left = write_branch_nodes(left, branch_offset, leaf_offset, write)?; + let right = write_branch_nodes(right, branch_offset, leaf_offset, write)?; + for &val in bbox { + write.write_all(&val.to_le_bytes())?; + } + write.write_all(&left.to_le_bytes())?; + write.write_all(&right.to_le_bytes())?; + write.write_all(&[0u8; 8])?; + let pos = *branch_offset; + *branch_offset += 1; + Ok(pos * size_of::() as i32) + } + } +} + +/// Builds and serializes a block kd-tree for spatial indexing of triangles. +/// +/// Takes a collection of triangles and constructs a complete block kd-tree, writing both the +/// compressed leaf pages and tree structure to the output. The tree uses recursive median +/// partitioning on the dimension with maximum spread, storing up to 512 triangles per leaf. +/// +/// The output format consists of: +/// - Version header (u16) +/// - Compressed leaf pages (delta-encoded doc_ids and triangle coordinates) +/// - 32-byte aligned tree structure (leaf nodes, then branch nodes) +/// - Footer with triangle count, root offset, and branch position +/// +/// The `triangles` slice will be reordered during tree construction as partitioning sorts by the +/// selected dimension at each level. +pub fn write_block_kd_tree( + triangles: &mut [Triangle], + write: &mut W, +) -> io::Result<()> { + write.write_all(&1u16.to_le_bytes())?; + let tree = write_leaf_pages(triangles, write)?; + let current = write.stream_position()?; + let aligned = (current + 31) & !31; + let padding = aligned - current; + write.write_all(&vec![0u8; padding as usize])?; + write_leaf_nodes(&tree, write)?; + let branch_position = write.stream_position()?; + let mut branch_offset: i32 = 0; + let mut leaf_offset: i32 = -1; + let root = write_branch_nodes(&tree, &mut branch_offset, &mut leaf_offset, write)?; + write.write_all(&triangles.len().to_le_bytes())?; + write.write_all(&root.to_le_bytes())?; + write.write_all(&branch_position.to_le_bytes())?; + Ok(()) +} + +fn decompress_leaf(data: &[u8]) -> io::Result> { + let count = u16::from_le_bytes([data[0], data[1]]) as usize; + let mut offset = 2; + let mut triangles = Vec::with_capacity(count); + offset += decompress::(&data[offset..], count, |_, doc_id| { + triangles.push(Triangle::skeleton(doc_id)) + })?; + for i in 0..7 { + offset += decompress::(&data[offset..], count, |j, word| { + triangles[j].words[i] = word + })?; + } + Ok(triangles) +} + +#[repr(C)] +struct BranchNode { + bbox: [i32; 4], + left: i32, + right: i32, + pad: [u8; 8], +} + +#[repr(C)] +struct LeafNode { + bbox: [i32; 4], + pos: u64, + len: u16, + pad: [u8; 6], +} + +/// A read-only view into a serialized block kd-tree segment. +/// +/// Provides access to the tree structure and compressed leaf data through memory-mapped or +/// buffered byte slices. The segment contains compressed leaf pages followed by the tree structure +/// (leaf nodes and branch nodes), with a footer containing metadata for locating the root and +/// interpreting offsets. +pub struct Segment<'a> { + data: &'a [u8], + branch_position: u64, + /// Offset to the root of the tree, used as the starting point for traversal. + pub root_offset: i32, +} + +impl<'a> Segment<'a> { + /// Creates a new segment from serialized block kd-tree data. + /// + /// Reads the footer metadata from the last 12 bytes to locate the tree structure and root + /// node. + pub fn new(data: &'a [u8]) -> Self { + Segment { + data: data, + branch_position: u64::from_le_bytes(data[data.len() - 8..].try_into().unwrap()), + root_offset: i32::from_le_bytes( + data[data.len() - 12..data.len() - 8].try_into().unwrap(), + ), + } + } + fn bounding_box(&self, offset: i32) -> &[i32; 4] { + unsafe { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let ptr = self.data.as_ptr().add(byte_offset); + &*ptr.cast::<[i32; 4]>() + } + } + fn branch_node(&self, offset: i32) -> &BranchNode { + unsafe { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let ptr = self.data.as_ptr().add(byte_offset); + &*ptr.cast::() + } + } + fn leaf_node(&self, offset: i32) -> &LeafNode { + unsafe { + let byte_offset = (self.branch_position as i64 + offset as i64) as usize; + let ptr = self.data.as_ptr().add(byte_offset); + &*ptr.cast::() + } + } + fn leaf_page(&self, leaf_node: &LeafNode) -> &[u8] { + &self.data[(leaf_node.pos as usize)..(leaf_node.pos as usize + leaf_node.len as usize)] + } +} + +fn collect_all_docs(segment: &Segment, offset: i32, result: &mut BitSet) -> io::Result<()> { + if offset < 0 { + let leaf_node = segment.leaf_node(offset); + let data = segment.leaf_page(leaf_node); + let count = u16::from_le_bytes([data[0], data[1]]) as usize; + decompress::(&data[2..], count, |_, doc_id| result.insert(doc_id))?; + } else { + let branch_node = segment.branch_node(offset); + collect_all_docs(segment, branch_node.left, result)?; + collect_all_docs(segment, branch_node.right, result)?; + } + Ok(()) +} + +fn bbox_within(bbox: &[i32; 4], query: &[i32; 4]) -> bool { + bbox[0] >= query[0] && // min_y >= query_min_y + bbox[1] >= query[1] && // min_x >= query_min_x + bbox[2] <= query[2] && // max_y <= query_max_y + bbox[3] <= query[3] // max_x <= query_max_x +} + +fn bbox_intersects(bbox: &[i32; 4], query: &[i32; 4]) -> bool { + !(bbox[2] < query[0] || bbox[0] > query[2] || bbox[3] < query[1] || bbox[1] > query[3]) +} + +/// Finds documents with triangles that intersect the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), pruning subtrees +/// whose bounding boxes don't intersect the query. When a node's bbox is entirely within the +/// query, all its documents are bulk-collected. Otherwise, individual triangles are tested using +/// exact geometric predicates. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. Documents are inserted into +/// the `result` BitSet, which automatically deduplicates when the same document appears in +/// multiple leaves. +pub fn search_intersects( + segment: &Segment, + offset: i32, + query: &[i32; 4], + result: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + // bbox doesn't intersect query → skip entire subtree + if !bbox_intersects(bbox, query) { + } + // bbox entirely within query → all triangles intersect + else if bbox_within(bbox, query) { + collect_all_docs(segment, offset, result)?; + } else if offset < 0 { + // bbox crosses query → test each triangle + let leaf_node = segment.leaf_node(offset); + let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + result.insert(triangle.doc_id); // BitSet deduplicates + } + } + } else { + let branch_node = segment.branch_node(offset); + // bbox crosses query → must check children + search_intersects(segment, branch_node.left, query, result)?; + search_intersects(segment, branch_node.right, query, result)?; + } + Ok(()) +} + +fn line_intersects_line( + x1: i32, + y1: i32, + x2: i32, + y2: i32, + x3: i32, + y3: i32, + x4: i32, + y4: i32, +) -> bool { + // Cast to i128 to prevent overflow in coordinate arithmetic + let x1 = x1 as i128; + let y1 = y1 as i128; + let x2 = x2 as i128; + let y2 = y2 as i128; + let x3 = x3 as i128; + let y3 = y3 as i128; + let x4 = x4 as i128; + let y4 = y4 as i128; + + // Proper segment-segment intersection test + let d = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4); + if d == 0 { + // parallel + return false; + } + + let t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4); + let u = -((x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)); + + if d > 0 { + t >= 0 && t <= d && u >= 0 && u <= d + } else { + t <= 0 && t >= d && u <= 0 && u >= d + } +} + +fn edge_intersects_bbox(x1: i32, y1: i32, x2: i32, y2: i32, bbox: &[i32; 4]) -> bool { + // Test against all 4 rectangle edges, bottom, right, top, left. + line_intersects_line(x1, y1, x2, y2, bbox[1], bbox[0], bbox[3], bbox[0]) + || line_intersects_line(x1, y1, x2, y2, bbox[3], bbox[0], bbox[3], bbox[2]) + || line_intersects_line(x1, y1, x2, y2, bbox[3], bbox[2], bbox[1], bbox[2]) + || line_intersects_line(x1, y1, x2, y2, bbox[1], bbox[2], bbox[1], bbox[0]) +} + +fn edge_crosses_bbox(x1: i32, y1: i32, x2: i32, y2: i32, bbox: &[i32; 4]) -> bool { + // Edge has endpoint outside while other is inside (crosses boundary) + let p1_inside = y1 >= bbox[0] && x1 >= bbox[1] && y1 <= bbox[2] && x1 <= bbox[3]; + let p2_inside = y2 >= bbox[0] && x2 >= bbox[1] && y2 <= bbox[2] && x2 <= bbox[3]; + p1_inside != p2_inside +} + +fn triangle_within(triangle: &Triangle, query: &[i32; 4]) -> bool { + let tri_bbox = &triangle.words[0..4]; + + // Triangle bbox entirely within query → WITHIN + if tri_bbox[0] >= query[0] + && tri_bbox[1] >= query[1] + && tri_bbox[2] <= query[2] + && tri_bbox[3] <= query[3] + { + return true; + } + + // Triangle bbox entirely outside → NOT WITHIN + if tri_bbox[2] < query[0] + || tri_bbox[3] < query[1] + || tri_bbox[0] > query[2] + || tri_bbox[1] > query[3] + { + return false; + } + + // Decode vertices. + let ([ay, ax, by, bx, cy, cx], [ab, bc, ca]) = triangle.decode(); + + // Check each edge - if boundary edge crosses query bbox, NOT WITHIN + if ab && edge_crosses_bbox(ax, ay, bx, by, query) { + return false; + } + if bc && edge_crosses_bbox(bx, by, cx, cy, query) { + return false; + } + if ca && edge_crosses_bbox(cx, cy, ax, ay, query) { + return false; + } + + // No boundary edges cross out + true +} + +fn point_in_triangle( + px: i32, + py: i32, + ax: i32, + ay: i32, + bx: i32, + by: i32, + cx: i32, + cy: i32, +) -> bool { + let v0x = (cx - ax) as i128; + let v0y = (cy - ay) as i128; + let v1x = (bx - ax) as i128; + let v1y = (by - ay) as i128; + let v2x = (px - ax) as i128; + let v2y = (py - ay) as i128; + + let dot00 = v0x * v0x + v0y * v0y; + let dot01 = v0x * v1x + v0y * v1y; + let dot02 = v0x * v2x + v0y * v2y; + let dot11 = v1x * v1x + v1y * v1y; + let dot12 = v1x * v2x + v1y * v2y; + + let denom = dot00 * dot11 - dot01 * dot01; + if denom == 0 { + return false; + } + + let u = dot11 * dot02 - dot01 * dot12; + let v = dot00 * dot12 - dot01 * dot02; + + u >= 0 && v >= 0 && u + v <= denom +} + +fn triangle_intersects(triangle: &Triangle, query: &[i32; 4]) -> bool { + let tri_bbox = &triangle.words[0..4]; + + // Quick reject: bboxes don't overlap + if tri_bbox[2] < query[0] + || tri_bbox[3] < query[1] + || tri_bbox[0] > query[2] + || tri_bbox[1] > query[3] + { + return false; + } + + let ([ay, ax, by, bx, cy, cx], _) = triangle.decode(); + + // Any triangle vertex inside rectangle? + if (ax >= query[1] && ax <= query[3] && ay >= query[0] && ay <= query[2]) + || (bx >= query[1] && bx <= query[3] && by >= query[0] && by <= query[2]) + || (cx >= query[1] && cx <= query[3] && cy >= query[0] && cy <= query[2]) + { + return true; + } + + // Any rectangle corner inside triangle? + let corners = [ + (query[1], query[0]), // min_x, min_y + (query[3], query[0]), // max_x, min_y + (query[3], query[2]), // max_x, max_y + (query[1], query[2]), // min_x, max_y + ]; + for (x, y) in corners { + if point_in_triangle(x, y, ax, ay, bx, by, cx, cy) { + return true; + } + } + + // Any triangle edge intersect rectangle edges? + edge_intersects_bbox(ax, ay, bx, by, query) + || edge_intersects_bbox(bx, by, cx, cy, query) + || edge_intersects_bbox(cx, cy, ax, ay, query) +} + +/// Finds documents where all triangles are within the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), testing each +/// triangle to determine if it lies entirely within the query bounds. Uses two `BitSet` instances +/// to track state: `result` accumulates candidate documents, while `excluded` marks documents that +/// have at least one triangle extending outside the query. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. The final result is +/// documents in `result` that are NOT in `excluded` - the caller must compute this difference. +pub fn search_within( + segment: &Segment, + offset: i32, + query: &[i32; 4], // [min_y, min_x, max_y, max_x] + result: &mut BitSet, + excluded: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + if !bbox_intersects(bbox, query) { + } else if offset < 0 { + let leaf_node = segment.leaf_node(offset); + // bbox crosses query → test each triangle + let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + if excluded.contains(triangle.doc_id) { + continue; // Already excluded + } + if triangle_within(triangle, query) { + result.insert(triangle.doc_id); + } else { + excluded.insert(triangle.doc_id); + } + } + } + } else { + let branch_node = segment.branch_node(offset); + search_within(segment, branch_node.left, query, result, excluded)?; + search_within(segment, branch_node.right, query, result, excluded)?; + } + Ok(()) +} + +enum ContainsRelation { + CANDIDATE, // Query might be contained + NOTWITHIN, // Query definitely not contained + DISJOINT, // Triangle doesn't overlap query +} + +fn triangle_contains_relation(triangle: &Triangle, query: &[i32; 4]) -> ContainsRelation { + let tri_bbox = &triangle.words[0..4]; + + if query[2] < tri_bbox[0] + || query[3] < tri_bbox[1] + || query[0] > tri_bbox[2] + || query[1] > tri_bbox[3] + { + return ContainsRelation::DISJOINT; + } + + let ([ay, ax, by, bx, cy, cx], [ab, bc, ca]) = triangle.decode(); + + let corners = [ + (query[1], query[0]), + (query[3], query[0]), + (query[3], query[2]), + (query[1], query[2]), + ]; + + let mut any_corner_inside = false; + for &(qx, qy) in &corners { + if point_in_triangle(qx, qy, ax, ay, bx, by, cx, cy) { + any_corner_inside = true; + break; + } + } + + let ab_intersects = edge_intersects_bbox(ax, ay, bx, by, query); + let bc_intersects = edge_intersects_bbox(bx, by, cx, cy, query); + let ca_intersects = edge_intersects_bbox(cx, cy, ax, ay, query); + + if (ab && edge_crosses_bbox(ax, ay, bx, by, query)) + || (bc && edge_crosses_bbox(bx, by, cx, cy, query)) + || (ca && edge_crosses_bbox(cx, cy, ax, ay, query)) + { + return ContainsRelation::NOTWITHIN; + } + + if any_corner_inside || ab_intersects || bc_intersects || ca_intersects { + return ContainsRelation::CANDIDATE; + } + + ContainsRelation::DISJOINT +} + +/// Finds documents whose polygons contain the query bounding box. +/// +/// Traverses the tree starting at `offset` (typically `segment.root_offset`), testing each +/// triangle using three-state logic: `CANDIDATE` (query might be contained), `NOTWITHIN` (boundary +/// edge crosses query), or `DISJOINT` (no overlap). Only boundary edges are tested for crossing - +/// internal tessellation edges are ignored. +/// +/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. Uses two `BitSet` +/// instances: `result` accumulates candidates, `excluded` marks documents with disqualifying +/// boundary crossings. The final result is documents in `result` that are NOT in `excluded`. +pub fn search_contains( + segment: &Segment, + offset: i32, + query: &[i32; 4], + result: &mut BitSet, + excluded: &mut BitSet, +) -> io::Result<()> { + let bbox = segment.bounding_box(offset); + if !bbox_intersects(bbox, query) { + } else if offset < 0 { + let leaf_node = segment.leaf_node(offset); + // bbox crosses query → test each triangle + let triangles = decompress_leaf(segment.leaf_page(leaf_node))?; + for triangle in &triangles { + if triangle_intersects(triangle, query) { + let doc_id = triangle.doc_id; + if excluded.contains(doc_id) { + continue; + } + match triangle_contains_relation(triangle, query) { + ContainsRelation::CANDIDATE => result.insert(doc_id), + ContainsRelation::NOTWITHIN => excluded.insert(doc_id), + ContainsRelation::DISJOINT => {} + } + } + } + } else { + let branch_node = segment.branch_node(offset); + search_contains(segment, branch_node.left, query, result, excluded)?; + search_contains(segment, branch_node.right, query, result, excluded)?; + } + Ok(()) +} diff --git a/src/spatial/mod.rs b/src/spatial/mod.rs index 1c677015e..fa5498c64 100644 --- a/src/spatial/mod.rs +++ b/src/spatial/mod.rs @@ -1,6 +1,6 @@ //! Spatial module (implements a block kd-tree) +pub mod bkd; pub mod delta; pub mod radix_select; -pub mod surveyor; pub mod triangle; diff --git a/src/spatial/surveyor.rs b/src/spatial/surveyor.rs deleted file mode 100644 index 390d3913c..000000000 --- a/src/spatial/surveyor.rs +++ /dev/null @@ -1,135 +0,0 @@ -//! Dimension analysis for block kd-tree construction. -//! -//! Surveys triangle collections to identify the optimal partition dimension by tracking min/max -//! spreads and common byte prefixes across the four bounding box dimensions. The dimension with -//! maximum spread is selected for partitioning, and its prefix is used to optimize radix selection -//! by skipping shared leading bytes. -use crate::spatial::triangle::Triangle; - -#[derive(Clone, Copy)] -struct Spread { - min: i32, - max: i32, -} - -impl Spread { - fn survey(&mut self, value: i32) { - if value < self.min { - self.min = value; - } - if value > self.max { - self.max = value; - } - } - fn spread(&self) -> i32 { - self.max - self.min - } -} - -impl Default for Spread { - fn default() -> Self { - Spread { - min: i32::MAX, - max: i32::MIN, - } - } -} - -#[derive(Debug)] -struct Prefix { - words: Vec, -} - -impl Prefix { - fn prime(value: i32) -> Self { - Prefix { - words: value.to_be_bytes().to_vec(), - } - } - fn survey(&mut self, value: i32) { - let bytes = value.to_be_bytes(); - while !bytes.starts_with(&self.words) { - self.words.pop(); - } - } -} - -/// Tracks dimension statistics for block kd-tree partitioning decisions. -/// -/// Accumulates min/max spreads and common byte prefixes across the four bounding box dimensions -/// `(min_y, min_x, max_y, max_x)` of a triangle collection. Used during tree construction to -/// select the optimal partition dimension (maximum spread) and identify shared prefixes for radix -/// selection optimization. -pub struct Surveyor { - spreads: [Spread; 4], - prefixes: [Prefix; 4], -} - -impl Surveyor { - /// Creates a new surveyor using an initial triangle to establish byte prefixes. - /// - /// Prefixes require an initial value to begin comparison, so the first triangle's bounding box - /// dimensions seed each prefix. Spreads initialize to defaults and will be updated as - /// triangles are surveyed. - pub fn new(triangle: &Triangle) -> Self { - let mut prefixes = Vec::new(); - for &value in triangle.bbox() { - prefixes.push(Prefix::prime(value)); - } - Surveyor { - spreads: [Spread::default(); 4], - prefixes: prefixes.try_into().unwrap(), - } - } - - /// Updates dimension statistics with values from another triangle. - /// - /// Expands the min/max spread for each bounding box dimension and shrinks prefixes to only the - /// bytes shared across all triangles surveyed so far. - pub fn survey(&mut self, triangle: &Triangle) { - for (i, &value) in triangle.bbox().iter().enumerate() { - self.spreads[i].survey(value); - self.prefixes[i].survey(value); - } - } - - /// Returns the dimension with maximum spread and its common byte prefix. - /// - /// Selects the optimal bounding box dimension for partitioning by finding which has the - /// largest range (max - min). Returns both the dimension index and its prefix for use in radix - /// selection. - pub fn dimension(&self) -> (usize, &Vec) { - let mut dimension = 0; - let mut max_spread = self.spreads[0].spread(); - for (i, spread) in self.spreads.iter().enumerate().skip(1) { - let current_spread = spread.spread(); - if current_spread > max_spread { - dimension = i; - max_spread = current_spread; - } - } - (dimension, &self.prefixes[dimension].words) - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::spatial::triangle::Triangle; - - #[test] - fn survey_triangles() { - let triangles = [ - Triangle::new(1, [0, 0, 8, 1, 10, 3], [false, false, false]), - Triangle::new(1, [0, 0, 0, 11, 9, 0], [false, false, false]), - Triangle::new(1, [0, 0, 5, 4, 5, 0], [false, false, false]), - ]; - let mut surveyor = Surveyor::new(&triangles[0]); - for triangle in &triangles { - surveyor.survey(triangle); - } - let (dimension, prefix) = surveyor.dimension(); - assert_eq!(dimension, 3); - assert_eq!(prefix.len(), 3); - } -}