mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Implement block kd-tree.
Implement an immutable bulk-loaded spatial index using recursive median partitioning on bounding box dimensions. Each leaf stores up to 512 triangles with delta-compressed coordinates and doc IDs. The tree provides three query types (intersects, within, contains) that use exact integer arithmetic for geometric predicates and accumulate results in bit sets for efficient deduplication across leaves. The serialized format stores compressed leaf pages followed by the tree structure (leaf and branch nodes), enabling zero-copy access through memory-mapped segments without upfront decompression.
This commit is contained in:
committed by
Paul Masurel
parent
f38140f72f
commit
2dc46b235e
@@ -93,6 +93,7 @@ time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
|
||||
postcard = { version = "1.0.4", features = [
|
||||
"use-std",
|
||||
], default-features = false }
|
||||
geojson = "0.24.2"
|
||||
|
||||
[target.'cfg(not(windows))'.dev-dependencies]
|
||||
criterion = { version = "0.5", default-features = false }
|
||||
|
||||
760
src/spatial/bkd.rs
Normal file
760
src/spatial/bkd.rs
Normal file
@@ -0,0 +1,760 @@
|
||||
//! Block kd-tree spatial indexing for triangulated polygons.
|
||||
//!
|
||||
//! Implements an immutable bulk-loaded spatial index using recursive median partitioning on
|
||||
//! bounding box dimensions. Each leaf stores up to 512 triangles with delta-compressed coordinates
|
||||
//! and doc IDs. The tree provides three query types (intersects, within, contains) that use exact
|
||||
//! integer arithmetic for geometric predicates and accumulate results in bit sets for efficient
|
||||
//! deduplication across leaves.
|
||||
//!
|
||||
//! The serialized format stores compressed leaf pages followed by the tree structure (leaf and
|
||||
//! branch nodes), enabling zero-copy access through memory-mapped segments without upfront
|
||||
//! decompression.
|
||||
use std::io;
|
||||
use std::io::{Seek, Write};
|
||||
|
||||
use common::BitSet;
|
||||
|
||||
use crate::spatial::delta::{compress, decompress, Compressible};
|
||||
use crate::spatial::triangle::Triangle;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct SpreadSurvey {
|
||||
min: i32,
|
||||
max: i32,
|
||||
}
|
||||
|
||||
impl SpreadSurvey {
|
||||
fn survey(&mut self, value: i32) {
|
||||
self.min = self.min.min(value);
|
||||
self.max = self.max.max(value);
|
||||
}
|
||||
fn spread(&self) -> i32 {
|
||||
self.max - self.min
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SpreadSurvey {
|
||||
fn default() -> Self {
|
||||
SpreadSurvey {
|
||||
min: i32::MAX,
|
||||
max: i32::MIN,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct BoundingBoxSurvey {
|
||||
bbox: [i32; 4],
|
||||
}
|
||||
|
||||
impl BoundingBoxSurvey {
|
||||
fn survey(&mut self, triangle: &Triangle) {
|
||||
self.bbox[0] = triangle.words[0].min(self.bbox[0]);
|
||||
self.bbox[1] = triangle.words[1].min(self.bbox[1]);
|
||||
self.bbox[2] = triangle.words[2].max(self.bbox[2]);
|
||||
self.bbox[3] = triangle.words[3].max(self.bbox[3]);
|
||||
}
|
||||
fn bbox(&self) -> [i32; 4] {
|
||||
self.bbox.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BoundingBoxSurvey {
|
||||
fn default() -> Self {
|
||||
BoundingBoxSurvey {
|
||||
bbox: [i32::MAX, i32::MAX, i32::MIN, i32::MIN],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum BuildNode {
|
||||
Branch {
|
||||
bbox: [i32; 4],
|
||||
left: Box<BuildNode>,
|
||||
right: Box<BuildNode>,
|
||||
},
|
||||
Leaf {
|
||||
bbox: [i32; 4],
|
||||
pos: u64,
|
||||
len: u16,
|
||||
},
|
||||
}
|
||||
|
||||
struct CompressibleTriangleI32<'a> {
|
||||
triangles: &'a [Triangle],
|
||||
dimension: usize,
|
||||
}
|
||||
|
||||
impl<'a> CompressibleTriangleI32<'a> {
|
||||
fn new(triangles: &'a [Triangle], dimension: usize) -> Self {
|
||||
CompressibleTriangleI32 {
|
||||
triangles: triangles,
|
||||
dimension: dimension,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Compressible for CompressibleTriangleI32<'a> {
|
||||
type Value = i32;
|
||||
fn len(&self) -> usize {
|
||||
self.triangles.len()
|
||||
}
|
||||
fn get(&self, i: usize) -> i32 {
|
||||
self.triangles[i].words[self.dimension]
|
||||
}
|
||||
}
|
||||
|
||||
struct CompressibleTriangleDocID<'a> {
|
||||
triangles: &'a [Triangle],
|
||||
}
|
||||
|
||||
impl<'a> CompressibleTriangleDocID<'a> {
|
||||
fn new(triangles: &'a [Triangle]) -> Self {
|
||||
CompressibleTriangleDocID {
|
||||
triangles: triangles,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Compressible for CompressibleTriangleDocID<'a> {
|
||||
type Value = u32;
|
||||
fn len(&self) -> usize {
|
||||
self.triangles.len()
|
||||
}
|
||||
fn get(&self, i: usize) -> u32 {
|
||||
self.triangles[i].doc_id
|
||||
}
|
||||
}
|
||||
|
||||
// Leaf pages are first the count of triangles, followed by delta encoded doc_ids, followed by
|
||||
// the delta encoded words in order. We will then have the length of the page. We build a tree
|
||||
// after the pages with leaf nodes and branch nodes. Leaf nodes will contain the bounding box
|
||||
// of the leaf followed position and length of the page. The leaf node is a level of direction
|
||||
// to store the position and length of the page in a format that is easy to read directly from
|
||||
// the mapping.
|
||||
|
||||
// We do not compress the tree nodes. We read them directly from the mapping.
|
||||
|
||||
//
|
||||
fn write_leaf_pages<W: Write + Seek>(
|
||||
triangles: &mut [Triangle],
|
||||
write: &mut W,
|
||||
) -> io::Result<BuildNode> {
|
||||
if triangles.len() <= 512 {
|
||||
let pos = write.stream_position()?;
|
||||
let mut spreads = [SpreadSurvey::default(); 4];
|
||||
let mut bounding_box = BoundingBoxSurvey::default();
|
||||
for triangle in triangles.iter() {
|
||||
for i in 0..4 {
|
||||
spreads[i].survey(triangle.words[i]);
|
||||
}
|
||||
bounding_box.survey(triangle);
|
||||
}
|
||||
let mut max_spread = spreads[0].spread();
|
||||
let mut dimension = 0;
|
||||
for i in 1..4 {
|
||||
let current_spread = spreads[i].spread();
|
||||
if current_spread > max_spread {
|
||||
dimension = i;
|
||||
max_spread = current_spread;
|
||||
}
|
||||
}
|
||||
write.write_all(&(triangles.len() as u16).to_le_bytes())?;
|
||||
triangles.sort_by_key(|t| t.words[dimension]);
|
||||
compress(&CompressibleTriangleDocID::new(triangles), write)?;
|
||||
let compressible = [
|
||||
CompressibleTriangleI32::new(triangles, 0),
|
||||
CompressibleTriangleI32::new(triangles, 1),
|
||||
CompressibleTriangleI32::new(triangles, 2),
|
||||
CompressibleTriangleI32::new(triangles, 3),
|
||||
CompressibleTriangleI32::new(triangles, 4),
|
||||
CompressibleTriangleI32::new(triangles, 5),
|
||||
CompressibleTriangleI32::new(triangles, 6),
|
||||
];
|
||||
for i in 0..7 {
|
||||
compress(&compressible[i], write)?;
|
||||
}
|
||||
let len = write.stream_position()? - pos;
|
||||
Ok(BuildNode::Leaf {
|
||||
bbox: bounding_box.bbox(),
|
||||
pos: pos,
|
||||
len: len as u16,
|
||||
})
|
||||
} else {
|
||||
let mut spreads = [SpreadSurvey::default(); 4];
|
||||
let mut bounding_box = BoundingBoxSurvey::default();
|
||||
for triangle in triangles.iter() {
|
||||
for i in 0..4 {
|
||||
spreads[i].survey(triangle.words[i]);
|
||||
}
|
||||
bounding_box.survey(triangle);
|
||||
}
|
||||
let mut max_spread = spreads[0].spread();
|
||||
let mut dimension = 0;
|
||||
for i in 0..4 {
|
||||
let current_spread = spreads[i].spread();
|
||||
if current_spread > max_spread {
|
||||
dimension = i;
|
||||
max_spread = current_spread;
|
||||
}
|
||||
}
|
||||
let mid = triangles.len() / 2;
|
||||
triangles.select_nth_unstable_by_key(mid, |t| t.words[dimension]);
|
||||
let partition = triangles[mid].words[dimension];
|
||||
let mut split_point = mid + 1;
|
||||
while split_point < triangles.len() && triangles[split_point].words[dimension] == partition
|
||||
{
|
||||
split_point += 1;
|
||||
}
|
||||
if split_point == triangles.len() {
|
||||
split_point = mid; // Force split at midpoint index
|
||||
}
|
||||
let (left, right) = triangles.split_at_mut(split_point);
|
||||
let left_node = write_leaf_pages(left, write)?;
|
||||
let right_node = write_leaf_pages(right, write)?;
|
||||
Ok(BuildNode::Branch {
|
||||
bbox: bounding_box.bbox(),
|
||||
left: Box::new(left_node),
|
||||
right: Box::new(right_node),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn write_leaf_nodes<W: Write + Seek>(node: &BuildNode, write: &mut W) -> io::Result<()> {
|
||||
match node {
|
||||
BuildNode::Branch {
|
||||
bbox: _,
|
||||
left,
|
||||
right,
|
||||
} => {
|
||||
write_leaf_nodes(right, write)?;
|
||||
write_leaf_nodes(left, write)?;
|
||||
}
|
||||
BuildNode::Leaf { bbox, pos, len } => {
|
||||
for &dimension in bbox.iter() {
|
||||
write.write_all(&dimension.to_le_bytes())?;
|
||||
}
|
||||
write.write_all(&pos.to_le_bytes())?;
|
||||
write.write_all(&len.to_le_bytes())?;
|
||||
write.write_all(&[0u8; 6])?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_branch_nodes<W: Write + Seek>(
|
||||
node: &BuildNode,
|
||||
branch_offset: &mut i32,
|
||||
leaf_offset: &mut i32,
|
||||
write: &mut W,
|
||||
) -> io::Result<i32> {
|
||||
match node {
|
||||
BuildNode::Leaf { .. } => {
|
||||
let pos = *leaf_offset;
|
||||
*leaf_offset -= 1;
|
||||
Ok(pos * size_of::<LeafNode>() as i32)
|
||||
}
|
||||
BuildNode::Branch { bbox, left, right } => {
|
||||
let left = write_branch_nodes(left, branch_offset, leaf_offset, write)?;
|
||||
let right = write_branch_nodes(right, branch_offset, leaf_offset, write)?;
|
||||
for &val in bbox {
|
||||
write.write_all(&val.to_le_bytes())?;
|
||||
}
|
||||
write.write_all(&left.to_le_bytes())?;
|
||||
write.write_all(&right.to_le_bytes())?;
|
||||
write.write_all(&[0u8; 8])?;
|
||||
let pos = *branch_offset;
|
||||
*branch_offset += 1;
|
||||
Ok(pos * size_of::<BranchNode>() as i32)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds and serializes a block kd-tree for spatial indexing of triangles.
|
||||
///
|
||||
/// Takes a collection of triangles and constructs a complete block kd-tree, writing both the
|
||||
/// compressed leaf pages and tree structure to the output. The tree uses recursive median
|
||||
/// partitioning on the dimension with maximum spread, storing up to 512 triangles per leaf.
|
||||
///
|
||||
/// The output format consists of:
|
||||
/// - Version header (u16)
|
||||
/// - Compressed leaf pages (delta-encoded doc_ids and triangle coordinates)
|
||||
/// - 32-byte aligned tree structure (leaf nodes, then branch nodes)
|
||||
/// - Footer with triangle count, root offset, and branch position
|
||||
///
|
||||
/// The `triangles` slice will be reordered during tree construction as partitioning sorts by the
|
||||
/// selected dimension at each level.
|
||||
pub fn write_block_kd_tree<W: Write + Seek>(
|
||||
triangles: &mut [Triangle],
|
||||
write: &mut W,
|
||||
) -> io::Result<()> {
|
||||
write.write_all(&1u16.to_le_bytes())?;
|
||||
let tree = write_leaf_pages(triangles, write)?;
|
||||
let current = write.stream_position()?;
|
||||
let aligned = (current + 31) & !31;
|
||||
let padding = aligned - current;
|
||||
write.write_all(&vec![0u8; padding as usize])?;
|
||||
write_leaf_nodes(&tree, write)?;
|
||||
let branch_position = write.stream_position()?;
|
||||
let mut branch_offset: i32 = 0;
|
||||
let mut leaf_offset: i32 = -1;
|
||||
let root = write_branch_nodes(&tree, &mut branch_offset, &mut leaf_offset, write)?;
|
||||
write.write_all(&triangles.len().to_le_bytes())?;
|
||||
write.write_all(&root.to_le_bytes())?;
|
||||
write.write_all(&branch_position.to_le_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn decompress_leaf(data: &[u8]) -> io::Result<Vec<Triangle>> {
|
||||
let count = u16::from_le_bytes([data[0], data[1]]) as usize;
|
||||
let mut offset = 2;
|
||||
let mut triangles = Vec::with_capacity(count);
|
||||
offset += decompress::<u32, _>(&data[offset..], count, |_, doc_id| {
|
||||
triangles.push(Triangle::skeleton(doc_id))
|
||||
})?;
|
||||
for i in 0..7 {
|
||||
offset += decompress::<i32, _>(&data[offset..], count, |j, word| {
|
||||
triangles[j].words[i] = word
|
||||
})?;
|
||||
}
|
||||
Ok(triangles)
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct BranchNode {
|
||||
bbox: [i32; 4],
|
||||
left: i32,
|
||||
right: i32,
|
||||
pad: [u8; 8],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct LeafNode {
|
||||
bbox: [i32; 4],
|
||||
pos: u64,
|
||||
len: u16,
|
||||
pad: [u8; 6],
|
||||
}
|
||||
|
||||
/// A read-only view into a serialized block kd-tree segment.
|
||||
///
|
||||
/// Provides access to the tree structure and compressed leaf data through memory-mapped or
|
||||
/// buffered byte slices. The segment contains compressed leaf pages followed by the tree structure
|
||||
/// (leaf nodes and branch nodes), with a footer containing metadata for locating the root and
|
||||
/// interpreting offsets.
|
||||
pub struct Segment<'a> {
|
||||
data: &'a [u8],
|
||||
branch_position: u64,
|
||||
/// Offset to the root of the tree, used as the starting point for traversal.
|
||||
pub root_offset: i32,
|
||||
}
|
||||
|
||||
impl<'a> Segment<'a> {
|
||||
/// Creates a new segment from serialized block kd-tree data.
|
||||
///
|
||||
/// Reads the footer metadata from the last 12 bytes to locate the tree structure and root
|
||||
/// node.
|
||||
pub fn new(data: &'a [u8]) -> Self {
|
||||
Segment {
|
||||
data: data,
|
||||
branch_position: u64::from_le_bytes(data[data.len() - 8..].try_into().unwrap()),
|
||||
root_offset: i32::from_le_bytes(
|
||||
data[data.len() - 12..data.len() - 8].try_into().unwrap(),
|
||||
),
|
||||
}
|
||||
}
|
||||
fn bounding_box(&self, offset: i32) -> &[i32; 4] {
|
||||
unsafe {
|
||||
let byte_offset = (self.branch_position as i64 + offset as i64) as usize;
|
||||
let ptr = self.data.as_ptr().add(byte_offset);
|
||||
&*ptr.cast::<[i32; 4]>()
|
||||
}
|
||||
}
|
||||
fn branch_node(&self, offset: i32) -> &BranchNode {
|
||||
unsafe {
|
||||
let byte_offset = (self.branch_position as i64 + offset as i64) as usize;
|
||||
let ptr = self.data.as_ptr().add(byte_offset);
|
||||
&*ptr.cast::<BranchNode>()
|
||||
}
|
||||
}
|
||||
fn leaf_node(&self, offset: i32) -> &LeafNode {
|
||||
unsafe {
|
||||
let byte_offset = (self.branch_position as i64 + offset as i64) as usize;
|
||||
let ptr = self.data.as_ptr().add(byte_offset);
|
||||
&*ptr.cast::<LeafNode>()
|
||||
}
|
||||
}
|
||||
fn leaf_page(&self, leaf_node: &LeafNode) -> &[u8] {
|
||||
&self.data[(leaf_node.pos as usize)..(leaf_node.pos as usize + leaf_node.len as usize)]
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_all_docs(segment: &Segment, offset: i32, result: &mut BitSet) -> io::Result<()> {
|
||||
if offset < 0 {
|
||||
let leaf_node = segment.leaf_node(offset);
|
||||
let data = segment.leaf_page(leaf_node);
|
||||
let count = u16::from_le_bytes([data[0], data[1]]) as usize;
|
||||
decompress::<u32, _>(&data[2..], count, |_, doc_id| result.insert(doc_id))?;
|
||||
} else {
|
||||
let branch_node = segment.branch_node(offset);
|
||||
collect_all_docs(segment, branch_node.left, result)?;
|
||||
collect_all_docs(segment, branch_node.right, result)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn bbox_within(bbox: &[i32; 4], query: &[i32; 4]) -> bool {
|
||||
bbox[0] >= query[0] && // min_y >= query_min_y
|
||||
bbox[1] >= query[1] && // min_x >= query_min_x
|
||||
bbox[2] <= query[2] && // max_y <= query_max_y
|
||||
bbox[3] <= query[3] // max_x <= query_max_x
|
||||
}
|
||||
|
||||
fn bbox_intersects(bbox: &[i32; 4], query: &[i32; 4]) -> bool {
|
||||
!(bbox[2] < query[0] || bbox[0] > query[2] || bbox[3] < query[1] || bbox[1] > query[3])
|
||||
}
|
||||
|
||||
/// Finds documents with triangles that intersect the query bounding box.
|
||||
///
|
||||
/// Traverses the tree starting at `offset` (typically `segment.root_offset`), pruning subtrees
|
||||
/// whose bounding boxes don't intersect the query. When a node's bbox is entirely within the
|
||||
/// query, all its documents are bulk-collected. Otherwise, individual triangles are tested using
|
||||
/// exact geometric predicates.
|
||||
///
|
||||
/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. Documents are inserted into
|
||||
/// the `result` BitSet, which automatically deduplicates when the same document appears in
|
||||
/// multiple leaves.
|
||||
pub fn search_intersects(
|
||||
segment: &Segment,
|
||||
offset: i32,
|
||||
query: &[i32; 4],
|
||||
result: &mut BitSet,
|
||||
) -> io::Result<()> {
|
||||
let bbox = segment.bounding_box(offset);
|
||||
// bbox doesn't intersect query → skip entire subtree
|
||||
if !bbox_intersects(bbox, query) {
|
||||
}
|
||||
// bbox entirely within query → all triangles intersect
|
||||
else if bbox_within(bbox, query) {
|
||||
collect_all_docs(segment, offset, result)?;
|
||||
} else if offset < 0 {
|
||||
// bbox crosses query → test each triangle
|
||||
let leaf_node = segment.leaf_node(offset);
|
||||
let triangles = decompress_leaf(segment.leaf_page(leaf_node))?;
|
||||
for triangle in &triangles {
|
||||
if triangle_intersects(triangle, query) {
|
||||
result.insert(triangle.doc_id); // BitSet deduplicates
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let branch_node = segment.branch_node(offset);
|
||||
// bbox crosses query → must check children
|
||||
search_intersects(segment, branch_node.left, query, result)?;
|
||||
search_intersects(segment, branch_node.right, query, result)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn line_intersects_line(
|
||||
x1: i32,
|
||||
y1: i32,
|
||||
x2: i32,
|
||||
y2: i32,
|
||||
x3: i32,
|
||||
y3: i32,
|
||||
x4: i32,
|
||||
y4: i32,
|
||||
) -> bool {
|
||||
// Cast to i128 to prevent overflow in coordinate arithmetic
|
||||
let x1 = x1 as i128;
|
||||
let y1 = y1 as i128;
|
||||
let x2 = x2 as i128;
|
||||
let y2 = y2 as i128;
|
||||
let x3 = x3 as i128;
|
||||
let y3 = y3 as i128;
|
||||
let x4 = x4 as i128;
|
||||
let y4 = y4 as i128;
|
||||
|
||||
// Proper segment-segment intersection test
|
||||
let d = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4);
|
||||
if d == 0 {
|
||||
// parallel
|
||||
return false;
|
||||
}
|
||||
|
||||
let t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4);
|
||||
let u = -((x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3));
|
||||
|
||||
if d > 0 {
|
||||
t >= 0 && t <= d && u >= 0 && u <= d
|
||||
} else {
|
||||
t <= 0 && t >= d && u <= 0 && u >= d
|
||||
}
|
||||
}
|
||||
|
||||
fn edge_intersects_bbox(x1: i32, y1: i32, x2: i32, y2: i32, bbox: &[i32; 4]) -> bool {
|
||||
// Test against all 4 rectangle edges, bottom, right, top, left.
|
||||
line_intersects_line(x1, y1, x2, y2, bbox[1], bbox[0], bbox[3], bbox[0])
|
||||
|| line_intersects_line(x1, y1, x2, y2, bbox[3], bbox[0], bbox[3], bbox[2])
|
||||
|| line_intersects_line(x1, y1, x2, y2, bbox[3], bbox[2], bbox[1], bbox[2])
|
||||
|| line_intersects_line(x1, y1, x2, y2, bbox[1], bbox[2], bbox[1], bbox[0])
|
||||
}
|
||||
|
||||
fn edge_crosses_bbox(x1: i32, y1: i32, x2: i32, y2: i32, bbox: &[i32; 4]) -> bool {
|
||||
// Edge has endpoint outside while other is inside (crosses boundary)
|
||||
let p1_inside = y1 >= bbox[0] && x1 >= bbox[1] && y1 <= bbox[2] && x1 <= bbox[3];
|
||||
let p2_inside = y2 >= bbox[0] && x2 >= bbox[1] && y2 <= bbox[2] && x2 <= bbox[3];
|
||||
p1_inside != p2_inside
|
||||
}
|
||||
|
||||
fn triangle_within(triangle: &Triangle, query: &[i32; 4]) -> bool {
|
||||
let tri_bbox = &triangle.words[0..4];
|
||||
|
||||
// Triangle bbox entirely within query → WITHIN
|
||||
if tri_bbox[0] >= query[0]
|
||||
&& tri_bbox[1] >= query[1]
|
||||
&& tri_bbox[2] <= query[2]
|
||||
&& tri_bbox[3] <= query[3]
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Triangle bbox entirely outside → NOT WITHIN
|
||||
if tri_bbox[2] < query[0]
|
||||
|| tri_bbox[3] < query[1]
|
||||
|| tri_bbox[0] > query[2]
|
||||
|| tri_bbox[1] > query[3]
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Decode vertices.
|
||||
let ([ay, ax, by, bx, cy, cx], [ab, bc, ca]) = triangle.decode();
|
||||
|
||||
// Check each edge - if boundary edge crosses query bbox, NOT WITHIN
|
||||
if ab && edge_crosses_bbox(ax, ay, bx, by, query) {
|
||||
return false;
|
||||
}
|
||||
if bc && edge_crosses_bbox(bx, by, cx, cy, query) {
|
||||
return false;
|
||||
}
|
||||
if ca && edge_crosses_bbox(cx, cy, ax, ay, query) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// No boundary edges cross out
|
||||
true
|
||||
}
|
||||
|
||||
fn point_in_triangle(
|
||||
px: i32,
|
||||
py: i32,
|
||||
ax: i32,
|
||||
ay: i32,
|
||||
bx: i32,
|
||||
by: i32,
|
||||
cx: i32,
|
||||
cy: i32,
|
||||
) -> bool {
|
||||
let v0x = (cx - ax) as i128;
|
||||
let v0y = (cy - ay) as i128;
|
||||
let v1x = (bx - ax) as i128;
|
||||
let v1y = (by - ay) as i128;
|
||||
let v2x = (px - ax) as i128;
|
||||
let v2y = (py - ay) as i128;
|
||||
|
||||
let dot00 = v0x * v0x + v0y * v0y;
|
||||
let dot01 = v0x * v1x + v0y * v1y;
|
||||
let dot02 = v0x * v2x + v0y * v2y;
|
||||
let dot11 = v1x * v1x + v1y * v1y;
|
||||
let dot12 = v1x * v2x + v1y * v2y;
|
||||
|
||||
let denom = dot00 * dot11 - dot01 * dot01;
|
||||
if denom == 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let u = dot11 * dot02 - dot01 * dot12;
|
||||
let v = dot00 * dot12 - dot01 * dot02;
|
||||
|
||||
u >= 0 && v >= 0 && u + v <= denom
|
||||
}
|
||||
|
||||
fn triangle_intersects(triangle: &Triangle, query: &[i32; 4]) -> bool {
|
||||
let tri_bbox = &triangle.words[0..4];
|
||||
|
||||
// Quick reject: bboxes don't overlap
|
||||
if tri_bbox[2] < query[0]
|
||||
|| tri_bbox[3] < query[1]
|
||||
|| tri_bbox[0] > query[2]
|
||||
|| tri_bbox[1] > query[3]
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
let ([ay, ax, by, bx, cy, cx], _) = triangle.decode();
|
||||
|
||||
// Any triangle vertex inside rectangle?
|
||||
if (ax >= query[1] && ax <= query[3] && ay >= query[0] && ay <= query[2])
|
||||
|| (bx >= query[1] && bx <= query[3] && by >= query[0] && by <= query[2])
|
||||
|| (cx >= query[1] && cx <= query[3] && cy >= query[0] && cy <= query[2])
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Any rectangle corner inside triangle?
|
||||
let corners = [
|
||||
(query[1], query[0]), // min_x, min_y
|
||||
(query[3], query[0]), // max_x, min_y
|
||||
(query[3], query[2]), // max_x, max_y
|
||||
(query[1], query[2]), // min_x, max_y
|
||||
];
|
||||
for (x, y) in corners {
|
||||
if point_in_triangle(x, y, ax, ay, bx, by, cx, cy) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Any triangle edge intersect rectangle edges?
|
||||
edge_intersects_bbox(ax, ay, bx, by, query)
|
||||
|| edge_intersects_bbox(bx, by, cx, cy, query)
|
||||
|| edge_intersects_bbox(cx, cy, ax, ay, query)
|
||||
}
|
||||
|
||||
/// Finds documents where all triangles are within the query bounding box.
|
||||
///
|
||||
/// Traverses the tree starting at `offset` (typically `segment.root_offset`), testing each
|
||||
/// triangle to determine if it lies entirely within the query bounds. Uses two `BitSet` instances
|
||||
/// to track state: `result` accumulates candidate documents, while `excluded` marks documents that
|
||||
/// have at least one triangle extending outside the query.
|
||||
///
|
||||
/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. The final result is
|
||||
/// documents in `result` that are NOT in `excluded` - the caller must compute this difference.
|
||||
pub fn search_within(
|
||||
segment: &Segment,
|
||||
offset: i32,
|
||||
query: &[i32; 4], // [min_y, min_x, max_y, max_x]
|
||||
result: &mut BitSet,
|
||||
excluded: &mut BitSet,
|
||||
) -> io::Result<()> {
|
||||
let bbox = segment.bounding_box(offset);
|
||||
if !bbox_intersects(bbox, query) {
|
||||
} else if offset < 0 {
|
||||
let leaf_node = segment.leaf_node(offset);
|
||||
// bbox crosses query → test each triangle
|
||||
let triangles = decompress_leaf(segment.leaf_page(leaf_node))?;
|
||||
for triangle in &triangles {
|
||||
if triangle_intersects(triangle, query) {
|
||||
if excluded.contains(triangle.doc_id) {
|
||||
continue; // Already excluded
|
||||
}
|
||||
if triangle_within(triangle, query) {
|
||||
result.insert(triangle.doc_id);
|
||||
} else {
|
||||
excluded.insert(triangle.doc_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let branch_node = segment.branch_node(offset);
|
||||
search_within(segment, branch_node.left, query, result, excluded)?;
|
||||
search_within(segment, branch_node.right, query, result, excluded)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
enum ContainsRelation {
|
||||
CANDIDATE, // Query might be contained
|
||||
NOTWITHIN, // Query definitely not contained
|
||||
DISJOINT, // Triangle doesn't overlap query
|
||||
}
|
||||
|
||||
fn triangle_contains_relation(triangle: &Triangle, query: &[i32; 4]) -> ContainsRelation {
|
||||
let tri_bbox = &triangle.words[0..4];
|
||||
|
||||
if query[2] < tri_bbox[0]
|
||||
|| query[3] < tri_bbox[1]
|
||||
|| query[0] > tri_bbox[2]
|
||||
|| query[1] > tri_bbox[3]
|
||||
{
|
||||
return ContainsRelation::DISJOINT;
|
||||
}
|
||||
|
||||
let ([ay, ax, by, bx, cy, cx], [ab, bc, ca]) = triangle.decode();
|
||||
|
||||
let corners = [
|
||||
(query[1], query[0]),
|
||||
(query[3], query[0]),
|
||||
(query[3], query[2]),
|
||||
(query[1], query[2]),
|
||||
];
|
||||
|
||||
let mut any_corner_inside = false;
|
||||
for &(qx, qy) in &corners {
|
||||
if point_in_triangle(qx, qy, ax, ay, bx, by, cx, cy) {
|
||||
any_corner_inside = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let ab_intersects = edge_intersects_bbox(ax, ay, bx, by, query);
|
||||
let bc_intersects = edge_intersects_bbox(bx, by, cx, cy, query);
|
||||
let ca_intersects = edge_intersects_bbox(cx, cy, ax, ay, query);
|
||||
|
||||
if (ab && edge_crosses_bbox(ax, ay, bx, by, query))
|
||||
|| (bc && edge_crosses_bbox(bx, by, cx, cy, query))
|
||||
|| (ca && edge_crosses_bbox(cx, cy, ax, ay, query))
|
||||
{
|
||||
return ContainsRelation::NOTWITHIN;
|
||||
}
|
||||
|
||||
if any_corner_inside || ab_intersects || bc_intersects || ca_intersects {
|
||||
return ContainsRelation::CANDIDATE;
|
||||
}
|
||||
|
||||
ContainsRelation::DISJOINT
|
||||
}
|
||||
|
||||
/// Finds documents whose polygons contain the query bounding box.
|
||||
///
|
||||
/// Traverses the tree starting at `offset` (typically `segment.root_offset`), testing each
|
||||
/// triangle using three-state logic: `CANDIDATE` (query might be contained), `NOTWITHIN` (boundary
|
||||
/// edge crosses query), or `DISJOINT` (no overlap). Only boundary edges are tested for crossing -
|
||||
/// internal tessellation edges are ignored.
|
||||
///
|
||||
/// The query is `[min_y, min_x, max_y, max_x]` in integer coordinates. Uses two `BitSet`
|
||||
/// instances: `result` accumulates candidates, `excluded` marks documents with disqualifying
|
||||
/// boundary crossings. The final result is documents in `result` that are NOT in `excluded`.
|
||||
pub fn search_contains(
|
||||
segment: &Segment,
|
||||
offset: i32,
|
||||
query: &[i32; 4],
|
||||
result: &mut BitSet,
|
||||
excluded: &mut BitSet,
|
||||
) -> io::Result<()> {
|
||||
let bbox = segment.bounding_box(offset);
|
||||
if !bbox_intersects(bbox, query) {
|
||||
} else if offset < 0 {
|
||||
let leaf_node = segment.leaf_node(offset);
|
||||
// bbox crosses query → test each triangle
|
||||
let triangles = decompress_leaf(segment.leaf_page(leaf_node))?;
|
||||
for triangle in &triangles {
|
||||
if triangle_intersects(triangle, query) {
|
||||
let doc_id = triangle.doc_id;
|
||||
if excluded.contains(doc_id) {
|
||||
continue;
|
||||
}
|
||||
match triangle_contains_relation(triangle, query) {
|
||||
ContainsRelation::CANDIDATE => result.insert(doc_id),
|
||||
ContainsRelation::NOTWITHIN => excluded.insert(doc_id),
|
||||
ContainsRelation::DISJOINT => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let branch_node = segment.branch_node(offset);
|
||||
search_contains(segment, branch_node.left, query, result, excluded)?;
|
||||
search_contains(segment, branch_node.right, query, result, excluded)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
//! Spatial module (implements a block kd-tree)
|
||||
|
||||
pub mod bkd;
|
||||
pub mod delta;
|
||||
pub mod radix_select;
|
||||
pub mod surveyor;
|
||||
pub mod triangle;
|
||||
|
||||
@@ -1,135 +0,0 @@
|
||||
//! Dimension analysis for block kd-tree construction.
|
||||
//!
|
||||
//! Surveys triangle collections to identify the optimal partition dimension by tracking min/max
|
||||
//! spreads and common byte prefixes across the four bounding box dimensions. The dimension with
|
||||
//! maximum spread is selected for partitioning, and its prefix is used to optimize radix selection
|
||||
//! by skipping shared leading bytes.
|
||||
use crate::spatial::triangle::Triangle;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct Spread {
|
||||
min: i32,
|
||||
max: i32,
|
||||
}
|
||||
|
||||
impl Spread {
|
||||
fn survey(&mut self, value: i32) {
|
||||
if value < self.min {
|
||||
self.min = value;
|
||||
}
|
||||
if value > self.max {
|
||||
self.max = value;
|
||||
}
|
||||
}
|
||||
fn spread(&self) -> i32 {
|
||||
self.max - self.min
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Spread {
|
||||
fn default() -> Self {
|
||||
Spread {
|
||||
min: i32::MAX,
|
||||
max: i32::MIN,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Prefix {
|
||||
words: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Prefix {
|
||||
fn prime(value: i32) -> Self {
|
||||
Prefix {
|
||||
words: value.to_be_bytes().to_vec(),
|
||||
}
|
||||
}
|
||||
fn survey(&mut self, value: i32) {
|
||||
let bytes = value.to_be_bytes();
|
||||
while !bytes.starts_with(&self.words) {
|
||||
self.words.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks dimension statistics for block kd-tree partitioning decisions.
|
||||
///
|
||||
/// Accumulates min/max spreads and common byte prefixes across the four bounding box dimensions
|
||||
/// `(min_y, min_x, max_y, max_x)` of a triangle collection. Used during tree construction to
|
||||
/// select the optimal partition dimension (maximum spread) and identify shared prefixes for radix
|
||||
/// selection optimization.
|
||||
pub struct Surveyor {
|
||||
spreads: [Spread; 4],
|
||||
prefixes: [Prefix; 4],
|
||||
}
|
||||
|
||||
impl Surveyor {
|
||||
/// Creates a new surveyor using an initial triangle to establish byte prefixes.
|
||||
///
|
||||
/// Prefixes require an initial value to begin comparison, so the first triangle's bounding box
|
||||
/// dimensions seed each prefix. Spreads initialize to defaults and will be updated as
|
||||
/// triangles are surveyed.
|
||||
pub fn new(triangle: &Triangle) -> Self {
|
||||
let mut prefixes = Vec::new();
|
||||
for &value in triangle.bbox() {
|
||||
prefixes.push(Prefix::prime(value));
|
||||
}
|
||||
Surveyor {
|
||||
spreads: [Spread::default(); 4],
|
||||
prefixes: prefixes.try_into().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Updates dimension statistics with values from another triangle.
|
||||
///
|
||||
/// Expands the min/max spread for each bounding box dimension and shrinks prefixes to only the
|
||||
/// bytes shared across all triangles surveyed so far.
|
||||
pub fn survey(&mut self, triangle: &Triangle) {
|
||||
for (i, &value) in triangle.bbox().iter().enumerate() {
|
||||
self.spreads[i].survey(value);
|
||||
self.prefixes[i].survey(value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the dimension with maximum spread and its common byte prefix.
|
||||
///
|
||||
/// Selects the optimal bounding box dimension for partitioning by finding which has the
|
||||
/// largest range (max - min). Returns both the dimension index and its prefix for use in radix
|
||||
/// selection.
|
||||
pub fn dimension(&self) -> (usize, &Vec<u8>) {
|
||||
let mut dimension = 0;
|
||||
let mut max_spread = self.spreads[0].spread();
|
||||
for (i, spread) in self.spreads.iter().enumerate().skip(1) {
|
||||
let current_spread = spread.spread();
|
||||
if current_spread > max_spread {
|
||||
dimension = i;
|
||||
max_spread = current_spread;
|
||||
}
|
||||
}
|
||||
(dimension, &self.prefixes[dimension].words)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use crate::spatial::triangle::Triangle;
|
||||
|
||||
#[test]
|
||||
fn survey_triangles() {
|
||||
let triangles = [
|
||||
Triangle::new(1, [0, 0, 8, 1, 10, 3], [false, false, false]),
|
||||
Triangle::new(1, [0, 0, 0, 11, 9, 0], [false, false, false]),
|
||||
Triangle::new(1, [0, 0, 5, 4, 5, 0], [false, false, false]),
|
||||
];
|
||||
let mut surveyor = Surveyor::new(&triangles[0]);
|
||||
for triangle in &triangles {
|
||||
surveyor.survey(triangle);
|
||||
}
|
||||
let (dimension, prefix) = surveyor.dimension();
|
||||
assert_eq!(dimension, 3);
|
||||
assert_eq!(prefix.len(), 3);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user