From d8bc0e7c99fc35e044349b03944a63ac6d2eb0d6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 3 Dec 2025 12:41:17 +0100 Subject: [PATCH] added doc --- src/indexer/merger.rs | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index a5de67643..152a31cf7 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,5 +1,5 @@ use std::collections::HashMap; -use std::io::Write; +use std::io::{BufWriter, Write}; use std::sync::Arc; use columnar::{ @@ -530,6 +530,18 @@ impl IndexMerger { serializer: &mut SegmentSerializer, doc_id_mapping: &SegmentDocIdMapping, ) -> crate::Result<()> { + /// Unfortunately, there are no special trick to merge segments. + /// We need to rebuild a BKD-tree based off the list of triangles. + /// + /// Because the data can be large, we do this by writing the sequence of triangles to disk, + /// and mmapping it as mutable slice, and calling the same code as what is done for the + /// segment serialization. + /// + /// The OS is in charge of deciding how to handle its page cache. + /// This is the same as what would have happened with swapping, + /// except by explicitly mapping the file, the OS is more likely to + /// swap, the memory will not be accounted as anonymous memory, + /// swap space is reserved etc. use crate::spatial::bkd::Segment; let mut segment_mappings: Vec>> = Vec::new(); for reader in &self.readers { @@ -549,10 +561,10 @@ impl IndexMerger { } for (segment_ord, reader) in self.readers.iter().enumerate() { for (field, temp_file) in &mut temp_files { + let mut buf_temp_file = BufWriter::new(temp_file); let spatial_readers = reader.spatial_fields(); - let spatial_reader = match spatial_readers.get_field(*field)? { - Some(reader) => reader, - None => continue, + let Some(spatial_reader) = spatial_readers.get_field(*field)? else { + continue; }; let segment = Segment::new(spatial_reader.get_bytes()); for triangle_result in LeafPageIterator::new(&segment) { @@ -561,20 +573,21 @@ impl IndexMerger { if let Some(new_doc_id) = segment_mappings[segment_ord][triangle.doc_id as usize] { + // This is really just a temporary file, not meant to be portable, so we + // use native endianness here. for &word in &triangle.words { - temp_file.write_all(&word.to_le_bytes())?; + buf_temp_file.write_all(&word.to_ne_bytes())?; } - temp_file.write_all(&new_doc_id.to_le_bytes())?; + buf_temp_file.write_all(&new_doc_id.to_ne_bytes())?; } } } + buf_temp_file.flush()?; + // No need to fsync here. This file is not here for persistency. } } if let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() { - for (field, mut temp_file) in temp_files { - // Flush and sync triangles. - temp_file.flush()?; - temp_file.as_file_mut().sync_all()?; + for (field, temp_file) in temp_files { // Memory map the triangle file. use memmap2::MmapOptions; let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? };