added doc

This commit is contained in:
Paul Masurel
2025-12-03 12:41:17 +01:00
parent 79622f1f0b
commit d8bc0e7c99

View File

@@ -1,5 +1,5 @@
use std::collections::HashMap;
use std::io::Write;
use std::io::{BufWriter, Write};
use std::sync::Arc;
use columnar::{
@@ -530,6 +530,18 @@ impl IndexMerger {
serializer: &mut SegmentSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
/// Unfortunately, there are no special trick to merge segments.
/// We need to rebuild a BKD-tree based off the list of triangles.
///
/// Because the data can be large, we do this by writing the sequence of triangles to disk,
/// and mmapping it as mutable slice, and calling the same code as what is done for the
/// segment serialization.
///
/// The OS is in charge of deciding how to handle its page cache.
/// This is the same as what would have happened with swapping,
/// except by explicitly mapping the file, the OS is more likely to
/// swap, the memory will not be accounted as anonymous memory,
/// swap space is reserved etc.
use crate::spatial::bkd::Segment;
let mut segment_mappings: Vec<Vec<Option<DocId>>> = Vec::new();
for reader in &self.readers {
@@ -549,10 +561,10 @@ impl IndexMerger {
}
for (segment_ord, reader) in self.readers.iter().enumerate() {
for (field, temp_file) in &mut temp_files {
let mut buf_temp_file = BufWriter::new(temp_file);
let spatial_readers = reader.spatial_fields();
let spatial_reader = match spatial_readers.get_field(*field)? {
Some(reader) => reader,
None => continue,
let Some(spatial_reader) = spatial_readers.get_field(*field)? else {
continue;
};
let segment = Segment::new(spatial_reader.get_bytes());
for triangle_result in LeafPageIterator::new(&segment) {
@@ -561,20 +573,21 @@ impl IndexMerger {
if let Some(new_doc_id) =
segment_mappings[segment_ord][triangle.doc_id as usize]
{
// This is really just a temporary file, not meant to be portable, so we
// use native endianness here.
for &word in &triangle.words {
temp_file.write_all(&word.to_le_bytes())?;
buf_temp_file.write_all(&word.to_ne_bytes())?;
}
temp_file.write_all(&new_doc_id.to_le_bytes())?;
buf_temp_file.write_all(&new_doc_id.to_ne_bytes())?;
}
}
}
buf_temp_file.flush()?;
// No need to fsync here. This file is not here for persistency.
}
}
if let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() {
for (field, mut temp_file) in temp_files {
// Flush and sync triangles.
temp_file.flush()?;
temp_file.as_file_mut().sync_all()?;
for (field, temp_file) in temp_files {
// Memory map the triangle file.
use memmap2::MmapOptions;
let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? };