mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-25 20:50:43 +00:00
added doc
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::{
|
||||
@@ -530,6 +530,18 @@ impl IndexMerger {
|
||||
serializer: &mut SegmentSerializer,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
/// Unfortunately, there are no special trick to merge segments.
|
||||
/// We need to rebuild a BKD-tree based off the list of triangles.
|
||||
///
|
||||
/// Because the data can be large, we do this by writing the sequence of triangles to disk,
|
||||
/// and mmapping it as mutable slice, and calling the same code as what is done for the
|
||||
/// segment serialization.
|
||||
///
|
||||
/// The OS is in charge of deciding how to handle its page cache.
|
||||
/// This is the same as what would have happened with swapping,
|
||||
/// except by explicitly mapping the file, the OS is more likely to
|
||||
/// swap, the memory will not be accounted as anonymous memory,
|
||||
/// swap space is reserved etc.
|
||||
use crate::spatial::bkd::Segment;
|
||||
let mut segment_mappings: Vec<Vec<Option<DocId>>> = Vec::new();
|
||||
for reader in &self.readers {
|
||||
@@ -549,10 +561,10 @@ impl IndexMerger {
|
||||
}
|
||||
for (segment_ord, reader) in self.readers.iter().enumerate() {
|
||||
for (field, temp_file) in &mut temp_files {
|
||||
let mut buf_temp_file = BufWriter::new(temp_file);
|
||||
let spatial_readers = reader.spatial_fields();
|
||||
let spatial_reader = match spatial_readers.get_field(*field)? {
|
||||
Some(reader) => reader,
|
||||
None => continue,
|
||||
let Some(spatial_reader) = spatial_readers.get_field(*field)? else {
|
||||
continue;
|
||||
};
|
||||
let segment = Segment::new(spatial_reader.get_bytes());
|
||||
for triangle_result in LeafPageIterator::new(&segment) {
|
||||
@@ -561,20 +573,21 @@ impl IndexMerger {
|
||||
if let Some(new_doc_id) =
|
||||
segment_mappings[segment_ord][triangle.doc_id as usize]
|
||||
{
|
||||
// This is really just a temporary file, not meant to be portable, so we
|
||||
// use native endianness here.
|
||||
for &word in &triangle.words {
|
||||
temp_file.write_all(&word.to_le_bytes())?;
|
||||
buf_temp_file.write_all(&word.to_ne_bytes())?;
|
||||
}
|
||||
temp_file.write_all(&new_doc_id.to_le_bytes())?;
|
||||
buf_temp_file.write_all(&new_doc_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
buf_temp_file.flush()?;
|
||||
// No need to fsync here. This file is not here for persistency.
|
||||
}
|
||||
}
|
||||
if let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() {
|
||||
for (field, mut temp_file) in temp_files {
|
||||
// Flush and sync triangles.
|
||||
temp_file.flush()?;
|
||||
temp_file.as_file_mut().sync_all()?;
|
||||
for (field, temp_file) in temp_files {
|
||||
// Memory map the triangle file.
|
||||
use memmap2::MmapOptions;
|
||||
let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? };
|
||||
|
||||
Reference in New Issue
Block a user