From c6912ce89af5c6e3f176a8a84883db6f9e7d99d6 Mon Sep 17 00:00:00 2001 From: PSeitz-dd Date: Wed, 10 Dec 2025 13:33:33 +0100 Subject: [PATCH] Handle JSON fields and columnar in space_usage (#2761) return field names in space_usage instead of `Field` more detailed info for columns --- columnar/src/dynamic_column.rs | 84 +++++++++++++++++++- columnar/src/lib.rs | 2 +- src/directory/composite_file.rs | 7 +- src/fastfield/readers.rs | 18 ++--- src/fieldnorm/reader.rs | 6 +- src/index/segment_reader.rs | 10 +-- src/lib.rs | 4 +- src/space_usage/mod.rs | 134 +++++++++++++++++++++++++------- sstable/src/dictionary.rs | 10 ++- 9 files changed, 216 insertions(+), 59 deletions(-) diff --git a/columnar/src/dynamic_column.rs b/columnar/src/dynamic_column.rs index b98bbd2fb..f87779608 100644 --- a/columnar/src/dynamic_column.rs +++ b/columnar/src/dynamic_column.rs @@ -3,7 +3,8 @@ use std::sync::Arc; use std::{fmt, io}; use common::file_slice::FileSlice; -use common::{ByteCount, DateTime, HasLen, OwnedBytes}; +use common::{ByteCount, DateTime, OwnedBytes}; +use serde::{Deserialize, Serialize}; use crate::column::{BytesColumn, Column, StrColumn}; use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column}; @@ -317,10 +318,89 @@ impl DynamicColumnHandle { } pub fn num_bytes(&self) -> ByteCount { - self.file_slice.len().into() + self.file_slice.num_bytes() + } + + /// Legacy helper returning the column space usage. + pub fn column_and_dictionary_num_bytes(&self) -> io::Result { + self.space_usage() + } + + /// Return the space usage of the column, optionally broken down by dictionary and column + /// values. + /// + /// For dictionary encoded columns (strings and bytes), this splits the total footprint into + /// the dictionary and the remaining column data (including index and values). + /// For all other column types, the dictionary size is `None` and the column size + /// equals the total bytes. + pub fn space_usage(&self) -> io::Result { + let total_num_bytes = self.num_bytes(); + let dynamic_column = self.open()?; + let dictionary_num_bytes = match &dynamic_column { + DynamicColumn::Bytes(bytes_column) => bytes_column.dictionary().num_bytes(), + DynamicColumn::Str(str_column) => str_column.dictionary().num_bytes(), + _ => { + return Ok(ColumnSpaceUsage::new(self.num_bytes(), None)); + } + }; + assert!(dictionary_num_bytes <= total_num_bytes); + let column_num_bytes = + ByteCount::from(total_num_bytes.get_bytes() - dictionary_num_bytes.get_bytes()); + Ok(ColumnSpaceUsage::new( + column_num_bytes, + Some(dictionary_num_bytes), + )) } pub fn column_type(&self) -> ColumnType { self.column_type } } + +/// Represents space usage of a column. +/// +/// `column_num_bytes` tracks the column payload (index, values and footer). +/// For dictionary encoded columns, `dictionary_num_bytes` captures the dictionary footprint. +/// [`ColumnSpaceUsage::total_num_bytes`] returns the sum of both parts. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ColumnSpaceUsage { + column_num_bytes: ByteCount, + dictionary_num_bytes: Option, +} + +impl ColumnSpaceUsage { + pub(crate) fn new( + column_num_bytes: ByteCount, + dictionary_num_bytes: Option, + ) -> Self { + ColumnSpaceUsage { + column_num_bytes, + dictionary_num_bytes, + } + } + + pub fn column_num_bytes(&self) -> ByteCount { + self.column_num_bytes + } + + pub fn dictionary_num_bytes(&self) -> Option { + self.dictionary_num_bytes + } + + pub fn total_num_bytes(&self) -> ByteCount { + self.column_num_bytes + self.dictionary_num_bytes.unwrap_or_default() + } + + /// Merge two space usage values by summing their components. + pub fn merge(&self, other: &ColumnSpaceUsage) -> ColumnSpaceUsage { + let dictionary_num_bytes = match (self.dictionary_num_bytes, other.dictionary_num_bytes) { + (Some(lhs), Some(rhs)) => Some(lhs + rhs), + (Some(val), None) | (None, Some(val)) => Some(val), + (None, None) => None, + }; + ColumnSpaceUsage { + column_num_bytes: self.column_num_bytes + other.column_num_bytes, + dictionary_num_bytes, + } + } +} diff --git a/columnar/src/lib.rs b/columnar/src/lib.rs index 0925d912d..537c52562 100644 --- a/columnar/src/lib.rs +++ b/columnar/src/lib.rs @@ -48,7 +48,7 @@ pub use columnar::{ use sstable::VoidSSTable; pub use value::{NumericalType, NumericalValue}; -pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle}; +pub use self::dynamic_column::{ColumnSpaceUsage, DynamicColumn, DynamicColumnHandle}; pub type RowId = u32; pub type DocId = u32; diff --git a/src/directory/composite_file.rs b/src/directory/composite_file.rs index 11d8929f1..1d38e082f 100644 --- a/src/directory/composite_file.rs +++ b/src/directory/composite_file.rs @@ -5,7 +5,7 @@ use std::ops::Range; use common::{BinarySerializable, CountingWriter, HasLen, VInt}; use crate::directory::{FileSlice, TerminatingWrite, WritePtr}; -use crate::schema::Field; +use crate::schema::{Field, Schema}; use crate::space_usage::{FieldUsage, PerFieldSpaceUsage}; #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)] @@ -167,10 +167,11 @@ impl CompositeFile { .map(|byte_range| self.data.slice(byte_range.clone())) } - pub fn space_usage(&self) -> PerFieldSpaceUsage { + pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage { let mut fields = Vec::new(); for (&field_addr, byte_range) in &self.offsets_index { - let mut field_usage = FieldUsage::empty(field_addr.field); + let field_name = schema.get_field_name(field_addr.field).to_string(); + let mut field_usage = FieldUsage::empty(field_name); field_usage.add_field_idx(field_addr.idx, byte_range.len().into()); fields.push(field_usage); } diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 56bc87778..083f79532 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -8,7 +8,7 @@ use columnar::{ }; use common::ByteCount; -use crate::core::json_utils::encode_column_name; +use crate::core::json_utils::{encode_column_name, json_path_sep_to_dot}; use crate::directory::FileSlice; use crate::schema::{Field, FieldEntry, FieldType, Schema}; use crate::space_usage::{FieldUsage, PerFieldSpaceUsage}; @@ -39,19 +39,15 @@ impl FastFieldReaders { self.resolve_column_name_given_default_field(column_name, default_field_opt) } - pub(crate) fn space_usage(&self, schema: &Schema) -> io::Result { + pub(crate) fn space_usage(&self) -> io::Result { let mut per_field_usages: Vec = Default::default(); - for (field, field_entry) in schema.fields() { - let column_handles = self.columnar.read_columns(field_entry.name())?; - let num_bytes: ByteCount = column_handles - .iter() - .map(|column_handle| column_handle.num_bytes()) - .sum(); - let mut field_usage = FieldUsage::empty(field); - field_usage.add_field_idx(0, num_bytes); + for (mut field_name, column_handle) in self.columnar.iter_columns()? { + json_path_sep_to_dot(&mut field_name); + let space_usage = column_handle.space_usage()?; + let mut field_usage = FieldUsage::empty(field_name); + field_usage.set_column_usage(space_usage); per_field_usages.push(field_usage); } - // TODO fix space usage for JSON fields. Ok(PerFieldSpaceUsage::new(per_field_usages)) } diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index 07070bb29..a6e7c6a7a 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use super::{fieldnorm_to_id, id_to_fieldnorm}; use crate::directory::{CompositeFile, FileSlice, OwnedBytes}; -use crate::schema::Field; +use crate::schema::{Field, Schema}; use crate::space_usage::PerFieldSpaceUsage; use crate::DocId; @@ -37,8 +37,8 @@ impl FieldNormReaders { } /// Return a break down of the space usage per field. - pub fn space_usage(&self) -> PerFieldSpaceUsage { - self.data.space_usage() + pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage { + self.data.space_usage(schema) } /// Returns a handle to inner file diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index f5589a690..cfccc65ed 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -455,11 +455,11 @@ impl SegmentReader { pub fn space_usage(&self) -> io::Result { Ok(SegmentSpaceUsage::new( self.num_docs(), - self.termdict_composite.space_usage(), - self.postings_composite.space_usage(), - self.positions_composite.space_usage(), - self.fast_fields_readers.space_usage(self.schema())?, - self.fieldnorm_readers.space_usage(), + self.termdict_composite.space_usage(self.schema()), + self.postings_composite.space_usage(self.schema()), + self.positions_composite.space_usage(self.schema()), + self.fast_fields_readers.space_usage()?, + self.fieldnorm_readers.space_usage(self.schema()), self.get_store_reader(0)?.space_usage(), self.alive_bitset_opt .as_ref() diff --git a/src/lib.rs b/src/lib.rs index 1027f4f46..22eab343a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -216,9 +216,7 @@ use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; -#[doc(hidden)] -pub use crate::core::json_utils; -pub use crate::core::{Executor, Searcher, SearcherGeneration}; +pub use crate::core::{json_utils, Executor, Searcher, SearcherGeneration}; pub use crate::directory::Directory; pub use crate::index::{ Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment, diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 70291153a..1dc413156 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -7,13 +7,14 @@ //! storage-level details into consideration. For example, if your file system block size is 4096 //! bytes, we can under-count actual resultant space usage by up to 4095 bytes per file. -use std::collections::HashMap; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use columnar::ColumnSpaceUsage; use common::ByteCount; use serde::{Deserialize, Serialize}; use crate::index::SegmentComponent; -use crate::schema::Field; /// Enum containing any of the possible space usage results for segment components. pub enum ComponentSpaceUsage { @@ -212,17 +213,26 @@ impl StoreSpaceUsage { /// Multiple indexes are used to handle variable length things, where #[derive(Clone, Debug, Serialize, Deserialize)] pub struct PerFieldSpaceUsage { - fields: HashMap, + fields: BTreeMap, total: ByteCount, } impl PerFieldSpaceUsage { pub(crate) fn new(fields: Vec) -> PerFieldSpaceUsage { - let total = fields.iter().map(FieldUsage::total).sum(); - let field_usage_map: HashMap = fields - .into_iter() - .map(|field_usage| (field_usage.field(), field_usage)) - .collect(); + let mut total = ByteCount::default(); + let mut field_usage_map: BTreeMap = BTreeMap::new(); + for field_usage in fields { + total += field_usage.total(); + let field_name = field_usage.field_name().to_string(); + match field_usage_map.entry(field_name) { + Entry::Vacant(entry) => { + entry.insert(field_usage); + } + Entry::Occupied(mut entry) => { + entry.get_mut().merge(field_usage); + } + } + } PerFieldSpaceUsage { fields: field_usage_map, total, @@ -230,8 +240,8 @@ impl PerFieldSpaceUsage { } /// Per field space usage - pub fn fields(&self) -> impl Iterator { - self.fields.iter() + pub fn fields(&self) -> impl Iterator { + self.fields.values() } /// Bytes used by the represented file @@ -246,20 +256,23 @@ impl PerFieldSpaceUsage { /// See documentation for [`PerFieldSpaceUsage`] for slightly more information. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct FieldUsage { - field: Field, + field_name: String, num_bytes: ByteCount, /// A field can be composed of more than one piece. /// These pieces are indexed by arbitrary numbers starting at zero. /// `self.num_bytes` includes all of `self.sub_num_bytes`. sub_num_bytes: Vec>, + /// Space usage of the column for fast fields, if relevant. + column_space_usage: Option, } impl FieldUsage { - pub(crate) fn empty(field: Field) -> FieldUsage { + pub(crate) fn empty(field_name: impl Into) -> FieldUsage { FieldUsage { - field, + field_name: field_name.into(), num_bytes: Default::default(), sub_num_bytes: Vec::new(), + column_space_usage: None, } } @@ -272,9 +285,14 @@ impl FieldUsage { self.num_bytes += size } + pub(crate) fn set_column_usage(&mut self, column_space_usage: ColumnSpaceUsage) { + self.num_bytes += column_space_usage.total_num_bytes(); + self.column_space_usage = Some(column_space_usage); + } + /// Field - pub fn field(&self) -> Field { - self.field + pub fn field_name(&self) -> &str { + &self.field_name } /// Space usage for each index @@ -282,16 +300,64 @@ impl FieldUsage { &self.sub_num_bytes[..] } + /// Returns the number of bytes used by the column payload, if the field is columnar. + pub fn column_num_bytes(&self) -> Option { + self.column_space_usage + .as_ref() + .map(ColumnSpaceUsage::column_num_bytes) + } + + /// Returns the number of bytes used by the dictionary for dictionary-encoded columns. + pub fn dictionary_num_bytes(&self) -> Option { + self.column_space_usage + .as_ref() + .and_then(ColumnSpaceUsage::dictionary_num_bytes) + } + + /// Returns the space usage of the column, if any. + pub fn column_space_usage(&self) -> Option<&ColumnSpaceUsage> { + self.column_space_usage.as_ref() + } + /// Total bytes used for this field in this context pub fn total(&self) -> ByteCount { self.num_bytes } + + fn merge(&mut self, other: FieldUsage) { + assert_eq!(self.field_name, other.field_name); + self.num_bytes += other.num_bytes; + if other.sub_num_bytes.len() > self.sub_num_bytes.len() { + self.sub_num_bytes.resize(other.sub_num_bytes.len(), None); + } + for (idx, num_bytes_opt) in other.sub_num_bytes.into_iter().enumerate() { + if let Some(num_bytes) = num_bytes_opt { + match self.sub_num_bytes[idx] { + Some(existing) => self.sub_num_bytes[idx] = Some(existing + num_bytes), + None => self.sub_num_bytes[idx] = Some(num_bytes), + } + } + } + self.column_space_usage = + merge_column_space_usage(self.column_space_usage.take(), other.column_space_usage); + } +} + +fn merge_column_space_usage( + left: Option, + right: Option, +) -> Option { + match (left, right) { + (Some(lhs), Some(rhs)) => Some(lhs.merge(&rhs)), + (Some(space), None) | (None, Some(space)) => Some(space), + (None, None) => None, + } } #[cfg(test)] mod test { use crate::index::Index; - use crate::schema::{Field, Schema, FAST, INDEXED, STORED, TEXT}; + use crate::schema::{Schema, FAST, INDEXED, STORED, TEXT}; use crate::space_usage::PerFieldSpaceUsage; use crate::{IndexWriter, Term}; @@ -307,17 +373,17 @@ mod test { fn expect_single_field( field_space: &PerFieldSpaceUsage, - field: &Field, + field: &str, min_size: u64, max_size: u64, ) { assert!(field_space.total() >= min_size); assert!(field_space.total() <= max_size); assert_eq!( - vec![(field, field_space.total())], + vec![(field.to_string(), field_space.total())], field_space .fields() - .map(|(x, y)| (x, y.total())) + .map(|usage| (usage.field_name().to_string(), usage.total())) .collect::>() ); } @@ -327,6 +393,7 @@ mod test { let mut schema_builder = Schema::builder(); let name = schema_builder.add_u64_field("name", FAST | INDEXED); let schema = schema_builder.build(); + let field_name = schema.get_field_name(name).to_string(); let index = Index::create_in_ram(schema); { @@ -349,11 +416,11 @@ mod test { assert_eq!(4, segment.num_docs()); - expect_single_field(segment.termdict(), &name, 1, 512); - expect_single_field(segment.postings(), &name, 1, 512); + expect_single_field(segment.termdict(), &field_name, 1, 512); + expect_single_field(segment.postings(), &field_name, 1, 512); assert_eq!(segment.positions().total(), 0); - expect_single_field(segment.fast_fields(), &name, 1, 512); - expect_single_field(segment.fieldnorms(), &name, 1, 512); + expect_single_field(segment.fast_fields(), &field_name, 1, 512); + expect_single_field(segment.fieldnorms(), &field_name, 1, 512); // TODO: understand why the following fails // assert_eq!(0, segment.store().total()); assert_eq!(segment.deletes(), 0); @@ -365,6 +432,7 @@ mod test { let mut schema_builder = Schema::builder(); let name = schema_builder.add_text_field("name", TEXT); let schema = schema_builder.build(); + let field_name = schema.get_field_name(name).to_string(); let index = Index::create_in_ram(schema); { @@ -389,11 +457,11 @@ mod test { assert_eq!(4, segment.num_docs()); - expect_single_field(segment.termdict(), &name, 1, 512); - expect_single_field(segment.postings(), &name, 1, 512); - expect_single_field(segment.positions(), &name, 1, 512); + expect_single_field(segment.termdict(), &field_name, 1, 512); + expect_single_field(segment.postings(), &field_name, 1, 512); + expect_single_field(segment.positions(), &field_name, 1, 512); assert_eq!(segment.fast_fields().total(), 0); - expect_single_field(segment.fieldnorms(), &name, 1, 512); + expect_single_field(segment.fieldnorms(), &field_name, 1, 512); // TODO: understand why the following fails // assert_eq!(0, segment.store().total()); assert_eq!(segment.deletes(), 0); @@ -429,10 +497,15 @@ mod test { assert_eq!(4, segment.num_docs()); assert_eq!(segment.termdict().total(), 0); + assert!(segment.termdict().fields().next().is_none()); assert_eq!(segment.postings().total(), 0); + assert!(segment.postings().fields().next().is_none()); assert_eq!(segment.positions().total(), 0); + assert!(segment.positions().fields().next().is_none()); assert_eq!(segment.fast_fields().total(), 0); + assert!(segment.fast_fields().fields().next().is_none()); assert_eq!(segment.fieldnorms().total(), 0); + assert!(segment.fieldnorms().fields().next().is_none()); assert!(segment.store().total() > 0); assert!(segment.store().total() < 512); assert_eq!(segment.deletes(), 0); @@ -444,6 +517,7 @@ mod test { let mut schema_builder = Schema::builder(); let name = schema_builder.add_u64_field("name", INDEXED); let schema = schema_builder.build(); + let field_name = schema.get_field_name(name).to_string(); let index = Index::create_in_ram(schema); { @@ -474,11 +548,11 @@ mod test { assert_eq!(2, segment_space_usage.num_docs()); - expect_single_field(segment_space_usage.termdict(), &name, 1, 512); - expect_single_field(segment_space_usage.postings(), &name, 1, 512); + expect_single_field(segment_space_usage.termdict(), &field_name, 1, 512); + expect_single_field(segment_space_usage.postings(), &field_name, 1, 512); assert_eq!(segment_space_usage.positions().total(), 0u64); assert_eq!(segment_space_usage.fast_fields().total(), 0u64); - expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512); + expect_single_field(segment_space_usage.fieldnorms(), &field_name, 1, 512); assert!(segment_space_usage.deletes() > 0); Ok(()) } diff --git a/sstable/src/dictionary.rs b/sstable/src/dictionary.rs index 56ddf60be..3a99fc16c 100644 --- a/sstable/src/dictionary.rs +++ b/sstable/src/dictionary.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use common::bounds::{TransformBound, transform_bound_inner_res}; use common::file_slice::FileSlice; -use common::{BinarySerializable, OwnedBytes}; +use common::{BinarySerializable, ByteCount, OwnedBytes}; use futures_util::{StreamExt, TryStreamExt, stream}; use itertools::Itertools; use tantivy_fst::Automaton; @@ -43,6 +43,7 @@ use crate::{ pub struct Dictionary { pub sstable_slice: FileSlice, pub sstable_index: SSTableIndex, + num_bytes: ByteCount, num_terms: u64, phantom_data: PhantomData, } @@ -278,6 +279,7 @@ impl Dictionary { /// Opens a `TermDictionary`. pub fn open(term_dictionary_file: FileSlice) -> io::Result { + let num_bytes = term_dictionary_file.num_bytes(); let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(20); let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?; let index_offset = u64::deserialize(&mut footer_len_bytes)?; @@ -317,6 +319,7 @@ impl Dictionary { Ok(Dictionary { sstable_slice, sstable_index, + num_bytes, num_terms, phantom_data: PhantomData, }) @@ -343,6 +346,11 @@ impl Dictionary { self.num_terms as usize } + /// Returns the total number of bytes used by the dictionary on disk. + pub fn num_bytes(&self) -> ByteCount { + self.num_bytes + } + /// Decode a DeltaReader up to key, returning the number of terms traversed /// /// If the key was not found, returns Ok(None).