Fixed agg validation

chore: Make Language hashable (#79 ) (#2763 )
Co-authored-by: Ming <ming.ying.nyc@gmail.com>
2026-01-04 00:02:55 +00:00 · 2025-12-10 10:28:49 -08:00 · 2025-12-10 15:38:43 +01:00 · 2025-12-10 20:33:33 +08:00 · 2025-12-05 09:48:40 +08:00
20 changed files with 533 additions and 218 deletions
--- a/columnar/src/dynamic_column.rs
+++ b/columnar/src/dynamic_column.rs
@@ -3,7 +3,8 @@ use std::sync::Arc;
 use std::{fmt, io};

 use common::file_slice::FileSlice;
-use common::{ByteCount, DateTime, HasLen, OwnedBytes};
+use common::{ByteCount, DateTime, OwnedBytes};
+use serde::{Deserialize, Serialize};

 use crate::column::{BytesColumn, Column, StrColumn};
 use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column};
@@ -317,10 +318,89 @@ impl DynamicColumnHandle {
    }

    pub fn num_bytes(&self) -> ByteCount {
-        self.file_slice.len().into()
+        self.file_slice.num_bytes()
+    }
+
+    /// Legacy helper returning the column space usage.
+    pub fn column_and_dictionary_num_bytes(&self) -> io::Result<ColumnSpaceUsage> {
+        self.space_usage()
+    }
+
+    /// Return the space usage of the column, optionally broken down by dictionary and column
+    /// values.
+    ///
+    /// For dictionary encoded columns (strings and bytes), this splits the total footprint into
+    /// the dictionary and the remaining column data (including index and values).
+    /// For all other column types, the dictionary size is `None` and the column size
+    /// equals the total bytes.
+    pub fn space_usage(&self) -> io::Result<ColumnSpaceUsage> {
+        let total_num_bytes = self.num_bytes();
+        let dynamic_column = self.open()?;
+        let dictionary_num_bytes = match &dynamic_column {
+            DynamicColumn::Bytes(bytes_column) => bytes_column.dictionary().num_bytes(),
+            DynamicColumn::Str(str_column) => str_column.dictionary().num_bytes(),
+            _ => {
+                return Ok(ColumnSpaceUsage::new(self.num_bytes(), None));
+            }
+        };
+        assert!(dictionary_num_bytes <= total_num_bytes);
+        let column_num_bytes =
+            ByteCount::from(total_num_bytes.get_bytes() - dictionary_num_bytes.get_bytes());
+        Ok(ColumnSpaceUsage::new(
+            column_num_bytes,
+            Some(dictionary_num_bytes),
+        ))
    }

    pub fn column_type(&self) -> ColumnType {
        self.column_type
    }
 }
+
+/// Represents space usage of a column.
+///
+/// `column_num_bytes` tracks the column payload (index, values and footer).
+/// For dictionary encoded columns, `dictionary_num_bytes` captures the dictionary footprint.
+/// [`ColumnSpaceUsage::total_num_bytes`] returns the sum of both parts.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ColumnSpaceUsage {
+    column_num_bytes: ByteCount,
+    dictionary_num_bytes: Option<ByteCount>,
+}
+
+impl ColumnSpaceUsage {
+    pub(crate) fn new(
+        column_num_bytes: ByteCount,
+        dictionary_num_bytes: Option<ByteCount>,
+    ) -> Self {
+        ColumnSpaceUsage {
+            column_num_bytes,
+            dictionary_num_bytes,
+        }
+    }
+
+    pub fn column_num_bytes(&self) -> ByteCount {
+        self.column_num_bytes
+    }
+
+    pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
+        self.dictionary_num_bytes
+    }
+
+    pub fn total_num_bytes(&self) -> ByteCount {
+        self.column_num_bytes + self.dictionary_num_bytes.unwrap_or_default()
+    }
+
+    /// Merge two space usage values by summing their components.
+    pub fn merge(&self, other: &ColumnSpaceUsage) -> ColumnSpaceUsage {
+        let dictionary_num_bytes = match (self.dictionary_num_bytes, other.dictionary_num_bytes) {
+            (Some(lhs), Some(rhs)) => Some(lhs + rhs),
+            (Some(val), None) | (None, Some(val)) => Some(val),
+            (None, None) => None,
+        };
+        ColumnSpaceUsage {
+            column_num_bytes: self.column_num_bytes + other.column_num_bytes,
+            dictionary_num_bytes,
+        }
+    }
+}
--- a/columnar/src/lib.rs
+++ b/columnar/src/lib.rs
@@ -48,7 +48,7 @@ pub use columnar::{
 use sstable::VoidSSTable;
 pub use value::{NumericalType, NumericalValue};

-pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
+pub use self::dynamic_column::{ColumnSpaceUsage, DynamicColumn, DynamicColumnHandle};

 pub type RowId = u32;
 pub type DocId = u32;
--- a/src/aggregation/accessor_helpers.rs
+++ b/src/aggregation/accessor_helpers.rs
@@ -55,22 +55,44 @@ pub(crate) fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
    ]
 }

-/// Get fast field reader or empty as default.
+/// Get fast field reader or return an error if the field doesn't exist.
 pub(crate) fn get_ff_reader(
    reader: &SegmentReader,
    field_name: &str,
    allowed_column_types: Option<&[ColumnType]>,
 ) -> crate::Result<(columnar::Column<u64>, ColumnType)> {
    let ff_fields = reader.fast_fields();
-    let ff_field_with_type = ff_fields
-        .u64_lenient_for_type(allowed_column_types, field_name)?
-        .unwrap_or_else(|| {
-            (
+    let ff_field_with_type = ff_fields.u64_lenient_for_type(allowed_column_types, field_name)?;
+
+    match ff_field_with_type {
+        Some(field) => Ok(field),
+        None => {
+            // Check if the field exists in the schema but is not a fast field
+            let schema = reader.schema();
+            if let Some((field, _path)) = schema.find_field(field_name) {
+                let field_type = schema.get_field_entry(field).field_type();
+                if !field_type.is_fast() {
+                    return Err(crate::TantivyError::SchemaError(format!(
+                        "Field '{}' is not a fast field. Aggregations require fast fields.",
+                        field_name
+                    )));
+                }
+            }
+
+            // Field doesn't exist at all or has no values in this segment
+            // Check if it exists in schema to provide a better error message
+            if schema.find_field(field_name).is_none() {
+                return Err(crate::TantivyError::FieldNotFound(field_name.to_string()));
+            }
+
+            // Field exists in schema and is a fast field, but has no values in this segment
+            // This is acceptable - return an empty column
+            Ok((
                Column::build_empty_column(reader.num_docs()),
                ColumnType::U64,
-            )
-        });
-    Ok(ff_field_with_type)
+            ))
+        }
+    }
 }

 pub(crate) fn get_dynamic_columns(
@@ -89,6 +111,7 @@ pub(crate) fn get_dynamic_columns(
 /// Get all fast field reader or empty as default.
 ///
 /// Is guaranteed to return at least one column.
+/// Returns an error if the field doesn't exist in the schema or is not a fast field.
 pub(crate) fn get_all_ff_reader_or_empty(
    reader: &SegmentReader,
    field_name: &str,
@@ -98,7 +121,25 @@ pub(crate) fn get_all_ff_reader_or_empty(
    let ff_fields = reader.fast_fields();
    let mut ff_field_with_type =
        ff_fields.u64_lenient_for_type_all(allowed_column_types, field_name)?;
+
    if ff_field_with_type.is_empty() {
+        // Check if the field exists in the schema but is not a fast field
+        let schema = reader.schema();
+        if let Some((field, _path)) = schema.find_field(field_name) {
+            let field_type = schema.get_field_entry(field).field_type();
+            if !field_type.is_fast() {
+                return Err(crate::TantivyError::SchemaError(format!(
+                    "Field '{}' is not a fast field. Aggregations require fast fields.",
+                    field_name
+                )));
+            }
+        } else {
+            // Field doesn't exist in the schema at all
+            return Err(crate::TantivyError::FieldNotFound(field_name.to_string()));
+        }
+
+        // Field exists in schema and is a fast field, but has no values in this segment
+        // This is acceptable - return an empty column
        ff_field_with_type.push((Column::build_empty_column(reader.num_docs()), fallback_type));
    }
    Ok(ff_field_with_type)
--- a/src/aggregation/agg_data.rs
+++ b/src/aggregation/agg_data.rs
@@ -1057,7 +1057,7 @@ mod tests {
            "avg": {"field": "score"}
        }));
        let terms_string_with_child = agg_from_json(json!({
-            "terms": {"field": "string_id"},
+            "terms": {"field": "text"},
            "aggs": {
                "histo": {"histogram": {"field": "score", "interval": 10.0}}
            }
--- a/src/aggregation/agg_tests.rs
+++ b/src/aggregation/agg_tests.rs
@@ -1005,3 +1005,123 @@ fn test_aggregation_on_json_object_mixed_numerical_segments() {
        )
    );
 }
+
+#[test]
+fn test_aggregation_invalid_field_returns_error() {
+    // Test that aggregations return an error when given an invalid field name
+    let index = get_test_index_2_segments(false).unwrap();
+    let reader = index.reader().unwrap();
+    let searcher = reader.searcher();
+
+    // Test with a field that doesn't exist at all
+    let agg_req_str = r#"
+    {
+        "date_histogram_test": {
+            "date_histogram": {
+                "field": "not_valid_field",
+                "fixed_interval": "30d"
+            }
+        }
+    }"#;
+    let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
+    let collector = get_collector(agg);
+    let result = searcher.search(&AllQuery, &collector);
+
+    assert!(result.is_err());
+    match result {
+        Err(crate::TantivyError::FieldNotFound(field_name)) => {
+            assert_eq!(field_name, "not_valid_field");
+        }
+        _ => panic!("Expected FieldNotFound error, got: {:?}", result),
+    }
+
+    // Test with histogram aggregation on invalid field
+    let agg_req_str = r#"
+    {
+        "histogram_test": {
+            "histogram": {
+                "field": "invalid_histogram_field",
+                "interval": 10.0
+            }
+        }
+    }"#;
+    let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
+    let collector = get_collector(agg);
+    let result = searcher.search(&AllQuery, &collector);
+
+    assert!(result.is_err());
+    match result {
+        Err(crate::TantivyError::FieldNotFound(field_name)) => {
+            assert_eq!(field_name, "invalid_histogram_field");
+        }
+        _ => panic!("Expected FieldNotFound error, got: {:?}", result),
+    }
+
+    // Test with terms aggregation on invalid field
+    let agg_req_str = r#"
+    {
+        "terms_test": {
+            "terms": {
+                "field": "invalid_terms_field"
+            }
+        }
+    }"#;
+    let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
+    let collector = get_collector(agg);
+    let result = searcher.search(&AllQuery, &collector);
+
+    assert!(result.is_err());
+    match result {
+        Err(crate::TantivyError::FieldNotFound(field_name)) => {
+            assert_eq!(field_name, "invalid_terms_field");
+        }
+        _ => panic!("Expected FieldNotFound error, got: {:?}", result),
+    }
+
+    // Test with avg metric aggregation on invalid field
+    let agg_req_str = r#"
+    {
+        "avg_test": {
+            "avg": {
+                "field": "invalid_avg_field"
+            }
+        }
+    }"#;
+    let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
+    let collector = get_collector(agg);
+    let result = searcher.search(&AllQuery, &collector);
+
+    assert!(result.is_err());
+    match result {
+        Err(crate::TantivyError::FieldNotFound(field_name)) => {
+            assert_eq!(field_name, "invalid_avg_field");
+        }
+        _ => panic!("Expected FieldNotFound error, got: {:?}", result),
+    }
+
+    // Test with range aggregation on invalid field
+    let agg_req_str = r#"
+    {
+        "range_test": {
+            "range": {
+                "field": "invalid_range_field",
+                "ranges": [
+                    { "to": 10.0 },
+                    { "from": 10.0, "to": 20.0 },
+                    { "from": 20.0 }
+                ]
+            }
+        }
+    }"#;
+    let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
+    let collector = get_collector(agg);
+    let result = searcher.search(&AllQuery, &collector);
+
+    assert!(result.is_err());
+    match result {
+        Err(crate::TantivyError::FieldNotFound(field_name)) => {
+            assert_eq!(field_name, "invalid_range_field");
+        }
+        _ => panic!("Expected FieldNotFound error, got: {:?}", result),
+    }
+}
--- a/src/aggregation/bucket/term_missing_agg.rs
+++ b/src/aggregation/bucket/term_missing_agg.rs
@@ -255,6 +255,7 @@ mod tests {
    fn terms_aggregation_missing_mult_seg_empty() -> crate::Result<()> {
        let mut schema_builder = Schema::builder();
        let score = schema_builder.add_f64_field("score", FAST);
+        schema_builder.add_json_field("json", FAST);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema);
        let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
@@ -302,6 +303,7 @@ mod tests {
    fn terms_aggregation_missing_single_seg_empty() -> crate::Result<()> {
        let mut schema_builder = Schema::builder();
        let score = schema_builder.add_f64_field("score", FAST);
+        schema_builder.add_json_field("json", FAST);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema);
        let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -406,7 +406,7 @@ mod tests {
        let mut term = Term::from_field_json_path(field, "color", false);
        term.append_type_and_str("red");

-        assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred")
+        assert_eq!(term.serialized_value_bytes(), b"color\x00sred".to_vec())
    }

    #[test]
@@ -416,8 +416,8 @@ mod tests {
        term.append_type_and_fast_value(-4i64);

        assert_eq!(
-            term.serialized_term(),
-            b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
+            term.serialized_value_bytes(),
+            b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc".to_vec()
        )
    }

@@ -428,8 +428,8 @@ mod tests {
        term.append_type_and_fast_value(4u64);

        assert_eq!(
-            term.serialized_term(),
-            b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
+            term.serialized_value_bytes(),
+            b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04".to_vec()
        )
    }

@@ -439,8 +439,8 @@ mod tests {
        let mut term = Term::from_field_json_path(field, "color", false);
        term.append_type_and_fast_value(4.0f64);
        assert_eq!(
-            term.serialized_term(),
-            b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
+            term.serialized_value_bytes(),
+            b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00".to_vec()
        )
    }

@@ -450,8 +450,8 @@ mod tests {
        let mut term = Term::from_field_json_path(field, "color", false);
        term.append_type_and_fast_value(true);
        assert_eq!(
-            term.serialized_term(),
-            b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
+            term.serialized_value_bytes(),
+            b"color\x00o\x00\x00\x00\x00\x00\x00\x00\x01".to_vec()
        )
    }

--- a/src/directory/composite_file.rs
+++ b/src/directory/composite_file.rs
@@ -5,7 +5,7 @@ use std::ops::Range;
 use common::{BinarySerializable, CountingWriter, HasLen, VInt};

 use crate::directory::{FileSlice, TerminatingWrite, WritePtr};
-use crate::schema::Field;
+use crate::schema::{Field, Schema};
 use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};

 #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
@@ -167,10 +167,11 @@ impl CompositeFile {
            .map(|byte_range| self.data.slice(byte_range.clone()))
    }

-    pub fn space_usage(&self) -> PerFieldSpaceUsage {
+    pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage {
        let mut fields = Vec::new();
        for (&field_addr, byte_range) in &self.offsets_index {
-            let mut field_usage = FieldUsage::empty(field_addr.field);
+            let field_name = schema.get_field_name(field_addr.field).to_string();
+            let mut field_usage = FieldUsage::empty(field_name);
            field_usage.add_field_idx(field_addr.idx, byte_range.len().into());
            fields.push(field_usage);
        }
--- a/src/fastfield/readers.rs
+++ b/src/fastfield/readers.rs
@@ -8,7 +8,7 @@ use columnar::{
 };
 use common::ByteCount;

-use crate::core::json_utils::encode_column_name;
+use crate::core::json_utils::{encode_column_name, json_path_sep_to_dot};
 use crate::directory::FileSlice;
 use crate::schema::{Field, FieldEntry, FieldType, Schema};
 use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
@@ -39,19 +39,15 @@ impl FastFieldReaders {
        self.resolve_column_name_given_default_field(column_name, default_field_opt)
    }

-    pub(crate) fn space_usage(&self, schema: &Schema) -> io::Result<PerFieldSpaceUsage> {
+    pub(crate) fn space_usage(&self) -> io::Result<PerFieldSpaceUsage> {
        let mut per_field_usages: Vec<FieldUsage> = Default::default();
-        for (field, field_entry) in schema.fields() {
-            let column_handles = self.columnar.read_columns(field_entry.name())?;
-            let num_bytes: ByteCount = column_handles
-                .iter()
-                .map(|column_handle| column_handle.num_bytes())
-                .sum();
-            let mut field_usage = FieldUsage::empty(field);
-            field_usage.add_field_idx(0, num_bytes);
+        for (mut field_name, column_handle) in self.columnar.iter_columns()? {
+            json_path_sep_to_dot(&mut field_name);
+            let space_usage = column_handle.space_usage()?;
+            let mut field_usage = FieldUsage::empty(field_name);
+            field_usage.set_column_usage(space_usage);
            per_field_usages.push(field_usage);
        }
-        // TODO fix space usage for JSON fields.
        Ok(PerFieldSpaceUsage::new(per_field_usages))
    }

--- a/src/fieldnorm/reader.rs
+++ b/src/fieldnorm/reader.rs
@@ -2,7 +2,7 @@ use std::sync::Arc;

 use super::{fieldnorm_to_id, id_to_fieldnorm};
 use crate::directory::{CompositeFile, FileSlice, OwnedBytes};
-use crate::schema::Field;
+use crate::schema::{Field, Schema};
 use crate::space_usage::PerFieldSpaceUsage;
 use crate::DocId;

@@ -37,8 +37,8 @@ impl FieldNormReaders {
    }

    /// Return a break down of the space usage per field.
-    pub fn space_usage(&self) -> PerFieldSpaceUsage {
-        self.data.space_usage()
+    pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage {
+        self.data.space_usage(schema)
    }

    /// Returns a handle to inner file
--- a/src/index/segment_reader.rs
+++ b/src/index/segment_reader.rs
@@ -455,11 +455,11 @@ impl SegmentReader {
    pub fn space_usage(&self) -> io::Result<SegmentSpaceUsage> {
        Ok(SegmentSpaceUsage::new(
            self.num_docs(),
-            self.termdict_composite.space_usage(),
-            self.postings_composite.space_usage(),
-            self.positions_composite.space_usage(),
-            self.fast_fields_readers.space_usage(self.schema())?,
-            self.fieldnorm_readers.space_usage(),
+            self.termdict_composite.space_usage(self.schema()),
+            self.postings_composite.space_usage(self.schema()),
+            self.positions_composite.space_usage(self.schema()),
+            self.fast_fields_readers.space_usage()?,
+            self.fieldnorm_readers.space_usage(self.schema()),
            self.get_store_reader(0)?.space_usage(),
            self.alive_bitset_opt
                .as_ref()
--- a/src/indexer/indexing_term.rs
+++ b/src/indexer/indexing_term.rs
@@ -3,21 +3,21 @@ use std::net::Ipv6Addr;
 use columnar::MonotonicallyMappableToU128;

 use crate::fastfield::FastValue;
-use crate::schema::{Field, Type};
+use crate::schema::Field;

-/// Term represents the value that the token can take.
-/// It's a serialized representation over different types.
+/// IndexingTerm is used to represent a term during indexing.
+/// It's a serialized representation over field and value.
 ///
-/// It actually wraps a `Vec<u8>`. The first 5 bytes are metadata.
-/// 4 bytes are the field id, and the last byte is the type.
+/// It actually wraps a `Vec<u8>`. The first 4 bytes are the field.
 ///
-/// The serialized value `ValueBytes` is considered everything after the 4 first bytes (term id).
+/// We serialize the field, because we index everything in a single
+/// global term dictionary during indexing.
 #[derive(Clone)]
 pub(crate) struct IndexingTerm<B = Vec<u8>>(B)
 where B: AsRef<[u8]>;

 /// The number of bytes used as metadata by `Term`.
-const TERM_METADATA_LENGTH: usize = 5;
+const TERM_METADATA_LENGTH: usize = 4;

 impl IndexingTerm {
    /// Create a new Term with a buffer with a given capacity.
@@ -31,10 +31,9 @@ impl IndexingTerm {
    /// Use `clear_with_field_and_type` in that case.
    ///
    /// Sets field and the type.
-    pub(crate) fn set_field_and_type(&mut self, field: Field, typ: Type) {
+    pub(crate) fn set_field(&mut self, field: Field) {
        assert!(self.is_empty());
        self.0[0..4].clone_from_slice(field.field_id().to_be_bytes().as_ref());
-        self.0[4] = typ.to_code();
    }

    /// Is empty if there are no value bytes.
@@ -42,10 +41,10 @@ impl IndexingTerm {
        self.0.len() == TERM_METADATA_LENGTH
    }

-    /// Removes the value_bytes and set the field and type code.
-    pub(crate) fn clear_with_field_and_type(&mut self, typ: Type, field: Field) {
+    /// Removes the value_bytes and set the field
+    pub(crate) fn clear_with_field(&mut self, field: Field) {
        self.truncate_value_bytes(0);
-        self.set_field_and_type(field, typ);
+        self.set_field(field);
    }

    /// Sets a u64 value in the term.
@@ -122,6 +121,23 @@ impl IndexingTerm {
 impl<B> IndexingTerm<B>
 where B: AsRef<[u8]>
 {
+    /// Wraps serialized term bytes.
+    ///
+    /// The input buffer is expected to be the concatenation of the big endian encoded field id
+    /// followed by the serialized value bytes (type tag + payload).
+    #[inline]
+    pub fn wrap(serialized_term: B) -> IndexingTerm<B> {
+        debug_assert!(serialized_term.as_ref().len() >= TERM_METADATA_LENGTH);
+        IndexingTerm(serialized_term)
+    }
+
+    /// Returns the field this term belongs to.
+    #[inline]
+    pub fn field(&self) -> Field {
+        let field_id_bytes: [u8; 4] = self.0.as_ref()[..4].try_into().unwrap();
+        Field::from_field_id(u32::from_be_bytes(field_id_bytes))
+    }
+
    /// Returns the serialized representation of Term.
    /// This includes field_id, value type and value.
    ///
@@ -136,6 +152,7 @@ where B: AsRef<[u8]>
 #[cfg(test)]
 mod tests {

+    use super::IndexingTerm;
    use crate::schema::*;

    #[test]
@@ -143,42 +160,55 @@ mod tests {
        let mut schema_builder = Schema::builder();
        schema_builder.add_text_field("text", STRING);
        let title_field = schema_builder.add_text_field("title", STRING);
-        let term = Term::from_field_text(title_field, "test");
+        let mut term = IndexingTerm::with_capacity(0);
+        term.set_field(title_field);
+        term.set_bytes(b"test");
        assert_eq!(term.field(), title_field);
-        assert_eq!(term.typ(), Type::Str);
-        assert_eq!(term.value().as_str(), Some("test"))
+        assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01test".to_vec())
    }

    /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
    /// <field> + <type byte> + <value len>
    ///
    /// - <field> is a big endian encoded u32 field id
-    /// - <type_byte>'s most significant bit expresses whether the term is a json term or not The
-    ///   remaining 7 bits are used to encode the type of the value. If this is a JSON term, the
-    ///   type is the type of the leaf of the json.
    /// - <value> is,  if this is not the json term, a binary representation specific to the type.
    ///   If it is a JSON Term, then it is prepended with the path that leads to this leaf value.
-    const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
+    const FAST_VALUE_TERM_LEN: usize = 4 + 8;

    #[test]
    pub fn test_term_u64() {
        let mut schema_builder = Schema::builder();
        let count_field = schema_builder.add_u64_field("count", INDEXED);
-        let term = Term::from_field_u64(count_field, 983u64);
+        let mut term = IndexingTerm::with_capacity(0);
+        term.set_field(count_field);
+        term.set_u64(983u64);
        assert_eq!(term.field(), count_field);
-        assert_eq!(term.typ(), Type::U64);
        assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN);
-        assert_eq!(term.value().as_u64(), Some(983u64))
    }

    #[test]
    pub fn test_term_bool() {
        let mut schema_builder = Schema::builder();
        let bool_field = schema_builder.add_bool_field("bool", INDEXED);
-        let term = Term::from_field_bool(bool_field, true);
+        let term = {
+            let mut term = IndexingTerm::with_capacity(0);
+            term.set_field(bool_field);
+            term.set_bool(true);
+            term
+        };
        assert_eq!(term.field(), bool_field);
-        assert_eq!(term.typ(), Type::Bool);
        assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN);
-        assert_eq!(term.value().as_bool(), Some(true))
+    }
+
+    #[test]
+    pub fn indexing_term_wrap_extracts_field() {
+        let field = Field::from_field_id(7u32);
+        let mut term = IndexingTerm::with_capacity(0);
+        term.set_field(field);
+        term.append_bytes(b"abc");
+
+        let wrapped = IndexingTerm::wrap(term.serialized_term());
+        assert_eq!(wrapped.field(), field);
+        assert_eq!(wrapped.serialized_term(), term.serialized_term());
    }
 }
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -171,7 +171,7 @@ impl SegmentWriter {
            let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
            let postings_writer: &mut dyn PostingsWriter =
                self.per_field_postings_writers.get_for_field_mut(field);
-            term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
+            term_buffer.clear_with_field(field);

            match field_entry.field_type() {
                FieldType::Facet(_) => {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -216,9 +216,7 @@ use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};

 pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
-#[doc(hidden)]
-pub use crate::core::json_utils;
-pub use crate::core::{Executor, Searcher, SearcherGeneration};
+pub use crate::core::{json_utils, Executor, Searcher, SearcherGeneration};
 pub use crate::directory::Directory;
 pub use crate::index::{
    Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
--- a/src/postings/json_postings_writer.rs
+++ b/src/postings/json_postings_writer.rs
@@ -8,7 +8,7 @@ use crate::indexer::path_to_unordered_id::OrderedPathId;
 use crate::postings::postings_writer::SpecializedPostingsWriter;
 use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
 use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
-use crate::schema::{Field, Type, ValueBytes};
+use crate::schema::{Field, Type};
 use crate::tokenizer::TokenStream;
 use crate::DocId;

@@ -79,8 +79,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
            term_buffer.truncate(term_path_len);
            term_buffer.append_bytes(term);

-            let json_value = ValueBytes::wrap(term);
-            let typ = json_value.typ();
+            let typ = Type::from_code(term[0]).expect("Invalid type code in JSON term");
            if typ == Type::Str {
                SpecializedPostingsWriter::<Rec>::serialize_one_term(
                    term_buffer.as_bytes(),
@@ -107,6 +106,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
    }
 }

+/// Helper to build the JSON term bytes that land in the term dictionary.
+/// Format: `[json path utf8][JSON_END_OF_PATH][type tag][payload]`
 struct JsonTermSerializer(Vec<u8>);
 impl JsonTermSerializer {
    /// Appends a JSON path to the Term.
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -11,7 +11,7 @@ use crate::postings::recorder::{BufferLender, Recorder};
 use crate::postings::{
    FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
 };
-use crate::schema::{Field, Schema, Term, Type};
+use crate::schema::{Field, Schema, Type};
 use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN};
 use crate::DocId;

@@ -59,14 +59,14 @@ pub(crate) fn serialize_postings(
    let mut term_offsets: Vec<(Field, OrderedPathId, &[u8], Addr)> =
        Vec::with_capacity(ctx.term_index.len());
    term_offsets.extend(ctx.term_index.iter().map(|(key, addr)| {
-        let field = Term::wrap(key).field();
+        let field = IndexingTerm::wrap(key).field();
        if schema.get_field_entry(field).field_type().value_type() == Type::Json {
-            let byte_range_path = 5..5 + 4;
+            let byte_range_path = 4..4 + 4;
            let unordered_id = u32::from_be_bytes(key[byte_range_path.clone()].try_into().unwrap());
            let path_id = unordered_id_to_ordered_id[unordered_id as usize];
            (field, path_id, &key[byte_range_path.end..], addr)
        } else {
-            (field, 0.into(), &key[5..], addr)
+            (field, 0.into(), &key[4..], addr)
        }
    }));
    // Sort by field, path, and term
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -1,10 +1,11 @@
-use std::hash::{Hash, Hasher};
+use std::hash::Hash;
 use std::net::Ipv6Addr;
 use std::{fmt, str};

 use columnar::MonotonicallyMappableToU128;
 use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP_STR};
 use common::JsonPathWriter;
+use serde::{Deserialize, Serialize};

 use super::date_time_options::DATE_TIME_PRECISION_INDEXED;
 use super::{Field, Schema};
@@ -16,23 +17,54 @@ use crate::DateTime;
 /// Term represents the value that the token can take.
 /// It's a serialized representation over different types.
 ///
-/// It actually wraps a `Vec<u8>`. The first 5 bytes are metadata.
-/// 4 bytes are the field id, and the last byte is the type.
-///
-/// The serialized value `ValueBytes` is considered everything after the 4 first bytes (term id).
-#[derive(Clone)]
-pub struct Term<B = Vec<u8>>(B)
-where B: AsRef<[u8]>;
+/// A term is composed of Field and the serialized value bytes.
+/// The serialized value bytes themselves start with a one byte type tag followed by the payload.
+#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+pub struct Term {
+    field: Field,
+    serialized_value_bytes: Vec<u8>,
+}

-/// The number of bytes used as metadata by `Term`.
-const TERM_METADATA_LENGTH: usize = 5;
+/// The number of bytes used as metadata when serializing a term.
+const TERM_TYPE_TAG_LEN: usize = 1;

 impl Term {
+    /// Takes a serialized term and wraps it as a Term.
+    /// First 4 bytes are the field id
+    #[deprecated(
+        note = "we want to avoid working on the serialized representation directly, replace with \
+                typed API calls (add more if needed) or use serde to serialize/deserialize"
+    )]
+    pub fn wrap(serialized: &[u8]) -> Term {
+        let field_id_bytes: [u8; 4] = serialized[0..4].try_into().unwrap();
+        let field_id = u32::from_be_bytes(field_id_bytes);
+        Term {
+            field: Field::from_field_id(field_id),
+            serialized_value_bytes: serialized[4..].to_vec(),
+        }
+    }
+
+    /// Returns the serialized representation of the term.
+    /// First 4 bytes are the field id
+    #[deprecated(
+        note = "we want to avoid working on the serialized representation directly, replace with \
+                typed API calls (add more if needed) or use serde to serialize/deserialize"
+    )]
+    pub fn serialized_term(&self) -> Vec<u8> {
+        let mut serialized = Vec::with_capacity(4 + self.serialized_value_bytes.len());
+        serialized.extend(self.field.field_id().to_be_bytes().as_ref());
+        serialized.extend_from_slice(&self.serialized_value_bytes);
+        serialized
+    }
+
    /// Create a new Term with a buffer with a given capacity.
    pub fn with_capacity(capacity: usize) -> Term {
-        let mut data = Vec::with_capacity(TERM_METADATA_LENGTH + capacity);
-        data.resize(TERM_METADATA_LENGTH, 0u8);
-        Term(data)
+        let mut data = Vec::with_capacity(TERM_TYPE_TAG_LEN + capacity);
+        data.resize(TERM_TYPE_TAG_LEN, 0u8);
+        Term {
+            field: Field::from_field_id(0u32),
+            serialized_value_bytes: data,
+        }
    }

    /// Creates a term from a json path.
@@ -89,7 +121,7 @@ impl Term {
    fn with_bytes_and_field_and_payload(typ: Type, field: Field, bytes: &[u8]) -> Term {
        let mut term = Self::with_capacity(bytes.len());
        term.set_field_and_type(field, typ);
-        term.0.extend_from_slice(bytes);
+        term.serialized_value_bytes.extend_from_slice(bytes);
        term
    }

@@ -105,13 +137,13 @@ impl Term {
    /// Sets field and the type.
    pub(crate) fn set_field_and_type(&mut self, field: Field, typ: Type) {
        assert!(self.is_empty());
-        self.0[0..4].clone_from_slice(field.field_id().to_be_bytes().as_ref());
-        self.0[4] = typ.to_code();
+        self.field = field;
+        self.serialized_value_bytes[0] = typ.to_code();
    }

    /// Is empty if there are no value bytes.
    pub fn is_empty(&self) -> bool {
-        self.0.len() == TERM_METADATA_LENGTH
+        self.serialized_value_bytes.len() == TERM_TYPE_TAG_LEN
    }

    /// Builds a term given a field, and a `Ipv6Addr`-value
@@ -177,7 +209,7 @@ impl Term {
    /// Removes the value_bytes and set the type code.
    pub fn clear_with_type(&mut self, typ: Type) {
        self.truncate_value_bytes(0);
-        self.0[4] = typ.to_code();
+        self.serialized_value_bytes[0] = typ.to_code();
    }

    /// Append a type marker + fast value to a term.
@@ -185,9 +217,10 @@ impl Term {
    ///
    /// It will not clear existing bytes.
    pub fn append_type_and_fast_value<T: FastValue>(&mut self, val: T) {
-        self.0.push(T::to_type().to_code());
+        self.serialized_value_bytes.push(T::to_type().to_code());
        let value = val.to_u64();
-        self.0.extend(value.to_be_bytes().as_ref());
+        self.serialized_value_bytes
+            .extend(value.to_be_bytes().as_ref());
    }

    /// Append a string type marker + string to a term.
@@ -195,24 +228,25 @@ impl Term {
    ///
    /// It will not clear existing bytes.
    pub fn append_type_and_str(&mut self, val: &str) {
-        self.0.push(Type::Str.to_code());
-        self.0.extend(val.as_bytes().as_ref());
+        self.serialized_value_bytes.push(Type::Str.to_code());
+        self.serialized_value_bytes.extend(val.as_bytes().as_ref());
    }

    /// Sets the value of a `Bytes` field.
    pub fn set_bytes(&mut self, bytes: &[u8]) {
        self.truncate_value_bytes(0);
-        self.0.extend(bytes);
+        self.serialized_value_bytes.extend(bytes);
    }

    /// Truncates the value bytes of the term. Value and field type stays the same.
    pub fn truncate_value_bytes(&mut self, len: usize) {
-        self.0.truncate(len + TERM_METADATA_LENGTH);
+        self.serialized_value_bytes
+            .truncate(len + TERM_TYPE_TAG_LEN);
    }

    /// The length of the bytes.
    pub fn len_bytes(&self) -> usize {
-        self.0.len() - TERM_METADATA_LENGTH
+        self.serialized_value_bytes.len() - TERM_TYPE_TAG_LEN
    }

    /// Appends value bytes to the Term.
@@ -220,18 +254,9 @@ impl Term {
    /// This function returns the segment that has just been added.
    #[inline]
    pub fn append_bytes(&mut self, bytes: &[u8]) -> &mut [u8] {
-        let len_before = self.0.len();
-        self.0.extend_from_slice(bytes);
-        &mut self.0[len_before..]
-    }
-}
-
-impl<B> Term<B>
-where B: AsRef<[u8]>
-{
-    /// Wraps a object holding bytes
-    pub fn wrap(data: B) -> Term<B> {
-        Term(data)
+        let len_before = self.serialized_value_bytes.len();
+        self.serialized_value_bytes.extend_from_slice(bytes);
+        &mut self.serialized_value_bytes[len_before..]
    }

    /// Return the type of the term.
@@ -241,8 +266,7 @@ where B: AsRef<[u8]>

    /// Returns the field.
    pub fn field(&self) -> Field {
-        let field_id_bytes: [u8; 4] = (&self.0.as_ref()[..4]).try_into().unwrap();
-        Field::from_field_id(u32::from_be_bytes(field_id_bytes))
+        self.field
    }

    /// Returns the serialized representation of the value.
@@ -252,23 +276,13 @@ where B: AsRef<[u8]>
    /// If the term is a u64, its value is encoded according
    /// to `byteorder::BigEndian`.
    pub fn serialized_value_bytes(&self) -> &[u8] {
-        &self.0.as_ref()[TERM_METADATA_LENGTH..]
+        &self.serialized_value_bytes[TERM_TYPE_TAG_LEN..]
    }

    /// Returns the value of the term.
    /// address or JSON path + value. (this does not include the field.)
    pub fn value(&self) -> ValueBytes<&[u8]> {
-        ValueBytes::wrap(&self.0.as_ref()[4..])
-    }
-
-    /// Returns the serialized representation of Term.
-    /// This includes field_id, value type and value.
-    ///
-    /// Do NOT rely on this byte representation in the index.
-    /// This value is likely to change in the future.
-    #[inline]
-    pub fn serialized_term(&self) -> &[u8] {
-        self.0.as_ref()
+        ValueBytes::wrap(self.serialized_value_bytes.as_ref())
    }
 }

@@ -452,10 +466,7 @@ where B: AsRef<[u8]>
        }
    }

-    /// Returns the serialized representation of Term.
-    ///
-    /// Do NOT rely on this byte representation in the index.
-    /// This value is likely to change in the future.
+    /// Returns the serialized representation of the value bytes including the type tag.
    pub fn as_serialized(&self) -> &[u8] {
        self.0.as_ref()
    }
@@ -508,40 +519,6 @@ where B: AsRef<[u8]>
    }
 }

-impl<B> Ord for Term<B>
-where B: AsRef<[u8]>
-{
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        self.serialized_term().cmp(other.serialized_term())
-    }
-}
-
-impl<B> PartialOrd for Term<B>
-where B: AsRef<[u8]>
-{
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<B> PartialEq for Term<B>
-where B: AsRef<[u8]>
-{
-    fn eq(&self, other: &Self) -> bool {
-        self.serialized_term() == other.serialized_term()
-    }
-}
-
-impl<B> Eq for Term<B> where B: AsRef<[u8]> {}
-
-impl<B> Hash for Term<B>
-where B: AsRef<[u8]>
-{
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        self.0.as_ref().hash(state)
-    }
-}
-
 fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) -> fmt::Result {
    if let Some(val) = val_opt {
        write!(f, "{val:?}")?;
@@ -549,13 +526,11 @@ fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) ->
    Ok(())
 }

-impl<B> fmt::Debug for Term<B>
-where B: AsRef<[u8]>
-{
+impl fmt::Debug for Term {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let field_id = self.field().field_id();
+        let field_id = self.field.field_id();
        write!(f, "Term(field={field_id}, ")?;
-        let value_bytes = ValueBytes::wrap(&self.0.as_ref()[4..]);
+        let value_bytes = ValueBytes::wrap(&self.serialized_value_bytes);
        value_bytes.debug_value_bytes(f)?;
        write!(f, ")",)?;
        Ok(())
@@ -578,17 +553,6 @@ mod tests {
        assert_eq!(term.value().as_str(), Some("test"))
    }

-    /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
-    /// <field> + <type byte> + <value len>
-    ///
-    /// - <field> is a big endian encoded u32 field id
-    /// - <type_byte>'s most significant bit expresses whether the term is a json term or not The
-    ///   remaining 7 bits are used to encode the type of the value. If this is a JSON term, the
-    ///   type is the type of the leaf of the json.
-    /// - <value> is,  if this is not the json term, a binary representation specific to the type.
-    ///   If it is a JSON Term, then it is prepended with the path that leads to this leaf value.
-    const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
-
    #[test]
    pub fn test_term_u64() {
        let mut schema_builder = Schema::builder();
@@ -596,7 +560,7 @@ mod tests {
        let term = Term::from_field_u64(count_field, 983u64);
        assert_eq!(term.field(), count_field);
        assert_eq!(term.typ(), Type::U64);
-        assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN);
+        assert_eq!(term.serialized_value_bytes().len(), 8);
        assert_eq!(term.value().as_u64(), Some(983u64))
    }

@@ -607,7 +571,7 @@ mod tests {
        let term = Term::from_field_bool(bool_field, true);
        assert_eq!(term.field(), bool_field);
        assert_eq!(term.typ(), Type::Bool);
-        assert_eq!(term.serialized_term().len(), FAST_VALUE_TERM_LEN);
+        assert_eq!(term.serialized_value_bytes().len(), 8);
        assert_eq!(term.value().as_bool(), Some(true))
    }
 }
--- a/src/space_usage/mod.rs
+++ b/src/space_usage/mod.rs
@@ -7,13 +7,14 @@
 //! storage-level details into consideration. For example, if your file system block size is 4096
 //! bytes, we can under-count actual resultant space usage by up to 4095 bytes per file.

-use std::collections::HashMap;
+use std::collections::btree_map::Entry;
+use std::collections::BTreeMap;

+use columnar::ColumnSpaceUsage;
 use common::ByteCount;
 use serde::{Deserialize, Serialize};

 use crate::index::SegmentComponent;
-use crate::schema::Field;

 /// Enum containing any of the possible space usage results for segment components.
 pub enum ComponentSpaceUsage {
@@ -212,17 +213,26 @@ impl StoreSpaceUsage {
 /// Multiple indexes are used to handle variable length things, where
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct PerFieldSpaceUsage {
-    fields: HashMap<Field, FieldUsage>,
+    fields: BTreeMap<String, FieldUsage>,
    total: ByteCount,
 }

 impl PerFieldSpaceUsage {
    pub(crate) fn new(fields: Vec<FieldUsage>) -> PerFieldSpaceUsage {
-        let total = fields.iter().map(FieldUsage::total).sum();
-        let field_usage_map: HashMap<Field, FieldUsage> = fields
-            .into_iter()
-            .map(|field_usage| (field_usage.field(), field_usage))
-            .collect();
+        let mut total = ByteCount::default();
+        let mut field_usage_map: BTreeMap<String, FieldUsage> = BTreeMap::new();
+        for field_usage in fields {
+            total += field_usage.total();
+            let field_name = field_usage.field_name().to_string();
+            match field_usage_map.entry(field_name) {
+                Entry::Vacant(entry) => {
+                    entry.insert(field_usage);
+                }
+                Entry::Occupied(mut entry) => {
+                    entry.get_mut().merge(field_usage);
+                }
+            }
+        }
        PerFieldSpaceUsage {
            fields: field_usage_map,
            total,
@@ -230,8 +240,8 @@ impl PerFieldSpaceUsage {
    }

    /// Per field space usage
-    pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
-        self.fields.iter()
+    pub fn fields(&self) -> impl Iterator<Item = &FieldUsage> {
+        self.fields.values()
    }

    /// Bytes used by the represented file
@@ -246,20 +256,23 @@ impl PerFieldSpaceUsage {
 /// See documentation for [`PerFieldSpaceUsage`] for slightly more information.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct FieldUsage {
-    field: Field,
+    field_name: String,
    num_bytes: ByteCount,
    /// A field can be composed of more than one piece.
    /// These pieces are indexed by arbitrary numbers starting at zero.
    /// `self.num_bytes` includes all of `self.sub_num_bytes`.
    sub_num_bytes: Vec<Option<ByteCount>>,
+    /// Space usage of the column for fast fields, if relevant.
+    column_space_usage: Option<ColumnSpaceUsage>,
 }

 impl FieldUsage {
-    pub(crate) fn empty(field: Field) -> FieldUsage {
+    pub(crate) fn empty(field_name: impl Into<String>) -> FieldUsage {
        FieldUsage {
-            field,
+            field_name: field_name.into(),
            num_bytes: Default::default(),
            sub_num_bytes: Vec::new(),
+            column_space_usage: None,
        }
    }

@@ -272,9 +285,14 @@ impl FieldUsage {
        self.num_bytes += size
    }

+    pub(crate) fn set_column_usage(&mut self, column_space_usage: ColumnSpaceUsage) {
+        self.num_bytes += column_space_usage.total_num_bytes();
+        self.column_space_usage = Some(column_space_usage);
+    }
+
    /// Field
-    pub fn field(&self) -> Field {
-        self.field
+    pub fn field_name(&self) -> &str {
+        &self.field_name
    }

    /// Space usage for each index
@@ -282,16 +300,64 @@ impl FieldUsage {
        &self.sub_num_bytes[..]
    }

+    /// Returns the number of bytes used by the column payload, if the field is columnar.
+    pub fn column_num_bytes(&self) -> Option<ByteCount> {
+        self.column_space_usage
+            .as_ref()
+            .map(ColumnSpaceUsage::column_num_bytes)
+    }
+
+    /// Returns the number of bytes used by the dictionary for dictionary-encoded columns.
+    pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
+        self.column_space_usage
+            .as_ref()
+            .and_then(ColumnSpaceUsage::dictionary_num_bytes)
+    }
+
+    /// Returns the space usage of the column, if any.
+    pub fn column_space_usage(&self) -> Option<&ColumnSpaceUsage> {
+        self.column_space_usage.as_ref()
+    }
+
    /// Total bytes used for this field in this context
    pub fn total(&self) -> ByteCount {
        self.num_bytes
    }
+
+    fn merge(&mut self, other: FieldUsage) {
+        assert_eq!(self.field_name, other.field_name);
+        self.num_bytes += other.num_bytes;
+        if other.sub_num_bytes.len() > self.sub_num_bytes.len() {
+            self.sub_num_bytes.resize(other.sub_num_bytes.len(), None);
+        }
+        for (idx, num_bytes_opt) in other.sub_num_bytes.into_iter().enumerate() {
+            if let Some(num_bytes) = num_bytes_opt {
+                match self.sub_num_bytes[idx] {
+                    Some(existing) => self.sub_num_bytes[idx] = Some(existing + num_bytes),
+                    None => self.sub_num_bytes[idx] = Some(num_bytes),
+                }
+            }
+        }
+        self.column_space_usage =
+            merge_column_space_usage(self.column_space_usage.take(), other.column_space_usage);
+    }
+}
+
+fn merge_column_space_usage(
+    left: Option<ColumnSpaceUsage>,
+    right: Option<ColumnSpaceUsage>,
+) -> Option<ColumnSpaceUsage> {
+    match (left, right) {
+        (Some(lhs), Some(rhs)) => Some(lhs.merge(&rhs)),
+        (Some(space), None) | (None, Some(space)) => Some(space),
+        (None, None) => None,
+    }
 }

 #[cfg(test)]
 mod test {
    use crate::index::Index;
-    use crate::schema::{Field, Schema, FAST, INDEXED, STORED, TEXT};
+    use crate::schema::{Schema, FAST, INDEXED, STORED, TEXT};
    use crate::space_usage::PerFieldSpaceUsage;
    use crate::{IndexWriter, Term};

@@ -307,17 +373,17 @@ mod test {

    fn expect_single_field(
        field_space: &PerFieldSpaceUsage,
-        field: &Field,
+        field: &str,
        min_size: u64,
        max_size: u64,
    ) {
        assert!(field_space.total() >= min_size);
        assert!(field_space.total() <= max_size);
        assert_eq!(
-            vec![(field, field_space.total())],
+            vec![(field.to_string(), field_space.total())],
            field_space
                .fields()
-                .map(|(x, y)| (x, y.total()))
+                .map(|usage| (usage.field_name().to_string(), usage.total()))
                .collect::<Vec<_>>()
        );
    }
@@ -327,6 +393,7 @@ mod test {
        let mut schema_builder = Schema::builder();
        let name = schema_builder.add_u64_field("name", FAST | INDEXED);
        let schema = schema_builder.build();
+        let field_name = schema.get_field_name(name).to_string();
        let index = Index::create_in_ram(schema);

        {
@@ -349,11 +416,11 @@ mod test {

        assert_eq!(4, segment.num_docs());

-        expect_single_field(segment.termdict(), &name, 1, 512);
-        expect_single_field(segment.postings(), &name, 1, 512);
+        expect_single_field(segment.termdict(), &field_name, 1, 512);
+        expect_single_field(segment.postings(), &field_name, 1, 512);
        assert_eq!(segment.positions().total(), 0);
-        expect_single_field(segment.fast_fields(), &name, 1, 512);
-        expect_single_field(segment.fieldnorms(), &name, 1, 512);
+        expect_single_field(segment.fast_fields(), &field_name, 1, 512);
+        expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
        // TODO: understand why the following fails
        //        assert_eq!(0, segment.store().total());
        assert_eq!(segment.deletes(), 0);
@@ -365,6 +432,7 @@ mod test {
        let mut schema_builder = Schema::builder();
        let name = schema_builder.add_text_field("name", TEXT);
        let schema = schema_builder.build();
+        let field_name = schema.get_field_name(name).to_string();
        let index = Index::create_in_ram(schema);

        {
@@ -389,11 +457,11 @@ mod test {

        assert_eq!(4, segment.num_docs());

-        expect_single_field(segment.termdict(), &name, 1, 512);
-        expect_single_field(segment.postings(), &name, 1, 512);
-        expect_single_field(segment.positions(), &name, 1, 512);
+        expect_single_field(segment.termdict(), &field_name, 1, 512);
+        expect_single_field(segment.postings(), &field_name, 1, 512);
+        expect_single_field(segment.positions(), &field_name, 1, 512);
        assert_eq!(segment.fast_fields().total(), 0);
-        expect_single_field(segment.fieldnorms(), &name, 1, 512);
+        expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
        // TODO: understand why the following fails
        //        assert_eq!(0, segment.store().total());
        assert_eq!(segment.deletes(), 0);
@@ -429,10 +497,15 @@ mod test {
        assert_eq!(4, segment.num_docs());

        assert_eq!(segment.termdict().total(), 0);
+        assert!(segment.termdict().fields().next().is_none());
        assert_eq!(segment.postings().total(), 0);
+        assert!(segment.postings().fields().next().is_none());
        assert_eq!(segment.positions().total(), 0);
+        assert!(segment.positions().fields().next().is_none());
        assert_eq!(segment.fast_fields().total(), 0);
+        assert!(segment.fast_fields().fields().next().is_none());
        assert_eq!(segment.fieldnorms().total(), 0);
+        assert!(segment.fieldnorms().fields().next().is_none());
        assert!(segment.store().total() > 0);
        assert!(segment.store().total() < 512);
        assert_eq!(segment.deletes(), 0);
@@ -444,6 +517,7 @@ mod test {
        let mut schema_builder = Schema::builder();
        let name = schema_builder.add_u64_field("name", INDEXED);
        let schema = schema_builder.build();
+        let field_name = schema.get_field_name(name).to_string();
        let index = Index::create_in_ram(schema);

        {
@@ -474,11 +548,11 @@ mod test {

        assert_eq!(2, segment_space_usage.num_docs());

-        expect_single_field(segment_space_usage.termdict(), &name, 1, 512);
-        expect_single_field(segment_space_usage.postings(), &name, 1, 512);
+        expect_single_field(segment_space_usage.termdict(), &field_name, 1, 512);
+        expect_single_field(segment_space_usage.postings(), &field_name, 1, 512);
        assert_eq!(segment_space_usage.positions().total(), 0u64);
        assert_eq!(segment_space_usage.fast_fields().total(), 0u64);
-        expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512);
+        expect_single_field(segment_space_usage.fieldnorms(), &field_name, 1, 512);
        assert!(segment_space_usage.deletes() > 0);
        Ok(())
    }
--- a/src/tokenizer/stemmer.rs
+++ b/src/tokenizer/stemmer.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 use super::{Token, TokenFilter, TokenStream, Tokenizer};

 /// Available stemmer languages.
-#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
+#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone, Hash)]
 #[allow(missing_docs)]
 pub enum Language {
    Arabic,
--- a/sstable/src/dictionary.rs
+++ b/sstable/src/dictionary.rs
@@ -8,7 +8,7 @@ use std::sync::Arc;

 use common::bounds::{TransformBound, transform_bound_inner_res};
 use common::file_slice::FileSlice;
-use common::{BinarySerializable, OwnedBytes};
+use common::{BinarySerializable, ByteCount, OwnedBytes};
 use futures_util::{StreamExt, TryStreamExt, stream};
 use itertools::Itertools;
 use tantivy_fst::Automaton;
@@ -43,6 +43,7 @@ use crate::{
 pub struct Dictionary<TSSTable: SSTable = VoidSSTable> {
    pub sstable_slice: FileSlice,
    pub sstable_index: SSTableIndex,
+    num_bytes: ByteCount,
    num_terms: u64,
    phantom_data: PhantomData<TSSTable>,
 }
@@ -278,6 +279,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {

    /// Opens a `TermDictionary`.
    pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
+        let num_bytes = term_dictionary_file.num_bytes();
        let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(20);
        let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
        let index_offset = u64::deserialize(&mut footer_len_bytes)?;
@@ -317,6 +319,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
        Ok(Dictionary {
            sstable_slice,
            sstable_index,
+            num_bytes,
            num_terms,
            phantom_data: PhantomData,
        })
@@ -343,6 +346,11 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
        self.num_terms as usize
    }

+    /// Returns the total number of bytes used by the dictionary on disk.
+    pub fn num_bytes(&self) -> ByteCount {
+        self.num_bytes
+    }
+
    /// Decode a DeltaReader up to key, returning the number of terms traversed
    ///
    /// If the key was not found, returns Ok(None).
Author	SHA1	Message	Date
Mohammad Dashti	fe293225d5	Fixed agg validation	2025-12-10 10:28:49 -08:00
Philippe Noël	794ff1ffc9	chore: Make `Language` hashable (#79 ) (#2763 ) Co-authored-by: Ming <ming.ying.nyc@gmail.com>	2025-12-10 15:38:43 +01:00
PSeitz-dd	c6912ce89a	Handle JSON fields and columnar in space_usage (#2761 ) return field names in space_usage instead of `Field` more detailed info for columns	2025-12-10 20:33:33 +08:00
PSeitz	618e3bd11b	Term and IndexingTerm cleanup (#2750 ) * refactor term * add deprecated functions --------- Co-authored-by: Pascal Seitz <pascal.seitz@datadoghq.com>	2025-12-05 09:48:40 +08:00