mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-24 04:00:40 +00:00
559 lines
19 KiB
Rust
559 lines
19 KiB
Rust
//! Representations for the space usage of various parts of a Tantivy index.
|
|
//!
|
|
//! This can be used programmatically, and will also be exposed in a human readable fashion in
|
|
//! tantivy-cli.
|
|
//!
|
|
//! One important caveat for all of this functionality is that none of it currently takes
|
|
//! storage-level details into consideration. For example, if your file system block size is 4096
|
|
//! bytes, we can under-count actual resultant space usage by up to 4095 bytes per file.
|
|
|
|
use std::collections::btree_map::Entry;
|
|
use std::collections::BTreeMap;
|
|
|
|
use columnar::ColumnSpaceUsage;
|
|
use common::ByteCount;
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use crate::index::SegmentComponent;
|
|
|
|
/// Enum containing any of the possible space usage results for segment components.
|
|
pub enum ComponentSpaceUsage {
|
|
/// Data is stored per field in a uniform way
|
|
PerField(PerFieldSpaceUsage),
|
|
/// Data is stored in separate pieces in the store
|
|
Store(StoreSpaceUsage),
|
|
/// Some sort of raw byte count
|
|
Basic(ByteCount),
|
|
}
|
|
|
|
/// Represents combined space usage of an entire searcher and its component segments.
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
pub struct SearcherSpaceUsage {
|
|
segments: Vec<SegmentSpaceUsage>,
|
|
total: ByteCount,
|
|
}
|
|
|
|
impl SearcherSpaceUsage {
|
|
pub fn new() -> SearcherSpaceUsage {
|
|
SearcherSpaceUsage {
|
|
segments: Vec::new(),
|
|
total: Default::default(),
|
|
}
|
|
}
|
|
|
|
/// Add a segment, to `self`.
|
|
/// Performs no deduplication or other intelligence.
|
|
pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
|
|
self.total += segment.total();
|
|
self.segments.push(segment);
|
|
}
|
|
|
|
/// Per segment space usage
|
|
pub fn segments(&self) -> &[SegmentSpaceUsage] {
|
|
&self.segments[..]
|
|
}
|
|
|
|
/// Returns total byte usage of this searcher, including all large subcomponents.
|
|
/// Does not account for smaller things like `meta.json`.
|
|
pub fn total(&self) -> ByteCount {
|
|
self.total
|
|
}
|
|
}
|
|
|
|
/// Represents combined space usage for all of the large components comprising a segment.
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
pub struct SegmentSpaceUsage {
|
|
num_docs: u32,
|
|
|
|
termdict: PerFieldSpaceUsage,
|
|
postings: PerFieldSpaceUsage,
|
|
positions: PerFieldSpaceUsage,
|
|
fast_fields: PerFieldSpaceUsage,
|
|
fieldnorms: PerFieldSpaceUsage,
|
|
|
|
store: StoreSpaceUsage,
|
|
|
|
deletes: ByteCount,
|
|
|
|
total: ByteCount,
|
|
}
|
|
|
|
impl SegmentSpaceUsage {
|
|
#[expect(clippy::too_many_arguments)]
|
|
pub fn new(
|
|
num_docs: u32,
|
|
termdict: PerFieldSpaceUsage,
|
|
postings: PerFieldSpaceUsage,
|
|
positions: PerFieldSpaceUsage,
|
|
fast_fields: PerFieldSpaceUsage,
|
|
fieldnorms: PerFieldSpaceUsage,
|
|
store: StoreSpaceUsage,
|
|
deletes: ByteCount,
|
|
) -> SegmentSpaceUsage {
|
|
let total = termdict.total()
|
|
+ postings.total()
|
|
+ positions.total()
|
|
+ fast_fields.total()
|
|
+ fieldnorms.total()
|
|
+ store.total()
|
|
+ deletes;
|
|
SegmentSpaceUsage {
|
|
num_docs,
|
|
termdict,
|
|
postings,
|
|
positions,
|
|
fast_fields,
|
|
fieldnorms,
|
|
store,
|
|
deletes,
|
|
total,
|
|
}
|
|
}
|
|
|
|
/// Space usage for the given component
|
|
///
|
|
/// Clones the underlying data.
|
|
/// Use the components directly if this is somehow in performance critical code.
|
|
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
|
|
use self::ComponentSpaceUsage::*;
|
|
use crate::index::SegmentComponent::*;
|
|
match component {
|
|
Postings => PerField(self.postings().clone()),
|
|
Positions => PerField(self.positions().clone()),
|
|
FastFields => PerField(self.fast_fields().clone()),
|
|
FieldNorms => PerField(self.fieldnorms().clone()),
|
|
Terms => PerField(self.termdict().clone()),
|
|
SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
|
|
Delete => Basic(self.deletes()),
|
|
}
|
|
}
|
|
|
|
/// Num docs in segment
|
|
pub fn num_docs(&self) -> u32 {
|
|
self.num_docs
|
|
}
|
|
|
|
/// Space usage for term dictionary
|
|
pub fn termdict(&self) -> &PerFieldSpaceUsage {
|
|
&self.termdict
|
|
}
|
|
|
|
/// Space usage for postings list
|
|
pub fn postings(&self) -> &PerFieldSpaceUsage {
|
|
&self.postings
|
|
}
|
|
|
|
/// Space usage for positions
|
|
pub fn positions(&self) -> &PerFieldSpaceUsage {
|
|
&self.positions
|
|
}
|
|
|
|
/// Space usage for fast fields
|
|
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
|
|
&self.fast_fields
|
|
}
|
|
|
|
/// Space usage for field norms
|
|
pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
|
|
&self.fieldnorms
|
|
}
|
|
|
|
/// Space usage for stored documents
|
|
pub fn store(&self) -> &StoreSpaceUsage {
|
|
&self.store
|
|
}
|
|
|
|
/// Space usage for document deletions
|
|
pub fn deletes(&self) -> ByteCount {
|
|
self.deletes
|
|
}
|
|
|
|
/// Total space usage in bytes for this segment.
|
|
pub fn total(&self) -> ByteCount {
|
|
self.total
|
|
}
|
|
}
|
|
|
|
/// Represents space usage for the Store for this segment.
|
|
///
|
|
/// This is composed of two parts.
|
|
/// `data` represents the compressed data itself.
|
|
/// `offsets` represents a lookup to find the start of a block
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
pub struct StoreSpaceUsage {
|
|
data: ByteCount,
|
|
offsets: ByteCount,
|
|
}
|
|
|
|
impl StoreSpaceUsage {
|
|
pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
|
|
StoreSpaceUsage { data, offsets }
|
|
}
|
|
|
|
/// Space usage for the data part of the store
|
|
pub fn data_usage(&self) -> ByteCount {
|
|
self.data
|
|
}
|
|
|
|
/// Space usage for the offsets part of the store (doc ID -> offset)
|
|
pub fn offsets_usage(&self) -> ByteCount {
|
|
self.offsets
|
|
}
|
|
|
|
/// Total space usage in bytes for this Store
|
|
pub fn total(&self) -> ByteCount {
|
|
self.data + self.offsets
|
|
}
|
|
}
|
|
|
|
/// Represents space usage for all of the (field, index) pairs that appear in a `CompositeFile`.
|
|
///
|
|
/// A field can appear with a single index (typically 0) or with multiple indexes.
|
|
/// Multiple indexes are used to handle variable length things, where
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
pub struct PerFieldSpaceUsage {
|
|
fields: BTreeMap<String, FieldUsage>,
|
|
total: ByteCount,
|
|
}
|
|
|
|
impl PerFieldSpaceUsage {
|
|
pub(crate) fn new(fields: Vec<FieldUsage>) -> PerFieldSpaceUsage {
|
|
let mut total = ByteCount::default();
|
|
let mut field_usage_map: BTreeMap<String, FieldUsage> = BTreeMap::new();
|
|
for field_usage in fields {
|
|
total += field_usage.total();
|
|
let field_name = field_usage.field_name().to_string();
|
|
match field_usage_map.entry(field_name) {
|
|
Entry::Vacant(entry) => {
|
|
entry.insert(field_usage);
|
|
}
|
|
Entry::Occupied(mut entry) => {
|
|
entry.get_mut().merge(field_usage);
|
|
}
|
|
}
|
|
}
|
|
PerFieldSpaceUsage {
|
|
fields: field_usage_map,
|
|
total,
|
|
}
|
|
}
|
|
|
|
/// Per field space usage
|
|
pub fn fields(&self) -> impl Iterator<Item = &FieldUsage> {
|
|
self.fields.values()
|
|
}
|
|
|
|
/// Bytes used by the represented file
|
|
pub fn total(&self) -> ByteCount {
|
|
self.total
|
|
}
|
|
}
|
|
|
|
/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
|
|
/// comprise it.
|
|
///
|
|
/// See documentation for [`PerFieldSpaceUsage`] for slightly more information.
|
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
pub struct FieldUsage {
|
|
field_name: String,
|
|
num_bytes: ByteCount,
|
|
/// A field can be composed of more than one piece.
|
|
/// These pieces are indexed by arbitrary numbers starting at zero.
|
|
/// `self.num_bytes` includes all of `self.sub_num_bytes`.
|
|
sub_num_bytes: Vec<Option<ByteCount>>,
|
|
/// Space usage of the column for fast fields, if relevant.
|
|
column_space_usage: Option<ColumnSpaceUsage>,
|
|
}
|
|
|
|
impl FieldUsage {
|
|
pub(crate) fn empty(field_name: impl Into<String>) -> FieldUsage {
|
|
FieldUsage {
|
|
field_name: field_name.into(),
|
|
num_bytes: Default::default(),
|
|
sub_num_bytes: Vec::new(),
|
|
column_space_usage: None,
|
|
}
|
|
}
|
|
|
|
pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
|
|
if self.sub_num_bytes.len() < idx + 1 {
|
|
self.sub_num_bytes.resize(idx + 1, None);
|
|
}
|
|
assert!(self.sub_num_bytes[idx].is_none());
|
|
self.sub_num_bytes[idx] = Some(size);
|
|
self.num_bytes += size
|
|
}
|
|
|
|
pub(crate) fn set_column_usage(&mut self, column_space_usage: ColumnSpaceUsage) {
|
|
self.num_bytes += column_space_usage.total_num_bytes();
|
|
self.column_space_usage = Some(column_space_usage);
|
|
}
|
|
|
|
/// Field
|
|
pub fn field_name(&self) -> &str {
|
|
&self.field_name
|
|
}
|
|
|
|
/// Space usage for each index
|
|
pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
|
|
&self.sub_num_bytes[..]
|
|
}
|
|
|
|
/// Returns the number of bytes used by the column payload, if the field is columnar.
|
|
pub fn column_num_bytes(&self) -> Option<ByteCount> {
|
|
self.column_space_usage
|
|
.as_ref()
|
|
.map(ColumnSpaceUsage::column_num_bytes)
|
|
}
|
|
|
|
/// Returns the number of bytes used by the dictionary for dictionary-encoded columns.
|
|
pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
|
|
self.column_space_usage
|
|
.as_ref()
|
|
.and_then(ColumnSpaceUsage::dictionary_num_bytes)
|
|
}
|
|
|
|
/// Returns the space usage of the column, if any.
|
|
pub fn column_space_usage(&self) -> Option<&ColumnSpaceUsage> {
|
|
self.column_space_usage.as_ref()
|
|
}
|
|
|
|
/// Total bytes used for this field in this context
|
|
pub fn total(&self) -> ByteCount {
|
|
self.num_bytes
|
|
}
|
|
|
|
fn merge(&mut self, other: FieldUsage) {
|
|
assert_eq!(self.field_name, other.field_name);
|
|
self.num_bytes += other.num_bytes;
|
|
if other.sub_num_bytes.len() > self.sub_num_bytes.len() {
|
|
self.sub_num_bytes.resize(other.sub_num_bytes.len(), None);
|
|
}
|
|
for (idx, num_bytes_opt) in other.sub_num_bytes.into_iter().enumerate() {
|
|
if let Some(num_bytes) = num_bytes_opt {
|
|
match self.sub_num_bytes[idx] {
|
|
Some(existing) => self.sub_num_bytes[idx] = Some(existing + num_bytes),
|
|
None => self.sub_num_bytes[idx] = Some(num_bytes),
|
|
}
|
|
}
|
|
}
|
|
self.column_space_usage =
|
|
merge_column_space_usage(self.column_space_usage.take(), other.column_space_usage);
|
|
}
|
|
}
|
|
|
|
fn merge_column_space_usage(
|
|
left: Option<ColumnSpaceUsage>,
|
|
right: Option<ColumnSpaceUsage>,
|
|
) -> Option<ColumnSpaceUsage> {
|
|
match (left, right) {
|
|
(Some(lhs), Some(rhs)) => Some(lhs.merge(&rhs)),
|
|
(Some(space), None) | (None, Some(space)) => Some(space),
|
|
(None, None) => None,
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use crate::index::Index;
|
|
use crate::schema::{Schema, FAST, INDEXED, STORED, TEXT};
|
|
use crate::space_usage::PerFieldSpaceUsage;
|
|
use crate::{IndexWriter, Term};
|
|
|
|
#[test]
|
|
fn test_empty() {
|
|
let schema = Schema::builder().build();
|
|
let index = Index::create_in_ram(schema);
|
|
let reader = index.reader().unwrap();
|
|
let searcher = reader.searcher();
|
|
let searcher_space_usage = searcher.space_usage().unwrap();
|
|
assert_eq!(searcher_space_usage.total(), 0u64);
|
|
}
|
|
|
|
fn expect_single_field(
|
|
field_space: &PerFieldSpaceUsage,
|
|
field: &str,
|
|
min_size: u64,
|
|
max_size: u64,
|
|
) {
|
|
assert!(field_space.total() >= min_size);
|
|
assert!(field_space.total() <= max_size);
|
|
assert_eq!(
|
|
vec![(field.to_string(), field_space.total())],
|
|
field_space
|
|
.fields()
|
|
.map(|usage| (usage.field_name().to_string(), usage.total()))
|
|
.collect::<Vec<_>>()
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_fast_indexed() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let name = schema_builder.add_u64_field("name", FAST | INDEXED);
|
|
let schema = schema_builder.build();
|
|
let field_name = schema.get_field_name(name).to_string();
|
|
let index = Index::create_in_ram(schema);
|
|
|
|
{
|
|
let mut index_writer = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(name => 1u64))?;
|
|
index_writer.add_document(doc!(name => 2u64))?;
|
|
index_writer.add_document(doc!(name => 10u64))?;
|
|
index_writer.add_document(doc!(name => 20u64))?;
|
|
index_writer.commit()?;
|
|
}
|
|
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
let searcher_space_usage = searcher.space_usage()?;
|
|
assert!(searcher_space_usage.total() > 0);
|
|
assert_eq!(1, searcher_space_usage.segments().len());
|
|
|
|
let segment = &searcher_space_usage.segments()[0];
|
|
assert!(segment.total() > 0);
|
|
|
|
assert_eq!(4, segment.num_docs());
|
|
|
|
expect_single_field(segment.termdict(), &field_name, 1, 512);
|
|
expect_single_field(segment.postings(), &field_name, 1, 512);
|
|
assert_eq!(segment.positions().total(), 0);
|
|
expect_single_field(segment.fast_fields(), &field_name, 1, 512);
|
|
expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
|
|
// TODO: understand why the following fails
|
|
// assert_eq!(0, segment.store().total());
|
|
assert_eq!(segment.deletes(), 0);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_text() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let name = schema_builder.add_text_field("name", TEXT);
|
|
let schema = schema_builder.build();
|
|
let field_name = schema.get_field_name(name).to_string();
|
|
let index = Index::create_in_ram(schema);
|
|
|
|
{
|
|
let mut index_writer = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(name => "hi"))?;
|
|
index_writer.add_document(doc!(name => "this is a test"))?;
|
|
index_writer.add_document(
|
|
doc!(name => "some more documents with some word overlap with the other test"),
|
|
)?;
|
|
index_writer.add_document(doc!(name => "hello hi goodbye"))?;
|
|
index_writer.commit()?;
|
|
}
|
|
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
let searcher_space_usage = searcher.space_usage()?;
|
|
assert!(searcher_space_usage.total() > 0);
|
|
assert_eq!(1, searcher_space_usage.segments().len());
|
|
|
|
let segment = &searcher_space_usage.segments()[0];
|
|
assert!(segment.total() > 0);
|
|
|
|
assert_eq!(4, segment.num_docs());
|
|
|
|
expect_single_field(segment.termdict(), &field_name, 1, 512);
|
|
expect_single_field(segment.postings(), &field_name, 1, 512);
|
|
expect_single_field(segment.positions(), &field_name, 1, 512);
|
|
assert_eq!(segment.fast_fields().total(), 0);
|
|
expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
|
|
// TODO: understand why the following fails
|
|
// assert_eq!(0, segment.store().total());
|
|
assert_eq!(segment.deletes(), 0);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_store() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let name = schema_builder.add_text_field("name", STORED);
|
|
let schema = schema_builder.build();
|
|
let index = Index::create_in_ram(schema);
|
|
|
|
{
|
|
let mut index_writer = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(name => "hi"))?;
|
|
index_writer.add_document(doc!(name => "this is a test"))?;
|
|
index_writer.add_document(
|
|
doc!(name => "some more documents with some word overlap with the other test"),
|
|
)?;
|
|
index_writer.add_document(doc!(name => "hello hi goodbye"))?;
|
|
index_writer.commit()?;
|
|
}
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
let searcher_space_usage = searcher.space_usage()?;
|
|
assert!(searcher_space_usage.total() > 0);
|
|
assert_eq!(1, searcher_space_usage.segments().len());
|
|
|
|
let segment = &searcher_space_usage.segments()[0];
|
|
assert!(segment.total() > 0);
|
|
|
|
assert_eq!(4, segment.num_docs());
|
|
|
|
assert_eq!(segment.termdict().total(), 0);
|
|
assert!(segment.termdict().fields().next().is_none());
|
|
assert_eq!(segment.postings().total(), 0);
|
|
assert!(segment.postings().fields().next().is_none());
|
|
assert_eq!(segment.positions().total(), 0);
|
|
assert!(segment.positions().fields().next().is_none());
|
|
assert_eq!(segment.fast_fields().total(), 0);
|
|
assert!(segment.fast_fields().fields().next().is_none());
|
|
assert_eq!(segment.fieldnorms().total(), 0);
|
|
assert!(segment.fieldnorms().fields().next().is_none());
|
|
assert!(segment.store().total() > 0);
|
|
assert!(segment.store().total() < 512);
|
|
assert_eq!(segment.deletes(), 0);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_deletes() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let name = schema_builder.add_u64_field("name", INDEXED);
|
|
let schema = schema_builder.build();
|
|
let field_name = schema.get_field_name(name).to_string();
|
|
let index = Index::create_in_ram(schema);
|
|
|
|
{
|
|
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(name => 1u64))?;
|
|
index_writer.add_document(doc!(name => 2u64))?;
|
|
index_writer.add_document(doc!(name => 3u64))?;
|
|
index_writer.add_document(doc!(name => 4u64))?;
|
|
index_writer.commit()?;
|
|
}
|
|
|
|
{
|
|
let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
|
|
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
|
|
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
|
|
// ok, now we should have a deleted doc
|
|
index_writer2.commit()?;
|
|
}
|
|
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
let searcher_space_usage = searcher.space_usage()?;
|
|
assert!(searcher_space_usage.total() > 0);
|
|
assert_eq!(1, searcher_space_usage.segments().len());
|
|
|
|
let segment_space_usage = &searcher_space_usage.segments()[0];
|
|
assert!(segment_space_usage.total() > 0);
|
|
|
|
assert_eq!(2, segment_space_usage.num_docs());
|
|
|
|
expect_single_field(segment_space_usage.termdict(), &field_name, 1, 512);
|
|
expect_single_field(segment_space_usage.postings(), &field_name, 1, 512);
|
|
assert_eq!(segment_space_usage.positions().total(), 0u64);
|
|
assert_eq!(segment_space_usage.fast_fields().total(), 0u64);
|
|
expect_single_field(segment_space_usage.fieldnorms(), &field_name, 1, 512);
|
|
assert!(segment_space_usage.deletes() > 0);
|
|
Ok(())
|
|
}
|
|
}
|