mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Handle JSON fields and columnar in space_usage (#2761)
return field names in space_usage instead of `Field` more detailed info for columns
This commit is contained in:
@@ -3,7 +3,8 @@ use std::sync::Arc;
|
|||||||
use std::{fmt, io};
|
use std::{fmt, io};
|
||||||
|
|
||||||
use common::file_slice::FileSlice;
|
use common::file_slice::FileSlice;
|
||||||
use common::{ByteCount, DateTime, HasLen, OwnedBytes};
|
use common::{ByteCount, DateTime, OwnedBytes};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::column::{BytesColumn, Column, StrColumn};
|
use crate::column::{BytesColumn, Column, StrColumn};
|
||||||
use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column};
|
use crate::column_values::{StrictlyMonotonicFn, monotonic_map_column};
|
||||||
@@ -317,10 +318,89 @@ impl DynamicColumnHandle {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn num_bytes(&self) -> ByteCount {
|
pub fn num_bytes(&self) -> ByteCount {
|
||||||
self.file_slice.len().into()
|
self.file_slice.num_bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Legacy helper returning the column space usage.
|
||||||
|
pub fn column_and_dictionary_num_bytes(&self) -> io::Result<ColumnSpaceUsage> {
|
||||||
|
self.space_usage()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the space usage of the column, optionally broken down by dictionary and column
|
||||||
|
/// values.
|
||||||
|
///
|
||||||
|
/// For dictionary encoded columns (strings and bytes), this splits the total footprint into
|
||||||
|
/// the dictionary and the remaining column data (including index and values).
|
||||||
|
/// For all other column types, the dictionary size is `None` and the column size
|
||||||
|
/// equals the total bytes.
|
||||||
|
pub fn space_usage(&self) -> io::Result<ColumnSpaceUsage> {
|
||||||
|
let total_num_bytes = self.num_bytes();
|
||||||
|
let dynamic_column = self.open()?;
|
||||||
|
let dictionary_num_bytes = match &dynamic_column {
|
||||||
|
DynamicColumn::Bytes(bytes_column) => bytes_column.dictionary().num_bytes(),
|
||||||
|
DynamicColumn::Str(str_column) => str_column.dictionary().num_bytes(),
|
||||||
|
_ => {
|
||||||
|
return Ok(ColumnSpaceUsage::new(self.num_bytes(), None));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assert!(dictionary_num_bytes <= total_num_bytes);
|
||||||
|
let column_num_bytes =
|
||||||
|
ByteCount::from(total_num_bytes.get_bytes() - dictionary_num_bytes.get_bytes());
|
||||||
|
Ok(ColumnSpaceUsage::new(
|
||||||
|
column_num_bytes,
|
||||||
|
Some(dictionary_num_bytes),
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn column_type(&self) -> ColumnType {
|
pub fn column_type(&self) -> ColumnType {
|
||||||
self.column_type
|
self.column_type
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Represents space usage of a column.
|
||||||
|
///
|
||||||
|
/// `column_num_bytes` tracks the column payload (index, values and footer).
|
||||||
|
/// For dictionary encoded columns, `dictionary_num_bytes` captures the dictionary footprint.
|
||||||
|
/// [`ColumnSpaceUsage::total_num_bytes`] returns the sum of both parts.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct ColumnSpaceUsage {
|
||||||
|
column_num_bytes: ByteCount,
|
||||||
|
dictionary_num_bytes: Option<ByteCount>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ColumnSpaceUsage {
|
||||||
|
pub(crate) fn new(
|
||||||
|
column_num_bytes: ByteCount,
|
||||||
|
dictionary_num_bytes: Option<ByteCount>,
|
||||||
|
) -> Self {
|
||||||
|
ColumnSpaceUsage {
|
||||||
|
column_num_bytes,
|
||||||
|
dictionary_num_bytes,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn column_num_bytes(&self) -> ByteCount {
|
||||||
|
self.column_num_bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
|
||||||
|
self.dictionary_num_bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn total_num_bytes(&self) -> ByteCount {
|
||||||
|
self.column_num_bytes + self.dictionary_num_bytes.unwrap_or_default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge two space usage values by summing their components.
|
||||||
|
pub fn merge(&self, other: &ColumnSpaceUsage) -> ColumnSpaceUsage {
|
||||||
|
let dictionary_num_bytes = match (self.dictionary_num_bytes, other.dictionary_num_bytes) {
|
||||||
|
(Some(lhs), Some(rhs)) => Some(lhs + rhs),
|
||||||
|
(Some(val), None) | (None, Some(val)) => Some(val),
|
||||||
|
(None, None) => None,
|
||||||
|
};
|
||||||
|
ColumnSpaceUsage {
|
||||||
|
column_num_bytes: self.column_num_bytes + other.column_num_bytes,
|
||||||
|
dictionary_num_bytes,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ pub use columnar::{
|
|||||||
use sstable::VoidSSTable;
|
use sstable::VoidSSTable;
|
||||||
pub use value::{NumericalType, NumericalValue};
|
pub use value::{NumericalType, NumericalValue};
|
||||||
|
|
||||||
pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
|
pub use self::dynamic_column::{ColumnSpaceUsage, DynamicColumn, DynamicColumnHandle};
|
||||||
|
|
||||||
pub type RowId = u32;
|
pub type RowId = u32;
|
||||||
pub type DocId = u32;
|
pub type DocId = u32;
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use std::ops::Range;
|
|||||||
use common::{BinarySerializable, CountingWriter, HasLen, VInt};
|
use common::{BinarySerializable, CountingWriter, HasLen, VInt};
|
||||||
|
|
||||||
use crate::directory::{FileSlice, TerminatingWrite, WritePtr};
|
use crate::directory::{FileSlice, TerminatingWrite, WritePtr};
|
||||||
use crate::schema::Field;
|
use crate::schema::{Field, Schema};
|
||||||
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||||
@@ -167,10 +167,11 @@ impl CompositeFile {
|
|||||||
.map(|byte_range| self.data.slice(byte_range.clone()))
|
.map(|byte_range| self.data.slice(byte_range.clone()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage {
|
||||||
let mut fields = Vec::new();
|
let mut fields = Vec::new();
|
||||||
for (&field_addr, byte_range) in &self.offsets_index {
|
for (&field_addr, byte_range) in &self.offsets_index {
|
||||||
let mut field_usage = FieldUsage::empty(field_addr.field);
|
let field_name = schema.get_field_name(field_addr.field).to_string();
|
||||||
|
let mut field_usage = FieldUsage::empty(field_name);
|
||||||
field_usage.add_field_idx(field_addr.idx, byte_range.len().into());
|
field_usage.add_field_idx(field_addr.idx, byte_range.len().into());
|
||||||
fields.push(field_usage);
|
fields.push(field_usage);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use columnar::{
|
|||||||
};
|
};
|
||||||
use common::ByteCount;
|
use common::ByteCount;
|
||||||
|
|
||||||
use crate::core::json_utils::encode_column_name;
|
use crate::core::json_utils::{encode_column_name, json_path_sep_to_dot};
|
||||||
use crate::directory::FileSlice;
|
use crate::directory::FileSlice;
|
||||||
use crate::schema::{Field, FieldEntry, FieldType, Schema};
|
use crate::schema::{Field, FieldEntry, FieldType, Schema};
|
||||||
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
||||||
@@ -39,19 +39,15 @@ impl FastFieldReaders {
|
|||||||
self.resolve_column_name_given_default_field(column_name, default_field_opt)
|
self.resolve_column_name_given_default_field(column_name, default_field_opt)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn space_usage(&self, schema: &Schema) -> io::Result<PerFieldSpaceUsage> {
|
pub(crate) fn space_usage(&self) -> io::Result<PerFieldSpaceUsage> {
|
||||||
let mut per_field_usages: Vec<FieldUsage> = Default::default();
|
let mut per_field_usages: Vec<FieldUsage> = Default::default();
|
||||||
for (field, field_entry) in schema.fields() {
|
for (mut field_name, column_handle) in self.columnar.iter_columns()? {
|
||||||
let column_handles = self.columnar.read_columns(field_entry.name())?;
|
json_path_sep_to_dot(&mut field_name);
|
||||||
let num_bytes: ByteCount = column_handles
|
let space_usage = column_handle.space_usage()?;
|
||||||
.iter()
|
let mut field_usage = FieldUsage::empty(field_name);
|
||||||
.map(|column_handle| column_handle.num_bytes())
|
field_usage.set_column_usage(space_usage);
|
||||||
.sum();
|
|
||||||
let mut field_usage = FieldUsage::empty(field);
|
|
||||||
field_usage.add_field_idx(0, num_bytes);
|
|
||||||
per_field_usages.push(field_usage);
|
per_field_usages.push(field_usage);
|
||||||
}
|
}
|
||||||
// TODO fix space usage for JSON fields.
|
|
||||||
Ok(PerFieldSpaceUsage::new(per_field_usages))
|
Ok(PerFieldSpaceUsage::new(per_field_usages))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||||
use crate::directory::{CompositeFile, FileSlice, OwnedBytes};
|
use crate::directory::{CompositeFile, FileSlice, OwnedBytes};
|
||||||
use crate::schema::Field;
|
use crate::schema::{Field, Schema};
|
||||||
use crate::space_usage::PerFieldSpaceUsage;
|
use crate::space_usage::PerFieldSpaceUsage;
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
|
|
||||||
@@ -37,8 +37,8 @@ impl FieldNormReaders {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return a break down of the space usage per field.
|
/// Return a break down of the space usage per field.
|
||||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
pub fn space_usage(&self, schema: &Schema) -> PerFieldSpaceUsage {
|
||||||
self.data.space_usage()
|
self.data.space_usage(schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a handle to inner file
|
/// Returns a handle to inner file
|
||||||
|
|||||||
@@ -455,11 +455,11 @@ impl SegmentReader {
|
|||||||
pub fn space_usage(&self) -> io::Result<SegmentSpaceUsage> {
|
pub fn space_usage(&self) -> io::Result<SegmentSpaceUsage> {
|
||||||
Ok(SegmentSpaceUsage::new(
|
Ok(SegmentSpaceUsage::new(
|
||||||
self.num_docs(),
|
self.num_docs(),
|
||||||
self.termdict_composite.space_usage(),
|
self.termdict_composite.space_usage(self.schema()),
|
||||||
self.postings_composite.space_usage(),
|
self.postings_composite.space_usage(self.schema()),
|
||||||
self.positions_composite.space_usage(),
|
self.positions_composite.space_usage(self.schema()),
|
||||||
self.fast_fields_readers.space_usage(self.schema())?,
|
self.fast_fields_readers.space_usage()?,
|
||||||
self.fieldnorm_readers.space_usage(),
|
self.fieldnorm_readers.space_usage(self.schema()),
|
||||||
self.get_store_reader(0)?.space_usage(),
|
self.get_store_reader(0)?.space_usage(),
|
||||||
self.alive_bitset_opt
|
self.alive_bitset_opt
|
||||||
.as_ref()
|
.as_ref()
|
||||||
|
|||||||
@@ -216,9 +216,7 @@ use once_cell::sync::Lazy;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
|
pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
|
||||||
#[doc(hidden)]
|
pub use crate::core::{json_utils, Executor, Searcher, SearcherGeneration};
|
||||||
pub use crate::core::json_utils;
|
|
||||||
pub use crate::core::{Executor, Searcher, SearcherGeneration};
|
|
||||||
pub use crate::directory::Directory;
|
pub use crate::directory::Directory;
|
||||||
pub use crate::index::{
|
pub use crate::index::{
|
||||||
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
|
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
|
||||||
|
|||||||
@@ -7,13 +7,14 @@
|
|||||||
//! storage-level details into consideration. For example, if your file system block size is 4096
|
//! storage-level details into consideration. For example, if your file system block size is 4096
|
||||||
//! bytes, we can under-count actual resultant space usage by up to 4095 bytes per file.
|
//! bytes, we can under-count actual resultant space usage by up to 4095 bytes per file.
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::btree_map::Entry;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use columnar::ColumnSpaceUsage;
|
||||||
use common::ByteCount;
|
use common::ByteCount;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::index::SegmentComponent;
|
use crate::index::SegmentComponent;
|
||||||
use crate::schema::Field;
|
|
||||||
|
|
||||||
/// Enum containing any of the possible space usage results for segment components.
|
/// Enum containing any of the possible space usage results for segment components.
|
||||||
pub enum ComponentSpaceUsage {
|
pub enum ComponentSpaceUsage {
|
||||||
@@ -212,17 +213,26 @@ impl StoreSpaceUsage {
|
|||||||
/// Multiple indexes are used to handle variable length things, where
|
/// Multiple indexes are used to handle variable length things, where
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
pub struct PerFieldSpaceUsage {
|
pub struct PerFieldSpaceUsage {
|
||||||
fields: HashMap<Field, FieldUsage>,
|
fields: BTreeMap<String, FieldUsage>,
|
||||||
total: ByteCount,
|
total: ByteCount,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PerFieldSpaceUsage {
|
impl PerFieldSpaceUsage {
|
||||||
pub(crate) fn new(fields: Vec<FieldUsage>) -> PerFieldSpaceUsage {
|
pub(crate) fn new(fields: Vec<FieldUsage>) -> PerFieldSpaceUsage {
|
||||||
let total = fields.iter().map(FieldUsage::total).sum();
|
let mut total = ByteCount::default();
|
||||||
let field_usage_map: HashMap<Field, FieldUsage> = fields
|
let mut field_usage_map: BTreeMap<String, FieldUsage> = BTreeMap::new();
|
||||||
.into_iter()
|
for field_usage in fields {
|
||||||
.map(|field_usage| (field_usage.field(), field_usage))
|
total += field_usage.total();
|
||||||
.collect();
|
let field_name = field_usage.field_name().to_string();
|
||||||
|
match field_usage_map.entry(field_name) {
|
||||||
|
Entry::Vacant(entry) => {
|
||||||
|
entry.insert(field_usage);
|
||||||
|
}
|
||||||
|
Entry::Occupied(mut entry) => {
|
||||||
|
entry.get_mut().merge(field_usage);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
PerFieldSpaceUsage {
|
PerFieldSpaceUsage {
|
||||||
fields: field_usage_map,
|
fields: field_usage_map,
|
||||||
total,
|
total,
|
||||||
@@ -230,8 +240,8 @@ impl PerFieldSpaceUsage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Per field space usage
|
/// Per field space usage
|
||||||
pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
|
pub fn fields(&self) -> impl Iterator<Item = &FieldUsage> {
|
||||||
self.fields.iter()
|
self.fields.values()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Bytes used by the represented file
|
/// Bytes used by the represented file
|
||||||
@@ -246,20 +256,23 @@ impl PerFieldSpaceUsage {
|
|||||||
/// See documentation for [`PerFieldSpaceUsage`] for slightly more information.
|
/// See documentation for [`PerFieldSpaceUsage`] for slightly more information.
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
pub struct FieldUsage {
|
pub struct FieldUsage {
|
||||||
field: Field,
|
field_name: String,
|
||||||
num_bytes: ByteCount,
|
num_bytes: ByteCount,
|
||||||
/// A field can be composed of more than one piece.
|
/// A field can be composed of more than one piece.
|
||||||
/// These pieces are indexed by arbitrary numbers starting at zero.
|
/// These pieces are indexed by arbitrary numbers starting at zero.
|
||||||
/// `self.num_bytes` includes all of `self.sub_num_bytes`.
|
/// `self.num_bytes` includes all of `self.sub_num_bytes`.
|
||||||
sub_num_bytes: Vec<Option<ByteCount>>,
|
sub_num_bytes: Vec<Option<ByteCount>>,
|
||||||
|
/// Space usage of the column for fast fields, if relevant.
|
||||||
|
column_space_usage: Option<ColumnSpaceUsage>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FieldUsage {
|
impl FieldUsage {
|
||||||
pub(crate) fn empty(field: Field) -> FieldUsage {
|
pub(crate) fn empty(field_name: impl Into<String>) -> FieldUsage {
|
||||||
FieldUsage {
|
FieldUsage {
|
||||||
field,
|
field_name: field_name.into(),
|
||||||
num_bytes: Default::default(),
|
num_bytes: Default::default(),
|
||||||
sub_num_bytes: Vec::new(),
|
sub_num_bytes: Vec::new(),
|
||||||
|
column_space_usage: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -272,9 +285,14 @@ impl FieldUsage {
|
|||||||
self.num_bytes += size
|
self.num_bytes += size
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn set_column_usage(&mut self, column_space_usage: ColumnSpaceUsage) {
|
||||||
|
self.num_bytes += column_space_usage.total_num_bytes();
|
||||||
|
self.column_space_usage = Some(column_space_usage);
|
||||||
|
}
|
||||||
|
|
||||||
/// Field
|
/// Field
|
||||||
pub fn field(&self) -> Field {
|
pub fn field_name(&self) -> &str {
|
||||||
self.field
|
&self.field_name
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Space usage for each index
|
/// Space usage for each index
|
||||||
@@ -282,16 +300,64 @@ impl FieldUsage {
|
|||||||
&self.sub_num_bytes[..]
|
&self.sub_num_bytes[..]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the number of bytes used by the column payload, if the field is columnar.
|
||||||
|
pub fn column_num_bytes(&self) -> Option<ByteCount> {
|
||||||
|
self.column_space_usage
|
||||||
|
.as_ref()
|
||||||
|
.map(ColumnSpaceUsage::column_num_bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of bytes used by the dictionary for dictionary-encoded columns.
|
||||||
|
pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
|
||||||
|
self.column_space_usage
|
||||||
|
.as_ref()
|
||||||
|
.and_then(ColumnSpaceUsage::dictionary_num_bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the space usage of the column, if any.
|
||||||
|
pub fn column_space_usage(&self) -> Option<&ColumnSpaceUsage> {
|
||||||
|
self.column_space_usage.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
/// Total bytes used for this field in this context
|
/// Total bytes used for this field in this context
|
||||||
pub fn total(&self) -> ByteCount {
|
pub fn total(&self) -> ByteCount {
|
||||||
self.num_bytes
|
self.num_bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn merge(&mut self, other: FieldUsage) {
|
||||||
|
assert_eq!(self.field_name, other.field_name);
|
||||||
|
self.num_bytes += other.num_bytes;
|
||||||
|
if other.sub_num_bytes.len() > self.sub_num_bytes.len() {
|
||||||
|
self.sub_num_bytes.resize(other.sub_num_bytes.len(), None);
|
||||||
|
}
|
||||||
|
for (idx, num_bytes_opt) in other.sub_num_bytes.into_iter().enumerate() {
|
||||||
|
if let Some(num_bytes) = num_bytes_opt {
|
||||||
|
match self.sub_num_bytes[idx] {
|
||||||
|
Some(existing) => self.sub_num_bytes[idx] = Some(existing + num_bytes),
|
||||||
|
None => self.sub_num_bytes[idx] = Some(num_bytes),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.column_space_usage =
|
||||||
|
merge_column_space_usage(self.column_space_usage.take(), other.column_space_usage);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_column_space_usage(
|
||||||
|
left: Option<ColumnSpaceUsage>,
|
||||||
|
right: Option<ColumnSpaceUsage>,
|
||||||
|
) -> Option<ColumnSpaceUsage> {
|
||||||
|
match (left, right) {
|
||||||
|
(Some(lhs), Some(rhs)) => Some(lhs.merge(&rhs)),
|
||||||
|
(Some(space), None) | (None, Some(space)) => Some(space),
|
||||||
|
(None, None) => None,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use crate::index::Index;
|
use crate::index::Index;
|
||||||
use crate::schema::{Field, Schema, FAST, INDEXED, STORED, TEXT};
|
use crate::schema::{Schema, FAST, INDEXED, STORED, TEXT};
|
||||||
use crate::space_usage::PerFieldSpaceUsage;
|
use crate::space_usage::PerFieldSpaceUsage;
|
||||||
use crate::{IndexWriter, Term};
|
use crate::{IndexWriter, Term};
|
||||||
|
|
||||||
@@ -307,17 +373,17 @@ mod test {
|
|||||||
|
|
||||||
fn expect_single_field(
|
fn expect_single_field(
|
||||||
field_space: &PerFieldSpaceUsage,
|
field_space: &PerFieldSpaceUsage,
|
||||||
field: &Field,
|
field: &str,
|
||||||
min_size: u64,
|
min_size: u64,
|
||||||
max_size: u64,
|
max_size: u64,
|
||||||
) {
|
) {
|
||||||
assert!(field_space.total() >= min_size);
|
assert!(field_space.total() >= min_size);
|
||||||
assert!(field_space.total() <= max_size);
|
assert!(field_space.total() <= max_size);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
vec![(field, field_space.total())],
|
vec![(field.to_string(), field_space.total())],
|
||||||
field_space
|
field_space
|
||||||
.fields()
|
.fields()
|
||||||
.map(|(x, y)| (x, y.total()))
|
.map(|usage| (usage.field_name().to_string(), usage.total()))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -327,6 +393,7 @@ mod test {
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let name = schema_builder.add_u64_field("name", FAST | INDEXED);
|
let name = schema_builder.add_u64_field("name", FAST | INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
let field_name = schema.get_field_name(name).to_string();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -349,11 +416,11 @@ mod test {
|
|||||||
|
|
||||||
assert_eq!(4, segment.num_docs());
|
assert_eq!(4, segment.num_docs());
|
||||||
|
|
||||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
expect_single_field(segment.termdict(), &field_name, 1, 512);
|
||||||
expect_single_field(segment.postings(), &name, 1, 512);
|
expect_single_field(segment.postings(), &field_name, 1, 512);
|
||||||
assert_eq!(segment.positions().total(), 0);
|
assert_eq!(segment.positions().total(), 0);
|
||||||
expect_single_field(segment.fast_fields(), &name, 1, 512);
|
expect_single_field(segment.fast_fields(), &field_name, 1, 512);
|
||||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
|
||||||
// TODO: understand why the following fails
|
// TODO: understand why the following fails
|
||||||
// assert_eq!(0, segment.store().total());
|
// assert_eq!(0, segment.store().total());
|
||||||
assert_eq!(segment.deletes(), 0);
|
assert_eq!(segment.deletes(), 0);
|
||||||
@@ -365,6 +432,7 @@ mod test {
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let name = schema_builder.add_text_field("name", TEXT);
|
let name = schema_builder.add_text_field("name", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
let field_name = schema.get_field_name(name).to_string();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -389,11 +457,11 @@ mod test {
|
|||||||
|
|
||||||
assert_eq!(4, segment.num_docs());
|
assert_eq!(4, segment.num_docs());
|
||||||
|
|
||||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
expect_single_field(segment.termdict(), &field_name, 1, 512);
|
||||||
expect_single_field(segment.postings(), &name, 1, 512);
|
expect_single_field(segment.postings(), &field_name, 1, 512);
|
||||||
expect_single_field(segment.positions(), &name, 1, 512);
|
expect_single_field(segment.positions(), &field_name, 1, 512);
|
||||||
assert_eq!(segment.fast_fields().total(), 0);
|
assert_eq!(segment.fast_fields().total(), 0);
|
||||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
|
||||||
// TODO: understand why the following fails
|
// TODO: understand why the following fails
|
||||||
// assert_eq!(0, segment.store().total());
|
// assert_eq!(0, segment.store().total());
|
||||||
assert_eq!(segment.deletes(), 0);
|
assert_eq!(segment.deletes(), 0);
|
||||||
@@ -429,10 +497,15 @@ mod test {
|
|||||||
assert_eq!(4, segment.num_docs());
|
assert_eq!(4, segment.num_docs());
|
||||||
|
|
||||||
assert_eq!(segment.termdict().total(), 0);
|
assert_eq!(segment.termdict().total(), 0);
|
||||||
|
assert!(segment.termdict().fields().next().is_none());
|
||||||
assert_eq!(segment.postings().total(), 0);
|
assert_eq!(segment.postings().total(), 0);
|
||||||
|
assert!(segment.postings().fields().next().is_none());
|
||||||
assert_eq!(segment.positions().total(), 0);
|
assert_eq!(segment.positions().total(), 0);
|
||||||
|
assert!(segment.positions().fields().next().is_none());
|
||||||
assert_eq!(segment.fast_fields().total(), 0);
|
assert_eq!(segment.fast_fields().total(), 0);
|
||||||
|
assert!(segment.fast_fields().fields().next().is_none());
|
||||||
assert_eq!(segment.fieldnorms().total(), 0);
|
assert_eq!(segment.fieldnorms().total(), 0);
|
||||||
|
assert!(segment.fieldnorms().fields().next().is_none());
|
||||||
assert!(segment.store().total() > 0);
|
assert!(segment.store().total() > 0);
|
||||||
assert!(segment.store().total() < 512);
|
assert!(segment.store().total() < 512);
|
||||||
assert_eq!(segment.deletes(), 0);
|
assert_eq!(segment.deletes(), 0);
|
||||||
@@ -444,6 +517,7 @@ mod test {
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let name = schema_builder.add_u64_field("name", INDEXED);
|
let name = schema_builder.add_u64_field("name", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
let field_name = schema.get_field_name(name).to_string();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -474,11 +548,11 @@ mod test {
|
|||||||
|
|
||||||
assert_eq!(2, segment_space_usage.num_docs());
|
assert_eq!(2, segment_space_usage.num_docs());
|
||||||
|
|
||||||
expect_single_field(segment_space_usage.termdict(), &name, 1, 512);
|
expect_single_field(segment_space_usage.termdict(), &field_name, 1, 512);
|
||||||
expect_single_field(segment_space_usage.postings(), &name, 1, 512);
|
expect_single_field(segment_space_usage.postings(), &field_name, 1, 512);
|
||||||
assert_eq!(segment_space_usage.positions().total(), 0u64);
|
assert_eq!(segment_space_usage.positions().total(), 0u64);
|
||||||
assert_eq!(segment_space_usage.fast_fields().total(), 0u64);
|
assert_eq!(segment_space_usage.fast_fields().total(), 0u64);
|
||||||
expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512);
|
expect_single_field(segment_space_usage.fieldnorms(), &field_name, 1, 512);
|
||||||
assert!(segment_space_usage.deletes() > 0);
|
assert!(segment_space_usage.deletes() > 0);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use common::bounds::{TransformBound, transform_bound_inner_res};
|
use common::bounds::{TransformBound, transform_bound_inner_res};
|
||||||
use common::file_slice::FileSlice;
|
use common::file_slice::FileSlice;
|
||||||
use common::{BinarySerializable, OwnedBytes};
|
use common::{BinarySerializable, ByteCount, OwnedBytes};
|
||||||
use futures_util::{StreamExt, TryStreamExt, stream};
|
use futures_util::{StreamExt, TryStreamExt, stream};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use tantivy_fst::Automaton;
|
use tantivy_fst::Automaton;
|
||||||
@@ -43,6 +43,7 @@ use crate::{
|
|||||||
pub struct Dictionary<TSSTable: SSTable = VoidSSTable> {
|
pub struct Dictionary<TSSTable: SSTable = VoidSSTable> {
|
||||||
pub sstable_slice: FileSlice,
|
pub sstable_slice: FileSlice,
|
||||||
pub sstable_index: SSTableIndex,
|
pub sstable_index: SSTableIndex,
|
||||||
|
num_bytes: ByteCount,
|
||||||
num_terms: u64,
|
num_terms: u64,
|
||||||
phantom_data: PhantomData<TSSTable>,
|
phantom_data: PhantomData<TSSTable>,
|
||||||
}
|
}
|
||||||
@@ -278,6 +279,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
|||||||
|
|
||||||
/// Opens a `TermDictionary`.
|
/// Opens a `TermDictionary`.
|
||||||
pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
|
pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
|
||||||
|
let num_bytes = term_dictionary_file.num_bytes();
|
||||||
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(20);
|
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(20);
|
||||||
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
|
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
|
||||||
let index_offset = u64::deserialize(&mut footer_len_bytes)?;
|
let index_offset = u64::deserialize(&mut footer_len_bytes)?;
|
||||||
@@ -317,6 +319,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
|||||||
Ok(Dictionary {
|
Ok(Dictionary {
|
||||||
sstable_slice,
|
sstable_slice,
|
||||||
sstable_index,
|
sstable_index,
|
||||||
|
num_bytes,
|
||||||
num_terms,
|
num_terms,
|
||||||
phantom_data: PhantomData,
|
phantom_data: PhantomData,
|
||||||
})
|
})
|
||||||
@@ -343,6 +346,11 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
|||||||
self.num_terms as usize
|
self.num_terms as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the total number of bytes used by the dictionary on disk.
|
||||||
|
pub fn num_bytes(&self) -> ByteCount {
|
||||||
|
self.num_bytes
|
||||||
|
}
|
||||||
|
|
||||||
/// Decode a DeltaReader up to key, returning the number of terms traversed
|
/// Decode a DeltaReader up to key, returning the number of terms traversed
|
||||||
///
|
///
|
||||||
/// If the key was not found, returns Ok(None).
|
/// If the key was not found, returns Ok(None).
|
||||||
|
|||||||
Reference in New Issue
Block a user