Merge branch 'master' of github.com:tantivy-search/tantivy into dds/lenient

Compute space usage of a Searcher / SegmentReader / CompositeFile (#282 )
* Compute space usage of a Searcher / SegmentReader / CompositeFile * Fix typo * Add serde Serialize/Deserialize for all the SpaceUsage structs * Fix indexing * Public methods for consuming space usage information * #281: Add a space usage method that takes a SegmentComponent to support code that is unaware of particular segment components, and to make it more likely to update methods when a new component type is added. * Add support for space usage computation of positions skip index file (#281) * Add some tests for space usage computation (#281)
2025-12-28 04:52:55 +00:00 · 2018-10-30 09:59:48 +09:00 · 2018-10-15 09:04:36 +09:00 · 2018-10-12 08:46:07 +09:00 · 2018-09-17 08:52:29 +09:00 · 2018-09-17 06:41:40 +09:00
13 changed files with 616 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+tantivy.iml
 *.swp
 target
 target/debug
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -60,7 +60,6 @@ maplit = "1"
 [profile.release]
 opt-level = 3
 debug = false
-lto = true
 debug-assertions = false

 [profile.test]
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@

 **Tantivy** is a **full text search engine library** written in rust.

-It is closer to Lucene than to Elastic Search and Solr in the sense it is not
+It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
 an off-the-shelf search engine server, but rather a crate that can be used
 to build such a search engine.

@@ -49,7 +49,9 @@ Tantivy is, in fact, strongly inspired by Lucene's design.

 # Non-features

- Distributed search and will not be in the scope of tantivy.
+- Distributed search is out of the scope of tantivy. That being said, tantivy is meant as a
+library upon which one could build a distributed search. Serializable/mergeable collector state for instance, 
+are within the scope of tantivy.


 # Supported OS and compiler
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -11,7 +11,6 @@ main() {
    else
        echo "Build"
        cross build --target $TARGET
-        cross build --target $TARGET --release
        if [ ! -z $DISABLE_TESTS ]; then
            return
        fi
--- a/src/common/composite_file.rs
+++ b/src/common/composite_file.rs
@@ -4,6 +4,8 @@ use common::VInt;
 use directory::ReadOnlySource;
 use directory::WritePtr;
 use schema::Field;
+use space_usage::PerFieldSpaceUsage;
+use space_usage::FieldUsage;
 use std::collections::HashMap;
 use std::io::Write;
 use std::io::{self, Read};
@@ -166,6 +168,16 @@ impl CompositeFile {
            .get(&FileAddr { field, idx })
            .map(|&(from, to)| self.data.slice(from, to))
    }
+
+    pub fn space_usage(&self) -> PerFieldSpaceUsage {
+        let mut fields = HashMap::new();
+        for (&field_addr, &(start, end)) in self.offsets_index.iter() {
+            fields.entry(field_addr.field)
+                .or_insert_with(|| FieldUsage::empty(field_addr.field))
+                .add_field_idx(field_addr.idx, end - start);
+        }
+        PerFieldSpaceUsage::new(fields)
+    }
 }

 #[cfg(test)]
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -5,6 +5,7 @@ use query::Query;
 use schema::Document;
 use schema::Schema;
 use schema::{Field, Term};
+use space_usage::SearcherSpaceUsage;
 use std::fmt;
 use std::sync::Arc;
 use termdict::TermMerger;
@@ -99,6 +100,15 @@ impl Searcher {
            .collect::<Vec<_>>();
        FieldSearcher::new(inv_index_readers)
    }
+
+    /// Summarize total space usage of this searcher.
+    pub fn space_usage(&self) -> SearcherSpaceUsage {
+        let mut space_usage = SearcherSpaceUsage::new();
+        for segment_reader in self.segment_readers.iter() {
+            space_usage.add_segment(segment_reader.space_usage());
+        }
+        space_usage
+    }
 }

 pub struct FieldSearcher {
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -16,6 +16,7 @@ use schema::Document;
 use schema::Field;
 use schema::FieldType;
 use schema::Schema;
+use space_usage::SegmentSpaceUsage;
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
@@ -381,6 +382,21 @@ impl SegmentReader {
    pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
        SegmentReaderAliveDocsIterator::new(&self)
    }
+
+    /// Summarize total space usage of this segment.
+    pub fn space_usage(&self) -> SegmentSpaceUsage {
+        SegmentSpaceUsage::new(
+            self.num_docs(),
+            self.termdict_composite.space_usage(),
+            self.postings_composite.space_usage(),
+            self.positions_composite.space_usage(),
+            self.positions_idx_composite.space_usage(),
+            self.fast_fields_composite.space_usage(),
+            self.fieldnorms_composite.space_usage(),
+            self.store_reader.space_usage(),
+            self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0),
+        )
+    }
 }

 impl fmt::Debug for SegmentReader {
--- a/src/fastfield/delete.rs
+++ b/src/fastfield/delete.rs
@@ -2,6 +2,7 @@ use bit_set::BitSet;
 use common::HasLen;
 use directory::ReadOnlySource;
 use directory::WritePtr;
+use space_usage::ByteCount;
 use std::io;
 use std::io::Write;
 use DocId;
@@ -63,6 +64,11 @@ impl DeleteBitSet {
            b & (1u8 << shift) != 0
        }
    }
+
+    /// Summarize total space usage of this bitset.
+    pub fn space_usage(&self) -> ByteCount {
+        self.data.len()
+    }
 }

 impl HasLen for DeleteBitSet {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -213,6 +213,7 @@ pub(crate) mod positions;
 pub mod postings;
 pub mod query;
 pub mod schema;
+pub mod space_usage;
 pub mod store;
 pub mod termdict;

--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -177,9 +177,6 @@ impl QueryParser {
    ///
    /// There is currently no lenient mode for the query parser
    /// which makes it a bad choice for a public/broad user search engine.
-    ///
-    /// Implementing a lenient mode for this query parser is tracked
-    /// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
    pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
        let logical_ast = self.parse_query_to_logical_ast(query)?;
        Ok(convert_to_query(logical_ast))
@@ -193,6 +190,61 @@ impl QueryParser {
        self.compute_logical_ast(user_input_ast)
    }

+    /// Parse a query
+    ///
+    /// Note that `parse_query_lenient` will NOT return an error
+    /// if the input is not a valid query.
+    ///
+    /// It will instead escape all special characters in the query body
+    /// retry to process the query, if it still fails will return the AllQuery
+    pub fn parse_query_lenient(&self, query: &str) -> Box<Query> {
+        if let Ok(logical_ast) = self.parse_query_to_logical_ast(query) {
+            return convert_to_query(logical_ast);
+        }
+
+        // try to clean up the query
+        if let Ok(logical_ast) = self.parse_lenient_query_to_logical_ast(query) {
+            return convert_to_query(logical_ast);
+        }
+
+        // we have no idea what you want, so here's nothing
+        Box::new(EmptyQuery)
+    }
+
+    /// Parse the user query into an AST.
+    fn parse_lenient_query_to_logical_ast(
+        &self,
+        query: &str,
+    ) -> Result<LogicalAST, QueryParserError> {
+        // if we are here, we know we have a poorly formed
+        // query input
+
+        //  # Escape special characters: \\+-&|!(){}[]^~*?:\/
+        let special_chars = "\\+-&|!(){}[]^~*?:/";
+        let mut scrubbed_query = query
+            .chars()
+            .filter(|c| !special_chars.contains(*c))
+            .collect::<String>();
+
+        // AND, OR and NOT are used by tantivy as logical operators. We need
+        // to escape them
+        let special_words = vec!["AND", "OR", "NOT"];
+        for word in special_words.iter() {
+            scrubbed_query = scrubbed_query.replace(word, &format!("{}", word));
+        }
+
+        // Escape odd quotes
+        let quote_count = scrubbed_query.chars().filter(|&c| c == '\"').count();
+        if quote_count % 2 == 1 {
+            scrubbed_query = scrubbed_query.replace("\"", "\\\"");
+        }
+        
+        let (user_input_ast, _remaining) = parse_to_ast()
+            .parse(scrubbed_query.as_str())
+            .map_err(|_| QueryParserError::SyntaxError)?;
+        self.compute_logical_ast(user_input_ast)
+    }
+
    fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
        self.schema
            .get_field(field_name)
@@ -544,6 +596,26 @@ mod test {
        assert!(query_parser.parse_query("toto").is_ok());
    }

+    #[test]
+    pub fn test_parse_query_lenient_no_panics() {
+        let query_parser = make_query_parser();
+
+        query_parser.parse_query_lenient("toto");
+        query_parser.parse_query_lenient("");
+        query_parser.parse_query_lenient("+(happy");
+    }
+
+    #[test]
+    pub fn test_parse_query_lenient_escapes_bad_queries() {
+        let query_parser = make_query_parser();
+
+        let query = query_parser
+            .parse_lenient_query_to_logical_ast("+(happy")
+            .unwrap();
+        let query_str = format!("{:?}", query);
+        assert_eq!(query_str, "(Term([0, 0, 0, 0, 104, 97, 112, 112, 121]) Term([0, 0, 0, 1, 104, 97, 112, 112, 121]))");
+    }
+
    #[test]
    pub fn test_parse_nonindexed_field_yields_error() {
        let query_parser = make_query_parser();
--- a/src/query/query_parser/user_input_ast.rs
+++ b/src/query/query_parser/user_input_ast.rs
@@ -80,9 +80,6 @@ impl UserInputBound {
 pub enum UserInputAST {
    Clause(Vec<UserInputAST>),
    Unary(Occur, Box<UserInputAST>),
-    //    Not(Box<UserInputAST>),
-    //    Should(Box<UserInputAST>),
-    //    Must(Box<UserInputAST>),
    Leaf(Box<UserInputLeaf>),
 }

@@ -92,7 +89,7 @@ impl UserInputAST {
    }

    fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
-        assert!(occur != Occur::MustNot);
+        assert_ne!(occur, Occur::MustNot);
        assert!(!asts.is_empty());
        if asts.len() == 1 {
            asts.into_iter().next().unwrap() //< safe
@@ -114,42 +111,6 @@ impl UserInputAST {
    }
 }

-/*
-impl UserInputAST {
-
-    fn compose_occur(self, occur: Occur) -> UserInputAST {
-        match self {
-            UserInputAST::Not(other) => {
-                let new_occur = compose_occur(Occur::MustNot, occur);
-                other.simplify()
-            }
-            _ => {
-                self
-            }
-        }
-    }
-
-    pub fn simplify(self) -> UserInputAST {
-        match self {
-            UserInputAST::Clause(els) => {
-                if els.len() == 1 {
-                    return els.into_iter().next().unwrap();
-                } else {
-                    return self;
-                }
-            }
-            UserInputAST::Not(els) => {
-                if els.len() == 1 {
-                    return els.into_iter().next().unwrap();
-                } else {
-                    return self;
-                }
-            }
-        }
-    }
-}
-*/
-
 impl From<UserInputLiteral> for UserInputLeaf {
    fn from(literal: UserInputLiteral) -> UserInputLeaf {
        UserInputLeaf::Literal(literal)
--- a/src/space_usage/mod.rs
+++ b/src/space_usage/mod.rs
@@ -0,0 +1,484 @@
+/*!
+Representations for the space usage of various parts of a Tantivy index.
+
+This can be used programmatically, and will also be exposed in a human readable fashion in
+tantivy-cli.
+
+One important caveat for all of this functionality is that none of it currently takes storage-level
+details into consideration. For example, if your file system block size is 4096 bytes, we can
+under-count actual resultant space usage by up to 4095 bytes per file.
+*/
+
+use schema::Field;
+use std::collections::HashMap;
+use SegmentComponent;
+
+/// Indicates space usage in bytes
+pub type ByteCount = usize;
+
+/// Enum containing any of the possible space usage results for segment components.
+pub enum ComponentSpaceUsage {
+    /// Data is stored per field in a uniform way
+    PerField(PerFieldSpaceUsage),
+    /// Data is stored in separate pieces in the store
+    Store(StoreSpaceUsage),
+    /// Some sort of raw byte count
+    Basic(ByteCount),
+}
+
+/// Represents combined space usage of an entire searcher and its component segments.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SearcherSpaceUsage {
+    segments: Vec<SegmentSpaceUsage>,
+    total: ByteCount,
+}
+
+impl SearcherSpaceUsage {
+    pub(crate) fn new() -> SearcherSpaceUsage {
+        SearcherSpaceUsage {
+            segments: Vec::new(),
+            total: 0,
+        }
+    }
+
+    /// Add a segment, to `self`.
+    /// Performs no deduplication or other intelligence.
+    pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
+        self.total += segment.total();
+        self.segments.push(segment);
+    }
+
+    /// Per segment space usage
+    pub fn segments(&self) -> &[SegmentSpaceUsage] {
+        &self.segments[..]
+    }
+
+    /// Returns total byte usage of this searcher, including all large subcomponents.
+    /// Does not account for smaller things like `meta.json`.
+    pub fn total(&self) -> ByteCount {
+        self.total
+    }
+}
+
+/// Represents combined space usage for all of the large components comprising a segment.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SegmentSpaceUsage {
+    num_docs: u32,
+
+    termdict: PerFieldSpaceUsage,
+    postings: PerFieldSpaceUsage,
+    positions: PerFieldSpaceUsage,
+    positions_idx: PerFieldSpaceUsage,
+    fast_fields: PerFieldSpaceUsage,
+    fieldnorms: PerFieldSpaceUsage,
+
+    store: StoreSpaceUsage,
+
+    deletes: ByteCount,
+
+    total: ByteCount,
+}
+
+impl SegmentSpaceUsage {
+    pub(crate) fn new(
+        num_docs: u32,
+        termdict: PerFieldSpaceUsage,
+        postings: PerFieldSpaceUsage,
+        positions: PerFieldSpaceUsage,
+        positions_idx: PerFieldSpaceUsage,
+        fast_fields: PerFieldSpaceUsage,
+        fieldnorms: PerFieldSpaceUsage,
+        store: StoreSpaceUsage,
+        deletes: ByteCount,
+    ) -> SegmentSpaceUsage {
+        let total = termdict.total()
+            + postings.total()
+            + positions.total()
+            + fast_fields.total()
+            + fieldnorms.total()
+            + store.total()
+            + deletes;
+        SegmentSpaceUsage {
+            num_docs,
+            termdict,
+            postings,
+            positions,
+            positions_idx,
+            fast_fields,
+            fieldnorms,
+            store,
+            deletes,
+            total,
+        }
+    }
+
+    /// Space usage for the given component
+    ///
+    /// Clones the underlying data.
+    /// Use the components directly if this is somehow in performance critical code.
+    pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
+        use SegmentComponent::*;
+        use self::ComponentSpaceUsage::*;
+        match component {
+            POSTINGS => PerField(self.postings().clone()),
+            POSITIONS => PerField(self.positions().clone()),
+            POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
+            FASTFIELDS => PerField(self.fast_fields().clone()),
+            FIELDNORMS => PerField(self.fieldnorms().clone()),
+            TERMS => PerField(self.termdict().clone()),
+            STORE => Store(self.store().clone()),
+            DELETE => Basic(self.deletes()),
+        }
+    }
+
+    /// Num docs in segment
+    pub fn num_docs(&self) -> u32 {
+        self.num_docs
+    }
+
+    /// Space usage for term dictionary
+    pub fn termdict(&self) -> &PerFieldSpaceUsage {
+        &self.termdict
+    }
+
+    /// Space usage for postings list
+    pub fn postings(&self) -> &PerFieldSpaceUsage {
+        &self.postings
+    }
+
+    /// Space usage for positions
+    pub fn positions(&self) -> &PerFieldSpaceUsage {
+        &self.positions
+    }
+
+    /// Space usage for positions skip idx
+    pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
+        &self.positions_idx
+    }
+
+    /// Space usage for fast fields
+    pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
+        &self.fast_fields
+    }
+
+    /// Space usage for field norms
+    pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
+        &self.fieldnorms
+    }
+
+    /// Space usage for stored documents
+    pub fn store(&self) -> &StoreSpaceUsage {
+        &self.store
+    }
+
+    /// Space usage for document deletions
+    pub fn deletes(&self) -> ByteCount {
+        self.deletes
+    }
+
+    /// Total space usage in bytes for this segment.
+    pub fn total(&self) -> ByteCount {
+        self.total
+    }
+}
+
+/// Represents space usage for the Store for this segment.
+///
+/// This is composed of two parts.
+/// `data` represents the compressed data itself.
+/// `offsets` represents a lookup to find the start of a block
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct StoreSpaceUsage {
+    data: ByteCount,
+    offsets: ByteCount,
+}
+
+impl StoreSpaceUsage {
+    pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
+        StoreSpaceUsage { data, offsets }
+    }
+
+    /// Space usage for the data part of the store
+    pub fn data_usage(&self) -> ByteCount {
+        self.data
+    }
+
+    /// Space usage for the offsets part of the store (doc ID -> offset)
+    pub fn offsets_usage(&self) -> ByteCount {
+        self.offsets
+    }
+
+    /// Total space usage in bytes for this Store
+    pub fn total(&self) -> ByteCount {
+        self.data + self.offsets
+    }
+}
+
+/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
+///
+/// A field can appear with a single index (typically 0) or with multiple indexes.
+/// Multiple indexes are used to handle variable length things, where
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PerFieldSpaceUsage {
+    fields: HashMap<Field, FieldUsage>,
+    total: ByteCount
+}
+
+impl PerFieldSpaceUsage {
+    pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
+        let total = fields.values().map(|x| x.total()).sum();
+        PerFieldSpaceUsage { fields, total }
+    }
+
+    /// Per field space usage
+    pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
+        self.fields.iter()
+    }
+
+    /// Bytes used by the represented file
+    pub fn total(&self) -> ByteCount {
+        self.total
+    }
+}
+
+/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
+/// comprise it.
+///
+/// See documentation for PerFieldSpaceUsage for slightly more information.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct FieldUsage {
+    field: Field,
+    num_bytes: ByteCount,
+    /// A field can be composed of more than one piece.
+    /// These pieces are indexed by arbitrary numbers starting at zero.
+    /// `self.num_bytes` includes all of `self.sub_num_bytes`.
+    sub_num_bytes: Vec<Option<ByteCount>>,
+}
+
+impl FieldUsage {
+    pub(crate) fn empty(field: Field) -> FieldUsage {
+        FieldUsage {
+            field,
+            num_bytes: 0,
+            sub_num_bytes: Vec::new(),
+        }
+    }
+
+    pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
+        if self.sub_num_bytes.len() < idx + 1{
+            self.sub_num_bytes.resize(idx + 1, None);
+        }
+        assert!(self.sub_num_bytes[idx].is_none());
+        self.sub_num_bytes[idx] = Some(size);
+        self.num_bytes += size
+    }
+
+    /// Field
+    pub fn field(&self) -> Field {
+        self.field
+    }
+
+    /// Space usage for each index
+    pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
+        &self.sub_num_bytes[..]
+    }
+
+    /// Total bytes used for this field in this context
+    pub fn total(&self) -> ByteCount {
+        self.num_bytes
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use core::Index;
+    use schema::SchemaBuilder;
+    use schema::{FAST, INT_INDEXED, TEXT};
+    use schema::Field;
+    use space_usage::ByteCount;
+    use space_usage::PerFieldSpaceUsage;
+    use schema::STORED;
+    use Term;
+
+    #[test]
+    fn test_empty() {
+        let schema = SchemaBuilder::new().build();
+        let index = Index::create_in_ram(schema.clone());
+
+        index.load_searchers().unwrap();
+        let searcher = index.searcher();
+        let searcher_space_usage = searcher.space_usage();
+        assert_eq!(0, searcher_space_usage.total());
+    }
+
+    fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) {
+        assert!(field_space.total() >= min_size);
+        assert!(field_space.total() <= max_size);
+        assert_eq!(
+            vec![(field, field_space.total())],
+            field_space.fields().map(|(x,y)| (x, y.total())).collect::<Vec<_>>()
+        );
+    }
+
+    #[test]
+    fn test_fast_indexed() {
+        let mut schema_builder = SchemaBuilder::new();
+        let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema.clone());
+
+        {
+            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+            index_writer.add_document(doc!(name => 1u64));
+            index_writer.add_document(doc!(name => 2u64));
+            index_writer.add_document(doc!(name => 10u64));
+            index_writer.add_document(doc!(name => 20u64));
+            index_writer.commit().unwrap();
+        }
+
+        index.load_searchers().unwrap();
+        let searcher = index.searcher();
+        let searcher_space_usage = searcher.space_usage();
+        assert!(searcher_space_usage.total() > 0);
+        assert_eq!(1, searcher_space_usage.segments().len());
+
+        let segment = &searcher_space_usage.segments()[0];
+        assert!(segment.total() > 0);
+
+        assert_eq!(4, segment.num_docs());
+
+        expect_single_field(segment.termdict(), &name, 1, 512);
+        expect_single_field(segment.postings(), &name, 1, 512);
+        assert_eq!(0, segment.positions().total());
+        assert_eq!(0, segment.positions_skip_idx().total());
+        expect_single_field(segment.fast_fields(), &name, 1, 512);
+        expect_single_field(segment.fieldnorms(), &name, 1, 512);
+        // TODO: understand why the following fails
+//        assert_eq!(0, segment.store().total());
+        assert_eq!(0, segment.deletes());
+    }
+
+    #[test]
+    fn test_text() {
+        let mut schema_builder = SchemaBuilder::new();
+        let name = schema_builder.add_text_field("name", TEXT);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema.clone());
+
+        {
+            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+            index_writer.add_document(doc!(name => "hi"));
+            index_writer.add_document(doc!(name => "this is a test"));
+            index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
+            index_writer.add_document(doc!(name => "hello hi goodbye"));
+            index_writer.commit().unwrap();
+        }
+
+        index.load_searchers().unwrap();
+        let searcher = index.searcher();
+        let searcher_space_usage = searcher.space_usage();
+        assert!(searcher_space_usage.total() > 0);
+        assert_eq!(1, searcher_space_usage.segments().len());
+
+        let segment = &searcher_space_usage.segments()[0];
+        assert!(segment.total() > 0);
+
+        assert_eq!(4, segment.num_docs());
+
+        expect_single_field(segment.termdict(), &name, 1, 512);
+        expect_single_field(segment.postings(), &name, 1, 512);
+        expect_single_field(segment.positions(), &name, 1, 512);
+        expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
+        assert_eq!(0, segment.fast_fields().total());
+        expect_single_field(segment.fieldnorms(), &name, 1, 512);
+        // TODO: understand why the following fails
+//        assert_eq!(0, segment.store().total());
+        assert_eq!(0, segment.deletes());
+    }
+
+    #[test]
+    fn test_store() {
+        let mut schema_builder = SchemaBuilder::new();
+        let name = schema_builder.add_text_field("name", STORED);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema.clone());
+
+        {
+            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+            index_writer.add_document(doc!(name => "hi"));
+            index_writer.add_document(doc!(name => "this is a test"));
+            index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
+            index_writer.add_document(doc!(name => "hello hi goodbye"));
+            index_writer.commit().unwrap();
+        }
+
+        index.load_searchers().unwrap();
+        let searcher = index.searcher();
+        let searcher_space_usage = searcher.space_usage();
+        assert!(searcher_space_usage.total() > 0);
+        assert_eq!(1, searcher_space_usage.segments().len());
+
+        let segment = &searcher_space_usage.segments()[0];
+        assert!(segment.total() > 0);
+
+        assert_eq!(4, segment.num_docs());
+
+        assert_eq!(0, segment.termdict().total());
+        assert_eq!(0, segment.postings().total());
+        assert_eq!(0, segment.positions().total());
+        assert_eq!(0, segment.positions_skip_idx().total());
+        assert_eq!(0, segment.fast_fields().total());
+        assert_eq!(0, segment.fieldnorms().total());
+        assert!(segment.store().total() > 0);
+        assert!(segment.store().total() < 512);
+        assert_eq!(0, segment.deletes());
+    }
+
+    #[test]
+    fn test_deletes() {
+        let mut schema_builder = SchemaBuilder::new();
+        let name = schema_builder.add_u64_field("name", INT_INDEXED);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema.clone());
+
+        {
+            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+            index_writer.add_document(doc!(name => 1u64));
+            index_writer.add_document(doc!(name => 2u64));
+            index_writer.add_document(doc!(name => 3u64));
+            index_writer.add_document(doc!(name => 4u64));
+            index_writer.commit().unwrap();
+        }
+
+        {
+            let mut index_writer2 = index.writer(50_000_000).unwrap();
+            index_writer2.delete_term(Term::from_field_u64(name, 2u64));
+            index_writer2.delete_term(Term::from_field_u64(name, 3u64));
+
+            // ok, now we should have a deleted doc
+            index_writer2.commit().unwrap();
+        }
+
+        index.load_searchers().unwrap();
+
+        let searcher = index.searcher();
+        let searcher_space_usage = searcher.space_usage();
+        assert!(searcher_space_usage.total() > 0);
+        assert_eq!(1, searcher_space_usage.segments().len());
+
+        let segment = &searcher_space_usage.segments()[0];
+        assert!(segment.total() > 0);
+
+        assert_eq!(2, segment.num_docs());
+
+        expect_single_field(segment.termdict(), &name, 1, 512);
+        expect_single_field(segment.postings(), &name, 1, 512);
+        assert_eq!(0, segment.positions().total());
+        assert_eq!(0, segment.positions_skip_idx().total());
+        assert_eq!(0, segment.fast_fields().total());
+        expect_single_field(segment.fieldnorms(), &name, 1, 512);
+        // TODO: understand why the following fails
+//        assert_eq!(0, segment.store().total());
+        assert!(segment.deletes() > 0);
+    }
+}
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -6,6 +6,7 @@ use common::BinarySerializable;
 use common::VInt;
 use directory::ReadOnlySource;
 use schema::Document;
+use space_usage::StoreSpaceUsage;
 use std::cell::RefCell;
 use std::io;
 use std::mem::size_of;
@@ -87,6 +88,11 @@ impl StoreReader {
        cursor = &cursor[..doc_length];
        Ok(Document::deserialize(&mut cursor)?)
    }
+
+    /// Summarize total space usage of this store reader.
+    pub fn space_usage(&self) -> StoreSpaceUsage {
+        StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
+    }
 }

 #[cfg_attr(
Author	SHA1	Message	Date
Paul Masurel	488bceda10	Merge branch 'master' of github.com:tantivy-search/tantivy into dds/lenient	2018-10-30 09:59:48 +09:00
Jason Wolfe	0098e3d428	Compute space usage of a Searcher / SegmentReader / CompositeFile (#282 ) * Compute space usage of a Searcher / SegmentReader / CompositeFile * Fix typo * Add serde Serialize/Deserialize for all the SpaceUsage structs * Fix indexing * Public methods for consuming space usage information * #281: Add a space usage method that takes a SegmentComponent to support code that is unaware of particular segment components, and to make it more likely to update methods when a new component type is added. * Add support for space usage computation of positions skip index file (#281) * Add some tests for space usage computation (#281)	2018-10-15 09:04:36 +09:00
Konstantin Gribov	69d5e4b9b1	Added proper references for Apache Lucene & Solr (#432 ) Also, added links to websites for Lucene, Solr & ElasticSearch	2018-10-12 08:46:07 +09:00
Paul Masurel	e0cdd3114d	Fixing README (#427 ) Closes #424.	2018-09-17 08:52:29 +09:00
Paul Masurel	f32b4a2ebe	Removing release build from ci, disabling lto (#425 )	2018-09-17 06:41:40 +09:00
Paul Masurel	6ff60b8ed8	Fixing README (#426 )	2018-09-17 06:20:44 +09:00
Paul Masurel	8da28fb6cf	Added iml filewq	2018-09-16 13:26:54 +09:00
Dru Sellers	f2b8755e10	Update test name and method comments	2018-08-18 10:26:06 -05:00
Dru Sellers	fa269f1f34	Santize known poor queries	2018-08-18 10:19:27 -05:00
Dru Sellers	e23a9303ce	Add parse_query_lenient to QueryParser	2018-08-18 09:30:45 -05:00