mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
10 Commits
0.7.0
...
dds/lenien
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
488bceda10 | ||
|
|
0098e3d428 | ||
|
|
69d5e4b9b1 | ||
|
|
e0cdd3114d | ||
|
|
f32b4a2ebe | ||
|
|
6ff60b8ed8 | ||
|
|
8da28fb6cf | ||
|
|
f2b8755e10 | ||
|
|
fa269f1f34 | ||
|
|
e23a9303ce |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
tantivy.iml
|
||||
*.swp
|
||||
target
|
||||
target/debug
|
||||
|
||||
@@ -60,7 +60,6 @@ maplit = "1"
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
debug = false
|
||||
lto = true
|
||||
debug-assertions = false
|
||||
|
||||
[profile.test]
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
|
||||
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
@@ -49,7 +49,9 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Non-features
|
||||
|
||||
- Distributed search and will not be in the scope of tantivy.
|
||||
- Distributed search is out of the scope of tantivy. That being said, tantivy is meant as a
|
||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
||||
are within the scope of tantivy.
|
||||
|
||||
|
||||
# Supported OS and compiler
|
||||
|
||||
@@ -11,7 +11,6 @@ main() {
|
||||
else
|
||||
echo "Build"
|
||||
cross build --target $TARGET
|
||||
cross build --target $TARGET --release
|
||||
if [ ! -z $DISABLE_TESTS ]; then
|
||||
return
|
||||
fi
|
||||
|
||||
@@ -4,6 +4,8 @@ use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use space_usage::PerFieldSpaceUsage;
|
||||
use space_usage::FieldUsage;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::io::{self, Read};
|
||||
@@ -166,6 +168,16 @@ impl CompositeFile {
|
||||
.get(&FileAddr { field, idx })
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
|
||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
let mut fields = HashMap::new();
|
||||
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
|
||||
fields.entry(field_addr.field)
|
||||
.or_insert_with(|| FieldUsage::empty(field_addr.field))
|
||||
.add_field_idx(field_addr.idx, end - start);
|
||||
}
|
||||
PerFieldSpaceUsage::new(fields)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -5,6 +5,7 @@ use query::Query;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::{Field, Term};
|
||||
use space_usage::SearcherSpaceUsage;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use termdict::TermMerger;
|
||||
@@ -99,6 +100,15 @@ impl Searcher {
|
||||
.collect::<Vec<_>>();
|
||||
FieldSearcher::new(inv_index_readers)
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this searcher.
|
||||
pub fn space_usage(&self) -> SearcherSpaceUsage {
|
||||
let mut space_usage = SearcherSpaceUsage::new();
|
||||
for segment_reader in self.segment_readers.iter() {
|
||||
space_usage.add_segment(segment_reader.space_usage());
|
||||
}
|
||||
space_usage
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FieldSearcher {
|
||||
|
||||
@@ -16,6 +16,7 @@ use schema::Document;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use space_usage::SegmentSpaceUsage;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
@@ -381,6 +382,21 @@ impl SegmentReader {
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||
SegmentReaderAliveDocsIterator::new(&self)
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this segment.
|
||||
pub fn space_usage(&self) -> SegmentSpaceUsage {
|
||||
SegmentSpaceUsage::new(
|
||||
self.num_docs(),
|
||||
self.termdict_composite.space_usage(),
|
||||
self.postings_composite.space_usage(),
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_composite.space_usage(),
|
||||
self.fieldnorms_composite.space_usage(),
|
||||
self.store_reader.space_usage(),
|
||||
self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
|
||||
@@ -2,6 +2,7 @@ use bit_set::BitSet;
|
||||
use common::HasLen;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use space_usage::ByteCount;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use DocId;
|
||||
@@ -63,6 +64,11 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.data.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
|
||||
@@ -213,6 +213,7 @@ pub(crate) mod positions;
|
||||
pub mod postings;
|
||||
pub mod query;
|
||||
pub mod schema;
|
||||
pub mod space_usage;
|
||||
pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
|
||||
@@ -177,9 +177,6 @@ impl QueryParser {
|
||||
///
|
||||
/// There is currently no lenient mode for the query parser
|
||||
/// which makes it a bad choice for a public/broad user search engine.
|
||||
///
|
||||
/// Implementing a lenient mode for this query parser is tracked
|
||||
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
|
||||
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
|
||||
let logical_ast = self.parse_query_to_logical_ast(query)?;
|
||||
Ok(convert_to_query(logical_ast))
|
||||
@@ -193,6 +190,61 @@ impl QueryParser {
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
/// Parse a query
|
||||
///
|
||||
/// Note that `parse_query_lenient` will NOT return an error
|
||||
/// if the input is not a valid query.
|
||||
///
|
||||
/// It will instead escape all special characters in the query body
|
||||
/// retry to process the query, if it still fails will return the AllQuery
|
||||
pub fn parse_query_lenient(&self, query: &str) -> Box<Query> {
|
||||
if let Ok(logical_ast) = self.parse_query_to_logical_ast(query) {
|
||||
return convert_to_query(logical_ast);
|
||||
}
|
||||
|
||||
// try to clean up the query
|
||||
if let Ok(logical_ast) = self.parse_lenient_query_to_logical_ast(query) {
|
||||
return convert_to_query(logical_ast);
|
||||
}
|
||||
|
||||
// we have no idea what you want, so here's nothing
|
||||
Box::new(EmptyQuery)
|
||||
}
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_lenient_query_to_logical_ast(
|
||||
&self,
|
||||
query: &str,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
// if we are here, we know we have a poorly formed
|
||||
// query input
|
||||
|
||||
// # Escape special characters: \\+-&|!(){}[]^~*?:\/
|
||||
let special_chars = "\\+-&|!(){}[]^~*?:/";
|
||||
let mut scrubbed_query = query
|
||||
.chars()
|
||||
.filter(|c| !special_chars.contains(*c))
|
||||
.collect::<String>();
|
||||
|
||||
// AND, OR and NOT are used by tantivy as logical operators. We need
|
||||
// to escape them
|
||||
let special_words = vec!["AND", "OR", "NOT"];
|
||||
for word in special_words.iter() {
|
||||
scrubbed_query = scrubbed_query.replace(word, &format!("{}", word));
|
||||
}
|
||||
|
||||
// Escape odd quotes
|
||||
let quote_count = scrubbed_query.chars().filter(|&c| c == '\"').count();
|
||||
if quote_count % 2 == 1 {
|
||||
scrubbed_query = scrubbed_query.replace("\"", "\\\"");
|
||||
}
|
||||
|
||||
let (user_input_ast, _remaining) = parse_to_ast()
|
||||
.parse(scrubbed_query.as_str())
|
||||
.map_err(|_| QueryParserError::SyntaxError)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
|
||||
self.schema
|
||||
.get_field(field_name)
|
||||
@@ -544,6 +596,26 @@ mod test {
|
||||
assert!(query_parser.parse_query("toto").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_lenient_no_panics() {
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
query_parser.parse_query_lenient("toto");
|
||||
query_parser.parse_query_lenient("");
|
||||
query_parser.parse_query_lenient("+(happy");
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_lenient_escapes_bad_queries() {
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
let query = query_parser
|
||||
.parse_lenient_query_to_logical_ast("+(happy")
|
||||
.unwrap();
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, "(Term([0, 0, 0, 0, 104, 97, 112, 112, 121]) Term([0, 0, 0, 1, 104, 97, 112, 112, 121]))");
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_nonindexed_field_yields_error() {
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
@@ -80,9 +80,6 @@ impl UserInputBound {
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<UserInputAST>),
|
||||
Unary(Occur, Box<UserInputAST>),
|
||||
// Not(Box<UserInputAST>),
|
||||
// Should(Box<UserInputAST>),
|
||||
// Must(Box<UserInputAST>),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
}
|
||||
|
||||
@@ -92,7 +89,7 @@ impl UserInputAST {
|
||||
}
|
||||
|
||||
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
assert!(occur != Occur::MustNot);
|
||||
assert_ne!(occur, Occur::MustNot);
|
||||
assert!(!asts.is_empty());
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap() //< safe
|
||||
@@ -114,42 +111,6 @@ impl UserInputAST {
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
impl UserInputAST {
|
||||
|
||||
fn compose_occur(self, occur: Occur) -> UserInputAST {
|
||||
match self {
|
||||
UserInputAST::Not(other) => {
|
||||
let new_occur = compose_occur(Occur::MustNot, occur);
|
||||
other.simplify()
|
||||
}
|
||||
_ => {
|
||||
self
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn simplify(self) -> UserInputAST {
|
||||
match self {
|
||||
UserInputAST::Clause(els) => {
|
||||
if els.len() == 1 {
|
||||
return els.into_iter().next().unwrap();
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
UserInputAST::Not(els) => {
|
||||
if els.len() == 1 {
|
||||
return els.into_iter().next().unwrap();
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
impl From<UserInputLiteral> for UserInputLeaf {
|
||||
fn from(literal: UserInputLiteral) -> UserInputLeaf {
|
||||
UserInputLeaf::Literal(literal)
|
||||
|
||||
484
src/space_usage/mod.rs
Normal file
484
src/space_usage/mod.rs
Normal file
@@ -0,0 +1,484 @@
|
||||
/*!
|
||||
Representations for the space usage of various parts of a Tantivy index.
|
||||
|
||||
This can be used programmatically, and will also be exposed in a human readable fashion in
|
||||
tantivy-cli.
|
||||
|
||||
One important caveat for all of this functionality is that none of it currently takes storage-level
|
||||
details into consideration. For example, if your file system block size is 4096 bytes, we can
|
||||
under-count actual resultant space usage by up to 4095 bytes per file.
|
||||
*/
|
||||
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use SegmentComponent;
|
||||
|
||||
/// Indicates space usage in bytes
|
||||
pub type ByteCount = usize;
|
||||
|
||||
/// Enum containing any of the possible space usage results for segment components.
|
||||
pub enum ComponentSpaceUsage {
|
||||
/// Data is stored per field in a uniform way
|
||||
PerField(PerFieldSpaceUsage),
|
||||
/// Data is stored in separate pieces in the store
|
||||
Store(StoreSpaceUsage),
|
||||
/// Some sort of raw byte count
|
||||
Basic(ByteCount),
|
||||
}
|
||||
|
||||
/// Represents combined space usage of an entire searcher and its component segments.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SearcherSpaceUsage {
|
||||
segments: Vec<SegmentSpaceUsage>,
|
||||
total: ByteCount,
|
||||
}
|
||||
|
||||
impl SearcherSpaceUsage {
|
||||
pub(crate) fn new() -> SearcherSpaceUsage {
|
||||
SearcherSpaceUsage {
|
||||
segments: Vec::new(),
|
||||
total: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a segment, to `self`.
|
||||
/// Performs no deduplication or other intelligence.
|
||||
pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
|
||||
self.total += segment.total();
|
||||
self.segments.push(segment);
|
||||
}
|
||||
|
||||
/// Per segment space usage
|
||||
pub fn segments(&self) -> &[SegmentSpaceUsage] {
|
||||
&self.segments[..]
|
||||
}
|
||||
|
||||
/// Returns total byte usage of this searcher, including all large subcomponents.
|
||||
/// Does not account for smaller things like `meta.json`.
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.total
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents combined space usage for all of the large components comprising a segment.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SegmentSpaceUsage {
|
||||
num_docs: u32,
|
||||
|
||||
termdict: PerFieldSpaceUsage,
|
||||
postings: PerFieldSpaceUsage,
|
||||
positions: PerFieldSpaceUsage,
|
||||
positions_idx: PerFieldSpaceUsage,
|
||||
fast_fields: PerFieldSpaceUsage,
|
||||
fieldnorms: PerFieldSpaceUsage,
|
||||
|
||||
store: StoreSpaceUsage,
|
||||
|
||||
deletes: ByteCount,
|
||||
|
||||
total: ByteCount,
|
||||
}
|
||||
|
||||
impl SegmentSpaceUsage {
|
||||
pub(crate) fn new(
|
||||
num_docs: u32,
|
||||
termdict: PerFieldSpaceUsage,
|
||||
postings: PerFieldSpaceUsage,
|
||||
positions: PerFieldSpaceUsage,
|
||||
positions_idx: PerFieldSpaceUsage,
|
||||
fast_fields: PerFieldSpaceUsage,
|
||||
fieldnorms: PerFieldSpaceUsage,
|
||||
store: StoreSpaceUsage,
|
||||
deletes: ByteCount,
|
||||
) -> SegmentSpaceUsage {
|
||||
let total = termdict.total()
|
||||
+ postings.total()
|
||||
+ positions.total()
|
||||
+ fast_fields.total()
|
||||
+ fieldnorms.total()
|
||||
+ store.total()
|
||||
+ deletes;
|
||||
SegmentSpaceUsage {
|
||||
num_docs,
|
||||
termdict,
|
||||
postings,
|
||||
positions,
|
||||
positions_idx,
|
||||
fast_fields,
|
||||
fieldnorms,
|
||||
store,
|
||||
deletes,
|
||||
total,
|
||||
}
|
||||
}
|
||||
|
||||
/// Space usage for the given component
|
||||
///
|
||||
/// Clones the underlying data.
|
||||
/// Use the components directly if this is somehow in performance critical code.
|
||||
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
|
||||
use SegmentComponent::*;
|
||||
use self::ComponentSpaceUsage::*;
|
||||
match component {
|
||||
POSTINGS => PerField(self.postings().clone()),
|
||||
POSITIONS => PerField(self.positions().clone()),
|
||||
POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
|
||||
FASTFIELDS => PerField(self.fast_fields().clone()),
|
||||
FIELDNORMS => PerField(self.fieldnorms().clone()),
|
||||
TERMS => PerField(self.termdict().clone()),
|
||||
STORE => Store(self.store().clone()),
|
||||
DELETE => Basic(self.deletes()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Num docs in segment
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.num_docs
|
||||
}
|
||||
|
||||
/// Space usage for term dictionary
|
||||
pub fn termdict(&self) -> &PerFieldSpaceUsage {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
/// Space usage for postings list
|
||||
pub fn postings(&self) -> &PerFieldSpaceUsage {
|
||||
&self.postings
|
||||
}
|
||||
|
||||
/// Space usage for positions
|
||||
pub fn positions(&self) -> &PerFieldSpaceUsage {
|
||||
&self.positions
|
||||
}
|
||||
|
||||
/// Space usage for positions skip idx
|
||||
pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
|
||||
&self.positions_idx
|
||||
}
|
||||
|
||||
/// Space usage for fast fields
|
||||
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
|
||||
&self.fast_fields
|
||||
}
|
||||
|
||||
/// Space usage for field norms
|
||||
pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
|
||||
&self.fieldnorms
|
||||
}
|
||||
|
||||
/// Space usage for stored documents
|
||||
pub fn store(&self) -> &StoreSpaceUsage {
|
||||
&self.store
|
||||
}
|
||||
|
||||
/// Space usage for document deletions
|
||||
pub fn deletes(&self) -> ByteCount {
|
||||
self.deletes
|
||||
}
|
||||
|
||||
/// Total space usage in bytes for this segment.
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.total
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents space usage for the Store for this segment.
|
||||
///
|
||||
/// This is composed of two parts.
|
||||
/// `data` represents the compressed data itself.
|
||||
/// `offsets` represents a lookup to find the start of a block
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct StoreSpaceUsage {
|
||||
data: ByteCount,
|
||||
offsets: ByteCount,
|
||||
}
|
||||
|
||||
impl StoreSpaceUsage {
|
||||
pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
|
||||
StoreSpaceUsage { data, offsets }
|
||||
}
|
||||
|
||||
/// Space usage for the data part of the store
|
||||
pub fn data_usage(&self) -> ByteCount {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Space usage for the offsets part of the store (doc ID -> offset)
|
||||
pub fn offsets_usage(&self) -> ByteCount {
|
||||
self.offsets
|
||||
}
|
||||
|
||||
/// Total space usage in bytes for this Store
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.data + self.offsets
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
|
||||
///
|
||||
/// A field can appear with a single index (typically 0) or with multiple indexes.
|
||||
/// Multiple indexes are used to handle variable length things, where
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PerFieldSpaceUsage {
|
||||
fields: HashMap<Field, FieldUsage>,
|
||||
total: ByteCount
|
||||
}
|
||||
|
||||
impl PerFieldSpaceUsage {
|
||||
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
||||
let total = fields.values().map(|x| x.total()).sum();
|
||||
PerFieldSpaceUsage { fields, total }
|
||||
}
|
||||
|
||||
/// Per field space usage
|
||||
pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
|
||||
self.fields.iter()
|
||||
}
|
||||
|
||||
/// Bytes used by the represented file
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.total
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
|
||||
/// comprise it.
|
||||
///
|
||||
/// See documentation for PerFieldSpaceUsage for slightly more information.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct FieldUsage {
|
||||
field: Field,
|
||||
num_bytes: ByteCount,
|
||||
/// A field can be composed of more than one piece.
|
||||
/// These pieces are indexed by arbitrary numbers starting at zero.
|
||||
/// `self.num_bytes` includes all of `self.sub_num_bytes`.
|
||||
sub_num_bytes: Vec<Option<ByteCount>>,
|
||||
}
|
||||
|
||||
impl FieldUsage {
|
||||
pub(crate) fn empty(field: Field) -> FieldUsage {
|
||||
FieldUsage {
|
||||
field,
|
||||
num_bytes: 0,
|
||||
sub_num_bytes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
|
||||
if self.sub_num_bytes.len() < idx + 1{
|
||||
self.sub_num_bytes.resize(idx + 1, None);
|
||||
}
|
||||
assert!(self.sub_num_bytes[idx].is_none());
|
||||
self.sub_num_bytes[idx] = Some(size);
|
||||
self.num_bytes += size
|
||||
}
|
||||
|
||||
/// Field
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Space usage for each index
|
||||
pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
|
||||
&self.sub_num_bytes[..]
|
||||
}
|
||||
|
||||
/// Total bytes used for this field in this context
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.num_bytes
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use core::Index;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::{FAST, INT_INDEXED, TEXT};
|
||||
use schema::Field;
|
||||
use space_usage::ByteCount;
|
||||
use space_usage::PerFieldSpaceUsage;
|
||||
use schema::STORED;
|
||||
use Term;
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let schema = SchemaBuilder::new().build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert_eq!(0, searcher_space_usage.total());
|
||||
}
|
||||
|
||||
fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) {
|
||||
assert!(field_space.total() >= min_size);
|
||||
assert!(field_space.total() <= max_size);
|
||||
assert_eq!(
|
||||
vec![(field, field_space.total())],
|
||||
field_space.fields().map(|(x,y)| (x, y.total())).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fast_indexed() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => 1u64));
|
||||
index_writer.add_document(doc!(name => 2u64));
|
||||
index_writer.add_document(doc!(name => 10u64));
|
||||
index_writer.add_document(doc!(name => 20u64));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert!(searcher_space_usage.total() > 0);
|
||||
assert_eq!(1, searcher_space_usage.segments().len());
|
||||
|
||||
let segment = &searcher_space_usage.segments()[0];
|
||||
assert!(segment.total() > 0);
|
||||
|
||||
assert_eq!(4, segment.num_docs());
|
||||
|
||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment.postings(), &name, 1, 512);
|
||||
assert_eq!(0, segment.positions().total());
|
||||
assert_eq!(0, segment.positions_skip_idx().total());
|
||||
expect_single_field(segment.fast_fields(), &name, 1, 512);
|
||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||
// TODO: understand why the following fails
|
||||
// assert_eq!(0, segment.store().total());
|
||||
assert_eq!(0, segment.deletes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let name = schema_builder.add_text_field("name", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => "hi"));
|
||||
index_writer.add_document(doc!(name => "this is a test"));
|
||||
index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
|
||||
index_writer.add_document(doc!(name => "hello hi goodbye"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert!(searcher_space_usage.total() > 0);
|
||||
assert_eq!(1, searcher_space_usage.segments().len());
|
||||
|
||||
let segment = &searcher_space_usage.segments()[0];
|
||||
assert!(segment.total() > 0);
|
||||
|
||||
assert_eq!(4, segment.num_docs());
|
||||
|
||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment.postings(), &name, 1, 512);
|
||||
expect_single_field(segment.positions(), &name, 1, 512);
|
||||
expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
|
||||
assert_eq!(0, segment.fast_fields().total());
|
||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||
// TODO: understand why the following fails
|
||||
// assert_eq!(0, segment.store().total());
|
||||
assert_eq!(0, segment.deletes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let name = schema_builder.add_text_field("name", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => "hi"));
|
||||
index_writer.add_document(doc!(name => "this is a test"));
|
||||
index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
|
||||
index_writer.add_document(doc!(name => "hello hi goodbye"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert!(searcher_space_usage.total() > 0);
|
||||
assert_eq!(1, searcher_space_usage.segments().len());
|
||||
|
||||
let segment = &searcher_space_usage.segments()[0];
|
||||
assert!(segment.total() > 0);
|
||||
|
||||
assert_eq!(4, segment.num_docs());
|
||||
|
||||
assert_eq!(0, segment.termdict().total());
|
||||
assert_eq!(0, segment.postings().total());
|
||||
assert_eq!(0, segment.positions().total());
|
||||
assert_eq!(0, segment.positions_skip_idx().total());
|
||||
assert_eq!(0, segment.fast_fields().total());
|
||||
assert_eq!(0, segment.fieldnorms().total());
|
||||
assert!(segment.store().total() > 0);
|
||||
assert!(segment.store().total() < 512);
|
||||
assert_eq!(0, segment.deletes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deletes() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let name = schema_builder.add_u64_field("name", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => 1u64));
|
||||
index_writer.add_document(doc!(name => 2u64));
|
||||
index_writer.add_document(doc!(name => 3u64));
|
||||
index_writer.add_document(doc!(name => 4u64));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut index_writer2 = index.writer(50_000_000).unwrap();
|
||||
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
|
||||
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
|
||||
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer2.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert!(searcher_space_usage.total() > 0);
|
||||
assert_eq!(1, searcher_space_usage.segments().len());
|
||||
|
||||
let segment = &searcher_space_usage.segments()[0];
|
||||
assert!(segment.total() > 0);
|
||||
|
||||
assert_eq!(2, segment.num_docs());
|
||||
|
||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment.postings(), &name, 1, 512);
|
||||
assert_eq!(0, segment.positions().total());
|
||||
assert_eq!(0, segment.positions_skip_idx().total());
|
||||
assert_eq!(0, segment.fast_fields().total());
|
||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||
// TODO: understand why the following fails
|
||||
// assert_eq!(0, segment.store().total());
|
||||
assert!(segment.deletes() > 0);
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use schema::Document;
|
||||
use space_usage::StoreSpaceUsage;
|
||||
use std::cell::RefCell;
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
@@ -87,6 +88,11 @@ impl StoreReader {
|
||||
cursor = &cursor[..doc_length];
|
||||
Ok(Document::deserialize(&mut cursor)?)
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this store reader.
|
||||
pub fn space_usage(&self) -> StoreSpaceUsage {
|
||||
StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(
|
||||
|
||||
Reference in New Issue
Block a user