Compare commits

..

2 Commits

Author SHA1 Message Date
Paul Masurel
507e46f814 Added static directory 2018-10-04 23:28:44 +09:00
Paul Masurel
3d3da2d66f Compiling in WebAssembly 2018-10-04 08:45:04 +09:00
16 changed files with 78 additions and 617 deletions

View File

@@ -17,7 +17,7 @@ byteorder = "1.0"
lazy_static = "1"
regex = "1.0"
fst = {version="0.3", default-features=false}
fst-regex = { version="0.2" }
fst-regex = { version="0.2", optional=true}
lz4 = {version="1.20", optional=true}
snap = {version="0.2"}
atomicwrites = {version="0.2.2", optional=true}
@@ -68,8 +68,9 @@ overflow-checks = true
[features]
# by default no-fail is disabled. We manually enable it when running test.
default = ["mmap", "no_fail"]
default = ["mmap", "no_fail", "regex_query"]
mmap = ["fst/mmap", "atomicwrites"]
regex_query = ["fst-regex"]
lz4-compression = ["lz4"]
no_fail = ["fail/no_fail"]

View File

@@ -21,7 +21,7 @@
**Tantivy** is a **full text search engine library** written in rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
to build such a search engine.

View File

@@ -4,8 +4,6 @@ use common::VInt;
use directory::ReadOnlySource;
use directory::WritePtr;
use schema::Field;
use space_usage::PerFieldSpaceUsage;
use space_usage::FieldUsage;
use std::collections::HashMap;
use std::io::Write;
use std::io::{self, Read};
@@ -168,16 +166,6 @@ impl CompositeFile {
.get(&FileAddr { field, idx })
.map(|&(from, to)| self.data.slice(from, to))
}
pub fn space_usage(&self) -> PerFieldSpaceUsage {
let mut fields = HashMap::new();
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
fields.entry(field_addr.field)
.or_insert_with(|| FieldUsage::empty(field_addr.field))
.add_field_idx(field_addr.idx, end - start);
}
PerFieldSpaceUsage::new(fields)
}
}
#[cfg(test)]

View File

@@ -5,7 +5,6 @@ use query::Query;
use schema::Document;
use schema::Schema;
use schema::{Field, Term};
use space_usage::SearcherSpaceUsage;
use std::fmt;
use std::sync::Arc;
use termdict::TermMerger;
@@ -100,15 +99,6 @@ impl Searcher {
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
}
/// Summarize total space usage of this searcher.
pub fn space_usage(&self) -> SearcherSpaceUsage {
let mut space_usage = SearcherSpaceUsage::new();
for segment_reader in self.segment_readers.iter() {
space_usage.add_segment(segment_reader.space_usage());
}
space_usage
}
}
pub struct FieldSearcher {

View File

@@ -16,7 +16,6 @@ use schema::Document;
use schema::Field;
use schema::FieldType;
use schema::Schema;
use space_usage::SegmentSpaceUsage;
use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
@@ -382,21 +381,6 @@ impl SegmentReader {
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
SegmentReaderAliveDocsIterator::new(&self)
}
/// Summarize total space usage of this segment.
pub fn space_usage(&self) -> SegmentSpaceUsage {
SegmentSpaceUsage::new(
self.num_docs(),
self.termdict_composite.space_usage(),
self.postings_composite.space_usage(),
self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(),
self.fast_fields_composite.space_usage(),
self.fieldnorms_composite.space_usage(),
self.store_reader.space_usage(),
self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0),
)
}
}
impl fmt::Debug for SegmentReader {

View File

@@ -12,6 +12,7 @@ mod managed_directory;
mod ram_directory;
mod read_only_source;
mod shared_vec_slice;
mod static_dictionnary;
/// Errors specific to the directory module.
pub mod error;
@@ -21,6 +22,7 @@ use std::io::{BufWriter, Seek, Write};
pub use self::directory::{Directory, DirectoryClone};
pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource;
pub use self::static_dictionnary::StaticDirectory;
#[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory;

View File

@@ -5,6 +5,9 @@ use fst::raw::MmapReadOnly;
use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::ops::Deref;
const EMPTY_SLICE: [u8; 0] = [];
/// Read object that represents files in tantivy.
///
/// These read objects are only in charge to deliver
@@ -17,6 +20,8 @@ pub enum ReadOnlySource {
Mmap(MmapReadOnly),
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
/// Wrapping a static slice
Static(&'static [u8])
}
unsafe impl StableDeref for ReadOnlySource {}
@@ -33,7 +38,7 @@ impl Deref for ReadOnlySource {
impl ReadOnlySource {
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::Anonymous(SharedVecSlice::empty())
ReadOnlySource::Static(&EMPTY_SLICE)
}
/// Returns the data underlying the ReadOnlySource object.
@@ -42,6 +47,7 @@ impl ReadOnlySource {
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
ReadOnlySource::Static(data) => data,
}
}
@@ -79,6 +85,9 @@ impl ReadOnlySource {
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
}
ReadOnlySource::Static(data) => {
ReadOnlySource::Static(&data[from_offset..to_offset])
}
}
}
@@ -118,3 +127,9 @@ impl From<Vec<u8>> for ReadOnlySource {
ReadOnlySource::Anonymous(shared_data)
}
}
impl From<&'static [u8]> for ReadOnlySource {
fn from(data: &'static [u8]) -> ReadOnlySource {
ReadOnlySource::Static(data)
}
}

View File

@@ -2,7 +2,6 @@ use bit_set::BitSet;
use common::HasLen;
use directory::ReadOnlySource;
use directory::WritePtr;
use space_usage::ByteCount;
use std::io;
use std::io::Write;
use DocId;
@@ -64,11 +63,6 @@ impl DeleteBitSet {
b & (1u8 << shift) != 0
}
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()
}
}
impl HasLen for DeleteBitSet {

View File

@@ -136,7 +136,7 @@ extern crate crossbeam;
extern crate crossbeam_channel;
extern crate fnv;
extern crate fst;
extern crate fst_regex;
extern crate futures;
extern crate futures_cpupool;
extern crate htmlescape;
@@ -213,7 +213,6 @@ pub(crate) mod positions;
pub mod postings;
pub mod query;
pub mod schema;
pub mod space_usage;
pub mod store;
pub mod termdict;

View File

@@ -16,7 +16,10 @@ mod phrase_query;
mod query;
mod query_parser;
mod range_query;
#[cfg(feature="regex_query")]
mod regex_query;
mod reqopt_scorer;
mod scorer;
mod term_query;
@@ -47,7 +50,10 @@ pub use self::query::Query;
pub use self::query_parser::QueryParser;
pub use self::query_parser::QueryParserError;
pub use self::range_query::RangeQuery;
#[cfg(feature="regex_query")]
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::scorer::ConstScorer;
pub use self::scorer::Scorer;

View File

@@ -177,6 +177,9 @@ impl QueryParser {
///
/// There is currently no lenient mode for the query parser
/// which makes it a bad choice for a public/broad user search engine.
///
/// Implementing a lenient mode for this query parser is tracked
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(convert_to_query(logical_ast))
@@ -190,61 +193,6 @@ impl QueryParser {
self.compute_logical_ast(user_input_ast)
}
/// Parse a query
///
/// Note that `parse_query_lenient` will NOT return an error
/// if the input is not a valid query.
///
/// It will instead escape all special characters in the query body
/// retry to process the query, if it still fails will return the AllQuery
pub fn parse_query_lenient(&self, query: &str) -> Box<Query> {
if let Ok(logical_ast) = self.parse_query_to_logical_ast(query) {
return convert_to_query(logical_ast);
}
// try to clean up the query
if let Ok(logical_ast) = self.parse_lenient_query_to_logical_ast(query) {
return convert_to_query(logical_ast);
}
// we have no idea what you want, so here's nothing
Box::new(EmptyQuery)
}
/// Parse the user query into an AST.
fn parse_lenient_query_to_logical_ast(
&self,
query: &str,
) -> Result<LogicalAST, QueryParserError> {
// if we are here, we know we have a poorly formed
// query input
// # Escape special characters: \\+-&|!(){}[]^~*?:\/
let special_chars = "\\+-&|!(){}[]^~*?:/";
let mut scrubbed_query = query
.chars()
.filter(|c| !special_chars.contains(*c))
.collect::<String>();
// AND, OR and NOT are used by tantivy as logical operators. We need
// to escape them
let special_words = vec!["AND", "OR", "NOT"];
for word in special_words.iter() {
scrubbed_query = scrubbed_query.replace(word, &format!("{}", word));
}
// Escape odd quotes
let quote_count = scrubbed_query.chars().filter(|&c| c == '\"').count();
if quote_count % 2 == 1 {
scrubbed_query = scrubbed_query.replace("\"", "\\\"");
}
let (user_input_ast, _remaining) = parse_to_ast()
.parse(scrubbed_query.as_str())
.map_err(|_| QueryParserError::SyntaxError)?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema
.get_field(field_name)
@@ -596,26 +544,6 @@ mod test {
assert!(query_parser.parse_query("toto").is_ok());
}
#[test]
pub fn test_parse_query_lenient_no_panics() {
let query_parser = make_query_parser();
query_parser.parse_query_lenient("toto");
query_parser.parse_query_lenient("");
query_parser.parse_query_lenient("+(happy");
}
#[test]
pub fn test_parse_query_lenient_escapes_bad_queries() {
let query_parser = make_query_parser();
let query = query_parser
.parse_lenient_query_to_logical_ast("+(happy")
.unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, "(Term([0, 0, 0, 0, 104, 97, 112, 112, 121]) Term([0, 0, 0, 1, 104, 97, 112, 112, 121]))");
}
#[test]
pub fn test_parse_nonindexed_field_yields_error() {
let query_parser = make_query_parser();

View File

@@ -80,6 +80,9 @@ impl UserInputBound {
pub enum UserInputAST {
Clause(Vec<UserInputAST>),
Unary(Occur, Box<UserInputAST>),
// Not(Box<UserInputAST>),
// Should(Box<UserInputAST>),
// Must(Box<UserInputAST>),
Leaf(Box<UserInputLeaf>),
}
@@ -89,7 +92,7 @@ impl UserInputAST {
}
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
assert_ne!(occur, Occur::MustNot);
assert!(occur != Occur::MustNot);
assert!(!asts.is_empty());
if asts.len() == 1 {
asts.into_iter().next().unwrap() //< safe
@@ -111,6 +114,42 @@ impl UserInputAST {
}
}
/*
impl UserInputAST {
fn compose_occur(self, occur: Occur) -> UserInputAST {
match self {
UserInputAST::Not(other) => {
let new_occur = compose_occur(Occur::MustNot, occur);
other.simplify()
}
_ => {
self
}
}
}
pub fn simplify(self) -> UserInputAST {
match self {
UserInputAST::Clause(els) => {
if els.len() == 1 {
return els.into_iter().next().unwrap();
} else {
return self;
}
}
UserInputAST::Not(els) => {
if els.len() == 1 {
return els.into_iter().next().unwrap();
} else {
return self;
}
}
}
}
}
*/
impl From<UserInputLiteral> for UserInputLeaf {
fn from(literal: UserInputLiteral) -> UserInputLeaf {
UserInputLeaf::Literal(literal)

View File

@@ -1,5 +1,7 @@
extern crate fst_regex;
use error::TantivyError;
use fst_regex::Regex;
use self::fst_regex::Regex;
use query::{AutomatonWeight, Query, Weight};
use schema::Field;
use std::clone::Clone;

View File

@@ -1,484 +0,0 @@
/*!
Representations for the space usage of various parts of a Tantivy index.
This can be used programmatically, and will also be exposed in a human readable fashion in
tantivy-cli.
One important caveat for all of this functionality is that none of it currently takes storage-level
details into consideration. For example, if your file system block size is 4096 bytes, we can
under-count actual resultant space usage by up to 4095 bytes per file.
*/
use schema::Field;
use std::collections::HashMap;
use SegmentComponent;
/// Indicates space usage in bytes
pub type ByteCount = usize;
/// Enum containing any of the possible space usage results for segment components.
pub enum ComponentSpaceUsage {
/// Data is stored per field in a uniform way
PerField(PerFieldSpaceUsage),
/// Data is stored in separate pieces in the store
Store(StoreSpaceUsage),
/// Some sort of raw byte count
Basic(ByteCount),
}
/// Represents combined space usage of an entire searcher and its component segments.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SearcherSpaceUsage {
segments: Vec<SegmentSpaceUsage>,
total: ByteCount,
}
impl SearcherSpaceUsage {
pub(crate) fn new() -> SearcherSpaceUsage {
SearcherSpaceUsage {
segments: Vec::new(),
total: 0,
}
}
/// Add a segment, to `self`.
/// Performs no deduplication or other intelligence.
pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
self.total += segment.total();
self.segments.push(segment);
}
/// Per segment space usage
pub fn segments(&self) -> &[SegmentSpaceUsage] {
&self.segments[..]
}
/// Returns total byte usage of this searcher, including all large subcomponents.
/// Does not account for smaller things like `meta.json`.
pub fn total(&self) -> ByteCount {
self.total
}
}
/// Represents combined space usage for all of the large components comprising a segment.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SegmentSpaceUsage {
num_docs: u32,
termdict: PerFieldSpaceUsage,
postings: PerFieldSpaceUsage,
positions: PerFieldSpaceUsage,
positions_idx: PerFieldSpaceUsage,
fast_fields: PerFieldSpaceUsage,
fieldnorms: PerFieldSpaceUsage,
store: StoreSpaceUsage,
deletes: ByteCount,
total: ByteCount,
}
impl SegmentSpaceUsage {
pub(crate) fn new(
num_docs: u32,
termdict: PerFieldSpaceUsage,
postings: PerFieldSpaceUsage,
positions: PerFieldSpaceUsage,
positions_idx: PerFieldSpaceUsage,
fast_fields: PerFieldSpaceUsage,
fieldnorms: PerFieldSpaceUsage,
store: StoreSpaceUsage,
deletes: ByteCount,
) -> SegmentSpaceUsage {
let total = termdict.total()
+ postings.total()
+ positions.total()
+ fast_fields.total()
+ fieldnorms.total()
+ store.total()
+ deletes;
SegmentSpaceUsage {
num_docs,
termdict,
postings,
positions,
positions_idx,
fast_fields,
fieldnorms,
store,
deletes,
total,
}
}
/// Space usage for the given component
///
/// Clones the underlying data.
/// Use the components directly if this is somehow in performance critical code.
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
use SegmentComponent::*;
use self::ComponentSpaceUsage::*;
match component {
POSTINGS => PerField(self.postings().clone()),
POSITIONS => PerField(self.positions().clone()),
POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
FASTFIELDS => PerField(self.fast_fields().clone()),
FIELDNORMS => PerField(self.fieldnorms().clone()),
TERMS => PerField(self.termdict().clone()),
STORE => Store(self.store().clone()),
DELETE => Basic(self.deletes()),
}
}
/// Num docs in segment
pub fn num_docs(&self) -> u32 {
self.num_docs
}
/// Space usage for term dictionary
pub fn termdict(&self) -> &PerFieldSpaceUsage {
&self.termdict
}
/// Space usage for postings list
pub fn postings(&self) -> &PerFieldSpaceUsage {
&self.postings
}
/// Space usage for positions
pub fn positions(&self) -> &PerFieldSpaceUsage {
&self.positions
}
/// Space usage for positions skip idx
pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
&self.positions_idx
}
/// Space usage for fast fields
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
&self.fast_fields
}
/// Space usage for field norms
pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
&self.fieldnorms
}
/// Space usage for stored documents
pub fn store(&self) -> &StoreSpaceUsage {
&self.store
}
/// Space usage for document deletions
pub fn deletes(&self) -> ByteCount {
self.deletes
}
/// Total space usage in bytes for this segment.
pub fn total(&self) -> ByteCount {
self.total
}
}
/// Represents space usage for the Store for this segment.
///
/// This is composed of two parts.
/// `data` represents the compressed data itself.
/// `offsets` represents a lookup to find the start of a block
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct StoreSpaceUsage {
data: ByteCount,
offsets: ByteCount,
}
impl StoreSpaceUsage {
pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
StoreSpaceUsage { data, offsets }
}
/// Space usage for the data part of the store
pub fn data_usage(&self) -> ByteCount {
self.data
}
/// Space usage for the offsets part of the store (doc ID -> offset)
pub fn offsets_usage(&self) -> ByteCount {
self.offsets
}
/// Total space usage in bytes for this Store
pub fn total(&self) -> ByteCount {
self.data + self.offsets
}
}
/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
///
/// A field can appear with a single index (typically 0) or with multiple indexes.
/// Multiple indexes are used to handle variable length things, where
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PerFieldSpaceUsage {
fields: HashMap<Field, FieldUsage>,
total: ByteCount
}
impl PerFieldSpaceUsage {
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
let total = fields.values().map(|x| x.total()).sum();
PerFieldSpaceUsage { fields, total }
}
/// Per field space usage
pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
self.fields.iter()
}
/// Bytes used by the represented file
pub fn total(&self) -> ByteCount {
self.total
}
}
/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
/// comprise it.
///
/// See documentation for PerFieldSpaceUsage for slightly more information.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct FieldUsage {
field: Field,
num_bytes: ByteCount,
/// A field can be composed of more than one piece.
/// These pieces are indexed by arbitrary numbers starting at zero.
/// `self.num_bytes` includes all of `self.sub_num_bytes`.
sub_num_bytes: Vec<Option<ByteCount>>,
}
impl FieldUsage {
pub(crate) fn empty(field: Field) -> FieldUsage {
FieldUsage {
field,
num_bytes: 0,
sub_num_bytes: Vec::new(),
}
}
pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
if self.sub_num_bytes.len() < idx + 1{
self.sub_num_bytes.resize(idx + 1, None);
}
assert!(self.sub_num_bytes[idx].is_none());
self.sub_num_bytes[idx] = Some(size);
self.num_bytes += size
}
/// Field
pub fn field(&self) -> Field {
self.field
}
/// Space usage for each index
pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
&self.sub_num_bytes[..]
}
/// Total bytes used for this field in this context
pub fn total(&self) -> ByteCount {
self.num_bytes
}
}
#[cfg(test)]
mod test {
use core::Index;
use schema::SchemaBuilder;
use schema::{FAST, INT_INDEXED, TEXT};
use schema::Field;
use space_usage::ByteCount;
use space_usage::PerFieldSpaceUsage;
use schema::STORED;
use Term;
#[test]
fn test_empty() {
let schema = SchemaBuilder::new().build();
let index = Index::create_in_ram(schema.clone());
index.load_searchers().unwrap();
let searcher = index.searcher();
let searcher_space_usage = searcher.space_usage();
assert_eq!(0, searcher_space_usage.total());
}
fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) {
assert!(field_space.total() >= min_size);
assert!(field_space.total() <= max_size);
assert_eq!(
vec![(field, field_space.total())],
field_space.fields().map(|(x,y)| (x, y.total())).collect::<Vec<_>>()
);
}
#[test]
fn test_fast_indexed() {
let mut schema_builder = SchemaBuilder::new();
let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 10u64));
index_writer.add_document(doc!(name => 20u64));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let searcher_space_usage = searcher.space_usage();
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment = &searcher_space_usage.segments()[0];
assert!(segment.total() > 0);
assert_eq!(4, segment.num_docs());
expect_single_field(segment.termdict(), &name, 1, 512);
expect_single_field(segment.postings(), &name, 1, 512);
assert_eq!(0, segment.positions().total());
assert_eq!(0, segment.positions_skip_idx().total());
expect_single_field(segment.fast_fields(), &name, 1, 512);
expect_single_field(segment.fieldnorms(), &name, 1, 512);
// TODO: understand why the following fails
// assert_eq!(0, segment.store().total());
assert_eq!(0, segment.deletes());
}
#[test]
fn test_text() {
let mut schema_builder = SchemaBuilder::new();
let name = schema_builder.add_text_field("name", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
index_writer.add_document(doc!(name => "hello hi goodbye"));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let searcher_space_usage = searcher.space_usage();
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment = &searcher_space_usage.segments()[0];
assert!(segment.total() > 0);
assert_eq!(4, segment.num_docs());
expect_single_field(segment.termdict(), &name, 1, 512);
expect_single_field(segment.postings(), &name, 1, 512);
expect_single_field(segment.positions(), &name, 1, 512);
expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
assert_eq!(0, segment.fast_fields().total());
expect_single_field(segment.fieldnorms(), &name, 1, 512);
// TODO: understand why the following fails
// assert_eq!(0, segment.store().total());
assert_eq!(0, segment.deletes());
}
#[test]
fn test_store() {
let mut schema_builder = SchemaBuilder::new();
let name = schema_builder.add_text_field("name", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(name => "hi"));
index_writer.add_document(doc!(name => "this is a test"));
index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
index_writer.add_document(doc!(name => "hello hi goodbye"));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let searcher_space_usage = searcher.space_usage();
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment = &searcher_space_usage.segments()[0];
assert!(segment.total() > 0);
assert_eq!(4, segment.num_docs());
assert_eq!(0, segment.termdict().total());
assert_eq!(0, segment.postings().total());
assert_eq!(0, segment.positions().total());
assert_eq!(0, segment.positions_skip_idx().total());
assert_eq!(0, segment.fast_fields().total());
assert_eq!(0, segment.fieldnorms().total());
assert!(segment.store().total() > 0);
assert!(segment.store().total() < 512);
assert_eq!(0, segment.deletes());
}
#[test]
fn test_deletes() {
let mut schema_builder = SchemaBuilder::new();
let name = schema_builder.add_u64_field("name", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(name => 1u64));
index_writer.add_document(doc!(name => 2u64));
index_writer.add_document(doc!(name => 3u64));
index_writer.add_document(doc!(name => 4u64));
index_writer.commit().unwrap();
}
{
let mut index_writer2 = index.writer(50_000_000).unwrap();
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
// ok, now we should have a deleted doc
index_writer2.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let searcher_space_usage = searcher.space_usage();
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment = &searcher_space_usage.segments()[0];
assert!(segment.total() > 0);
assert_eq!(2, segment.num_docs());
expect_single_field(segment.termdict(), &name, 1, 512);
expect_single_field(segment.postings(), &name, 1, 512);
assert_eq!(0, segment.positions().total());
assert_eq!(0, segment.positions_skip_idx().total());
assert_eq!(0, segment.fast_fields().total());
expect_single_field(segment.fieldnorms(), &name, 1, 512);
// TODO: understand why the following fails
// assert_eq!(0, segment.store().total());
assert!(segment.deletes() > 0);
}
}

View File

@@ -6,7 +6,6 @@ use common::BinarySerializable;
use common::VInt;
use directory::ReadOnlySource;
use schema::Document;
use space_usage::StoreSpaceUsage;
use std::cell::RefCell;
use std::io;
use std::mem::size_of;
@@ -88,11 +87,6 @@ impl StoreReader {
cursor = &cursor[..doc_length];
Ok(Document::deserialize(&mut cursor)?)
}
/// Summarize total space usage of this store reader.
pub fn space_usage(&self) -> StoreSpaceUsage {
StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
}
}
#[cfg_attr(

View File

@@ -96,6 +96,9 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map {
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
}
ReadOnlySource::Static(data) => {
Fst::from_static_slice(data).expect("FST data is corrupted")
}
};
fst::Map::from(fst)
}