Merge branch 'master' of github.com:tantivy-search/tantivy

This commit is contained in:
Paul Masurel
2018-09-07 08:44:14 +09:00
14 changed files with 241 additions and 96 deletions

View File

@@ -48,6 +48,7 @@ census = "0.1"
fnv = "1.0.6"
owned-read = "0.4"
failure = "0.1"
fail = "0.2"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -60,12 +61,20 @@ opt-level = 3
debug = false
lto = true
debug-assertions = false
overflow-checks = false
[profile.test]
debug-assertions = true
overflow-checks = true
[features]
default = ["mmap"]
# by default no-fail is disabled. We manually enable it when running test.
default = ["mmap", "no_fail"]
mmap = ["fst/mmap", "atomicwrites"]
lz4-compression = ["lz4"]
no_fail = ["fail/no_fail"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -78,6 +78,10 @@ To check out and run tests, you can simply run :
cd tantivy
cargo build
## Running tests
Some tests will not run with just `cargo test` because of `fail-rs`.
To run the tests exhaustively, run `./run-tests.sh`.
# Contribute

View File

@@ -18,5 +18,5 @@ install:
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
- REM SET RUST_BACKTRACE=1 & cargo build --examples
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
- REM SET RUST_BACKTRACE=1 & cargo build --examples

View File

@@ -16,7 +16,7 @@ main() {
return
fi
echo "Test"
cross test --target $TARGET
cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
fi
for example in $(ls examples/*.rs)
do

View File

@@ -23,7 +23,6 @@ use tantivy::Index;
fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search`
let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
let mut schema_builder = SchemaBuilder::default();
// This configures your custom options for how tantivy will
@@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> {
// to note is that we are setting the tokenizer to `stoppy`
// which will be defined and registered below.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
.set_indexing_options(text_field_indexing)
.set_stored();
// Our first field is title.
schema_builder.add_text_field("title", text_options);
// Our second field is body.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("body", text_options);
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_path, schema.clone())?;
let index = Index::create_in_ram(schema.clone());
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
index.tokenizers().register("stoppy", tokenizer);
@@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> {
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Frankenstein",
@@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> {
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// this will have NO hits because it was filtered out
// because the query is run through the analyzer you
// actually will get an error here because the query becomes
// empty
assert!(query_parser.parse_query("the").is_err());
// this will have hits
let query = query_parser.parse_query("is")?;
// stop words are applied on the query as well.
// The following will be equivalent to `title:frankenstein`
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
let mut top_collector = TopCollector::with_limit(10);
@@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> {
}
Ok(())
}
use tempdir::TempDir;
}

2
run-tests.sh Executable file
View File

@@ -0,0 +1,2 @@
#!/bin/bash
cargo test --no-default-features --features mmap -- --test-threads 1

View File

@@ -266,14 +266,14 @@ mod tests {
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
let arr = generate_nonunique_unsorted(100_000, 5_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(1_000_000);
let mut bitset = BitSet::with_max_value(100_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..1_000_000 {
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());

View File

@@ -4,7 +4,6 @@ use core::InvertedIndexReader;
use core::Segment;
use core::SegmentComponent;
use core::SegmentId;
use core::SegmentMeta;
use error::TantivyError;
use fastfield::DeleteBitSet;
use fastfield::FacetReader;
@@ -44,7 +43,8 @@ pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId,
segment_meta: SegmentMeta,
max_doc: DocId,
num_docs: DocId,
termdict_composite: CompositeFile,
postings_composite: CompositeFile,
@@ -64,7 +64,7 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes, so it happens
/// to also be the number of documents in the index.
pub fn max_doc(&self) -> DocId {
self.segment_meta.max_doc()
self.max_doc
}
/// Returns the number of documents.
@@ -73,7 +73,7 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
self.segment_meta.num_docs()
self.num_docs
}
/// Returns the schema of the index this segment belongs to.
@@ -225,6 +225,8 @@ impl SegmentReader {
let store_source = segment.open_read(SegmentComponent::STORE)?;
let store_reader = StoreReader::from_source(store_source);
fail_point!("SegmentReader::open#middle");
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(&postings_source)?;
@@ -260,7 +262,8 @@ impl SegmentReader {
let schema = segment.schema();
Ok(SegmentReader {
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
segment_meta: segment.meta().clone(),
max_doc: segment.meta().max_doc(),
num_docs: segment.meta().num_docs(),
termdict_composite,
postings_composite,
fast_fields_composite,
@@ -432,6 +435,7 @@ mod test {
use schema::{SchemaBuilder, Term, STORED, TEXT};
use DocId;
#[test]
fn test_alive_docs_iterator() {
let mut schema_builder = SchemaBuilder::new();

View File

@@ -195,6 +195,9 @@ impl Directory for RAMDirectory {
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
fail_point!("RAMDirectory::atomic_write", |msg| {
Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string())))
});
let path_buf = PathBuf::from(path);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
self.fs.write(path_buf, &Vec::new())?;

View File

@@ -370,7 +370,7 @@ mod tests {
pub fn generate_permutation() -> Vec<u64> {
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
let mut rng = XorShiftRng::from_seed(seed);
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
rng.shuffle(&mut permutation);
permutation
}

View File

@@ -301,25 +301,31 @@ fn index_documents(
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
)?;
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
});
let segment_entry: SegmentEntry;
if delete_cursor.get().is_some() {
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
)?;
segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
});
} else {
// if there are no delete operation in the queue, no need
// to even open the segment.
segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
}
Ok(segment_updater.add_segment(generation, segment_entry))
}
@@ -858,4 +864,33 @@ mod tests {
assert_eq!(initial_table_size(1_000_000_000), 19);
}
#[cfg(not(feature="no_fail"))]
#[test]
fn test_write_commit_fails() {
use fail;
let mut schema_builder = schema::SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
index_writer.commit().unwrap();
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "b"));
}
assert!(index_writer.commit().is_err());
index.load_searchers().unwrap();
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
searcher.doc_freq(&term_a)
};
assert_eq!(num_docs_containing("a"), 100);
assert_eq!(num_docs_containing("b"), 0);
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
}
}

View File

@@ -173,6 +173,9 @@ extern crate tinysegmenter;
#[macro_use]
extern crate downcast;
#[macro_use]
extern crate fail;
#[cfg(test)]
mod functional_test;
@@ -946,3 +949,4 @@ mod tests {
}
}
}

View File

@@ -20,6 +20,7 @@ use std::str::FromStr;
use tokenizer::TokenizerManager;
use combine::Parser;
use query::EmptyQuery;
use query::query_parser::logical_ast::LogicalAST;
/// Possible error that may happen when parsing a query.
@@ -58,6 +59,27 @@ impl From<ParseIntError> for QueryParserError {
}
}
/// Recursively remove empty clause from the AST
///
/// Returns `None` iff the `logical_ast` ended up being empty.
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
match logical_ast {
LogicalAST::Clause(children) => {
let trimmed_children = children.into_iter()
.flat_map(|(occur, child)|
trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
.collect::<Vec<_>>();
if trimmed_children.is_empty() {
None
} else {
Some(LogicalAST::Clause(trimmed_children))
}
},
_ => Some(logical_ast),
}
}
/// Tantivy's Query parser
///
/// The language covered by the current parser is extremely simple.
@@ -369,14 +391,15 @@ impl QueryParser {
asts.push(LogicalAST::Leaf(Box::new(ast)));
}
}
let result_ast = if asts.is_empty() {
// this should never happen
return Err(QueryParserError::SyntaxError);
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
};
let result_ast: LogicalAST =
if asts.len() == 1 {
asts.into_iter().next().unwrap()
} else {
LogicalAST::Clause(
asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
Ok(result_ast)
}
UserInputLeaf::All => {
@@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
}
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
match logical_ast {
LogicalAST::Clause(clause) => {
if clause.is_empty() {
Box::new(EmptyQuery)
} else {
let occur_subqueries = clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
Box::new(BooleanQuery::from(occur_subqueries))
}
}
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
match trim_ast(logical_ast) {
Some(LogicalAST::Clause(trimmed_clause)) => {
let occur_subqueries = trimmed_clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
Box::new(BooleanQuery::from(occur_subqueries))
},
Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
None => Box::new(EmptyQuery)
}
}
@@ -454,12 +475,17 @@ mod test {
use schema::Field;
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
use tokenizer::SimpleTokenizer;
use tokenizer::TokenizerManager;
use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
use Index;
fn make_query_parser() -> QueryParser {
let mut schema_builder = SchemaBuilder::default();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("en_with_stop_words")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
schema_builder.add_i64_field("signed", INT_INDEXED);
@@ -468,9 +494,14 @@ mod test {
schema_builder.add_text_field("notindexed_u64", STORED);
schema_builder.add_text_field("notindexed_i64", STORED);
schema_builder.add_text_field("nottokenized", STRING);
schema_builder.add_text_field("with_stop_words", text_options);
let schema = schema_builder.build();
let default_fields = vec![title, text];
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()]))
);
QueryParser::new(schema, default_fields, tokenizer_manager)
}
@@ -739,6 +770,13 @@ mod test {
);
}
#[test]
pub fn test_query_parser_not_empty_but_no_tokens() {
let query_parser = make_query_parser();
assert!(query_parser.parse_query(" !, ").is_ok());
assert!(query_parser.parse_query("with_stop_words:the").is_ok());
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper(

View File

@@ -1,4 +1,5 @@
use super::{Token, TokenFilter, TokenStream};
use std::mem;
/// Token filter that lowercase terms.
#[derive(Clone)]
@@ -15,13 +16,22 @@ where
}
}
pub struct LowerCaserTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
pub struct LowerCaserTokenStream<TailTokenStream> {
buffer: String,
tail: TailTokenStream,
}
// writes a lowercased version of text into output.
fn to_lowercase_unicode(text: &mut String, output: &mut String) {
output.clear();
for c in text.chars() {
// Contrary to the std, we do not take care of sigma special case.
// This will have an normalizationo effect, which is ok for search.
output.extend(c.to_lowercase());
}
}
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
@@ -36,7 +46,14 @@ where
fn advance(&mut self) -> bool {
if self.tail.advance() {
self.tail.token_mut().text.make_ascii_lowercase();
if self.token_mut().text.is_ascii() {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
} else {
false
@@ -49,6 +66,43 @@ where
TailTokenStream: TokenStream,
{
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
LowerCaserTokenStream { tail }
LowerCaserTokenStream {
tail,
buffer: String::with_capacity(100)
}
}
}
#[cfg(test)]
mod tests {
use tokenizer::Tokenizer;
use tokenizer::LowerCaser;
use tokenizer::TokenStream;
use tokenizer::SimpleTokenizer;
#[test]
fn test_to_lower_case() {
assert_eq!(lowercase_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]);
}
fn lowercase_helper(text: &str) -> Vec<String> {
let mut tokens = vec![];
let mut token_stream = SimpleTokenizer
.filter(LowerCaser)
.token_stream(text);
while token_stream.advance() {
let token_text = token_stream.token().text.clone();
tokens.push(token_text);
}
tokens
}
#[test]
fn test_lowercaser() {
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
}
}