diff --git a/Cargo.toml b/Cargo.toml index ab767d3fd..098ab91c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ census = "0.1" fnv = "1.0.6" owned-read = "0.4" failure = "0.1" +fail = "0.2" [target.'cfg(windows)'.dependencies] winapi = "0.2" @@ -60,12 +61,20 @@ opt-level = 3 debug = false lto = true debug-assertions = false +overflow-checks = false + +[profile.test] +debug-assertions = true +overflow-checks = true [features] -default = ["mmap"] +# by default no-fail is disabled. We manually enable it when running test. +default = ["mmap", "no_fail"] mmap = ["fst/mmap", "atomicwrites"] lz4-compression = ["lz4"] +no_fail = ["fail/no_fail"] [badges] travis-ci = { repository = "tantivy-search/tantivy" } + diff --git a/README.md b/README.md index 499a12464..0ce522a7c 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,10 @@ To check out and run tests, you can simply run : cd tantivy cargo build +## Running tests + +Some tests will not run with just `cargo test` because of `fail-rs`. +To run the tests exhaustively, run `./run-tests.sh`. # Contribute diff --git a/appveyor.yml b/appveyor.yml index a3bd2ac04..685b04d3a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -18,5 +18,5 @@ install: build: false test_script: - - REM SET RUST_LOG=tantivy,test & cargo test --verbose - - REM SET RUST_BACKTRACE=1 & cargo build --examples \ No newline at end of file + - REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1 + - REM SET RUST_BACKTRACE=1 & cargo build --examples diff --git a/ci/script.sh b/ci/script.sh index b56345753..0939344b0 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -16,7 +16,7 @@ main() { return fi echo "Test" - cross test --target $TARGET + cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1 fi for example in $(ls examples/*.rs) do diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 950a42afd..b131d876c 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -23,7 +23,6 @@ use tantivy::Index; fn main() -> tantivy::Result<()> { // this example assumes you understand the content in `basic_search` - let index_path = TempDir::new("tantivy_stopwords_example_dir")?; let mut schema_builder = SchemaBuilder::default(); // This configures your custom options for how tantivy will @@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> { // to note is that we are setting the tokenizer to `stoppy` // which will be defined and registered below. let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + .set_indexing_options(text_field_indexing) + .set_stored(); // Our first field is title. schema_builder.add_text_field("title", text_options); // Our second field is body. let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + .set_indexing_options(text_field_indexing) + .set_stored(); schema_builder.add_text_field("body", text_options); let schema = schema_builder.build(); - let index = Index::create_in_dir(&index_path, schema.clone())?; + let index = Index::create_in_ram(schema.clone()); // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus let tokenizer = SimpleTokenizer - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec![ - "the".to_string(), - "and".to_string(), - ])); + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec![ + "the".to_string(), + "and".to_string(), + ])); index.tokenizers().register("stoppy", tokenizer); @@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> { )); index_writer.add_document(doc!( - title => "Of Mice and Men", - body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ - bank and runs deep and green. The water is warm too, for it has slipped twinkling \ - over the yellow sands in the sunlight before reaching the narrow pool. On one \ - side of the river the golden foothill slopes curve up to the strong and rocky \ - Gabilan Mountains, but on the valley side the water is lined with trees—willows \ - fresh and green with every spring, carrying in their lower leaf junctures the \ - debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ - limbs and branches that arch over the pool" - )); + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); index_writer.add_document(doc!( title => "Frankenstein", @@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> { let query_parser = QueryParser::for_index(&index, vec![title, body]); - // this will have NO hits because it was filtered out - // because the query is run through the analyzer you - // actually will get an error here because the query becomes - // empty - assert!(query_parser.parse_query("the").is_err()); - - // this will have hits - let query = query_parser.parse_query("is")?; + // stop words are applied on the query as well. + // The following will be equivalent to `title:frankenstein` + let query = query_parser.parse_query("title:\"the Frankenstein\"")?; let mut top_collector = TopCollector::with_limit(10); @@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> { } Ok(()) -} - -use tempdir::TempDir; +} \ No newline at end of file diff --git a/run-tests.sh b/run-tests.sh new file mode 100755 index 000000000..fc2944dd5 --- /dev/null +++ b/run-tests.sh @@ -0,0 +1,2 @@ +#!/bin/bash +cargo test --no-default-features --features mmap -- --test-threads 1 diff --git a/src/common/bitset.rs b/src/common/bitset.rs index 73f03c4f5..326e7cee8 100644 --- a/src/common/bitset.rs +++ b/src/common/bitset.rs @@ -266,14 +266,14 @@ mod tests { #[test] fn test_bitset_large() { - let arr = generate_nonunique_unsorted(1_000_000, 50_000); + let arr = generate_nonunique_unsorted(100_000, 5_000); let mut btreeset: BTreeSet = BTreeSet::new(); - let mut bitset = BitSet::with_max_value(1_000_000); + let mut bitset = BitSet::with_max_value(100_000); for el in arr { btreeset.insert(el); bitset.insert(el); } - for i in 0..1_000_000 { + for i in 0..100_000 { assert_eq!(btreeset.contains(&i), bitset.contains(i)); } assert_eq!(btreeset.len(), bitset.len()); diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 37b950332..56a3a7b9e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -4,7 +4,6 @@ use core::InvertedIndexReader; use core::Segment; use core::SegmentComponent; use core::SegmentId; -use core::SegmentMeta; use error::TantivyError; use fastfield::DeleteBitSet; use fastfield::FacetReader; @@ -44,7 +43,8 @@ pub struct SegmentReader { inv_idx_reader_cache: Arc>>>, segment_id: SegmentId, - segment_meta: SegmentMeta, + max_doc: DocId, + num_docs: DocId, termdict_composite: CompositeFile, postings_composite: CompositeFile, @@ -64,7 +64,7 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes, so it happens /// to also be the number of documents in the index. pub fn max_doc(&self) -> DocId { - self.segment_meta.max_doc() + self.max_doc } /// Returns the number of documents. @@ -73,7 +73,7 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes so max doc and /// num_docs are the same. pub fn num_docs(&self) -> DocId { - self.segment_meta.num_docs() + self.num_docs } /// Returns the schema of the index this segment belongs to. @@ -225,6 +225,8 @@ impl SegmentReader { let store_source = segment.open_read(SegmentComponent::STORE)?; let store_reader = StoreReader::from_source(store_source); + fail_point!("SegmentReader::open#middle"); + let postings_source = segment.open_read(SegmentComponent::POSTINGS)?; let postings_composite = CompositeFile::open(&postings_source)?; @@ -260,7 +262,8 @@ impl SegmentReader { let schema = segment.schema(); Ok(SegmentReader { inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), - segment_meta: segment.meta().clone(), + max_doc: segment.meta().max_doc(), + num_docs: segment.meta().num_docs(), termdict_composite, postings_composite, fast_fields_composite, @@ -432,6 +435,7 @@ mod test { use schema::{SchemaBuilder, Term, STORED, TEXT}; use DocId; + #[test] fn test_alive_docs_iterator() { let mut schema_builder = SchemaBuilder::new(); diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 4e55da537..1b40970b4 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -195,6 +195,9 @@ impl Directory for RAMDirectory { } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + fail_point!("RAMDirectory::atomic_write", |msg| { + Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string()))) + }); let path_buf = PathBuf::from(path); let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); self.fs.write(path_buf, &Vec::new())?; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index e3599bacf..fdb029432 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -370,7 +370,7 @@ mod tests { pub fn generate_permutation() -> Vec { let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; let mut rng = XorShiftRng::from_seed(seed); - let mut permutation: Vec = (0u64..1_000_000u64).collect(); + let mut permutation: Vec = (0u64..100_000u64).collect(); rng.shuffle(&mut permutation); permutation } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 5af4ed694..3e11c4ce5 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -301,25 +301,31 @@ fn index_documents( let last_docstamp: u64 = *(doc_opstamps.last().unwrap()); - let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); - let segment_reader = SegmentReader::open(segment)?; - let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); - let may_have_deletes = compute_deleted_bitset( - &mut deleted_bitset, - &segment_reader, - &mut delete_cursor, - &doc_to_opstamps, - last_docstamp, - )?; - - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { - if may_have_deletes { - Some(deleted_bitset) - } else { - None - } - }); + let segment_entry: SegmentEntry; + if delete_cursor.get().is_some() { + let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); + let segment_reader = SegmentReader::open(segment)?; + let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); + let may_have_deletes = compute_deleted_bitset( + &mut deleted_bitset, + &segment_reader, + &mut delete_cursor, + &doc_to_opstamps, + last_docstamp, + )?; + segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { + if may_have_deletes { + Some(deleted_bitset) + } else { + None + } + }); + } else { + // if there are no delete operation in the queue, no need + // to even open the segment. + segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None); + } Ok(segment_updater.add_segment(generation, segment_entry)) } @@ -858,4 +864,33 @@ mod tests { assert_eq!(initial_table_size(1_000_000_000), 19); } + + #[cfg(not(feature="no_fail"))] + #[test] + fn test_write_commit_fails() { + use fail; + let mut schema_builder = schema::SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema_builder.build()); + + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + for _ in 0..100 { + index_writer.add_document(doc!(text_field => "a")); + } + index_writer.commit().unwrap(); + fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap(); + for _ in 0..100 { + index_writer.add_document(doc!(text_field => "b")); + } + assert!(index_writer.commit().is_err()); + index.load_searchers().unwrap(); + let num_docs_containing = |s: &str| { + let searcher = index.searcher(); + let term_a = Term::from_field_text(text_field, s); + searcher.doc_freq(&term_a) + }; + assert_eq!(num_docs_containing("a"), 100); + assert_eq!(num_docs_containing("b"), 0); + fail::cfg("RAMDirectory::atomic_write", "off").unwrap(); + } } diff --git a/src/lib.rs b/src/lib.rs index 985d68a84..e5a75cd64 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -173,6 +173,9 @@ extern crate tinysegmenter; #[macro_use] extern crate downcast; +#[macro_use] +extern crate fail; + #[cfg(test)] mod functional_test; @@ -946,3 +949,4 @@ mod tests { } } } + diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index f3a9f37c0..93deb48c1 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -20,6 +20,7 @@ use std::str::FromStr; use tokenizer::TokenizerManager; use combine::Parser; use query::EmptyQuery; +use query::query_parser::logical_ast::LogicalAST; /// Possible error that may happen when parsing a query. @@ -58,6 +59,27 @@ impl From for QueryParserError { } } + +/// Recursively remove empty clause from the AST +/// +/// Returns `None` iff the `logical_ast` ended up being empty. +fn trim_ast(logical_ast: LogicalAST) -> Option { + match logical_ast { + LogicalAST::Clause(children) => { + let trimmed_children = children.into_iter() + .flat_map(|(occur, child)| + trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) ) + .collect::>(); + if trimmed_children.is_empty() { + None + } else { + Some(LogicalAST::Clause(trimmed_children)) + } + }, + _ => Some(logical_ast), + } +} + /// Tantivy's Query parser /// /// The language covered by the current parser is extremely simple. @@ -369,14 +391,15 @@ impl QueryParser { asts.push(LogicalAST::Leaf(Box::new(ast))); } } - let result_ast = if asts.is_empty() { - // this should never happen - return Err(QueryParserError::SyntaxError); - } else if asts.len() == 1 { - asts[0].clone() - } else { - LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) - }; + let result_ast: LogicalAST = + if asts.len() == 1 { + asts.into_iter().next().unwrap() + } else { + LogicalAST::Clause( + asts.into_iter() + .map(|ast| (Occur::Should, ast)) + .collect()) + }; Ok(result_ast) } UserInputLeaf::All => { @@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { } fn convert_to_query(logical_ast: LogicalAST) -> Box { - match logical_ast { - LogicalAST::Clause(clause) => { - if clause.is_empty() { - Box::new(EmptyQuery) - } else { - let occur_subqueries = clause - .into_iter() - .map(|(occur, subquery)| (occur, convert_to_query(subquery))) - .collect::>(); - Box::new(BooleanQuery::from(occur_subqueries)) - } - } - LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal), + match trim_ast(logical_ast) { + Some(LogicalAST::Clause(trimmed_clause)) => { + let occur_subqueries = trimmed_clause + .into_iter() + .map(|(occur, subquery)| (occur, convert_to_query(subquery))) + .collect::>(); + assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming"); + Box::new(BooleanQuery::from(occur_subqueries)) + }, + Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal), + None => Box::new(EmptyQuery) } } @@ -454,12 +475,17 @@ mod test { use schema::Field; use schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT}; - use tokenizer::SimpleTokenizer; - use tokenizer::TokenizerManager; + use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager}; use Index; fn make_query_parser() -> QueryParser { let mut schema_builder = SchemaBuilder::default(); + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("en_with_stop_words") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); let title = schema_builder.add_text_field("title", TEXT); let text = schema_builder.add_text_field("text", TEXT); schema_builder.add_i64_field("signed", INT_INDEXED); @@ -468,9 +494,14 @@ mod test { schema_builder.add_text_field("notindexed_u64", STORED); schema_builder.add_text_field("notindexed_i64", STORED); schema_builder.add_text_field("nottokenized", STRING); + schema_builder.add_text_field("with_stop_words", text_options); let schema = schema_builder.build(); let default_fields = vec![title, text]; let tokenizer_manager = TokenizerManager::default(); + tokenizer_manager.register("en_with_stop_words", SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec!["the".to_string()])) + ); QueryParser::new(schema, default_fields, tokenizer_manager) } @@ -739,6 +770,13 @@ mod test { ); } + #[test] + pub fn test_query_parser_not_empty_but_no_tokens() { + let query_parser = make_query_parser(); + assert!(query_parser.parse_query(" !, ").is_ok()); + assert!(query_parser.parse_query("with_stop_words:the").is_ok()); + } + #[test] pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index ebade3978..578678a4a 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,4 +1,5 @@ use super::{Token, TokenFilter, TokenStream}; +use std::mem; /// Token filter that lowercase terms. #[derive(Clone)] @@ -15,13 +16,22 @@ where } } -pub struct LowerCaserTokenStream -where - TailTokenStream: TokenStream, -{ +pub struct LowerCaserTokenStream { + buffer: String, tail: TailTokenStream, } +// writes a lowercased version of text into output. +fn to_lowercase_unicode(text: &mut String, output: &mut String) { + output.clear(); + for c in text.chars() { + // Contrary to the std, we do not take care of sigma special case. + // This will have an normalizationo effect, which is ok for search. + output.extend(c.to_lowercase()); + } +} + + impl TokenStream for LowerCaserTokenStream where TailTokenStream: TokenStream, @@ -36,7 +46,14 @@ where fn advance(&mut self) -> bool { if self.tail.advance() { - self.tail.token_mut().text.make_ascii_lowercase(); + if self.token_mut().text.is_ascii() { + // fast track for ascii. + self.token_mut().text.make_ascii_lowercase(); + } else { + to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer); + + mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + } true } else { false @@ -49,6 +66,43 @@ where TailTokenStream: TokenStream, { fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream { - LowerCaserTokenStream { tail } + LowerCaserTokenStream { + tail, + buffer: String::with_capacity(100) + } } } + +#[cfg(test)] +mod tests { + use tokenizer::Tokenizer; + use tokenizer::LowerCaser; + use tokenizer::TokenStream; + use tokenizer::SimpleTokenizer; + + #[test] + fn test_to_lower_case() { + assert_eq!(lowercase_helper("Русский текст"), + vec!["русский".to_string(), "текст".to_string()]); + } + + fn lowercase_helper(text: &str) -> Vec { + let mut tokens = vec![]; + let mut token_stream = SimpleTokenizer + .filter(LowerCaser) + .token_stream(text); + while token_stream.advance() { + let token_text = token_stream.token().text.clone(); + tokens.push(token_text); + } + tokens + } + + + #[test] + fn test_lowercaser() { + assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]); + assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]); + } + +} \ No newline at end of file