Merge branch 'master' of github.com:tantivy-search/tantivy

2026-06-02 16:40:43 +00:00 · 2018-09-07 08:44:14 +09:00
parent 1d439e96f5 934933582e
commit 23e97da9f6
14 changed files with 241 additions and 96 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,7 @@ census = "0.1"
 fnv = "1.0.6"
 owned-read = "0.4"
 failure = "0.1"
+fail = "0.2"

 [target.'cfg(windows)'.dependencies]
 winapi = "0.2"
@@ -60,12 +61,20 @@ opt-level = 3
 debug = false
 lto = true
 debug-assertions = false
+overflow-checks = false
+
+[profile.test]
+debug-assertions = true
+overflow-checks = true

 [features]
-default = ["mmap"]
+# by default no-fail is disabled. We manually enable it when running test.
+default = ["mmap", "no_fail"]
 mmap = ["fst/mmap", "atomicwrites"]
 lz4-compression = ["lz4"]
+no_fail = ["fail/no_fail"]

 [badges]
 travis-ci = { repository = "tantivy-search/tantivy" }

+
--- a/README.md
+++ b/README.md
@@ -78,6 +78,10 @@ To check out and run tests, you can simply run :
    cd tantivy
    cargo build

+## Running tests
+
+Some tests will not run with just `cargo test` because of `fail-rs`.
+To run the tests exhaustively, run `./run-tests.sh`. 

 # Contribute

--- a/appveyor.yml
+++ b/appveyor.yml
@@ -18,5 +18,5 @@ install:
 build: false

 test_script:
-  - REM SET RUST_LOG=tantivy,test & cargo test --verbose
-  - REM SET RUST_BACKTRACE=1 & cargo build --examples
+  - REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
+  - REM SET RUST_BACKTRACE=1 & cargo build --examples
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -16,7 +16,7 @@ main() {
            return
        fi
        echo "Test"
-        cross test --target $TARGET
+        cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
    fi
    for example in $(ls examples/*.rs)
    do
--- a/examples/stop_words.rs
+++ b/examples/stop_words.rs
@@ -23,7 +23,6 @@ use tantivy::Index;

 fn main() -> tantivy::Result<()> {
  // this example assumes you understand the content in `basic_search`
-  let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
  let mut schema_builder = SchemaBuilder::default();

  // This configures your custom options for how tantivy will
@@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> {
  // to note is that we are setting the tokenizer to `stoppy`
  // which will be defined and registered below.
  let text_field_indexing = TextFieldIndexing::default()
-    .set_tokenizer("stoppy")
-    .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+      .set_tokenizer("stoppy")
+      .set_index_option(IndexRecordOption::WithFreqsAndPositions);
  let text_options = TextOptions::default()
-    .set_indexing_options(text_field_indexing)
-    .set_stored();
+      .set_indexing_options(text_field_indexing)
+      .set_stored();

  // Our first field is title.
  schema_builder.add_text_field("title", text_options);

  // Our second field is body.
  let text_field_indexing = TextFieldIndexing::default()
-    .set_tokenizer("stoppy")
-    .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+      .set_tokenizer("stoppy")
+      .set_index_option(IndexRecordOption::WithFreqsAndPositions);
  let text_options = TextOptions::default()
-    .set_indexing_options(text_field_indexing)
-    .set_stored();
+      .set_indexing_options(text_field_indexing)
+      .set_stored();
  schema_builder.add_text_field("body", text_options);

  let schema = schema_builder.build();

-  let index = Index::create_in_dir(&index_path, schema.clone())?;
+  let index = Index::create_in_ram(schema.clone());

  // This tokenizer lowers all of the text (to help with stop word matching)
  // then removes all instances of `the` and `and` from the corpus
  let tokenizer = SimpleTokenizer
-    .filter(LowerCaser)
-    .filter(StopWordFilter::remove(vec![
-      "the".to_string(),
-      "and".to_string(),
-    ]));
+      .filter(LowerCaser)
+      .filter(StopWordFilter::remove(vec![
+        "the".to_string(),
+        "and".to_string(),
+      ]));

  index.tokenizers().register("stoppy", tokenizer);

@@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> {
  ));

  index_writer.add_document(doc!(
-        title => "Of Mice and Men",
-        body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
-                bank and runs deep and green. The water is warm too, for it has slipped twinkling \
-                over the yellow sands in the sunlight before reaching the narrow pool. On one \
-                side of the river the golden foothill slopes curve up to the strong and rocky \
-                Gabilan Mountains, but on the valley side the water is lined with trees—willows \
-                fresh and green with every spring, carrying in their lower leaf junctures the \
-                debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
-                limbs and branches that arch over the pool"
-    ));
+      title => "Of Mice and Men",
+      body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
+              bank and runs deep and green. The water is warm too, for it has slipped twinkling \
+              over the yellow sands in the sunlight before reaching the narrow pool. On one \
+              side of the river the golden foothill slopes curve up to the strong and rocky \
+              Gabilan Mountains, but on the valley side the water is lined with trees—willows \
+              fresh and green with every spring, carrying in their lower leaf junctures the \
+              debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
+              limbs and branches that arch over the pool"
+  ));

  index_writer.add_document(doc!(
       title => "Frankenstein",
@@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> {

  let query_parser = QueryParser::for_index(&index, vec![title, body]);

-  // this will have NO hits because it was filtered out
-  // because the query is run through the analyzer you
-  // actually will get an error here because the query becomes
-  // empty
-  assert!(query_parser.parse_query("the").is_err());
-
-  // this will have hits
-  let query = query_parser.parse_query("is")?;
+  // stop words are applied on the query as well.
+  // The following will be equivalent to `title:frankenstein`
+  let query = query_parser.parse_query("title:\"the Frankenstein\"")?;

  let mut top_collector = TopCollector::with_limit(10);

@@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> {
  }

  Ok(())
-}
-
-use tempdir::TempDir;
+}
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+cargo test --no-default-features --features mmap -- --test-threads 1
--- a/src/common/bitset.rs
+++ b/src/common/bitset.rs
@@ -266,14 +266,14 @@ mod tests {

    #[test]
    fn test_bitset_large() {
-        let arr = generate_nonunique_unsorted(1_000_000, 50_000);
+        let arr = generate_nonunique_unsorted(100_000, 5_000);
        let mut btreeset: BTreeSet<u32> = BTreeSet::new();
-        let mut bitset = BitSet::with_max_value(1_000_000);
+        let mut bitset = BitSet::with_max_value(100_000);
        for el in arr {
            btreeset.insert(el);
            bitset.insert(el);
        }
-        for i in 0..1_000_000 {
+        for i in 0..100_000 {
            assert_eq!(btreeset.contains(&i), bitset.contains(i));
        }
        assert_eq!(btreeset.len(), bitset.len());
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -4,7 +4,6 @@ use core::InvertedIndexReader;
 use core::Segment;
 use core::SegmentComponent;
 use core::SegmentId;
-use core::SegmentMeta;
 use error::TantivyError;
 use fastfield::DeleteBitSet;
 use fastfield::FacetReader;
@@ -44,7 +43,8 @@ pub struct SegmentReader {
    inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,

    segment_id: SegmentId,
-    segment_meta: SegmentMeta,
+    max_doc: DocId,
+    num_docs: DocId,

    termdict_composite: CompositeFile,
    postings_composite: CompositeFile,
@@ -64,7 +64,7 @@ impl SegmentReader {
    /// Today, `tantivy` does not handle deletes, so it happens
    /// to also be the number of documents in the index.
    pub fn max_doc(&self) -> DocId {
-        self.segment_meta.max_doc()
+        self.max_doc
    }

    /// Returns the number of documents.
@@ -73,7 +73,7 @@ impl SegmentReader {
    /// Today, `tantivy` does not handle deletes so max doc and
    /// num_docs are the same.
    pub fn num_docs(&self) -> DocId {
-        self.segment_meta.num_docs()
+        self.num_docs
    }

    /// Returns the schema of the index this segment belongs to.
@@ -225,6 +225,8 @@ impl SegmentReader {
        let store_source = segment.open_read(SegmentComponent::STORE)?;
        let store_reader = StoreReader::from_source(store_source);

+        fail_point!("SegmentReader::open#middle");
+
        let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
        let postings_composite = CompositeFile::open(&postings_source)?;

@@ -260,7 +262,8 @@ impl SegmentReader {
        let schema = segment.schema();
        Ok(SegmentReader {
            inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
-            segment_meta: segment.meta().clone(),
+            max_doc: segment.meta().max_doc(),
+            num_docs: segment.meta().num_docs(),
            termdict_composite,
            postings_composite,
            fast_fields_composite,
@@ -432,6 +435,7 @@ mod test {
    use schema::{SchemaBuilder, Term, STORED, TEXT};
    use DocId;

+
    #[test]
    fn test_alive_docs_iterator() {
        let mut schema_builder = SchemaBuilder::new();
--- a/src/directory/ram_directory.rs
+++ b/src/directory/ram_directory.rs
@@ -195,6 +195,9 @@ impl Directory for RAMDirectory {
    }

    fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
+        fail_point!("RAMDirectory::atomic_write", |msg| {
+            Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string())))
+        });
        let path_buf = PathBuf::from(path);
        let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
        self.fs.write(path_buf, &Vec::new())?;
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -370,7 +370,7 @@ mod tests {
    pub fn generate_permutation() -> Vec<u64> {
        let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
        let mut rng = XorShiftRng::from_seed(seed);
-        let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
+        let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
        rng.shuffle(&mut permutation);
        permutation
    }
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -301,25 +301,31 @@ fn index_documents(

    let last_docstamp: u64 = *(doc_opstamps.last().unwrap());

-    let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
-    let segment_reader = SegmentReader::open(segment)?;
-    let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
-    let may_have_deletes = compute_deleted_bitset(
-        &mut deleted_bitset,
-        &segment_reader,
-        &mut delete_cursor,
-        &doc_to_opstamps,
-        last_docstamp,
-    )?;
-
-    let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
-        if may_have_deletes {
-            Some(deleted_bitset)
-        } else {
-            None
-        }
-    });
+    let segment_entry: SegmentEntry;

+    if delete_cursor.get().is_some() {
+        let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
+        let segment_reader = SegmentReader::open(segment)?;
+        let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
+        let may_have_deletes = compute_deleted_bitset(
+            &mut deleted_bitset,
+            &segment_reader,
+            &mut delete_cursor,
+            &doc_to_opstamps,
+            last_docstamp,
+        )?;
+        segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
+            if may_have_deletes {
+                Some(deleted_bitset)
+            } else {
+                None
+            }
+        });
+    } else {
+        // if there are no delete operation in the queue, no need
+        // to even open the segment.
+        segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
+    }
    Ok(segment_updater.add_segment(generation, segment_entry))
 }

@@ -858,4 +864,33 @@ mod tests {
        assert_eq!(initial_table_size(1_000_000_000), 19);
    }

+
+    #[cfg(not(feature="no_fail"))]
+    #[test]
+    fn test_write_commit_fails() {
+        use fail;
+        let mut schema_builder = schema::SchemaBuilder::default();
+        let text_field = schema_builder.add_text_field("text", schema::TEXT);
+        let index = Index::create_in_ram(schema_builder.build());
+
+        let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+        for _    in 0..100 {
+            index_writer.add_document(doc!(text_field => "a"));
+        }
+        index_writer.commit().unwrap();
+        fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
+        for _ in 0..100 {
+            index_writer.add_document(doc!(text_field => "b"));
+        }
+        assert!(index_writer.commit().is_err());
+        index.load_searchers().unwrap();
+        let num_docs_containing = |s: &str| {
+            let searcher = index.searcher();
+            let term_a = Term::from_field_text(text_field, s);
+            searcher.doc_freq(&term_a)
+        };
+        assert_eq!(num_docs_containing("a"), 100);
+        assert_eq!(num_docs_containing("b"), 0);
+        fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
+    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -173,6 +173,9 @@ extern crate tinysegmenter;
 #[macro_use]
 extern crate downcast;

+#[macro_use]
+extern crate fail;
+
 #[cfg(test)]
 mod functional_test;

@@ -946,3 +949,4 @@ mod tests {
        }
    }
 }
+
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -20,6 +20,7 @@ use std::str::FromStr;
 use tokenizer::TokenizerManager;
 use combine::Parser;
 use query::EmptyQuery;
+use query::query_parser::logical_ast::LogicalAST;


 /// Possible error that may happen when parsing a query.
@@ -58,6 +59,27 @@ impl From<ParseIntError> for QueryParserError {
    }
 }

+
+/// Recursively remove empty clause from the AST
+///
+/// Returns `None` iff the `logical_ast` ended up being empty.
+fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
+    match logical_ast {
+        LogicalAST::Clause(children) => {
+            let trimmed_children = children.into_iter()
+                .flat_map(|(occur, child)|
+                    trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
+                .collect::<Vec<_>>();
+            if trimmed_children.is_empty() {
+                None
+            } else {
+                Some(LogicalAST::Clause(trimmed_children))
+            }
+        },
+        _ => Some(logical_ast),
+    }
+}
+
 /// Tantivy's Query parser
 ///
 /// The language covered by the current parser is extremely simple.
@@ -369,14 +391,15 @@ impl QueryParser {
                        asts.push(LogicalAST::Leaf(Box::new(ast)));
                    }
                }
-                let result_ast = if asts.is_empty() {
-                    // this should never happen
-                    return Err(QueryParserError::SyntaxError);
-                } else if asts.len() == 1 {
-                    asts[0].clone()
-                } else {
-                    LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
-                };
+                let result_ast: LogicalAST =
+                    if asts.len() == 1 {
+                        asts.into_iter().next().unwrap()
+                    } else {
+                        LogicalAST::Clause(
+                            asts.into_iter()
+                                .map(|ast| (Occur::Should, ast))
+                                .collect())
+                    };
                Ok(result_ast)
            }
            UserInputLeaf::All => {
@@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
 }

 fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
-    match logical_ast {
-        LogicalAST::Clause(clause) => {
-            if clause.is_empty() {
-                Box::new(EmptyQuery)
-            } else {
-                let occur_subqueries = clause
-                    .into_iter()
-                    .map(|(occur, subquery)| (occur, convert_to_query(subquery)))
-                    .collect::<Vec<_>>();
-                Box::new(BooleanQuery::from(occur_subqueries))
-            }
-        }
-        LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
+    match trim_ast(logical_ast) {
+        Some(LogicalAST::Clause(trimmed_clause)) => {
+            let occur_subqueries = trimmed_clause
+                .into_iter()
+                .map(|(occur, subquery)| (occur, convert_to_query(subquery)))
+                .collect::<Vec<_>>();
+            assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
+            Box::new(BooleanQuery::from(occur_subqueries))
+        },
+        Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
+        None => Box::new(EmptyQuery)
    }
 }

@@ -454,12 +475,17 @@ mod test {
    use schema::Field;
    use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
    use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
-    use tokenizer::SimpleTokenizer;
-    use tokenizer::TokenizerManager;
+    use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
    use Index;

    fn make_query_parser() -> QueryParser {
        let mut schema_builder = SchemaBuilder::default();
+        let text_field_indexing = TextFieldIndexing::default()
+            .set_tokenizer("en_with_stop_words")
+            .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+        let text_options = TextOptions::default()
+            .set_indexing_options(text_field_indexing)
+            .set_stored();
        let title = schema_builder.add_text_field("title", TEXT);
        let text = schema_builder.add_text_field("text", TEXT);
        schema_builder.add_i64_field("signed", INT_INDEXED);
@@ -468,9 +494,14 @@ mod test {
        schema_builder.add_text_field("notindexed_u64", STORED);
        schema_builder.add_text_field("notindexed_i64", STORED);
        schema_builder.add_text_field("nottokenized", STRING);
+        schema_builder.add_text_field("with_stop_words", text_options);
        let schema = schema_builder.build();
        let default_fields = vec![title, text];
        let tokenizer_manager = TokenizerManager::default();
+        tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
+            .filter(LowerCaser)
+            .filter(StopWordFilter::remove(vec!["the".to_string()]))
+        );
        QueryParser::new(schema, default_fields, tokenizer_manager)
    }

@@ -739,6 +770,13 @@ mod test {
        );
    }

+    #[test]
+    pub fn test_query_parser_not_empty_but_no_tokens() {
+        let query_parser = make_query_parser();
+        assert!(query_parser.parse_query(" !, ").is_ok());
+        assert!(query_parser.parse_query("with_stop_words:the").is_ok());
+    }
+
    #[test]
    pub fn test_parse_query_to_ast_conjunction() {
        test_parse_query_to_logical_ast_helper(
--- a/src/tokenizer/lower_caser.rs
+++ b/src/tokenizer/lower_caser.rs
@@ -1,4 +1,5 @@
 use super::{Token, TokenFilter, TokenStream};
+use std::mem;

 /// Token filter that lowercase terms.
 #[derive(Clone)]
@@ -15,13 +16,22 @@ where
    }
 }

-pub struct LowerCaserTokenStream<TailTokenStream>
-where
-    TailTokenStream: TokenStream,
-{
+pub struct LowerCaserTokenStream<TailTokenStream> {
+    buffer: String,
    tail: TailTokenStream,
 }

+// writes a lowercased version of text into output.
+fn to_lowercase_unicode(text: &mut String, output: &mut String) {
+    output.clear();
+    for c in text.chars() {
+        // Contrary to the std, we do not take care of sigma special case.
+        // This will have an normalizationo effect, which is ok for search.
+        output.extend(c.to_lowercase());
+    }
+}
+
+
 impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
 where
    TailTokenStream: TokenStream,
@@ -36,7 +46,14 @@ where

    fn advance(&mut self) -> bool {
        if self.tail.advance() {
-            self.tail.token_mut().text.make_ascii_lowercase();
+            if self.token_mut().text.is_ascii() {
+                // fast track for ascii.
+                self.token_mut().text.make_ascii_lowercase();
+            } else {
+                    to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
+
+                mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
+            }
            true
        } else {
            false
@@ -49,6 +66,43 @@ where
    TailTokenStream: TokenStream,
 {
    fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
-        LowerCaserTokenStream { tail }
+        LowerCaserTokenStream {
+            tail,
+            buffer: String::with_capacity(100)
+        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use tokenizer::Tokenizer;
+    use tokenizer::LowerCaser;
+    use tokenizer::TokenStream;
+    use tokenizer::SimpleTokenizer;
+
+    #[test]
+    fn test_to_lower_case() {
+        assert_eq!(lowercase_helper("Русский текст"),
+                   vec!["русский".to_string(), "текст".to_string()]);
+    }
+
+    fn lowercase_helper(text: &str) -> Vec<String> {
+        let mut tokens = vec![];
+        let mut token_stream = SimpleTokenizer
+            .filter(LowerCaser)
+            .token_stream(text);
+        while token_stream.advance() {
+            let token_text = token_stream.token().text.clone();
+            tokens.push(token_text);
+        }
+        tokens
+     }
+
+
+    #[test]
+    fn test_lowercaser() {
+        assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
+        assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
+    }
+
+}