Merge branch 'issue/368' into issue/368b

2026-07-12 04:00:42 +00:00 · 2018-09-09 14:24:05 +09:00
parent 2e44f0f099 9ccba9f864
commit a78f4cca37
23 changed files with 519 additions and 170 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -49,6 +49,7 @@ fnv = "1.0.6"
 owned-read = "0.4"
 failure = "0.1"
 htmlescape = "0.3.1"
+fail = "0.2"

 [target.'cfg(windows)'.dependencies]
 winapi = "0.2"
@@ -61,12 +62,20 @@ opt-level = 3
 debug = false
 lto = true
 debug-assertions = false
+overflow-checks = false
+
+[profile.test]
+debug-assertions = true
+overflow-checks = true

 [features]
-default = ["mmap"]
+# by default no-fail is disabled. We manually enable it when running test.
+default = ["mmap", "no_fail"]
 mmap = ["fst/mmap", "atomicwrites"]
 lz4-compression = ["lz4"]
+no_fail = ["fail/no_fail"]

 [badges]
 travis-ci = { repository = "tantivy-search/tantivy" }

+
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@
 [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
+[![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton)

 ![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)

@@ -77,6 +78,10 @@ To check out and run tests, you can simply run :
    cd tantivy
    cargo build

+## Running tests
+
+Some tests will not run with just `cargo test` because of `fail-rs`.
+To run the tests exhaustively, run `./run-tests.sh`. 

 # Contribute

--- a/appveyor.yml
+++ b/appveyor.yml
@@ -18,5 +18,5 @@ install:
 build: false

 test_script:
-  - REM SET RUST_LOG=tantivy,test & cargo test --verbose
-  - REM SET RUST_BACKTRACE=1 & cargo build --examples
+  - REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
+  - REM SET RUST_BACKTRACE=1 & cargo build --examples
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -16,7 +16,7 @@ main() {
            return
        fi
        echo "Test"
-        cross test --target $TARGET
+        cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
    fi
    for example in $(ls examples/*.rs)
    do
--- a/examples/stop_words.rs
+++ b/examples/stop_words.rs
@@ -23,7 +23,6 @@ use tantivy::Index;

 fn main() -> tantivy::Result<()> {
  // this example assumes you understand the content in `basic_search`
-  let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
  let mut schema_builder = SchemaBuilder::default();

  // This configures your custom options for how tantivy will
@@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> {
  // to note is that we are setting the tokenizer to `stoppy`
  // which will be defined and registered below.
  let text_field_indexing = TextFieldIndexing::default()
-    .set_tokenizer("stoppy")
-    .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+      .set_tokenizer("stoppy")
+      .set_index_option(IndexRecordOption::WithFreqsAndPositions);
  let text_options = TextOptions::default()
-    .set_indexing_options(text_field_indexing)
-    .set_stored();
+      .set_indexing_options(text_field_indexing)
+      .set_stored();

  // Our first field is title.
  schema_builder.add_text_field("title", text_options);

  // Our second field is body.
  let text_field_indexing = TextFieldIndexing::default()
-    .set_tokenizer("stoppy")
-    .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+      .set_tokenizer("stoppy")
+      .set_index_option(IndexRecordOption::WithFreqsAndPositions);
  let text_options = TextOptions::default()
-    .set_indexing_options(text_field_indexing)
-    .set_stored();
+      .set_indexing_options(text_field_indexing)
+      .set_stored();
  schema_builder.add_text_field("body", text_options);

  let schema = schema_builder.build();

-  let index = Index::create_in_dir(&index_path, schema.clone())?;
+  let index = Index::create_in_ram(schema.clone());

  // This tokenizer lowers all of the text (to help with stop word matching)
  // then removes all instances of `the` and `and` from the corpus
  let tokenizer = SimpleTokenizer
-    .filter(LowerCaser)
-    .filter(StopWordFilter::remove(vec![
-      "the".to_string(),
-      "and".to_string(),
-    ]));
+      .filter(LowerCaser)
+      .filter(StopWordFilter::remove(vec![
+        "the".to_string(),
+        "and".to_string(),
+      ]));

  index.tokenizers().register("stoppy", tokenizer);

@@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> {
  ));

  index_writer.add_document(doc!(
-        title => "Of Mice and Men",
-        body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
-                bank and runs deep and green. The water is warm too, for it has slipped twinkling \
-                over the yellow sands in the sunlight before reaching the narrow pool. On one \
-                side of the river the golden foothill slopes curve up to the strong and rocky \
-                Gabilan Mountains, but on the valley side the water is lined with trees—willows \
-                fresh and green with every spring, carrying in their lower leaf junctures the \
-                debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
-                limbs and branches that arch over the pool"
-    ));
+      title => "Of Mice and Men",
+      body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
+              bank and runs deep and green. The water is warm too, for it has slipped twinkling \
+              over the yellow sands in the sunlight before reaching the narrow pool. On one \
+              side of the river the golden foothill slopes curve up to the strong and rocky \
+              Gabilan Mountains, but on the valley side the water is lined with trees—willows \
+              fresh and green with every spring, carrying in their lower leaf junctures the \
+              debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
+              limbs and branches that arch over the pool"
+  ));

  index_writer.add_document(doc!(
       title => "Frankenstein",
@@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> {

  let query_parser = QueryParser::for_index(&index, vec![title, body]);

-  // this will have NO hits because it was filtered out
-  // because the query is run through the analyzer you
-  // actually will get an error here because the query becomes
-  // empty
-  assert!(query_parser.parse_query("the").is_err());
-
-  // this will have hits
-  let query = query_parser.parse_query("is")?;
+  // stop words are applied on the query as well.
+  // The following will be equivalent to `title:frankenstein`
+  let query = query_parser.parse_query("title:\"the Frankenstein\"")?;

  let mut top_collector = TopCollector::with_limit(10);

@@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> {
  }

  Ok(())
-}
-
-use tempdir::TempDir;
+}
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+cargo test --no-default-features --features mmap -- --test-threads 1
--- a/src/common/bitset.rs
+++ b/src/common/bitset.rs
@@ -266,14 +266,14 @@ mod tests {

    #[test]
    fn test_bitset_large() {
-        let arr = generate_nonunique_unsorted(1_000_000, 50_000);
+        let arr = generate_nonunique_unsorted(100_000, 5_000);
        let mut btreeset: BTreeSet<u32> = BTreeSet::new();
-        let mut bitset = BitSet::with_max_value(1_000_000);
+        let mut bitset = BitSet::with_max_value(100_000);
        for el in arr {
            btreeset.insert(el);
            bitset.insert(el);
        }
-        for i in 0..1_000_000 {
+        for i in 0..100_000 {
            assert_eq!(btreeset.contains(&i), bitset.contains(i));
        }
        assert_eq!(btreeset.len(), bitset.len());
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -7,7 +7,7 @@ use std::fmt;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 use Result;
-
+use indexer::LockType;
 use super::pool::LeasedItem;
 use super::pool::Pool;
 use super::segment::create_segment;
@@ -20,11 +20,10 @@ use core::META_FILEPATH;
 #[cfg(feature = "mmap")]
 use directory::MmapDirectory;
 use directory::{Directory, RAMDirectory};
-use directory::{DirectoryClone, ManagedDirectory};
+use directory::{ManagedDirectory};
 use indexer::index_writer::open_index_writer;
 use indexer::index_writer::HEAP_SIZE_MIN;
 use indexer::segment_updater::save_new_metas;
-use indexer::DirectoryLock;
 use num_cpus;
 use std::path::Path;
 use tokenizer::TokenizerManager;
@@ -116,18 +115,28 @@ impl Index {
        &self.tokenizers
    }

-    pub fn tokenizer_for_field(&self, field: Field) -> Option<Box<BoxedTokenizer>> {
-        let field_type = self.schema.get_field_entry(field).field_type();
-        let tokenizer: &TokenizerManager = self.tokenizers();
+    pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
+        let field_entry = self.schema.get_field_entry(field);
+        let field_type = field_entry.field_type();
+        let tokenizer_manager: &TokenizerManager = self.tokenizers();
+        let tokenizer_name_opt: Option<Box<BoxedTokenizer>> =
            match field_type {
-            FieldType::Str(text_options) => {
-                text_options.get_indexing_options()
-                    .map(|text_indexing_options| text_indexing_options.tokenizer())
-                    .and_then(|tokenizer_name| tokenizer.get(tokenizer_name))
-
-            },
-            _ => {
-                None
+                FieldType::Str(text_options) => {
+                    text_options
+                        .get_indexing_options()
+                        .map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
+                        .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
+                },
+                _ => {
+                    None
+                }
+            };
+        match tokenizer_name_opt {
+            Some(tokenizer) => {
+                Ok(tokenizer)
+            }
+            None => {
+                Err(TantivyError::SchemaError(format!("{:?} is not a text field.", field_entry.name())))
            }
        }
    }
@@ -175,7 +184,8 @@ impl Index {
        num_threads: usize,
        overall_heap_size_in_bytes: usize,
    ) -> Result<IndexWriter> {
-        let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
+
+        let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
        let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
        open_index_writer(
            self,
@@ -268,6 +278,7 @@ impl Index {
    /// This needs to be called when a new segment has been
    /// published or after a merge.
    pub fn load_searchers(&self) -> Result<()> {
+        let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
        let searchable_segments = self.searchable_segments()?;
        let segment_readers: Vec<SegmentReader> = searchable_segments
            .iter()
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -33,10 +33,4 @@ lazy_static! {
    /// Removing this file is safe, but will prevent the garbage collection of all of the file that
    /// are currently in the directory
    pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
-
-    /// Only one process should be able to write tantivy's index at a time.
-    /// This file, when present, is in charge of preventing other processes to open an IndexWriter.
-    ///
-    /// If the process is killed and this file remains, it is safe to remove it manually.
-    pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
 }
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -4,7 +4,6 @@ use core::InvertedIndexReader;
 use core::Segment;
 use core::SegmentComponent;
 use core::SegmentId;
-use core::SegmentMeta;
 use error::TantivyError;
 use fastfield::DeleteBitSet;
 use fastfield::FacetReader;
@@ -44,7 +43,8 @@ pub struct SegmentReader {
    inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,

    segment_id: SegmentId,
-    segment_meta: SegmentMeta,
+    max_doc: DocId,
+    num_docs: DocId,

    termdict_composite: CompositeFile,
    postings_composite: CompositeFile,
@@ -64,7 +64,7 @@ impl SegmentReader {
    /// Today, `tantivy` does not handle deletes, so it happens
    /// to also be the number of documents in the index.
    pub fn max_doc(&self) -> DocId {
-        self.segment_meta.max_doc()
+        self.max_doc
    }

    /// Returns the number of documents.
@@ -73,7 +73,7 @@ impl SegmentReader {
    /// Today, `tantivy` does not handle deletes so max doc and
    /// num_docs are the same.
    pub fn num_docs(&self) -> DocId {
-        self.segment_meta.num_docs()
+        self.num_docs
    }

    /// Returns the schema of the index this segment belongs to.
@@ -225,6 +225,8 @@ impl SegmentReader {
        let store_source = segment.open_read(SegmentComponent::STORE)?;
        let store_reader = StoreReader::from_source(store_source);

+        fail_point!("SegmentReader::open#middle");
+
        let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
        let postings_composite = CompositeFile::open(&postings_source)?;

@@ -260,7 +262,8 @@ impl SegmentReader {
        let schema = segment.schema();
        Ok(SegmentReader {
            inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
-            segment_meta: segment.meta().clone(),
+            max_doc: segment.meta().max_doc(),
+            num_docs: segment.meta().num_docs(),
            termdict_composite,
            postings_composite,
            fast_fields_composite,
@@ -432,6 +435,7 @@ mod test {
    use schema::{SchemaBuilder, Term, STORED, TEXT};
    use DocId;

+
    #[test]
    fn test_alive_docs_iterator() {
        let mut schema_builder = SchemaBuilder::new();
--- a/src/directory/managed_directory.rs
+++ b/src/directory/managed_directory.rs
@@ -12,6 +12,20 @@ use std::sync::RwLockWriteGuard;
 use std::sync::{Arc, RwLock};
 use Directory;
 use Result;
+use indexer::LockType;
+
+
+
+/// Returns true iff the file is "managed".
+/// Non-managed file are not subject to garbage collection.
+///
+/// Filenames that starts by a "." -typically locks-
+/// are not managed.
+fn is_managed(path: &Path) -> bool {
+    path.to_str()
+        .map(|p_str| !p_str.starts_with("."))
+        .unwrap_or(true)
+}

 /// Wrapper of directories that keeps track of files created by Tantivy.
 ///
@@ -82,25 +96,34 @@ impl ManagedDirectory {
    pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
        info!("Garbage collect");
        let mut files_to_delete = vec![];
+
+        // It is crucial to get the living files after acquiring the
+        // read lock of meta informations. That way, we
+        // avoid the following scenario.
+        //
+        // 1) we get the list of living files.
+        // 2) someone creates a new file.
+        // 3) we start garbage collection and remove this file
+        // even though it is a living file.
+        //
+        // releasing the lock as .delete() will use it too.
        {
-            // releasing the lock as .delete() will use it too.
            let meta_informations_rlock = self.meta_informations
                .read()
                .expect("Managed directory rlock poisoned in garbage collect.");

-            // It is crucial to get the living files after acquiring the
-            // read lock of meta informations. That way, we
-            // avoid the following scenario.
-            //
-            // 1) we get the list of living files.
-            // 2) someone creates a new file.
-            // 3) we start garbage collection and remove this file
-            // even though it is a living file.
-            let living_files = get_living_files();
-
-            for managed_path in &meta_informations_rlock.managed_paths {
-                if !living_files.contains(managed_path) {
-                    files_to_delete.push(managed_path.clone());
+            // The point of this second "file" lock is to enforce the following scenario
+            // 1) process B tries to load a new set of searcher.
+            // The list of segments is loaded
+            // 2) writer change meta.json (for instance after a merge or a commit)
+            // 3) gc kicks in.
+            // 4) gc removes a file that was useful for process B, before process B opened it.
+            if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
+                let living_files = get_living_files();
+                for managed_path in &meta_informations_rlock.managed_paths {
+                    if !living_files.contains(managed_path) {
+                        files_to_delete.push(managed_path.clone());
+                    }
                }
            }
        }
@@ -156,7 +179,15 @@ impl ManagedDirectory {
    /// registering the filepath and creating the file
    /// will not lead to garbage files that will
    /// never get removed.
+    ///
+    /// File starting by "." are reserved to locks.
+    /// They are not managed and cannot be subjected
+    /// to garbage collection.
    fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
+        // Files starting by "." (e.g. lock files) are not managed.
+        if !is_managed(filepath) {
+           return Ok(());
+        }
        let mut meta_wlock = self.meta_informations
            .write()
            .expect("Managed file lock poisoned");
--- a/src/directory/ram_directory.rs
+++ b/src/directory/ram_directory.rs
@@ -173,7 +173,6 @@ impl Directory for RAMDirectory {
        let exists = self.fs
            .write(path_buf.clone(), &Vec::new())
            .map_err(|err| IOError::with_path(path.to_owned(), err))?;
-
        // force the creation of the file to mimic the MMap directory.
        if exists {
            Err(OpenWriteError::FileAlreadyExists(path_buf))
@@ -196,6 +195,9 @@ impl Directory for RAMDirectory {
    }

    fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
+        fail_point!("RAMDirectory::atomic_write", |msg| {
+            Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string())))
+        });
        let path_buf = PathBuf::from(path);
        let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
        self.fs.write(path_buf, &Vec::new())?;
--- a/src/error.rs
+++ b/src/error.rs
@@ -6,6 +6,7 @@ use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteErro
 use fastfield::FastFieldNotAvailableError;
 use query;
 use schema;
+use indexer::LockType;
 use serde_json;
 use std::path::PathBuf;
 use std::sync::PoisonError;
@@ -19,6 +20,9 @@ pub enum TantivyError {
    /// File already exists, this is a problem when we try to write into a new file.
    #[fail(display = "file already exists: '{:?}'", _0)]
    FileAlreadyExists(PathBuf),
+    /// Failed to acquire file lock
+    #[fail(display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)]
+    LockFailure(LockType),
    /// IO Error.
    #[fail(display = "an IO error occurred: '{}'", _0)]
    IOError(#[cause] IOError),
@@ -91,13 +95,14 @@ impl From<schema::DocParsingError> for TantivyError {
    }
 }

+
 impl From<OpenWriteError> for TantivyError {
    fn from(error: OpenWriteError) -> TantivyError {
        match error {
-            OpenWriteError::FileAlreadyExists(filepath) => {
-                TantivyError::FileAlreadyExists(filepath)
-            }
-            OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
+            OpenWriteError::FileAlreadyExists(filepath) =>
+                TantivyError::FileAlreadyExists(filepath),
+            OpenWriteError::IOError(io_error) =>
+                TantivyError::IOError(io_error),
        }.into()
    }
 }
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -370,7 +370,7 @@ mod tests {
    pub fn generate_permutation() -> Vec<u64> {
        let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
        let mut rng = XorShiftRng::from_seed(seed);
-        let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
+        let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
        rng.shuffle(&mut permutation);
        permutation
    }
--- a/src/indexer/directory_lock.rs
+++ b/src/indexer/directory_lock.rs
@@ -1,26 +1,147 @@
-use core::LOCKFILE_FILEPATH;
 use directory::error::OpenWriteError;
 use Directory;
+use TantivyError;
+use std::path::{Path, PathBuf};
+use std::thread;
+use std::time::Duration;
+use std::io::Write;

-/// The directory lock is a mechanism used to
-/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
-///
-/// Only one lock can exist at a time for a given directory.
-/// The lock is release automatically on `Drop`.
-pub struct DirectoryLock {
-    directory: Box<Directory>,
+#[derive(Debug, Clone, Copy)]
+pub enum LockType {
+    /// Only one process should be able to write tantivy's index at a time.
+    /// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
+    ///
+    /// If the process is killed and this file remains, it is safe to remove it manually.
+    ///
+    /// Failing to acquire this lock usually means a misuse of tantivy's API,
+    /// (creating more than one instance of the `IndexWriter`), are a spurious
+    /// lock file remaining after a crash. In the latter case, removing the file after
+    /// checking no process running tantivy is running is safe.
+    IndexWriterLock,
+    /// The meta lock file is here to protect the segment files being opened by
+    /// `.load_searchers()` from being garbage collected.
+    /// It makes it possible for another process to safely consume
+    /// our index in-writing. Ideally, we may have prefered `RWLock` semantics
+    /// here, but it is difficult to achieve on Windows.
+    ///
+    /// Opening segment readers is a very fast process.
+    /// Right now if the lock cannot be acquire on the first attempt, the logic
+    /// is very simplistic. We retry after `100ms` until we effectively
+    /// acquire the lock.
+    /// This lock should not have much contention in normal usage.
+    MetaLock
 }

-impl DirectoryLock {
-    pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
-        directory.open_write(&*LOCKFILE_FILEPATH)?;
-        Ok(DirectoryLock { directory })
+
+/// Retry the logic of acquiring locks is pretty simple.
+/// We just retry `n` times after a given `duratio`, both
+/// depending on the type of lock.
+struct RetryPolicy {
+    num_retries: usize,
+    wait_in_ms: u64,
+}
+
+impl RetryPolicy {
+    fn no_retry() -> RetryPolicy {
+        RetryPolicy {
+            num_retries: 0,
+            wait_in_ms: 0,
+        }
    }
+
+    fn wait_and_retry(&mut self,) -> bool {
+        if self.num_retries == 0 {
+            false
+        } else {
+            self.num_retries -= 1;
+            let wait_duration = Duration::from_millis(self.wait_in_ms);
+            thread::sleep(wait_duration);
+            true
+        }
+
+    }
+}
+
+impl LockType {
+
+    fn retry_policy(&self) -> RetryPolicy {
+        match *self {
+            LockType::IndexWriterLock =>
+                RetryPolicy::no_retry(),
+            LockType::MetaLock =>
+                RetryPolicy {
+                    num_retries: 100,
+                    wait_in_ms: 100,
+                }
+        }
+    }
+
+    fn try_acquire_lock(&self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
+        let path = self.filename();
+        let mut write = directory
+            .open_write(path)
+            .map_err(|e|
+                match e {
+                    OpenWriteError::FileAlreadyExists(_) =>
+                        TantivyError::LockFailure(*self),
+                    OpenWriteError::IOError(io_error) =>
+                        TantivyError::IOError(io_error),
+                })?;
+        write.flush()?;
+        Ok(DirectoryLock {
+            directory: directory.box_clone(),
+            path: path.to_owned(),
+        })
+    }
+
+
+    /// Acquire a lock in the given directory.
+    pub fn acquire_lock(&self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
+        let mut box_directory = directory.box_clone();
+        let mut retry_policy = self.retry_policy();
+        loop {
+            let lock_result = self.try_acquire_lock(&mut *box_directory);
+            match lock_result {
+                Ok(result) => {
+                    return Ok(result);
+                }
+                Err(TantivyError::LockFailure(ref filepath)) => {
+                    if !retry_policy.wait_and_retry() {
+                        return Err(TantivyError::LockFailure(filepath.to_owned()));
+                    }
+                }
+                Err(_) => {
+                }
+            }
+        }
+    }
+
+    fn filename(&self) -> &Path {
+        match *self {
+            LockType::MetaLock => {
+                Path::new(".tantivy-meta.lock")
+            }
+            LockType::IndexWriterLock => {
+                Path::new(".tantivy-indexer.lock")
+            }
+        }
+    }
+}
+
+
+/// The `DirectoryLock` is an object that represents a file lock.
+/// See  [`LockType`](struct.LockType.html)
+///
+/// It is transparently associated to a lock file, that gets deleted
+/// on `Drop.` The lock is release automatically on `Drop`.
+pub struct DirectoryLock {
+    directory: Box<Directory>,
+    path: PathBuf,
 }

 impl Drop for DirectoryLock {
    fn drop(&mut self) {
-        if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) {
+        if let Err(e) = self.directory.delete(&*self.path) {
            error!("Failed to remove the lock file. {:?}", e);
        }
    }
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -301,25 +301,31 @@ fn index_documents(

    let last_docstamp: u64 = *(doc_opstamps.last().unwrap());

-    let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
-    let segment_reader = SegmentReader::open(segment)?;
-    let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
-    let may_have_deletes = compute_deleted_bitset(
-        &mut deleted_bitset,
-        &segment_reader,
-        &mut delete_cursor,
-        &doc_to_opstamps,
-        last_docstamp,
-    )?;
-
-    let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
-        if may_have_deletes {
-            Some(deleted_bitset)
-        } else {
-            None
-        }
-    });
+    let segment_entry: SegmentEntry;

+    if delete_cursor.get().is_some() {
+        let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
+        let segment_reader = SegmentReader::open(segment)?;
+        let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
+        let may_have_deletes = compute_deleted_bitset(
+            &mut deleted_bitset,
+            &segment_reader,
+            &mut delete_cursor,
+            &doc_to_opstamps,
+            last_docstamp,
+        )?;
+        segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
+            if may_have_deletes {
+                Some(deleted_bitset)
+            } else {
+                None
+            }
+        });
+    } else {
+        // if there are no delete operation in the queue, no need
+        // to even open the segment.
+        segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
+    }
    Ok(segment_updater.add_segment(generation, segment_entry))
 }

@@ -657,11 +663,26 @@ mod tests {
        let index = Index::create_in_ram(schema_builder.build());
        let _index_writer = index.writer(40_000_000).unwrap();
        match index.writer(40_000_000) {
-            Err(TantivyError::FileAlreadyExists(_)) => {}
+            Err(TantivyError::LockFailure(_)) => {}
            _ => panic!("Expected FileAlreadyExists error"),
        }
    }

+    #[test]
+    fn test_lockfile_already_exists_error_msg() {
+        let schema_builder = schema::SchemaBuilder::default();
+        let index = Index::create_in_ram(schema_builder.build());
+        let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+        match index.writer_with_num_threads(1, 3_000_000) {
+            Err(err) => {
+                let err_msg = err.to_string();
+                assert!(err_msg.contains("Lockfile"));
+                assert!(err_msg.contains("Possible causes:"))
+            },
+            _ => panic!("Expected LockfileAlreadyExists error"),
+        }
+    }
+
    #[test]
    fn test_set_merge_policy() {
        let schema_builder = schema::SchemaBuilder::default();
@@ -843,4 +864,33 @@ mod tests {
        assert_eq!(initial_table_size(1_000_000_000), 19);
    }

+
+    #[cfg(not(feature="no_fail"))]
+    #[test]
+    fn test_write_commit_fails() {
+        use fail;
+        let mut schema_builder = schema::SchemaBuilder::default();
+        let text_field = schema_builder.add_text_field("text", schema::TEXT);
+        let index = Index::create_in_ram(schema_builder.build());
+
+        let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+        for _    in 0..100 {
+            index_writer.add_document(doc!(text_field => "a"));
+        }
+        index_writer.commit().unwrap();
+        fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
+        for _ in 0..100 {
+            index_writer.add_document(doc!(text_field => "b"));
+        }
+        assert!(index_writer.commit().is_err());
+        index.load_searchers().unwrap();
+        let num_docs_containing = |s: &str| {
+            let searcher = index.searcher();
+            let term_a = Term::from_field_text(text_field, s);
+            searcher.doc_freq(&term_a)
+        };
+        assert_eq!(num_docs_containing("a"), 100);
+        assert_eq!(num_docs_containing("b"), 0);
+        fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
+    }
 }
--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -16,6 +16,8 @@ mod segment_writer;
 mod stamper;

 pub(crate) use self::directory_lock::DirectoryLock;
+pub use self::directory_lock::LockType;
+
 pub use self::index_writer::IndexWriter;
 pub use self::log_merge_policy::LogMergePolicy;
 pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
--- a/src/indexer/segment_manager.rs
+++ b/src/indexer/segment_manager.rs
@@ -1,7 +1,7 @@
 use super::segment_register::SegmentRegister;
 use core::SegmentId;
 use core::SegmentMeta;
-use core::{LOCKFILE_FILEPATH, META_FILEPATH};
+use core::META_FILEPATH;
 use error::TantivyError;
 use indexer::delete_queue::DeleteCursor;
 use indexer::SegmentEntry;
@@ -78,10 +78,13 @@ impl SegmentManager {
        registers_lock.committed.len() + registers_lock.uncommitted.len()
    }

+    /// List the files that are useful to the index.
+    ///
+    /// This does not include lock files, or files that are obsolete
+    /// but have not yet been deleted by the garbage collector.
    pub fn list_files(&self) -> HashSet<PathBuf> {
        let mut files = HashSet::new();
        files.insert(META_FILEPATH.clone());
-        files.insert(LOCKFILE_FILEPATH.clone());
        for segment_meta in SegmentMeta::all() {
            files.extend(segment_meta.list_files());
        }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -174,6 +174,9 @@ extern crate tinysegmenter;
 #[macro_use]
 extern crate downcast;

+#[macro_use]
+extern crate fail;
+
 #[cfg(test)]
 mod functional_test;

@@ -949,3 +952,4 @@ mod tests {
        }
    }
 }
+
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -98,7 +98,7 @@ impl MultiFieldPostingsWriter {
            .iter()
            .map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
            .collect();
-        term_offsets.sort_by_key(|&(k, _, _)| k);
+        term_offsets.sort_unstable_by_key(|&(k, _, _)| k);

        let mut offsets: Vec<(Field, usize)> = vec![];
        let term_offsets_it = term_offsets
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -20,6 +20,7 @@ use std::str::FromStr;
 use tokenizer::TokenizerManager;
 use combine::Parser;
 use query::EmptyQuery;
+use query::query_parser::logical_ast::LogicalAST;


 /// Possible error that may happen when parsing a query.
@@ -58,6 +59,27 @@ impl From<ParseIntError> for QueryParserError {
    }
 }

+
+/// Recursively remove empty clause from the AST
+///
+/// Returns `None` iff the `logical_ast` ended up being empty.
+fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
+    match logical_ast {
+        LogicalAST::Clause(children) => {
+            let trimmed_children = children.into_iter()
+                .flat_map(|(occur, child)|
+                    trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
+                .collect::<Vec<_>>();
+            if trimmed_children.is_empty() {
+                None
+            } else {
+                Some(LogicalAST::Clause(trimmed_children))
+            }
+        },
+        _ => Some(logical_ast),
+    }
+}
+
 /// Tantivy's Query parser
 ///
 /// The language covered by the current parser is extremely simple.
@@ -369,14 +391,15 @@ impl QueryParser {
                        asts.push(LogicalAST::Leaf(Box::new(ast)));
                    }
                }
-                let result_ast = if asts.is_empty() {
-                    // this should never happen
-                    return Err(QueryParserError::SyntaxError);
-                } else if asts.len() == 1 {
-                    asts[0].clone()
-                } else {
-                    LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
-                };
+                let result_ast: LogicalAST =
+                    if asts.len() == 1 {
+                        asts.into_iter().next().unwrap()
+                    } else {
+                        LogicalAST::Clause(
+                            asts.into_iter()
+                                .map(|ast| (Occur::Should, ast))
+                                .collect())
+                    };
                Ok(result_ast)
            }
            UserInputLeaf::All => {
@@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
 }

 fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
-    match logical_ast {
-        LogicalAST::Clause(clause) => {
-            if clause.is_empty() {
-                Box::new(EmptyQuery)
-            } else {
-                let occur_subqueries = clause
-                    .into_iter()
-                    .map(|(occur, subquery)| (occur, convert_to_query(subquery)))
-                    .collect::<Vec<_>>();
-                Box::new(BooleanQuery::from(occur_subqueries))
-            }
-        }
-        LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
+    match trim_ast(logical_ast) {
+        Some(LogicalAST::Clause(trimmed_clause)) => {
+            let occur_subqueries = trimmed_clause
+                .into_iter()
+                .map(|(occur, subquery)| (occur, convert_to_query(subquery)))
+                .collect::<Vec<_>>();
+            assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
+            Box::new(BooleanQuery::from(occur_subqueries))
+        },
+        Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
+        None => Box::new(EmptyQuery)
    }
 }

@@ -454,12 +475,17 @@ mod test {
    use schema::Field;
    use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
    use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
-    use tokenizer::SimpleTokenizer;
-    use tokenizer::TokenizerManager;
+    use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
    use Index;

    fn make_query_parser() -> QueryParser {
        let mut schema_builder = SchemaBuilder::default();
+        let text_field_indexing = TextFieldIndexing::default()
+            .set_tokenizer("en_with_stop_words")
+            .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+        let text_options = TextOptions::default()
+            .set_indexing_options(text_field_indexing)
+            .set_stored();
        let title = schema_builder.add_text_field("title", TEXT);
        let text = schema_builder.add_text_field("text", TEXT);
        schema_builder.add_i64_field("signed", INT_INDEXED);
@@ -468,9 +494,14 @@ mod test {
        schema_builder.add_text_field("notindexed_u64", STORED);
        schema_builder.add_text_field("notindexed_i64", STORED);
        schema_builder.add_text_field("nottokenized", STRING);
+        schema_builder.add_text_field("with_stop_words", text_options);
        let schema = schema_builder.build();
        let default_fields = vec![title, text];
        let tokenizer_manager = TokenizerManager::default();
+        tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
+            .filter(LowerCaser)
+            .filter(StopWordFilter::remove(vec!["the".to_string()]))
+        );
        QueryParser::new(schema, default_fields, tokenizer_manager)
    }

@@ -739,6 +770,13 @@ mod test {
        );
    }

+    #[test]
+    pub fn test_query_parser_not_empty_but_no_tokens() {
+        let query_parser = make_query_parser();
+        assert!(query_parser.parse_query(" !, ").is_ok());
+        assert!(query_parser.parse_query("with_stop_words:the").is_ok());
+    }
+
    #[test]
    pub fn test_parse_query_to_ast_conjunction() {
        test_parse_query_to_logical_ast_helper(
--- a/src/snippet/mod.rs
+++ b/src/snippet/mod.rs
@@ -15,6 +15,7 @@ use query::MatchingTerms;
 use schema::Field;
 use std::collections::HashMap;
 use SegmentLocalId;
+use error::TantivyError;

 #[derive(Debug)]
 pub struct HighlightSection {
@@ -79,6 +80,14 @@ const HIGHLIGHTEN_PREFIX: &str = "<b>";
 const HIGHLIGHTEN_POSTFIX: &str = "</b>";

 impl Snippet {
+
+    pub fn empty() -> Snippet {
+        Snippet {
+            fragments: String::new(),
+            highlighted: Vec::new()
+        }
+    }
+
    /// Returns a hignlightned html from the `Snippet`.
    pub fn to_html(&self) -> String {
        let mut html = String::new();
@@ -210,42 +219,55 @@ fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[D

 pub fn generate_snippet(
    searcher: &Searcher,
-    field: Field,
    query: &Query,
+    field: Field,
    doc_addresses: &[DocAddress],
    max_num_chars: usize) -> Result<Vec<Snippet>> {

    let mut doc_address_ords: Vec<usize> = (0..doc_addresses.len()).collect();
    doc_address_ords.sort_by_key(|k| doc_addresses[*k]);

-    // TODO sort doc_addresses
+    let mut snippets = vec![];
    let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?;
-    for doc_address in doc_addresses {      
+
+    for &doc_address_ord in &doc_address_ords {
+        let doc_address = doc_addresses[doc_address_ord];
        let segment_ord: u32 = doc_address.segment_ord();
-        let doc = searcher.doc(doc_address)?;
+        let doc = searcher.doc(&doc_address)?;

        let mut text = String::new();
        for value in doc.get_all(field) {
            text.push_str(value.text());
        }

+
        if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&segment_ord) {
-            if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) {
-                if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) {
-                    let terms: BTreeMap<String, f32>  = terms
-                        .iter()
-                        .map(|(term, score)| (term.text().to_string(), *score))
-                        .collect();
-                    let fragment_candidates = search_fragments(tokenizer,
-                                     &text,
-                                     terms,
-                                     max_num_chars);
-                }
+            let tokenizer = searcher.index().tokenizer_for_field(field)?;
+            if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) {
+                let terms: BTreeMap<String, f32>  = terms
+                    .iter()
+                    .map(|(term, score)| (term.text().to_string(), *score))
+                    .collect();
+                let fragment_candidates = search_fragments(tokenizer,
+                                 &text,
+                                 terms,
+                                 max_num_chars);
+                let snippet = select_best_fragment_combination(fragment_candidates, &text);
+                snippets.push(snippet);
+            } else {
+                snippets.push(Snippet::empty());
            }
+        } else {
+
        }
    }
-    // search_fragments(boxed_tokenizer, &text, terms, 3);
-    panic!("e");
+
+    // reorder the snippets
+    for i in 0..doc_addresses.len() {
+        snippets.swap(i, doc_address_ords[i]);
+    }
+
+    Ok(snippets)
 }

 #[cfg(test)]
--- a/src/tokenizer/lower_caser.rs
+++ b/src/tokenizer/lower_caser.rs
@@ -1,4 +1,5 @@
 use super::{Token, TokenFilter, TokenStream};
+use std::mem;

 /// Token filter that lowercase terms.
 #[derive(Clone)]
@@ -15,13 +16,22 @@ where
    }
 }

-pub struct LowerCaserTokenStream<TailTokenStream>
-where
-    TailTokenStream: TokenStream,
-{
+pub struct LowerCaserTokenStream<TailTokenStream> {
+    buffer: String,
    tail: TailTokenStream,
 }

+// writes a lowercased version of text into output.
+fn to_lowercase_unicode(text: &mut String, output: &mut String) {
+    output.clear();
+    for c in text.chars() {
+        // Contrary to the std, we do not take care of sigma special case.
+        // This will have an normalizationo effect, which is ok for search.
+        output.extend(c.to_lowercase());
+    }
+}
+
+
 impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
 where
    TailTokenStream: TokenStream,
@@ -36,7 +46,14 @@ where

    fn advance(&mut self) -> bool {
        if self.tail.advance() {
-            self.tail.token_mut().text.make_ascii_lowercase();
+            if self.token_mut().text.is_ascii() {
+                // fast track for ascii.
+                self.token_mut().text.make_ascii_lowercase();
+            } else {
+                    to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
+
+                mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
+            }
            true
        } else {
            false
@@ -49,6 +66,43 @@ where
    TailTokenStream: TokenStream,
 {
    fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
-        LowerCaserTokenStream { tail }
+        LowerCaserTokenStream {
+            tail,
+            buffer: String::with_capacity(100)
+        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use tokenizer::Tokenizer;
+    use tokenizer::LowerCaser;
+    use tokenizer::TokenStream;
+    use tokenizer::SimpleTokenizer;
+
+    #[test]
+    fn test_to_lower_case() {
+        assert_eq!(lowercase_helper("Русский текст"),
+                   vec!["русский".to_string(), "текст".to_string()]);
+    }
+
+    fn lowercase_helper(text: &str) -> Vec<String> {
+        let mut tokens = vec![];
+        let mut token_stream = SimpleTokenizer
+            .filter(LowerCaser)
+            .token_stream(text);
+        while token_stream.advance() {
+            let token_text = token_stream.token().text.clone();
+            tokens.push(token_text);
+        }
+        tokens
+     }
+
+
+    #[test]
+    fn test_lowercaser() {
+        assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
+        assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
+    }
+
+}