diff --git a/Cargo.toml b/Cargo.toml index 1ec1b65d5..887f7a9b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ fnv = "1.0.6" owned-read = "0.4" failure = "0.1" htmlescape = "0.3.1" +fail = "0.2" [target.'cfg(windows)'.dependencies] winapi = "0.2" @@ -61,12 +62,20 @@ opt-level = 3 debug = false lto = true debug-assertions = false +overflow-checks = false + +[profile.test] +debug-assertions = true +overflow-checks = true [features] -default = ["mmap"] +# by default no-fail is disabled. We manually enable it when running test. +default = ["mmap", "no_fail"] mmap = ["fst/mmap", "atomicwrites"] lz4-compression = ["lz4"] +no_fail = ["fail/no_fail"] [badges] travis-ci = { repository = "tantivy-search/tantivy" } + diff --git a/README.md b/README.md index c1824b575..0ce522a7c 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master) +[![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) ![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png) @@ -77,6 +78,10 @@ To check out and run tests, you can simply run : cd tantivy cargo build +## Running tests + +Some tests will not run with just `cargo test` because of `fail-rs`. +To run the tests exhaustively, run `./run-tests.sh`. # Contribute diff --git a/appveyor.yml b/appveyor.yml index a3bd2ac04..685b04d3a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -18,5 +18,5 @@ install: build: false test_script: - - REM SET RUST_LOG=tantivy,test & cargo test --verbose - - REM SET RUST_BACKTRACE=1 & cargo build --examples \ No newline at end of file + - REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1 + - REM SET RUST_BACKTRACE=1 & cargo build --examples diff --git a/ci/script.sh b/ci/script.sh index b56345753..0939344b0 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -16,7 +16,7 @@ main() { return fi echo "Test" - cross test --target $TARGET + cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1 fi for example in $(ls examples/*.rs) do diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 950a42afd..b131d876c 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -23,7 +23,6 @@ use tantivy::Index; fn main() -> tantivy::Result<()> { // this example assumes you understand the content in `basic_search` - let index_path = TempDir::new("tantivy_stopwords_example_dir")?; let mut schema_builder = SchemaBuilder::default(); // This configures your custom options for how tantivy will @@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> { // to note is that we are setting the tokenizer to `stoppy` // which will be defined and registered below. let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + .set_indexing_options(text_field_indexing) + .set_stored(); // Our first field is title. schema_builder.add_text_field("title", text_options); // Our second field is body. let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + .set_indexing_options(text_field_indexing) + .set_stored(); schema_builder.add_text_field("body", text_options); let schema = schema_builder.build(); - let index = Index::create_in_dir(&index_path, schema.clone())?; + let index = Index::create_in_ram(schema.clone()); // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus let tokenizer = SimpleTokenizer - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec![ - "the".to_string(), - "and".to_string(), - ])); + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec![ + "the".to_string(), + "and".to_string(), + ])); index.tokenizers().register("stoppy", tokenizer); @@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> { )); index_writer.add_document(doc!( - title => "Of Mice and Men", - body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ - bank and runs deep and green. The water is warm too, for it has slipped twinkling \ - over the yellow sands in the sunlight before reaching the narrow pool. On one \ - side of the river the golden foothill slopes curve up to the strong and rocky \ - Gabilan Mountains, but on the valley side the water is lined with trees—willows \ - fresh and green with every spring, carrying in their lower leaf junctures the \ - debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ - limbs and branches that arch over the pool" - )); + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); index_writer.add_document(doc!( title => "Frankenstein", @@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> { let query_parser = QueryParser::for_index(&index, vec![title, body]); - // this will have NO hits because it was filtered out - // because the query is run through the analyzer you - // actually will get an error here because the query becomes - // empty - assert!(query_parser.parse_query("the").is_err()); - - // this will have hits - let query = query_parser.parse_query("is")?; + // stop words are applied on the query as well. + // The following will be equivalent to `title:frankenstein` + let query = query_parser.parse_query("title:\"the Frankenstein\"")?; let mut top_collector = TopCollector::with_limit(10); @@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> { } Ok(()) -} - -use tempdir::TempDir; +} \ No newline at end of file diff --git a/run-tests.sh b/run-tests.sh new file mode 100755 index 000000000..fc2944dd5 --- /dev/null +++ b/run-tests.sh @@ -0,0 +1,2 @@ +#!/bin/bash +cargo test --no-default-features --features mmap -- --test-threads 1 diff --git a/src/common/bitset.rs b/src/common/bitset.rs index 73f03c4f5..326e7cee8 100644 --- a/src/common/bitset.rs +++ b/src/common/bitset.rs @@ -266,14 +266,14 @@ mod tests { #[test] fn test_bitset_large() { - let arr = generate_nonunique_unsorted(1_000_000, 50_000); + let arr = generate_nonunique_unsorted(100_000, 5_000); let mut btreeset: BTreeSet = BTreeSet::new(); - let mut bitset = BitSet::with_max_value(1_000_000); + let mut bitset = BitSet::with_max_value(100_000); for el in arr { btreeset.insert(el); bitset.insert(el); } - for i in 0..1_000_000 { + for i in 0..100_000 { assert_eq!(btreeset.contains(&i), bitset.contains(i)); } assert_eq!(btreeset.len(), bitset.len()); diff --git a/src/core/index.rs b/src/core/index.rs index c6f465eef..6c236ff5f 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -7,7 +7,7 @@ use std::fmt; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use Result; - +use indexer::LockType; use super::pool::LeasedItem; use super::pool::Pool; use super::segment::create_segment; @@ -20,11 +20,10 @@ use core::META_FILEPATH; #[cfg(feature = "mmap")] use directory::MmapDirectory; use directory::{Directory, RAMDirectory}; -use directory::{DirectoryClone, ManagedDirectory}; +use directory::{ManagedDirectory}; use indexer::index_writer::open_index_writer; use indexer::index_writer::HEAP_SIZE_MIN; use indexer::segment_updater::save_new_metas; -use indexer::DirectoryLock; use num_cpus; use std::path::Path; use tokenizer::TokenizerManager; @@ -116,18 +115,28 @@ impl Index { &self.tokenizers } - pub fn tokenizer_for_field(&self, field: Field) -> Option> { - let field_type = self.schema.get_field_entry(field).field_type(); - let tokenizer: &TokenizerManager = self.tokenizers(); + pub fn tokenizer_for_field(&self, field: Field) -> Result> { + let field_entry = self.schema.get_field_entry(field); + let field_type = field_entry.field_type(); + let tokenizer_manager: &TokenizerManager = self.tokenizers(); + let tokenizer_name_opt: Option> = match field_type { - FieldType::Str(text_options) => { - text_options.get_indexing_options() - .map(|text_indexing_options| text_indexing_options.tokenizer()) - .and_then(|tokenizer_name| tokenizer.get(tokenizer_name)) - - }, - _ => { - None + FieldType::Str(text_options) => { + text_options + .get_indexing_options() + .map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) + .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)) + }, + _ => { + None + } + }; + match tokenizer_name_opt { + Some(tokenizer) => { + Ok(tokenizer) + } + None => { + Err(TantivyError::SchemaError(format!("{:?} is not a text field.", field_entry.name()))) } } } @@ -175,7 +184,8 @@ impl Index { num_threads: usize, overall_heap_size_in_bytes: usize, ) -> Result { - let directory_lock = DirectoryLock::lock(self.directory().box_clone())?; + + let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?; let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads; open_index_writer( self, @@ -268,6 +278,7 @@ impl Index { /// This needs to be called when a new segment has been /// published or after a merge. pub fn load_searchers(&self) -> Result<()> { + let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?; let searchable_segments = self.searchable_segments()?; let segment_readers: Vec = searchable_segments .iter() diff --git a/src/core/mod.rs b/src/core/mod.rs index 6d43685f8..062b537ee 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -33,10 +33,4 @@ lazy_static! { /// Removing this file is safe, but will prevent the garbage collection of all of the file that /// are currently in the directory pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json"); - - /// Only one process should be able to write tantivy's index at a time. - /// This file, when present, is in charge of preventing other processes to open an IndexWriter. - /// - /// If the process is killed and this file remains, it is safe to remove it manually. - pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock"); } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 37b950332..56a3a7b9e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -4,7 +4,6 @@ use core::InvertedIndexReader; use core::Segment; use core::SegmentComponent; use core::SegmentId; -use core::SegmentMeta; use error::TantivyError; use fastfield::DeleteBitSet; use fastfield::FacetReader; @@ -44,7 +43,8 @@ pub struct SegmentReader { inv_idx_reader_cache: Arc>>>, segment_id: SegmentId, - segment_meta: SegmentMeta, + max_doc: DocId, + num_docs: DocId, termdict_composite: CompositeFile, postings_composite: CompositeFile, @@ -64,7 +64,7 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes, so it happens /// to also be the number of documents in the index. pub fn max_doc(&self) -> DocId { - self.segment_meta.max_doc() + self.max_doc } /// Returns the number of documents. @@ -73,7 +73,7 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes so max doc and /// num_docs are the same. pub fn num_docs(&self) -> DocId { - self.segment_meta.num_docs() + self.num_docs } /// Returns the schema of the index this segment belongs to. @@ -225,6 +225,8 @@ impl SegmentReader { let store_source = segment.open_read(SegmentComponent::STORE)?; let store_reader = StoreReader::from_source(store_source); + fail_point!("SegmentReader::open#middle"); + let postings_source = segment.open_read(SegmentComponent::POSTINGS)?; let postings_composite = CompositeFile::open(&postings_source)?; @@ -260,7 +262,8 @@ impl SegmentReader { let schema = segment.schema(); Ok(SegmentReader { inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), - segment_meta: segment.meta().clone(), + max_doc: segment.meta().max_doc(), + num_docs: segment.meta().num_docs(), termdict_composite, postings_composite, fast_fields_composite, @@ -432,6 +435,7 @@ mod test { use schema::{SchemaBuilder, Term, STORED, TEXT}; use DocId; + #[test] fn test_alive_docs_iterator() { let mut schema_builder = SchemaBuilder::new(); diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index cf59b9ace..e5510d113 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -12,6 +12,20 @@ use std::sync::RwLockWriteGuard; use std::sync::{Arc, RwLock}; use Directory; use Result; +use indexer::LockType; + + + +/// Returns true iff the file is "managed". +/// Non-managed file are not subject to garbage collection. +/// +/// Filenames that starts by a "." -typically locks- +/// are not managed. +fn is_managed(path: &Path) -> bool { + path.to_str() + .map(|p_str| !p_str.starts_with(".")) + .unwrap_or(true) +} /// Wrapper of directories that keeps track of files created by Tantivy. /// @@ -82,25 +96,34 @@ impl ManagedDirectory { pub fn garbage_collect HashSet>(&mut self, get_living_files: L) { info!("Garbage collect"); let mut files_to_delete = vec![]; + + // It is crucial to get the living files after acquiring the + // read lock of meta informations. That way, we + // avoid the following scenario. + // + // 1) we get the list of living files. + // 2) someone creates a new file. + // 3) we start garbage collection and remove this file + // even though it is a living file. + // + // releasing the lock as .delete() will use it too. { - // releasing the lock as .delete() will use it too. let meta_informations_rlock = self.meta_informations .read() .expect("Managed directory rlock poisoned in garbage collect."); - // It is crucial to get the living files after acquiring the - // read lock of meta informations. That way, we - // avoid the following scenario. - // - // 1) we get the list of living files. - // 2) someone creates a new file. - // 3) we start garbage collection and remove this file - // even though it is a living file. - let living_files = get_living_files(); - - for managed_path in &meta_informations_rlock.managed_paths { - if !living_files.contains(managed_path) { - files_to_delete.push(managed_path.clone()); + // The point of this second "file" lock is to enforce the following scenario + // 1) process B tries to load a new set of searcher. + // The list of segments is loaded + // 2) writer change meta.json (for instance after a merge or a commit) + // 3) gc kicks in. + // 4) gc removes a file that was useful for process B, before process B opened it. + if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) { + let living_files = get_living_files(); + for managed_path in &meta_informations_rlock.managed_paths { + if !living_files.contains(managed_path) { + files_to_delete.push(managed_path.clone()); + } } } } @@ -156,7 +179,15 @@ impl ManagedDirectory { /// registering the filepath and creating the file /// will not lead to garbage files that will /// never get removed. + /// + /// File starting by "." are reserved to locks. + /// They are not managed and cannot be subjected + /// to garbage collection. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { + // Files starting by "." (e.g. lock files) are not managed. + if !is_managed(filepath) { + return Ok(()); + } let mut meta_wlock = self.meta_informations .write() .expect("Managed file lock poisoned"); diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index d1a671cd1..1b40970b4 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -173,7 +173,6 @@ impl Directory for RAMDirectory { let exists = self.fs .write(path_buf.clone(), &Vec::new()) .map_err(|err| IOError::with_path(path.to_owned(), err))?; - // force the creation of the file to mimic the MMap directory. if exists { Err(OpenWriteError::FileAlreadyExists(path_buf)) @@ -196,6 +195,9 @@ impl Directory for RAMDirectory { } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + fail_point!("RAMDirectory::atomic_write", |msg| { + Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string()))) + }); let path_buf = PathBuf::from(path); let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); self.fs.write(path_buf, &Vec::new())?; diff --git a/src/error.rs b/src/error.rs index 8fa5cb1ce..ddde26789 100644 --- a/src/error.rs +++ b/src/error.rs @@ -6,6 +6,7 @@ use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteErro use fastfield::FastFieldNotAvailableError; use query; use schema; +use indexer::LockType; use serde_json; use std::path::PathBuf; use std::sync::PoisonError; @@ -19,6 +20,9 @@ pub enum TantivyError { /// File already exists, this is a problem when we try to write into a new file. #[fail(display = "file already exists: '{:?}'", _0)] FileAlreadyExists(PathBuf), + /// Failed to acquire file lock + #[fail(display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)] + LockFailure(LockType), /// IO Error. #[fail(display = "an IO error occurred: '{}'", _0)] IOError(#[cause] IOError), @@ -91,13 +95,14 @@ impl From for TantivyError { } } + impl From for TantivyError { fn from(error: OpenWriteError) -> TantivyError { match error { - OpenWriteError::FileAlreadyExists(filepath) => { - TantivyError::FileAlreadyExists(filepath) - } - OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), + OpenWriteError::FileAlreadyExists(filepath) => + TantivyError::FileAlreadyExists(filepath), + OpenWriteError::IOError(io_error) => + TantivyError::IOError(io_error), }.into() } } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index e3599bacf..fdb029432 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -370,7 +370,7 @@ mod tests { pub fn generate_permutation() -> Vec { let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; let mut rng = XorShiftRng::from_seed(seed); - let mut permutation: Vec = (0u64..1_000_000u64).collect(); + let mut permutation: Vec = (0u64..100_000u64).collect(); rng.shuffle(&mut permutation); permutation } diff --git a/src/indexer/directory_lock.rs b/src/indexer/directory_lock.rs index b152a3c58..4dbaa9ed4 100644 --- a/src/indexer/directory_lock.rs +++ b/src/indexer/directory_lock.rs @@ -1,26 +1,147 @@ -use core::LOCKFILE_FILEPATH; use directory::error::OpenWriteError; use Directory; +use TantivyError; +use std::path::{Path, PathBuf}; +use std::thread; +use std::time::Duration; +use std::io::Write; -/// The directory lock is a mechanism used to -/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html) -/// -/// Only one lock can exist at a time for a given directory. -/// The lock is release automatically on `Drop`. -pub struct DirectoryLock { - directory: Box, +#[derive(Debug, Clone, Copy)] +pub enum LockType { + /// Only one process should be able to write tantivy's index at a time. + /// This lock file, when present, is in charge of preventing other processes to open an IndexWriter. + /// + /// If the process is killed and this file remains, it is safe to remove it manually. + /// + /// Failing to acquire this lock usually means a misuse of tantivy's API, + /// (creating more than one instance of the `IndexWriter`), are a spurious + /// lock file remaining after a crash. In the latter case, removing the file after + /// checking no process running tantivy is running is safe. + IndexWriterLock, + /// The meta lock file is here to protect the segment files being opened by + /// `.load_searchers()` from being garbage collected. + /// It makes it possible for another process to safely consume + /// our index in-writing. Ideally, we may have prefered `RWLock` semantics + /// here, but it is difficult to achieve on Windows. + /// + /// Opening segment readers is a very fast process. + /// Right now if the lock cannot be acquire on the first attempt, the logic + /// is very simplistic. We retry after `100ms` until we effectively + /// acquire the lock. + /// This lock should not have much contention in normal usage. + MetaLock } -impl DirectoryLock { - pub fn lock(mut directory: Box) -> Result { - directory.open_write(&*LOCKFILE_FILEPATH)?; - Ok(DirectoryLock { directory }) + +/// Retry the logic of acquiring locks is pretty simple. +/// We just retry `n` times after a given `duratio`, both +/// depending on the type of lock. +struct RetryPolicy { + num_retries: usize, + wait_in_ms: u64, +} + +impl RetryPolicy { + fn no_retry() -> RetryPolicy { + RetryPolicy { + num_retries: 0, + wait_in_ms: 0, + } } + + fn wait_and_retry(&mut self,) -> bool { + if self.num_retries == 0 { + false + } else { + self.num_retries -= 1; + let wait_duration = Duration::from_millis(self.wait_in_ms); + thread::sleep(wait_duration); + true + } + + } +} + +impl LockType { + + fn retry_policy(&self) -> RetryPolicy { + match *self { + LockType::IndexWriterLock => + RetryPolicy::no_retry(), + LockType::MetaLock => + RetryPolicy { + num_retries: 100, + wait_in_ms: 100, + } + } + } + + fn try_acquire_lock(&self, directory: &mut Directory) -> Result { + let path = self.filename(); + let mut write = directory + .open_write(path) + .map_err(|e| + match e { + OpenWriteError::FileAlreadyExists(_) => + TantivyError::LockFailure(*self), + OpenWriteError::IOError(io_error) => + TantivyError::IOError(io_error), + })?; + write.flush()?; + Ok(DirectoryLock { + directory: directory.box_clone(), + path: path.to_owned(), + }) + } + + + /// Acquire a lock in the given directory. + pub fn acquire_lock(&self, directory: &Directory) -> Result { + let mut box_directory = directory.box_clone(); + let mut retry_policy = self.retry_policy(); + loop { + let lock_result = self.try_acquire_lock(&mut *box_directory); + match lock_result { + Ok(result) => { + return Ok(result); + } + Err(TantivyError::LockFailure(ref filepath)) => { + if !retry_policy.wait_and_retry() { + return Err(TantivyError::LockFailure(filepath.to_owned())); + } + } + Err(_) => { + } + } + } + } + + fn filename(&self) -> &Path { + match *self { + LockType::MetaLock => { + Path::new(".tantivy-meta.lock") + } + LockType::IndexWriterLock => { + Path::new(".tantivy-indexer.lock") + } + } + } +} + + +/// The `DirectoryLock` is an object that represents a file lock. +/// See [`LockType`](struct.LockType.html) +/// +/// It is transparently associated to a lock file, that gets deleted +/// on `Drop.` The lock is release automatically on `Drop`. +pub struct DirectoryLock { + directory: Box, + path: PathBuf, } impl Drop for DirectoryLock { fn drop(&mut self) { - if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) { + if let Err(e) = self.directory.delete(&*self.path) { error!("Failed to remove the lock file. {:?}", e); } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 982140fbc..3e11c4ce5 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -301,25 +301,31 @@ fn index_documents( let last_docstamp: u64 = *(doc_opstamps.last().unwrap()); - let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); - let segment_reader = SegmentReader::open(segment)?; - let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); - let may_have_deletes = compute_deleted_bitset( - &mut deleted_bitset, - &segment_reader, - &mut delete_cursor, - &doc_to_opstamps, - last_docstamp, - )?; - - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { - if may_have_deletes { - Some(deleted_bitset) - } else { - None - } - }); + let segment_entry: SegmentEntry; + if delete_cursor.get().is_some() { + let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); + let segment_reader = SegmentReader::open(segment)?; + let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); + let may_have_deletes = compute_deleted_bitset( + &mut deleted_bitset, + &segment_reader, + &mut delete_cursor, + &doc_to_opstamps, + last_docstamp, + )?; + segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { + if may_have_deletes { + Some(deleted_bitset) + } else { + None + } + }); + } else { + // if there are no delete operation in the queue, no need + // to even open the segment. + segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None); + } Ok(segment_updater.add_segment(generation, segment_entry)) } @@ -657,11 +663,26 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); let _index_writer = index.writer(40_000_000).unwrap(); match index.writer(40_000_000) { - Err(TantivyError::FileAlreadyExists(_)) => {} + Err(TantivyError::LockFailure(_)) => {} _ => panic!("Expected FileAlreadyExists error"), } } + #[test] + fn test_lockfile_already_exists_error_msg() { + let schema_builder = schema::SchemaBuilder::default(); + let index = Index::create_in_ram(schema_builder.build()); + let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + match index.writer_with_num_threads(1, 3_000_000) { + Err(err) => { + let err_msg = err.to_string(); + assert!(err_msg.contains("Lockfile")); + assert!(err_msg.contains("Possible causes:")) + }, + _ => panic!("Expected LockfileAlreadyExists error"), + } + } + #[test] fn test_set_merge_policy() { let schema_builder = schema::SchemaBuilder::default(); @@ -843,4 +864,33 @@ mod tests { assert_eq!(initial_table_size(1_000_000_000), 19); } + + #[cfg(not(feature="no_fail"))] + #[test] + fn test_write_commit_fails() { + use fail; + let mut schema_builder = schema::SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema_builder.build()); + + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + for _ in 0..100 { + index_writer.add_document(doc!(text_field => "a")); + } + index_writer.commit().unwrap(); + fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap(); + for _ in 0..100 { + index_writer.add_document(doc!(text_field => "b")); + } + assert!(index_writer.commit().is_err()); + index.load_searchers().unwrap(); + let num_docs_containing = |s: &str| { + let searcher = index.searcher(); + let term_a = Term::from_field_text(text_field, s); + searcher.doc_freq(&term_a) + }; + assert_eq!(num_docs_containing("a"), 100); + assert_eq!(num_docs_containing("b"), 0); + fail::cfg("RAMDirectory::atomic_write", "off").unwrap(); + } } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 783e787c8..3d29b38c0 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -16,6 +16,8 @@ mod segment_writer; mod stamper; pub(crate) use self::directory_lock::DirectoryLock; +pub use self::directory_lock::LockType; + pub use self::index_writer::IndexWriter; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 18175c774..0e67d3b15 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -1,7 +1,7 @@ use super::segment_register::SegmentRegister; use core::SegmentId; use core::SegmentMeta; -use core::{LOCKFILE_FILEPATH, META_FILEPATH}; +use core::META_FILEPATH; use error::TantivyError; use indexer::delete_queue::DeleteCursor; use indexer::SegmentEntry; @@ -78,10 +78,13 @@ impl SegmentManager { registers_lock.committed.len() + registers_lock.uncommitted.len() } + /// List the files that are useful to the index. + /// + /// This does not include lock files, or files that are obsolete + /// but have not yet been deleted by the garbage collector. pub fn list_files(&self) -> HashSet { let mut files = HashSet::new(); files.insert(META_FILEPATH.clone()); - files.insert(LOCKFILE_FILEPATH.clone()); for segment_meta in SegmentMeta::all() { files.extend(segment_meta.list_files()); } diff --git a/src/lib.rs b/src/lib.rs index 4f4d364a0..5806d5f69 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -174,6 +174,9 @@ extern crate tinysegmenter; #[macro_use] extern crate downcast; +#[macro_use] +extern crate fail; + #[cfg(test)] mod functional_test; @@ -949,3 +952,4 @@ mod tests { } } } + diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index fe56795e7..d90b322d9 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -98,7 +98,7 @@ impl MultiFieldPostingsWriter { .iter() .map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId)) .collect(); - term_offsets.sort_by_key(|&(k, _, _)| k); + term_offsets.sort_unstable_by_key(|&(k, _, _)| k); let mut offsets: Vec<(Field, usize)> = vec![]; let term_offsets_it = term_offsets diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index f3a9f37c0..93deb48c1 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -20,6 +20,7 @@ use std::str::FromStr; use tokenizer::TokenizerManager; use combine::Parser; use query::EmptyQuery; +use query::query_parser::logical_ast::LogicalAST; /// Possible error that may happen when parsing a query. @@ -58,6 +59,27 @@ impl From for QueryParserError { } } + +/// Recursively remove empty clause from the AST +/// +/// Returns `None` iff the `logical_ast` ended up being empty. +fn trim_ast(logical_ast: LogicalAST) -> Option { + match logical_ast { + LogicalAST::Clause(children) => { + let trimmed_children = children.into_iter() + .flat_map(|(occur, child)| + trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) ) + .collect::>(); + if trimmed_children.is_empty() { + None + } else { + Some(LogicalAST::Clause(trimmed_children)) + } + }, + _ => Some(logical_ast), + } +} + /// Tantivy's Query parser /// /// The language covered by the current parser is extremely simple. @@ -369,14 +391,15 @@ impl QueryParser { asts.push(LogicalAST::Leaf(Box::new(ast))); } } - let result_ast = if asts.is_empty() { - // this should never happen - return Err(QueryParserError::SyntaxError); - } else if asts.len() == 1 { - asts[0].clone() - } else { - LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) - }; + let result_ast: LogicalAST = + if asts.len() == 1 { + asts.into_iter().next().unwrap() + } else { + LogicalAST::Clause( + asts.into_iter() + .map(|ast| (Occur::Should, ast)) + .collect()) + }; Ok(result_ast) } UserInputLeaf::All => { @@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { } fn convert_to_query(logical_ast: LogicalAST) -> Box { - match logical_ast { - LogicalAST::Clause(clause) => { - if clause.is_empty() { - Box::new(EmptyQuery) - } else { - let occur_subqueries = clause - .into_iter() - .map(|(occur, subquery)| (occur, convert_to_query(subquery))) - .collect::>(); - Box::new(BooleanQuery::from(occur_subqueries)) - } - } - LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal), + match trim_ast(logical_ast) { + Some(LogicalAST::Clause(trimmed_clause)) => { + let occur_subqueries = trimmed_clause + .into_iter() + .map(|(occur, subquery)| (occur, convert_to_query(subquery))) + .collect::>(); + assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming"); + Box::new(BooleanQuery::from(occur_subqueries)) + }, + Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal), + None => Box::new(EmptyQuery) } } @@ -454,12 +475,17 @@ mod test { use schema::Field; use schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT}; - use tokenizer::SimpleTokenizer; - use tokenizer::TokenizerManager; + use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager}; use Index; fn make_query_parser() -> QueryParser { let mut schema_builder = SchemaBuilder::default(); + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("en_with_stop_words") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); let title = schema_builder.add_text_field("title", TEXT); let text = schema_builder.add_text_field("text", TEXT); schema_builder.add_i64_field("signed", INT_INDEXED); @@ -468,9 +494,14 @@ mod test { schema_builder.add_text_field("notindexed_u64", STORED); schema_builder.add_text_field("notindexed_i64", STORED); schema_builder.add_text_field("nottokenized", STRING); + schema_builder.add_text_field("with_stop_words", text_options); let schema = schema_builder.build(); let default_fields = vec![title, text]; let tokenizer_manager = TokenizerManager::default(); + tokenizer_manager.register("en_with_stop_words", SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec!["the".to_string()])) + ); QueryParser::new(schema, default_fields, tokenizer_manager) } @@ -739,6 +770,13 @@ mod test { ); } + #[test] + pub fn test_query_parser_not_empty_but_no_tokens() { + let query_parser = make_query_parser(); + assert!(query_parser.parse_query(" !, ").is_ok()); + assert!(query_parser.parse_query("with_stop_words:the").is_ok()); + } + #[test] pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 65d50575c..ffd6613e3 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -15,6 +15,7 @@ use query::MatchingTerms; use schema::Field; use std::collections::HashMap; use SegmentLocalId; +use error::TantivyError; #[derive(Debug)] pub struct HighlightSection { @@ -79,6 +80,14 @@ const HIGHLIGHTEN_PREFIX: &str = ""; const HIGHLIGHTEN_POSTFIX: &str = ""; impl Snippet { + + pub fn empty() -> Snippet { + Snippet { + fragments: String::new(), + highlighted: Vec::new() + } + } + /// Returns a hignlightned html from the `Snippet`. pub fn to_html(&self) -> String { let mut html = String::new(); @@ -210,42 +219,55 @@ fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[D pub fn generate_snippet( searcher: &Searcher, - field: Field, query: &Query, + field: Field, doc_addresses: &[DocAddress], max_num_chars: usize) -> Result> { let mut doc_address_ords: Vec = (0..doc_addresses.len()).collect(); doc_address_ords.sort_by_key(|k| doc_addresses[*k]); - // TODO sort doc_addresses + let mut snippets = vec![]; let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?; - for doc_address in doc_addresses { + + for &doc_address_ord in &doc_address_ords { + let doc_address = doc_addresses[doc_address_ord]; let segment_ord: u32 = doc_address.segment_ord(); - let doc = searcher.doc(doc_address)?; + let doc = searcher.doc(&doc_address)?; let mut text = String::new(); for value in doc.get_all(field) { text.push_str(value.text()); } + if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&segment_ord) { - if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) { - if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { - let terms: BTreeMap = terms - .iter() - .map(|(term, score)| (term.text().to_string(), *score)) - .collect(); - let fragment_candidates = search_fragments(tokenizer, - &text, - terms, - max_num_chars); - } + let tokenizer = searcher.index().tokenizer_for_field(field)?; + if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { + let terms: BTreeMap = terms + .iter() + .map(|(term, score)| (term.text().to_string(), *score)) + .collect(); + let fragment_candidates = search_fragments(tokenizer, + &text, + terms, + max_num_chars); + let snippet = select_best_fragment_combination(fragment_candidates, &text); + snippets.push(snippet); + } else { + snippets.push(Snippet::empty()); } + } else { + } } - // search_fragments(boxed_tokenizer, &text, terms, 3); - panic!("e"); + + // reorder the snippets + for i in 0..doc_addresses.len() { + snippets.swap(i, doc_address_ords[i]); + } + + Ok(snippets) } #[cfg(test)] diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index ebade3978..578678a4a 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,4 +1,5 @@ use super::{Token, TokenFilter, TokenStream}; +use std::mem; /// Token filter that lowercase terms. #[derive(Clone)] @@ -15,13 +16,22 @@ where } } -pub struct LowerCaserTokenStream -where - TailTokenStream: TokenStream, -{ +pub struct LowerCaserTokenStream { + buffer: String, tail: TailTokenStream, } +// writes a lowercased version of text into output. +fn to_lowercase_unicode(text: &mut String, output: &mut String) { + output.clear(); + for c in text.chars() { + // Contrary to the std, we do not take care of sigma special case. + // This will have an normalizationo effect, which is ok for search. + output.extend(c.to_lowercase()); + } +} + + impl TokenStream for LowerCaserTokenStream where TailTokenStream: TokenStream, @@ -36,7 +46,14 @@ where fn advance(&mut self) -> bool { if self.tail.advance() { - self.tail.token_mut().text.make_ascii_lowercase(); + if self.token_mut().text.is_ascii() { + // fast track for ascii. + self.token_mut().text.make_ascii_lowercase(); + } else { + to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer); + + mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + } true } else { false @@ -49,6 +66,43 @@ where TailTokenStream: TokenStream, { fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream { - LowerCaserTokenStream { tail } + LowerCaserTokenStream { + tail, + buffer: String::with_capacity(100) + } } } + +#[cfg(test)] +mod tests { + use tokenizer::Tokenizer; + use tokenizer::LowerCaser; + use tokenizer::TokenStream; + use tokenizer::SimpleTokenizer; + + #[test] + fn test_to_lower_case() { + assert_eq!(lowercase_helper("Русский текст"), + vec!["русский".to_string(), "текст".to_string()]); + } + + fn lowercase_helper(text: &str) -> Vec { + let mut tokens = vec![]; + let mut token_stream = SimpleTokenizer + .filter(LowerCaser) + .token_stream(text); + while token_stream.advance() { + let token_text = token_stream.token().text.clone(); + tokens.push(token_text); + } + tokens + } + + + #[test] + fn test_lowercaser() { + assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]); + assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]); + } + +} \ No newline at end of file