Merge branch 'issue/368' into issue/368b

This commit is contained in:
Paul Masurel
2018-09-09 14:24:05 +09:00
23 changed files with 519 additions and 170 deletions

View File

@@ -49,6 +49,7 @@ fnv = "1.0.6"
owned-read = "0.4"
failure = "0.1"
htmlescape = "0.3.1"
fail = "0.2"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -61,12 +62,20 @@ opt-level = 3
debug = false
lto = true
debug-assertions = false
overflow-checks = false
[profile.test]
debug-assertions = true
overflow-checks = true
[features]
default = ["mmap"]
# by default no-fail is disabled. We manually enable it when running test.
default = ["mmap", "no_fail"]
mmap = ["fst/mmap", "atomicwrites"]
lz4-compression = ["lz4"]
no_fail = ["fail/no_fail"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -4,6 +4,7 @@
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
[![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton)
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)
@@ -77,6 +78,10 @@ To check out and run tests, you can simply run :
cd tantivy
cargo build
## Running tests
Some tests will not run with just `cargo test` because of `fail-rs`.
To run the tests exhaustively, run `./run-tests.sh`.
# Contribute

View File

@@ -18,5 +18,5 @@ install:
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
- REM SET RUST_BACKTRACE=1 & cargo build --examples
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
- REM SET RUST_BACKTRACE=1 & cargo build --examples

View File

@@ -16,7 +16,7 @@ main() {
return
fi
echo "Test"
cross test --target $TARGET
cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
fi
for example in $(ls examples/*.rs)
do

View File

@@ -23,7 +23,6 @@ use tantivy::Index;
fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search`
let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
let mut schema_builder = SchemaBuilder::default();
// This configures your custom options for how tantivy will
@@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> {
// to note is that we are setting the tokenizer to `stoppy`
// which will be defined and registered below.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
.set_indexing_options(text_field_indexing)
.set_stored();
// Our first field is title.
schema_builder.add_text_field("title", text_options);
// Our second field is body.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
.set_tokenizer("stoppy")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("body", text_options);
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_path, schema.clone())?;
let index = Index::create_in_ram(schema.clone());
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
"and".to_string(),
]));
index.tokenizers().register("stoppy", tokenizer);
@@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> {
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Frankenstein",
@@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> {
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// this will have NO hits because it was filtered out
// because the query is run through the analyzer you
// actually will get an error here because the query becomes
// empty
assert!(query_parser.parse_query("the").is_err());
// this will have hits
let query = query_parser.parse_query("is")?;
// stop words are applied on the query as well.
// The following will be equivalent to `title:frankenstein`
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
let mut top_collector = TopCollector::with_limit(10);
@@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> {
}
Ok(())
}
use tempdir::TempDir;
}

2
run-tests.sh Executable file
View File

@@ -0,0 +1,2 @@
#!/bin/bash
cargo test --no-default-features --features mmap -- --test-threads 1

View File

@@ -266,14 +266,14 @@ mod tests {
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
let arr = generate_nonunique_unsorted(100_000, 5_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(1_000_000);
let mut bitset = BitSet::with_max_value(100_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..1_000_000 {
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());

View File

@@ -7,7 +7,7 @@ use std::fmt;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use Result;
use indexer::LockType;
use super::pool::LeasedItem;
use super::pool::Pool;
use super::segment::create_segment;
@@ -20,11 +20,10 @@ use core::META_FILEPATH;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use directory::{DirectoryClone, ManagedDirectory};
use directory::{ManagedDirectory};
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
use indexer::segment_updater::save_new_metas;
use indexer::DirectoryLock;
use num_cpus;
use std::path::Path;
use tokenizer::TokenizerManager;
@@ -116,18 +115,28 @@ impl Index {
&self.tokenizers
}
pub fn tokenizer_for_field(&self, field: Field) -> Option<Box<BoxedTokenizer>> {
let field_type = self.schema.get_field_entry(field).field_type();
let tokenizer: &TokenizerManager = self.tokenizers();
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> =
match field_type {
FieldType::Str(text_options) => {
text_options.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer())
.and_then(|tokenizer_name| tokenizer.get(tokenizer_name))
},
_ => {
None
FieldType::Str(text_options) => {
text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
},
_ => {
None
}
};
match tokenizer_name_opt {
Some(tokenizer) => {
Ok(tokenizer)
}
None => {
Err(TantivyError::SchemaError(format!("{:?} is not a text field.", field_entry.name())))
}
}
}
@@ -175,7 +184,8 @@ impl Index {
num_threads: usize,
overall_heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
open_index_writer(
self,
@@ -268,6 +278,7 @@ impl Index {
/// This needs to be called when a new segment has been
/// published or after a merge.
pub fn load_searchers(&self) -> Result<()> {
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = searchable_segments
.iter()

View File

@@ -33,10 +33,4 @@ lazy_static! {
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
/// are currently in the directory
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
/// Only one process should be able to write tantivy's index at a time.
/// This file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
}

View File

@@ -4,7 +4,6 @@ use core::InvertedIndexReader;
use core::Segment;
use core::SegmentComponent;
use core::SegmentId;
use core::SegmentMeta;
use error::TantivyError;
use fastfield::DeleteBitSet;
use fastfield::FacetReader;
@@ -44,7 +43,8 @@ pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId,
segment_meta: SegmentMeta,
max_doc: DocId,
num_docs: DocId,
termdict_composite: CompositeFile,
postings_composite: CompositeFile,
@@ -64,7 +64,7 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes, so it happens
/// to also be the number of documents in the index.
pub fn max_doc(&self) -> DocId {
self.segment_meta.max_doc()
self.max_doc
}
/// Returns the number of documents.
@@ -73,7 +73,7 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
self.segment_meta.num_docs()
self.num_docs
}
/// Returns the schema of the index this segment belongs to.
@@ -225,6 +225,8 @@ impl SegmentReader {
let store_source = segment.open_read(SegmentComponent::STORE)?;
let store_reader = StoreReader::from_source(store_source);
fail_point!("SegmentReader::open#middle");
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(&postings_source)?;
@@ -260,7 +262,8 @@ impl SegmentReader {
let schema = segment.schema();
Ok(SegmentReader {
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
segment_meta: segment.meta().clone(),
max_doc: segment.meta().max_doc(),
num_docs: segment.meta().num_docs(),
termdict_composite,
postings_composite,
fast_fields_composite,
@@ -432,6 +435,7 @@ mod test {
use schema::{SchemaBuilder, Term, STORED, TEXT};
use DocId;
#[test]
fn test_alive_docs_iterator() {
let mut schema_builder = SchemaBuilder::new();

View File

@@ -12,6 +12,20 @@ use std::sync::RwLockWriteGuard;
use std::sync::{Arc, RwLock};
use Directory;
use Result;
use indexer::LockType;
/// Returns true iff the file is "managed".
/// Non-managed file are not subject to garbage collection.
///
/// Filenames that starts by a "." -typically locks-
/// are not managed.
fn is_managed(path: &Path) -> bool {
path.to_str()
.map(|p_str| !p_str.starts_with("."))
.unwrap_or(true)
}
/// Wrapper of directories that keeps track of files created by Tantivy.
///
@@ -82,25 +96,34 @@ impl ManagedDirectory {
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
info!("Garbage collect");
let mut files_to_delete = vec![];
// It is crucial to get the living files after acquiring the
// read lock of meta informations. That way, we
// avoid the following scenario.
//
// 1) we get the list of living files.
// 2) someone creates a new file.
// 3) we start garbage collection and remove this file
// even though it is a living file.
//
// releasing the lock as .delete() will use it too.
{
// releasing the lock as .delete() will use it too.
let meta_informations_rlock = self.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
// It is crucial to get the living files after acquiring the
// read lock of meta informations. That way, we
// avoid the following scenario.
//
// 1) we get the list of living files.
// 2) someone creates a new file.
// 3) we start garbage collection and remove this file
// even though it is a living file.
let living_files = get_living_files();
for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) {
files_to_delete.push(managed_path.clone());
// The point of this second "file" lock is to enforce the following scenario
// 1) process B tries to load a new set of searcher.
// The list of segments is loaded
// 2) writer change meta.json (for instance after a merge or a commit)
// 3) gc kicks in.
// 4) gc removes a file that was useful for process B, before process B opened it.
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
let living_files = get_living_files();
for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) {
files_to_delete.push(managed_path.clone());
}
}
}
}
@@ -156,7 +179,15 @@ impl ManagedDirectory {
/// registering the filepath and creating the file
/// will not lead to garbage files that will
/// never get removed.
///
/// File starting by "." are reserved to locks.
/// They are not managed and cannot be subjected
/// to garbage collection.
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
// Files starting by "." (e.g. lock files) are not managed.
if !is_managed(filepath) {
return Ok(());
}
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");

View File

@@ -173,7 +173,6 @@ impl Directory for RAMDirectory {
let exists = self.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
// force the creation of the file to mimic the MMap directory.
if exists {
Err(OpenWriteError::FileAlreadyExists(path_buf))
@@ -196,6 +195,9 @@ impl Directory for RAMDirectory {
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
fail_point!("RAMDirectory::atomic_write", |msg| {
Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string())))
});
let path_buf = PathBuf::from(path);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
self.fs.write(path_buf, &Vec::new())?;

View File

@@ -6,6 +6,7 @@ use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteErro
use fastfield::FastFieldNotAvailableError;
use query;
use schema;
use indexer::LockType;
use serde_json;
use std::path::PathBuf;
use std::sync::PoisonError;
@@ -19,6 +20,9 @@ pub enum TantivyError {
/// File already exists, this is a problem when we try to write into a new file.
#[fail(display = "file already exists: '{:?}'", _0)]
FileAlreadyExists(PathBuf),
/// Failed to acquire file lock
#[fail(display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)]
LockFailure(LockType),
/// IO Error.
#[fail(display = "an IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
@@ -91,13 +95,14 @@ impl From<schema::DocParsingError> for TantivyError {
}
}
impl From<OpenWriteError> for TantivyError {
fn from(error: OpenWriteError) -> TantivyError {
match error {
OpenWriteError::FileAlreadyExists(filepath) => {
TantivyError::FileAlreadyExists(filepath)
}
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
OpenWriteError::FileAlreadyExists(filepath) =>
TantivyError::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) =>
TantivyError::IOError(io_error),
}.into()
}
}

View File

@@ -370,7 +370,7 @@ mod tests {
pub fn generate_permutation() -> Vec<u64> {
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
let mut rng = XorShiftRng::from_seed(seed);
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
rng.shuffle(&mut permutation);
permutation
}

View File

@@ -1,26 +1,147 @@
use core::LOCKFILE_FILEPATH;
use directory::error::OpenWriteError;
use Directory;
use TantivyError;
use std::path::{Path, PathBuf};
use std::thread;
use std::time::Duration;
use std::io::Write;
/// The directory lock is a mechanism used to
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
///
/// Only one lock can exist at a time for a given directory.
/// The lock is release automatically on `Drop`.
pub struct DirectoryLock {
directory: Box<Directory>,
#[derive(Debug, Clone, Copy)]
pub enum LockType {
/// Only one process should be able to write tantivy's index at a time.
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
///
/// Failing to acquire this lock usually means a misuse of tantivy's API,
/// (creating more than one instance of the `IndexWriter`), are a spurious
/// lock file remaining after a crash. In the latter case, removing the file after
/// checking no process running tantivy is running is safe.
IndexWriterLock,
/// The meta lock file is here to protect the segment files being opened by
/// `.load_searchers()` from being garbage collected.
/// It makes it possible for another process to safely consume
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
/// here, but it is difficult to achieve on Windows.
///
/// Opening segment readers is a very fast process.
/// Right now if the lock cannot be acquire on the first attempt, the logic
/// is very simplistic. We retry after `100ms` until we effectively
/// acquire the lock.
/// This lock should not have much contention in normal usage.
MetaLock
}
impl DirectoryLock {
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
directory.open_write(&*LOCKFILE_FILEPATH)?;
Ok(DirectoryLock { directory })
/// Retry the logic of acquiring locks is pretty simple.
/// We just retry `n` times after a given `duratio`, both
/// depending on the type of lock.
struct RetryPolicy {
num_retries: usize,
wait_in_ms: u64,
}
impl RetryPolicy {
fn no_retry() -> RetryPolicy {
RetryPolicy {
num_retries: 0,
wait_in_ms: 0,
}
}
fn wait_and_retry(&mut self,) -> bool {
if self.num_retries == 0 {
false
} else {
self.num_retries -= 1;
let wait_duration = Duration::from_millis(self.wait_in_ms);
thread::sleep(wait_duration);
true
}
}
}
impl LockType {
fn retry_policy(&self) -> RetryPolicy {
match *self {
LockType::IndexWriterLock =>
RetryPolicy::no_retry(),
LockType::MetaLock =>
RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
}
}
}
fn try_acquire_lock(&self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
let path = self.filename();
let mut write = directory
.open_write(path)
.map_err(|e|
match e {
OpenWriteError::FileAlreadyExists(_) =>
TantivyError::LockFailure(*self),
OpenWriteError::IOError(io_error) =>
TantivyError::IOError(io_error),
})?;
write.flush()?;
Ok(DirectoryLock {
directory: directory.box_clone(),
path: path.to_owned(),
})
}
/// Acquire a lock in the given directory.
pub fn acquire_lock(&self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
let mut box_directory = directory.box_clone();
let mut retry_policy = self.retry_policy();
loop {
let lock_result = self.try_acquire_lock(&mut *box_directory);
match lock_result {
Ok(result) => {
return Ok(result);
}
Err(TantivyError::LockFailure(ref filepath)) => {
if !retry_policy.wait_and_retry() {
return Err(TantivyError::LockFailure(filepath.to_owned()));
}
}
Err(_) => {
}
}
}
}
fn filename(&self) -> &Path {
match *self {
LockType::MetaLock => {
Path::new(".tantivy-meta.lock")
}
LockType::IndexWriterLock => {
Path::new(".tantivy-indexer.lock")
}
}
}
}
/// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
///
/// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is release automatically on `Drop`.
pub struct DirectoryLock {
directory: Box<Directory>,
path: PathBuf,
}
impl Drop for DirectoryLock {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
}
}

View File

@@ -301,25 +301,31 @@ fn index_documents(
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
)?;
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
});
let segment_entry: SegmentEntry;
if delete_cursor.get().is_some() {
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
)?;
segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
});
} else {
// if there are no delete operation in the queue, no need
// to even open the segment.
segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
}
Ok(segment_updater.add_segment(generation, segment_entry))
}
@@ -657,11 +663,26 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer(40_000_000).unwrap();
match index.writer(40_000_000) {
Err(TantivyError::FileAlreadyExists(_)) => {}
Err(TantivyError::LockFailure(_)) => {}
_ => panic!("Expected FileAlreadyExists error"),
}
}
#[test]
fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::SchemaBuilder::default();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
match index.writer_with_num_threads(1, 3_000_000) {
Err(err) => {
let err_msg = err.to_string();
assert!(err_msg.contains("Lockfile"));
assert!(err_msg.contains("Possible causes:"))
},
_ => panic!("Expected LockfileAlreadyExists error"),
}
}
#[test]
fn test_set_merge_policy() {
let schema_builder = schema::SchemaBuilder::default();
@@ -843,4 +864,33 @@ mod tests {
assert_eq!(initial_table_size(1_000_000_000), 19);
}
#[cfg(not(feature="no_fail"))]
#[test]
fn test_write_commit_fails() {
use fail;
let mut schema_builder = schema::SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
index_writer.commit().unwrap();
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "b"));
}
assert!(index_writer.commit().is_err());
index.load_searchers().unwrap();
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
searcher.doc_freq(&term_a)
};
assert_eq!(num_docs_containing("a"), 100);
assert_eq!(num_docs_containing("b"), 0);
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
}
}

View File

@@ -16,6 +16,8 @@ mod segment_writer;
mod stamper;
pub(crate) use self::directory_lock::DirectoryLock;
pub use self::directory_lock::LockType;
pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};

View File

@@ -1,7 +1,7 @@
use super::segment_register::SegmentRegister;
use core::SegmentId;
use core::SegmentMeta;
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
use core::META_FILEPATH;
use error::TantivyError;
use indexer::delete_queue::DeleteCursor;
use indexer::SegmentEntry;
@@ -78,10 +78,13 @@ impl SegmentManager {
registers_lock.committed.len() + registers_lock.uncommitted.len()
}
/// List the files that are useful to the index.
///
/// This does not include lock files, or files that are obsolete
/// but have not yet been deleted by the garbage collector.
pub fn list_files(&self) -> HashSet<PathBuf> {
let mut files = HashSet::new();
files.insert(META_FILEPATH.clone());
files.insert(LOCKFILE_FILEPATH.clone());
for segment_meta in SegmentMeta::all() {
files.extend(segment_meta.list_files());
}

View File

@@ -174,6 +174,9 @@ extern crate tinysegmenter;
#[macro_use]
extern crate downcast;
#[macro_use]
extern crate fail;
#[cfg(test)]
mod functional_test;
@@ -949,3 +952,4 @@ mod tests {
}
}
}

View File

@@ -98,7 +98,7 @@ impl MultiFieldPostingsWriter {
.iter()
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
.collect();
term_offsets.sort_by_key(|&(k, _, _)| k);
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
let mut offsets: Vec<(Field, usize)> = vec![];
let term_offsets_it = term_offsets

View File

@@ -20,6 +20,7 @@ use std::str::FromStr;
use tokenizer::TokenizerManager;
use combine::Parser;
use query::EmptyQuery;
use query::query_parser::logical_ast::LogicalAST;
/// Possible error that may happen when parsing a query.
@@ -58,6 +59,27 @@ impl From<ParseIntError> for QueryParserError {
}
}
/// Recursively remove empty clause from the AST
///
/// Returns `None` iff the `logical_ast` ended up being empty.
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
match logical_ast {
LogicalAST::Clause(children) => {
let trimmed_children = children.into_iter()
.flat_map(|(occur, child)|
trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
.collect::<Vec<_>>();
if trimmed_children.is_empty() {
None
} else {
Some(LogicalAST::Clause(trimmed_children))
}
},
_ => Some(logical_ast),
}
}
/// Tantivy's Query parser
///
/// The language covered by the current parser is extremely simple.
@@ -369,14 +391,15 @@ impl QueryParser {
asts.push(LogicalAST::Leaf(Box::new(ast)));
}
}
let result_ast = if asts.is_empty() {
// this should never happen
return Err(QueryParserError::SyntaxError);
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
};
let result_ast: LogicalAST =
if asts.len() == 1 {
asts.into_iter().next().unwrap()
} else {
LogicalAST::Clause(
asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
Ok(result_ast)
}
UserInputLeaf::All => {
@@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
}
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
match logical_ast {
LogicalAST::Clause(clause) => {
if clause.is_empty() {
Box::new(EmptyQuery)
} else {
let occur_subqueries = clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
Box::new(BooleanQuery::from(occur_subqueries))
}
}
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
match trim_ast(logical_ast) {
Some(LogicalAST::Clause(trimmed_clause)) => {
let occur_subqueries = trimmed_clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>();
assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
Box::new(BooleanQuery::from(occur_subqueries))
},
Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
None => Box::new(EmptyQuery)
}
}
@@ -454,12 +475,17 @@ mod test {
use schema::Field;
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
use tokenizer::SimpleTokenizer;
use tokenizer::TokenizerManager;
use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
use Index;
fn make_query_parser() -> QueryParser {
let mut schema_builder = SchemaBuilder::default();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("en_with_stop_words")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
schema_builder.add_i64_field("signed", INT_INDEXED);
@@ -468,9 +494,14 @@ mod test {
schema_builder.add_text_field("notindexed_u64", STORED);
schema_builder.add_text_field("notindexed_i64", STORED);
schema_builder.add_text_field("nottokenized", STRING);
schema_builder.add_text_field("with_stop_words", text_options);
let schema = schema_builder.build();
let default_fields = vec![title, text];
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec!["the".to_string()]))
);
QueryParser::new(schema, default_fields, tokenizer_manager)
}
@@ -739,6 +770,13 @@ mod test {
);
}
#[test]
pub fn test_query_parser_not_empty_but_no_tokens() {
let query_parser = make_query_parser();
assert!(query_parser.parse_query(" !, ").is_ok());
assert!(query_parser.parse_query("with_stop_words:the").is_ok());
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper(

View File

@@ -15,6 +15,7 @@ use query::MatchingTerms;
use schema::Field;
use std::collections::HashMap;
use SegmentLocalId;
use error::TantivyError;
#[derive(Debug)]
pub struct HighlightSection {
@@ -79,6 +80,14 @@ const HIGHLIGHTEN_PREFIX: &str = "<b>";
const HIGHLIGHTEN_POSTFIX: &str = "</b>";
impl Snippet {
pub fn empty() -> Snippet {
Snippet {
fragments: String::new(),
highlighted: Vec::new()
}
}
/// Returns a hignlightned html from the `Snippet`.
pub fn to_html(&self) -> String {
let mut html = String::new();
@@ -210,42 +219,55 @@ fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[D
pub fn generate_snippet(
searcher: &Searcher,
field: Field,
query: &Query,
field: Field,
doc_addresses: &[DocAddress],
max_num_chars: usize) -> Result<Vec<Snippet>> {
let mut doc_address_ords: Vec<usize> = (0..doc_addresses.len()).collect();
doc_address_ords.sort_by_key(|k| doc_addresses[*k]);
// TODO sort doc_addresses
let mut snippets = vec![];
let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?;
for doc_address in doc_addresses {
for &doc_address_ord in &doc_address_ords {
let doc_address = doc_addresses[doc_address_ord];
let segment_ord: u32 = doc_address.segment_ord();
let doc = searcher.doc(doc_address)?;
let doc = searcher.doc(&doc_address)?;
let mut text = String::new();
for value in doc.get_all(field) {
text.push_str(value.text());
}
if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&segment_ord) {
if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) {
if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) {
let terms: BTreeMap<String, f32> = terms
.iter()
.map(|(term, score)| (term.text().to_string(), *score))
.collect();
let fragment_candidates = search_fragments(tokenizer,
&text,
terms,
max_num_chars);
}
let tokenizer = searcher.index().tokenizer_for_field(field)?;
if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) {
let terms: BTreeMap<String, f32> = terms
.iter()
.map(|(term, score)| (term.text().to_string(), *score))
.collect();
let fragment_candidates = search_fragments(tokenizer,
&text,
terms,
max_num_chars);
let snippet = select_best_fragment_combination(fragment_candidates, &text);
snippets.push(snippet);
} else {
snippets.push(Snippet::empty());
}
} else {
}
}
// search_fragments(boxed_tokenizer, &text, terms, 3);
panic!("e");
// reorder the snippets
for i in 0..doc_addresses.len() {
snippets.swap(i, doc_address_ords[i]);
}
Ok(snippets)
}
#[cfg(test)]

View File

@@ -1,4 +1,5 @@
use super::{Token, TokenFilter, TokenStream};
use std::mem;
/// Token filter that lowercase terms.
#[derive(Clone)]
@@ -15,13 +16,22 @@ where
}
}
pub struct LowerCaserTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
pub struct LowerCaserTokenStream<TailTokenStream> {
buffer: String,
tail: TailTokenStream,
}
// writes a lowercased version of text into output.
fn to_lowercase_unicode(text: &mut String, output: &mut String) {
output.clear();
for c in text.chars() {
// Contrary to the std, we do not take care of sigma special case.
// This will have an normalizationo effect, which is ok for search.
output.extend(c.to_lowercase());
}
}
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
@@ -36,7 +46,14 @@ where
fn advance(&mut self) -> bool {
if self.tail.advance() {
self.tail.token_mut().text.make_ascii_lowercase();
if self.token_mut().text.is_ascii() {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
} else {
false
@@ -49,6 +66,43 @@ where
TailTokenStream: TokenStream,
{
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
LowerCaserTokenStream { tail }
LowerCaserTokenStream {
tail,
buffer: String::with_capacity(100)
}
}
}
#[cfg(test)]
mod tests {
use tokenizer::Tokenizer;
use tokenizer::LowerCaser;
use tokenizer::TokenStream;
use tokenizer::SimpleTokenizer;
#[test]
fn test_to_lower_case() {
assert_eq!(lowercase_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]);
}
fn lowercase_helper(text: &str) -> Vec<String> {
let mut tokens = vec![];
let mut token_stream = SimpleTokenizer
.filter(LowerCaser)
.token_stream(text);
while token_stream.advance() {
let token_text = token_stream.token().text.clone();
tokens.push(token_text);
}
tokens
}
#[test]
fn test_lowercaser() {
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
}
}