mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
Merge branch 'issue/368' into issue/368b
This commit is contained in:
11
Cargo.toml
11
Cargo.toml
@@ -49,6 +49,7 @@ fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
@@ -61,12 +62,20 @@ opt-level = 3
|
||||
debug = false
|
||||
lto = true
|
||||
debug-assertions = false
|
||||
overflow-checks = false
|
||||
|
||||
[profile.test]
|
||||
debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
# by default no-fail is disabled. We manually enable it when running test.
|
||||
default = ["mmap", "no_fail"]
|
||||
mmap = ["fst/mmap", "atomicwrites"]
|
||||
lz4-compression = ["lz4"]
|
||||
no_fail = ["fail/no_fail"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
[](https://saythanks.io/to/fulmicoton)
|
||||
|
||||

|
||||
|
||||
@@ -77,6 +78,10 @@ To check out and run tests, you can simply run :
|
||||
cd tantivy
|
||||
cargo build
|
||||
|
||||
## Running tests
|
||||
|
||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`.
|
||||
|
||||
# Contribute
|
||||
|
||||
|
||||
@@ -18,5 +18,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
@@ -16,7 +16,7 @@ main() {
|
||||
return
|
||||
fi
|
||||
echo "Test"
|
||||
cross test --target $TARGET
|
||||
cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
|
||||
fi
|
||||
for example in $(ls examples/*.rs)
|
||||
do
|
||||
|
||||
@@ -23,7 +23,6 @@ use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// This configures your custom options for how tantivy will
|
||||
@@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> {
|
||||
// to note is that we are setting the tokenizer to `stoppy`
|
||||
// which will be defined and registered below.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
|
||||
// Our first field is title.
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("body", text_options);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
@@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> {
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
@@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// this will have NO hits because it was filtered out
|
||||
// because the query is run through the analyzer you
|
||||
// actually will get an error here because the query becomes
|
||||
// empty
|
||||
assert!(query_parser.parse_query("the").is_err());
|
||||
|
||||
// this will have hits
|
||||
let query = query_parser.parse_query("is")?;
|
||||
// stop words are applied on the query as well.
|
||||
// The following will be equivalent to `title:frankenstein`
|
||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
@@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> {
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use tempdir::TempDir;
|
||||
}
|
||||
2
run-tests.sh
Executable file
2
run-tests.sh
Executable file
@@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
cargo test --no-default-features --features mmap -- --test-threads 1
|
||||
@@ -266,14 +266,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
|
||||
let arr = generate_nonunique_unsorted(100_000, 5_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
let mut bitset = BitSet::with_max_value(100_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..1_000_000 {
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::fmt;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use Result;
|
||||
|
||||
use indexer::LockType;
|
||||
use super::pool::LeasedItem;
|
||||
use super::pool::Pool;
|
||||
use super::segment::create_segment;
|
||||
@@ -20,11 +20,10 @@ use core::META_FILEPATH;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use directory::{DirectoryClone, ManagedDirectory};
|
||||
use directory::{ManagedDirectory};
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use indexer::DirectoryLock;
|
||||
use num_cpus;
|
||||
use std::path::Path;
|
||||
use tokenizer::TokenizerManager;
|
||||
@@ -116,18 +115,28 @@ impl Index {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Option<Box<BoxedTokenizer>> {
|
||||
let field_type = self.schema.get_field_entry(field).field_type();
|
||||
let tokenizer: &TokenizerManager = self.tokenizers();
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> =
|
||||
match field_type {
|
||||
FieldType::Str(text_options) => {
|
||||
text_options.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer())
|
||||
.and_then(|tokenizer_name| tokenizer.get(tokenizer_name))
|
||||
|
||||
},
|
||||
_ => {
|
||||
None
|
||||
FieldType::Str(text_options) => {
|
||||
text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
|
||||
},
|
||||
_ => {
|
||||
None
|
||||
}
|
||||
};
|
||||
match tokenizer_name_opt {
|
||||
Some(tokenizer) => {
|
||||
Ok(tokenizer)
|
||||
}
|
||||
None => {
|
||||
Err(TantivyError::SchemaError(format!("{:?} is not a text field.", field_entry.name())))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -175,7 +184,8 @@ impl Index {
|
||||
num_threads: usize,
|
||||
overall_heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||
|
||||
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
self,
|
||||
@@ -268,6 +278,7 @@ impl Index {
|
||||
/// This needs to be called when a new segment has been
|
||||
/// published or after a merge.
|
||||
pub fn load_searchers(&self) -> Result<()> {
|
||||
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
let segment_readers: Vec<SegmentReader> = searchable_segments
|
||||
.iter()
|
||||
|
||||
@@ -33,10 +33,4 @@ lazy_static! {
|
||||
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
|
||||
/// are currently in the directory
|
||||
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
|
||||
|
||||
/// Only one process should be able to write tantivy's index at a time.
|
||||
/// This file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ use core::InvertedIndexReader;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use error::TantivyError;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
@@ -44,7 +43,8 @@ pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
|
||||
segment_id: SegmentId,
|
||||
segment_meta: SegmentMeta,
|
||||
max_doc: DocId,
|
||||
num_docs: DocId,
|
||||
|
||||
termdict_composite: CompositeFile,
|
||||
postings_composite: CompositeFile,
|
||||
@@ -64,7 +64,7 @@ impl SegmentReader {
|
||||
/// Today, `tantivy` does not handle deletes, so it happens
|
||||
/// to also be the number of documents in the index.
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.segment_meta.max_doc()
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents.
|
||||
@@ -73,7 +73,7 @@ impl SegmentReader {
|
||||
/// Today, `tantivy` does not handle deletes so max doc and
|
||||
/// num_docs are the same.
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.segment_meta.num_docs()
|
||||
self.num_docs
|
||||
}
|
||||
|
||||
/// Returns the schema of the index this segment belongs to.
|
||||
@@ -225,6 +225,8 @@ impl SegmentReader {
|
||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||
let store_reader = StoreReader::from_source(store_source);
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_composite = CompositeFile::open(&postings_source)?;
|
||||
|
||||
@@ -260,7 +262,8 @@ impl SegmentReader {
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
segment_meta: segment.meta().clone(),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_composite,
|
||||
@@ -432,6 +435,7 @@ mod test {
|
||||
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||
use DocId;
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
|
||||
@@ -12,6 +12,20 @@ use std::sync::RwLockWriteGuard;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use Directory;
|
||||
use Result;
|
||||
use indexer::LockType;
|
||||
|
||||
|
||||
|
||||
/// Returns true iff the file is "managed".
|
||||
/// Non-managed file are not subject to garbage collection.
|
||||
///
|
||||
/// Filenames that starts by a "." -typically locks-
|
||||
/// are not managed.
|
||||
fn is_managed(path: &Path) -> bool {
|
||||
path.to_str()
|
||||
.map(|p_str| !p_str.starts_with("."))
|
||||
.unwrap_or(true)
|
||||
}
|
||||
|
||||
/// Wrapper of directories that keeps track of files created by Tantivy.
|
||||
///
|
||||
@@ -82,25 +96,34 @@ impl ManagedDirectory {
|
||||
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
|
||||
info!("Garbage collect");
|
||||
let mut files_to_delete = vec![];
|
||||
|
||||
// It is crucial to get the living files after acquiring the
|
||||
// read lock of meta informations. That way, we
|
||||
// avoid the following scenario.
|
||||
//
|
||||
// 1) we get the list of living files.
|
||||
// 2) someone creates a new file.
|
||||
// 3) we start garbage collection and remove this file
|
||||
// even though it is a living file.
|
||||
//
|
||||
// releasing the lock as .delete() will use it too.
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
|
||||
// It is crucial to get the living files after acquiring the
|
||||
// read lock of meta informations. That way, we
|
||||
// avoid the following scenario.
|
||||
//
|
||||
// 1) we get the list of living files.
|
||||
// 2) someone creates a new file.
|
||||
// 3) we start garbage collection and remove this file
|
||||
// even though it is a living file.
|
||||
let living_files = get_living_files();
|
||||
|
||||
for managed_path in &meta_informations_rlock.managed_paths {
|
||||
if !living_files.contains(managed_path) {
|
||||
files_to_delete.push(managed_path.clone());
|
||||
// The point of this second "file" lock is to enforce the following scenario
|
||||
// 1) process B tries to load a new set of searcher.
|
||||
// The list of segments is loaded
|
||||
// 2) writer change meta.json (for instance after a merge or a commit)
|
||||
// 3) gc kicks in.
|
||||
// 4) gc removes a file that was useful for process B, before process B opened it.
|
||||
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
|
||||
let living_files = get_living_files();
|
||||
for managed_path in &meta_informations_rlock.managed_paths {
|
||||
if !living_files.contains(managed_path) {
|
||||
files_to_delete.push(managed_path.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -156,7 +179,15 @@ impl ManagedDirectory {
|
||||
/// registering the filepath and creating the file
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
///
|
||||
/// File starting by "." are reserved to locks.
|
||||
/// They are not managed and cannot be subjected
|
||||
/// to garbage collection.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
// Files starting by "." (e.g. lock files) are not managed.
|
||||
if !is_managed(filepath) {
|
||||
return Ok(());
|
||||
}
|
||||
let mut meta_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
|
||||
@@ -173,7 +173,6 @@ impl Directory for RAMDirectory {
|
||||
let exists = self.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
if exists {
|
||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
||||
@@ -196,6 +195,9 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| {
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string())))
|
||||
});
|
||||
let path_buf = PathBuf::from(path);
|
||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
self.fs.write(path_buf, &Vec::new())?;
|
||||
|
||||
13
src/error.rs
13
src/error.rs
@@ -6,6 +6,7 @@ use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteErro
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use query;
|
||||
use schema;
|
||||
use indexer::LockType;
|
||||
use serde_json;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
@@ -19,6 +20,9 @@ pub enum TantivyError {
|
||||
/// File already exists, this is a problem when we try to write into a new file.
|
||||
#[fail(display = "file already exists: '{:?}'", _0)]
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// Failed to acquire file lock
|
||||
#[fail(display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)]
|
||||
LockFailure(LockType),
|
||||
/// IO Error.
|
||||
#[fail(display = "an IO error occurred: '{}'", _0)]
|
||||
IOError(#[cause] IOError),
|
||||
@@ -91,13 +95,14 @@ impl From<schema::DocParsingError> for TantivyError {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<OpenWriteError> for TantivyError {
|
||||
fn from(error: OpenWriteError) -> TantivyError {
|
||||
match error {
|
||||
OpenWriteError::FileAlreadyExists(filepath) => {
|
||||
TantivyError::FileAlreadyExists(filepath)
|
||||
}
|
||||
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||
OpenWriteError::FileAlreadyExists(filepath) =>
|
||||
TantivyError::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) =>
|
||||
TantivyError::IOError(io_error),
|
||||
}.into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -370,7 +370,7 @@ mod tests {
|
||||
pub fn generate_permutation() -> Vec<u64> {
|
||||
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
|
||||
let mut rng = XorShiftRng::from_seed(seed);
|
||||
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
|
||||
rng.shuffle(&mut permutation);
|
||||
permutation
|
||||
}
|
||||
|
||||
@@ -1,26 +1,147 @@
|
||||
use core::LOCKFILE_FILEPATH;
|
||||
use directory::error::OpenWriteError;
|
||||
use Directory;
|
||||
use TantivyError;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::io::Write;
|
||||
|
||||
/// The directory lock is a mechanism used to
|
||||
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
|
||||
///
|
||||
/// Only one lock can exist at a time for a given directory.
|
||||
/// The lock is release automatically on `Drop`.
|
||||
pub struct DirectoryLock {
|
||||
directory: Box<Directory>,
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum LockType {
|
||||
/// Only one process should be able to write tantivy's index at a time.
|
||||
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
///
|
||||
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
||||
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
||||
/// lock file remaining after a crash. In the latter case, removing the file after
|
||||
/// checking no process running tantivy is running is safe.
|
||||
IndexWriterLock,
|
||||
/// The meta lock file is here to protect the segment files being opened by
|
||||
/// `.load_searchers()` from being garbage collected.
|
||||
/// It makes it possible for another process to safely consume
|
||||
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
||||
/// here, but it is difficult to achieve on Windows.
|
||||
///
|
||||
/// Opening segment readers is a very fast process.
|
||||
/// Right now if the lock cannot be acquire on the first attempt, the logic
|
||||
/// is very simplistic. We retry after `100ms` until we effectively
|
||||
/// acquire the lock.
|
||||
/// This lock should not have much contention in normal usage.
|
||||
MetaLock
|
||||
}
|
||||
|
||||
impl DirectoryLock {
|
||||
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
|
||||
directory.open_write(&*LOCKFILE_FILEPATH)?;
|
||||
Ok(DirectoryLock { directory })
|
||||
|
||||
/// Retry the logic of acquiring locks is pretty simple.
|
||||
/// We just retry `n` times after a given `duratio`, both
|
||||
/// depending on the type of lock.
|
||||
struct RetryPolicy {
|
||||
num_retries: usize,
|
||||
wait_in_ms: u64,
|
||||
}
|
||||
|
||||
impl RetryPolicy {
|
||||
fn no_retry() -> RetryPolicy {
|
||||
RetryPolicy {
|
||||
num_retries: 0,
|
||||
wait_in_ms: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn wait_and_retry(&mut self,) -> bool {
|
||||
if self.num_retries == 0 {
|
||||
false
|
||||
} else {
|
||||
self.num_retries -= 1;
|
||||
let wait_duration = Duration::from_millis(self.wait_in_ms);
|
||||
thread::sleep(wait_duration);
|
||||
true
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
impl LockType {
|
||||
|
||||
fn retry_policy(&self) -> RetryPolicy {
|
||||
match *self {
|
||||
LockType::IndexWriterLock =>
|
||||
RetryPolicy::no_retry(),
|
||||
LockType::MetaLock =>
|
||||
RetryPolicy {
|
||||
num_retries: 100,
|
||||
wait_in_ms: 100,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn try_acquire_lock(&self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
|
||||
let path = self.filename();
|
||||
let mut write = directory
|
||||
.open_write(path)
|
||||
.map_err(|e|
|
||||
match e {
|
||||
OpenWriteError::FileAlreadyExists(_) =>
|
||||
TantivyError::LockFailure(*self),
|
||||
OpenWriteError::IOError(io_error) =>
|
||||
TantivyError::IOError(io_error),
|
||||
})?;
|
||||
write.flush()?;
|
||||
Ok(DirectoryLock {
|
||||
directory: directory.box_clone(),
|
||||
path: path.to_owned(),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Acquire a lock in the given directory.
|
||||
pub fn acquire_lock(&self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
|
||||
let mut box_directory = directory.box_clone();
|
||||
let mut retry_policy = self.retry_policy();
|
||||
loop {
|
||||
let lock_result = self.try_acquire_lock(&mut *box_directory);
|
||||
match lock_result {
|
||||
Ok(result) => {
|
||||
return Ok(result);
|
||||
}
|
||||
Err(TantivyError::LockFailure(ref filepath)) => {
|
||||
if !retry_policy.wait_and_retry() {
|
||||
return Err(TantivyError::LockFailure(filepath.to_owned()));
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn filename(&self) -> &Path {
|
||||
match *self {
|
||||
LockType::MetaLock => {
|
||||
Path::new(".tantivy-meta.lock")
|
||||
}
|
||||
LockType::IndexWriterLock => {
|
||||
Path::new(".tantivy-indexer.lock")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// The `DirectoryLock` is an object that represents a file lock.
|
||||
/// See [`LockType`](struct.LockType.html)
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is release automatically on `Drop`.
|
||||
pub struct DirectoryLock {
|
||||
directory: Box<Directory>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl Drop for DirectoryLock {
|
||||
fn drop(&mut self) {
|
||||
if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) {
|
||||
if let Err(e) = self.directory.delete(&*self.path) {
|
||||
error!("Failed to remove the lock file. {:?}", e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -301,25 +301,31 @@ fn index_documents(
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
let segment_entry: SegmentEntry;
|
||||
|
||||
if delete_cursor.get().is_some() {
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// if there are no delete operation in the queue, no need
|
||||
// to even open the segment.
|
||||
segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
}
|
||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||
}
|
||||
|
||||
@@ -657,11 +663,26 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let _index_writer = index.writer(40_000_000).unwrap();
|
||||
match index.writer(40_000_000) {
|
||||
Err(TantivyError::FileAlreadyExists(_)) => {}
|
||||
Err(TantivyError::LockFailure(_)) => {}
|
||||
_ => panic!("Expected FileAlreadyExists error"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_already_exists_error_msg() {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
match index.writer_with_num_threads(1, 3_000_000) {
|
||||
Err(err) => {
|
||||
let err_msg = err.to_string();
|
||||
assert!(err_msg.contains("Lockfile"));
|
||||
assert!(err_msg.contains("Possible causes:"))
|
||||
},
|
||||
_ => panic!("Expected LockfileAlreadyExists error"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_set_merge_policy() {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
@@ -843,4 +864,33 @@ mod tests {
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(feature="no_fail"))]
|
||||
#[test]
|
||||
fn test_write_commit_fails() {
|
||||
use fail;
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_err());
|
||||
index.load_searchers().unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 100);
|
||||
assert_eq!(num_docs_containing("b"), 0);
|
||||
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,8 @@ mod segment_writer;
|
||||
mod stamper;
|
||||
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
pub use self::directory_lock::LockType;
|
||||
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
|
||||
use core::META_FILEPATH;
|
||||
use error::TantivyError;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::SegmentEntry;
|
||||
@@ -78,10 +78,13 @@ impl SegmentManager {
|
||||
registers_lock.committed.len() + registers_lock.uncommitted.len()
|
||||
}
|
||||
|
||||
/// List the files that are useful to the index.
|
||||
///
|
||||
/// This does not include lock files, or files that are obsolete
|
||||
/// but have not yet been deleted by the garbage collector.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
let mut files = HashSet::new();
|
||||
files.insert(META_FILEPATH.clone());
|
||||
files.insert(LOCKFILE_FILEPATH.clone());
|
||||
for segment_meta in SegmentMeta::all() {
|
||||
files.extend(segment_meta.list_files());
|
||||
}
|
||||
|
||||
@@ -174,6 +174,9 @@ extern crate tinysegmenter;
|
||||
#[macro_use]
|
||||
extern crate downcast;
|
||||
|
||||
#[macro_use]
|
||||
extern crate fail;
|
||||
|
||||
#[cfg(test)]
|
||||
mod functional_test;
|
||||
|
||||
@@ -949,3 +952,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -98,7 +98,7 @@ impl MultiFieldPostingsWriter {
|
||||
.iter()
|
||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||
.collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||
let term_offsets_it = term_offsets
|
||||
|
||||
@@ -20,6 +20,7 @@ use std::str::FromStr;
|
||||
use tokenizer::TokenizerManager;
|
||||
use combine::Parser;
|
||||
use query::EmptyQuery;
|
||||
use query::query_parser::logical_ast::LogicalAST;
|
||||
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
@@ -58,6 +59,27 @@ impl From<ParseIntError> for QueryParserError {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Recursively remove empty clause from the AST
|
||||
///
|
||||
/// Returns `None` iff the `logical_ast` ended up being empty.
|
||||
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(children) => {
|
||||
let trimmed_children = children.into_iter()
|
||||
.flat_map(|(occur, child)|
|
||||
trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
|
||||
.collect::<Vec<_>>();
|
||||
if trimmed_children.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(LogicalAST::Clause(trimmed_children))
|
||||
}
|
||||
},
|
||||
_ => Some(logical_ast),
|
||||
}
|
||||
}
|
||||
|
||||
/// Tantivy's Query parser
|
||||
///
|
||||
/// The language covered by the current parser is extremely simple.
|
||||
@@ -369,14 +391,15 @@ impl QueryParser {
|
||||
asts.push(LogicalAST::Leaf(Box::new(ast)));
|
||||
}
|
||||
}
|
||||
let result_ast = if asts.is_empty() {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
|
||||
};
|
||||
let result_ast: LogicalAST =
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(
|
||||
asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
Ok(result_ast)
|
||||
}
|
||||
UserInputLeaf::All => {
|
||||
@@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
}
|
||||
|
||||
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(clause) => {
|
||||
if clause.is_empty() {
|
||||
Box::new(EmptyQuery)
|
||||
} else {
|
||||
let occur_subqueries = clause
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
Box::new(BooleanQuery::from(occur_subqueries))
|
||||
}
|
||||
}
|
||||
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
|
||||
match trim_ast(logical_ast) {
|
||||
Some(LogicalAST::Clause(trimmed_clause)) => {
|
||||
let occur_subqueries = trimmed_clause
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
|
||||
Box::new(BooleanQuery::from(occur_subqueries))
|
||||
},
|
||||
Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
|
||||
None => Box::new(EmptyQuery)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,12 +475,17 @@ mod test {
|
||||
use schema::Field;
|
||||
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use tokenizer::TokenizerManager;
|
||||
use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
|
||||
use Index;
|
||||
|
||||
fn make_query_parser() -> QueryParser {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("en_with_stop_words")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.add_i64_field("signed", INT_INDEXED);
|
||||
@@ -468,9 +494,14 @@ mod test {
|
||||
schema_builder.add_text_field("notindexed_u64", STORED);
|
||||
schema_builder.add_text_field("notindexed_i64", STORED);
|
||||
schema_builder.add_text_field("nottokenized", STRING);
|
||||
schema_builder.add_text_field("with_stop_words", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title, text];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
);
|
||||
QueryParser::new(schema, default_fields, tokenizer_manager)
|
||||
}
|
||||
|
||||
@@ -739,6 +770,13 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_not_empty_but_no_tokens() {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query(" !, ").is_ok());
|
||||
assert!(query_parser.parse_query("with_stop_words:the").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
|
||||
@@ -15,6 +15,7 @@ use query::MatchingTerms;
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use SegmentLocalId;
|
||||
use error::TantivyError;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct HighlightSection {
|
||||
@@ -79,6 +80,14 @@ const HIGHLIGHTEN_PREFIX: &str = "<b>";
|
||||
const HIGHLIGHTEN_POSTFIX: &str = "</b>";
|
||||
|
||||
impl Snippet {
|
||||
|
||||
pub fn empty() -> Snippet {
|
||||
Snippet {
|
||||
fragments: String::new(),
|
||||
highlighted: Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a hignlightned html from the `Snippet`.
|
||||
pub fn to_html(&self) -> String {
|
||||
let mut html = String::new();
|
||||
@@ -210,42 +219,55 @@ fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[D
|
||||
|
||||
pub fn generate_snippet(
|
||||
searcher: &Searcher,
|
||||
field: Field,
|
||||
query: &Query,
|
||||
field: Field,
|
||||
doc_addresses: &[DocAddress],
|
||||
max_num_chars: usize) -> Result<Vec<Snippet>> {
|
||||
|
||||
let mut doc_address_ords: Vec<usize> = (0..doc_addresses.len()).collect();
|
||||
doc_address_ords.sort_by_key(|k| doc_addresses[*k]);
|
||||
|
||||
// TODO sort doc_addresses
|
||||
let mut snippets = vec![];
|
||||
let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?;
|
||||
for doc_address in doc_addresses {
|
||||
|
||||
for &doc_address_ord in &doc_address_ords {
|
||||
let doc_address = doc_addresses[doc_address_ord];
|
||||
let segment_ord: u32 = doc_address.segment_ord();
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let doc = searcher.doc(&doc_address)?;
|
||||
|
||||
let mut text = String::new();
|
||||
for value in doc.get_all(field) {
|
||||
text.push_str(value.text());
|
||||
}
|
||||
|
||||
|
||||
if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&segment_ord) {
|
||||
if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) {
|
||||
if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) {
|
||||
let terms: BTreeMap<String, f32> = terms
|
||||
.iter()
|
||||
.map(|(term, score)| (term.text().to_string(), *score))
|
||||
.collect();
|
||||
let fragment_candidates = search_fragments(tokenizer,
|
||||
&text,
|
||||
terms,
|
||||
max_num_chars);
|
||||
}
|
||||
let tokenizer = searcher.index().tokenizer_for_field(field)?;
|
||||
if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) {
|
||||
let terms: BTreeMap<String, f32> = terms
|
||||
.iter()
|
||||
.map(|(term, score)| (term.text().to_string(), *score))
|
||||
.collect();
|
||||
let fragment_candidates = search_fragments(tokenizer,
|
||||
&text,
|
||||
terms,
|
||||
max_num_chars);
|
||||
let snippet = select_best_fragment_combination(fragment_candidates, &text);
|
||||
snippets.push(snippet);
|
||||
} else {
|
||||
snippets.push(Snippet::empty());
|
||||
}
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
||||
// search_fragments(boxed_tokenizer, &text, terms, 3);
|
||||
panic!("e");
|
||||
|
||||
// reorder the snippets
|
||||
for i in 0..doc_addresses.len() {
|
||||
snippets.swap(i, doc_address_ords[i]);
|
||||
}
|
||||
|
||||
Ok(snippets)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use std::mem;
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
@@ -15,13 +16,22 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
pub struct LowerCaserTokenStream<TailTokenStream> {
|
||||
buffer: String,
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
// writes a lowercased version of text into output.
|
||||
fn to_lowercase_unicode(text: &mut String, output: &mut String) {
|
||||
output.clear();
|
||||
for c in text.chars() {
|
||||
// Contrary to the std, we do not take care of sigma special case.
|
||||
// This will have an normalizationo effect, which is ok for search.
|
||||
output.extend(c.to_lowercase());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
@@ -36,7 +46,14 @@ where
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tail.advance() {
|
||||
self.tail.token_mut().text.make_ascii_lowercase();
|
||||
if self.token_mut().text.is_ascii() {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -49,6 +66,43 @@ where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
|
||||
LowerCaserTokenStream { tail }
|
||||
LowerCaserTokenStream {
|
||||
tail,
|
||||
buffer: String::with_capacity(100)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tokenizer::Tokenizer;
|
||||
use tokenizer::LowerCaser;
|
||||
use tokenizer::TokenStream;
|
||||
use tokenizer::SimpleTokenizer;
|
||||
|
||||
#[test]
|
||||
fn test_to_lower_case() {
|
||||
assert_eq!(lowercase_helper("Русский текст"),
|
||||
vec!["русский".to_string(), "текст".to_string()]);
|
||||
}
|
||||
|
||||
fn lowercase_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = vec![];
|
||||
let mut token_stream = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.token_stream(text);
|
||||
while token_stream.advance() {
|
||||
let token_text = token_stream.token().text.clone();
|
||||
tokens.push(token_text);
|
||||
}
|
||||
tokens
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_lowercaser() {
|
||||
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
|
||||
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user