mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 16:40:43 +00:00
Merge branch 'master' of github.com:tantivy-search/tantivy
This commit is contained in:
11
Cargo.toml
11
Cargo.toml
@@ -48,6 +48,7 @@ census = "0.1"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
fail = "0.2"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
@@ -60,12 +61,20 @@ opt-level = 3
|
||||
debug = false
|
||||
lto = true
|
||||
debug-assertions = false
|
||||
overflow-checks = false
|
||||
|
||||
[profile.test]
|
||||
debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
# by default no-fail is disabled. We manually enable it when running test.
|
||||
default = ["mmap", "no_fail"]
|
||||
mmap = ["fst/mmap", "atomicwrites"]
|
||||
lz4-compression = ["lz4"]
|
||||
no_fail = ["fail/no_fail"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
|
||||
|
||||
@@ -78,6 +78,10 @@ To check out and run tests, you can simply run :
|
||||
cd tantivy
|
||||
cargo build
|
||||
|
||||
## Running tests
|
||||
|
||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`.
|
||||
|
||||
# Contribute
|
||||
|
||||
|
||||
@@ -18,5 +18,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
@@ -16,7 +16,7 @@ main() {
|
||||
return
|
||||
fi
|
||||
echo "Test"
|
||||
cross test --target $TARGET
|
||||
cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
|
||||
fi
|
||||
for example in $(ls examples/*.rs)
|
||||
do
|
||||
|
||||
@@ -23,7 +23,6 @@ use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// This configures your custom options for how tantivy will
|
||||
@@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> {
|
||||
// to note is that we are setting the tokenizer to `stoppy`
|
||||
// which will be defined and registered below.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
|
||||
// Our first field is title.
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("body", text_options);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
@@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> {
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
@@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// this will have NO hits because it was filtered out
|
||||
// because the query is run through the analyzer you
|
||||
// actually will get an error here because the query becomes
|
||||
// empty
|
||||
assert!(query_parser.parse_query("the").is_err());
|
||||
|
||||
// this will have hits
|
||||
let query = query_parser.parse_query("is")?;
|
||||
// stop words are applied on the query as well.
|
||||
// The following will be equivalent to `title:frankenstein`
|
||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
@@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> {
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use tempdir::TempDir;
|
||||
}
|
||||
2
run-tests.sh
Executable file
2
run-tests.sh
Executable file
@@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
cargo test --no-default-features --features mmap -- --test-threads 1
|
||||
@@ -266,14 +266,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
|
||||
let arr = generate_nonunique_unsorted(100_000, 5_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
let mut bitset = BitSet::with_max_value(100_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..1_000_000 {
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
|
||||
@@ -4,7 +4,6 @@ use core::InvertedIndexReader;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use error::TantivyError;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
@@ -44,7 +43,8 @@ pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
|
||||
segment_id: SegmentId,
|
||||
segment_meta: SegmentMeta,
|
||||
max_doc: DocId,
|
||||
num_docs: DocId,
|
||||
|
||||
termdict_composite: CompositeFile,
|
||||
postings_composite: CompositeFile,
|
||||
@@ -64,7 +64,7 @@ impl SegmentReader {
|
||||
/// Today, `tantivy` does not handle deletes, so it happens
|
||||
/// to also be the number of documents in the index.
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.segment_meta.max_doc()
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents.
|
||||
@@ -73,7 +73,7 @@ impl SegmentReader {
|
||||
/// Today, `tantivy` does not handle deletes so max doc and
|
||||
/// num_docs are the same.
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.segment_meta.num_docs()
|
||||
self.num_docs
|
||||
}
|
||||
|
||||
/// Returns the schema of the index this segment belongs to.
|
||||
@@ -225,6 +225,8 @@ impl SegmentReader {
|
||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||
let store_reader = StoreReader::from_source(store_source);
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_composite = CompositeFile::open(&postings_source)?;
|
||||
|
||||
@@ -260,7 +262,8 @@ impl SegmentReader {
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
segment_meta: segment.meta().clone(),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_composite,
|
||||
@@ -432,6 +435,7 @@ mod test {
|
||||
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||
use DocId;
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
|
||||
@@ -195,6 +195,9 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| {
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string())))
|
||||
});
|
||||
let path_buf = PathBuf::from(path);
|
||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
self.fs.write(path_buf, &Vec::new())?;
|
||||
|
||||
@@ -370,7 +370,7 @@ mod tests {
|
||||
pub fn generate_permutation() -> Vec<u64> {
|
||||
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
|
||||
let mut rng = XorShiftRng::from_seed(seed);
|
||||
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
|
||||
rng.shuffle(&mut permutation);
|
||||
permutation
|
||||
}
|
||||
|
||||
@@ -301,25 +301,31 @@ fn index_documents(
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
let segment_entry: SegmentEntry;
|
||||
|
||||
if delete_cursor.get().is_some() {
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// if there are no delete operation in the queue, no need
|
||||
// to even open the segment.
|
||||
segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
}
|
||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||
}
|
||||
|
||||
@@ -858,4 +864,33 @@ mod tests {
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
|
||||
#[cfg(not(feature="no_fail"))]
|
||||
#[test]
|
||||
fn test_write_commit_fails() {
|
||||
use fail;
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_err());
|
||||
index.load_searchers().unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 100);
|
||||
assert_eq!(num_docs_containing("b"), 0);
|
||||
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -173,6 +173,9 @@ extern crate tinysegmenter;
|
||||
#[macro_use]
|
||||
extern crate downcast;
|
||||
|
||||
#[macro_use]
|
||||
extern crate fail;
|
||||
|
||||
#[cfg(test)]
|
||||
mod functional_test;
|
||||
|
||||
@@ -946,3 +949,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ use std::str::FromStr;
|
||||
use tokenizer::TokenizerManager;
|
||||
use combine::Parser;
|
||||
use query::EmptyQuery;
|
||||
use query::query_parser::logical_ast::LogicalAST;
|
||||
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
@@ -58,6 +59,27 @@ impl From<ParseIntError> for QueryParserError {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Recursively remove empty clause from the AST
|
||||
///
|
||||
/// Returns `None` iff the `logical_ast` ended up being empty.
|
||||
fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(children) => {
|
||||
let trimmed_children = children.into_iter()
|
||||
.flat_map(|(occur, child)|
|
||||
trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) )
|
||||
.collect::<Vec<_>>();
|
||||
if trimmed_children.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(LogicalAST::Clause(trimmed_children))
|
||||
}
|
||||
},
|
||||
_ => Some(logical_ast),
|
||||
}
|
||||
}
|
||||
|
||||
/// Tantivy's Query parser
|
||||
///
|
||||
/// The language covered by the current parser is extremely simple.
|
||||
@@ -369,14 +391,15 @@ impl QueryParser {
|
||||
asts.push(LogicalAST::Leaf(Box::new(ast)));
|
||||
}
|
||||
}
|
||||
let result_ast = if asts.is_empty() {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
|
||||
};
|
||||
let result_ast: LogicalAST =
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(
|
||||
asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
Ok(result_ast)
|
||||
}
|
||||
UserInputLeaf::All => {
|
||||
@@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
}
|
||||
|
||||
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(clause) => {
|
||||
if clause.is_empty() {
|
||||
Box::new(EmptyQuery)
|
||||
} else {
|
||||
let occur_subqueries = clause
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
Box::new(BooleanQuery::from(occur_subqueries))
|
||||
}
|
||||
}
|
||||
LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal),
|
||||
match trim_ast(logical_ast) {
|
||||
Some(LogicalAST::Clause(trimmed_clause)) => {
|
||||
let occur_subqueries = trimmed_clause
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming");
|
||||
Box::new(BooleanQuery::from(occur_subqueries))
|
||||
},
|
||||
Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal),
|
||||
None => Box::new(EmptyQuery)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,12 +475,17 @@ mod test {
|
||||
use schema::Field;
|
||||
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use tokenizer::TokenizerManager;
|
||||
use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager};
|
||||
use Index;
|
||||
|
||||
fn make_query_parser() -> QueryParser {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("en_with_stop_words")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.add_i64_field("signed", INT_INDEXED);
|
||||
@@ -468,9 +494,14 @@ mod test {
|
||||
schema_builder.add_text_field("notindexed_u64", STORED);
|
||||
schema_builder.add_text_field("notindexed_i64", STORED);
|
||||
schema_builder.add_text_field("nottokenized", STRING);
|
||||
schema_builder.add_text_field("with_stop_words", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title, text];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register("en_with_stop_words", SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
);
|
||||
QueryParser::new(schema, default_fields, tokenizer_manager)
|
||||
}
|
||||
|
||||
@@ -739,6 +770,13 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_not_empty_but_no_tokens() {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query(" !, ").is_ok());
|
||||
assert!(query_parser.parse_query("with_stop_words:the").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use std::mem;
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
@@ -15,13 +16,22 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
pub struct LowerCaserTokenStream<TailTokenStream> {
|
||||
buffer: String,
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
// writes a lowercased version of text into output.
|
||||
fn to_lowercase_unicode(text: &mut String, output: &mut String) {
|
||||
output.clear();
|
||||
for c in text.chars() {
|
||||
// Contrary to the std, we do not take care of sigma special case.
|
||||
// This will have an normalizationo effect, which is ok for search.
|
||||
output.extend(c.to_lowercase());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
@@ -36,7 +46,14 @@ where
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tail.advance() {
|
||||
self.tail.token_mut().text.make_ascii_lowercase();
|
||||
if self.token_mut().text.is_ascii() {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -49,6 +66,43 @@ where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
|
||||
LowerCaserTokenStream { tail }
|
||||
LowerCaserTokenStream {
|
||||
tail,
|
||||
buffer: String::with_capacity(100)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tokenizer::Tokenizer;
|
||||
use tokenizer::LowerCaser;
|
||||
use tokenizer::TokenStream;
|
||||
use tokenizer::SimpleTokenizer;
|
||||
|
||||
#[test]
|
||||
fn test_to_lower_case() {
|
||||
assert_eq!(lowercase_helper("Русский текст"),
|
||||
vec!["русский".to_string(), "текст".to_string()]);
|
||||
}
|
||||
|
||||
fn lowercase_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = vec![];
|
||||
let mut token_stream = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.token_stream(text);
|
||||
while token_stream.advance() {
|
||||
let token_text = token_stream.token().text.clone();
|
||||
tokens.push(token_text);
|
||||
}
|
||||
tokens
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_lowercaser() {
|
||||
assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]);
|
||||
assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user