Compare commits

..

4 Commits

Author SHA1 Message Date
Paul Masurel
488bceda10 Merge branch 'master' of github.com:tantivy-search/tantivy into dds/lenient 2018-10-30 09:59:48 +09:00
Dru Sellers
f2b8755e10 Update test name and method comments 2018-08-18 10:26:06 -05:00
Dru Sellers
fa269f1f34 Santize known poor queries 2018-08-18 10:19:27 -05:00
Dru Sellers
e23a9303ce Add parse_query_lenient to QueryParser 2018-08-18 09:30:45 -05:00
12 changed files with 250 additions and 600 deletions

View File

@@ -1,7 +1,3 @@
Tantivy 0.7.1
=====================
- Bugfix: NGramTokenizer panics on non ascii chars
- Added a space usage API
Tantivy 0.7
=====================

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.7.2"
version = "0.7.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -12,7 +12,7 @@ readme = "README.md"
keywords = ["search", "information", "retrieval"]
[dependencies]
base64 = "0.10.0"
base64 = "0.9.1"
byteorder = "1.0"
lazy_static = "1"
regex = "1.0"

View File

@@ -49,11 +49,6 @@ pub struct Index {
}
impl Index {
/// Examines the director to see if it contains an index
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
dir.exists(&META_FILEPATH)
}
/// Creates a new index using the `RAMDirectory`.
///
/// The index will be allocated in anonymous memory.
@@ -70,28 +65,9 @@ impl Index {
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
if Index::exists(&mmap_directory) {
return Err(TantivyError::IndexAlreadyExists);
}
Index::create(mmap_directory, schema)
}
/// Opens or creates a new index in the provided directory
#[cfg(feature = "mmap")]
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
if Index::exists(&dir) {
let index = Index::open(dir)?;
if index.schema() == schema {
Ok(index)
} else {
Err(TantivyError::SchemaError("An index exists but the schema does not match.".to_string()))
}
} else {
Index::create(dir, schema)
}
}
/// Creates a new index in a temp directory.
///
/// The index will use the `MMapDirectory` in a newly created directory.
@@ -113,8 +89,6 @@ impl Index {
}
/// Create a new index from a directory.
///
/// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
@@ -354,9 +328,8 @@ impl Clone for Index {
#[cfg(test)]
mod tests {
use schema::{Schema, SchemaBuilder, INT_INDEXED, TEXT};
use schema::{SchemaBuilder, INT_INDEXED, TEXT};
use Index;
use directory::RAMDirectory;
#[test]
fn test_indexer_for_field() {
@@ -372,52 +345,4 @@ mod tests {
);
}
#[test]
fn test_index_exists() {
let directory = RAMDirectory::create();
assert!(!Index::exists(&directory));
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
}
#[test]
fn open_or_create_should_create() {
let directory = RAMDirectory::create();
assert!(!Index::exists(&directory));
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
}
#[test]
fn open_or_create_should_open() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
}
#[test]
fn create_should_wipeoff_existing() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::create(directory.clone(), SchemaBuilder::default().build()).is_ok());
}
#[test]
fn open_or_create_exists_but_schema_does_not_match() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
let err = Index::open_or_create(directory, SchemaBuilder::default().build());
assert_eq!(format!("{:?}", err.unwrap_err()), "SchemaError(\"An index exists but the schema does not match.\")");
}
fn throw_away_schema() -> Schema {
let mut schema_builder = SchemaBuilder::default();
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
schema_builder.build()
}
}

View File

@@ -364,11 +364,6 @@ mod tests {
use super::*;
#[test]
fn test_open_non_existant_path() {
assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
}
#[test]
fn test_open_empty() {
// empty file is actually an edge case because those

View File

@@ -20,9 +20,6 @@ pub enum TantivyError {
/// File already exists, this is a problem when we try to write into a new file.
#[fail(display = "file already exists: '{:?}'", _0)]
FileAlreadyExists(PathBuf),
/// Index already exists in this directory
#[fail(display = "index already exists")]
IndexAlreadyExists,
/// Failed to acquire file lock
#[fail(
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",

View File

@@ -126,6 +126,7 @@ impl SegmentPostings {
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
@@ -215,10 +216,11 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
@@ -630,10 +632,8 @@ mod tests {
use schema::IndexRecordOption;
use schema::SchemaBuilder;
use schema::Term;
use super::exponential_search;
use schema::INT_INDEXED;
use DocId;
use SkipResult;
#[test]
fn test_empty_segment_postings() {
@@ -661,13 +661,6 @@ mod tests {
.0
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(0, &[1,2]), (0, 1));
assert_eq!(exponential_search(1, &[1,2]), (0, 1));
assert_eq!(exponential_search(7, &[1,2,3,4,5,6,7,8,9,10,11]), (3,7));
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
@@ -699,7 +692,7 @@ mod tests {
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -713,45 +706,14 @@ mod tests {
}
}
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
let mut schema_builder = SchemaBuilder::default();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
@@ -771,7 +733,7 @@ mod tests {
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(&[3]);
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
@@ -781,7 +743,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(&[3]);
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
@@ -794,7 +756,7 @@ mod tests {
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
let mut block_postings = build_block_postings(docs.clone());
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),

View File

@@ -177,9 +177,6 @@ impl QueryParser {
///
/// There is currently no lenient mode for the query parser
/// which makes it a bad choice for a public/broad user search engine.
///
/// Implementing a lenient mode for this query parser is tracked
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<Query>, QueryParserError> {
let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(convert_to_query(logical_ast))
@@ -193,6 +190,61 @@ impl QueryParser {
self.compute_logical_ast(user_input_ast)
}
/// Parse a query
///
/// Note that `parse_query_lenient` will NOT return an error
/// if the input is not a valid query.
///
/// It will instead escape all special characters in the query body
/// retry to process the query, if it still fails will return the AllQuery
pub fn parse_query_lenient(&self, query: &str) -> Box<Query> {
if let Ok(logical_ast) = self.parse_query_to_logical_ast(query) {
return convert_to_query(logical_ast);
}
// try to clean up the query
if let Ok(logical_ast) = self.parse_lenient_query_to_logical_ast(query) {
return convert_to_query(logical_ast);
}
// we have no idea what you want, so here's nothing
Box::new(EmptyQuery)
}
/// Parse the user query into an AST.
fn parse_lenient_query_to_logical_ast(
&self,
query: &str,
) -> Result<LogicalAST, QueryParserError> {
// if we are here, we know we have a poorly formed
// query input
// # Escape special characters: \\+-&|!(){}[]^~*?:\/
let special_chars = "\\+-&|!(){}[]^~*?:/";
let mut scrubbed_query = query
.chars()
.filter(|c| !special_chars.contains(*c))
.collect::<String>();
// AND, OR and NOT are used by tantivy as logical operators. We need
// to escape them
let special_words = vec!["AND", "OR", "NOT"];
for word in special_words.iter() {
scrubbed_query = scrubbed_query.replace(word, &format!("{}", word));
}
// Escape odd quotes
let quote_count = scrubbed_query.chars().filter(|&c| c == '\"').count();
if quote_count % 2 == 1 {
scrubbed_query = scrubbed_query.replace("\"", "\\\"");
}
let (user_input_ast, _remaining) = parse_to_ast()
.parse(scrubbed_query.as_str())
.map_err(|_| QueryParserError::SyntaxError)?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema
.get_field(field_name)
@@ -544,6 +596,26 @@ mod test {
assert!(query_parser.parse_query("toto").is_ok());
}
#[test]
pub fn test_parse_query_lenient_no_panics() {
let query_parser = make_query_parser();
query_parser.parse_query_lenient("toto");
query_parser.parse_query_lenient("");
query_parser.parse_query_lenient("+(happy");
}
#[test]
pub fn test_parse_query_lenient_escapes_bad_queries() {
let query_parser = make_query_parser();
let query = query_parser
.parse_lenient_query_to_logical_ast("+(happy")
.unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, "(Term([0, 0, 0, 0, 104, 97, 112, 112, 121]) Term([0, 0, 0, 1, 104, 97, 112, 112, 121]))");
}
#[test]
pub fn test_parse_nonindexed_field_yields_error() {
let query_parser = make_query_parser();

View File

@@ -80,9 +80,6 @@ impl UserInputBound {
pub enum UserInputAST {
Clause(Vec<UserInputAST>),
Unary(Occur, Box<UserInputAST>),
// Not(Box<UserInputAST>),
// Should(Box<UserInputAST>),
// Must(Box<UserInputAST>),
Leaf(Box<UserInputLeaf>),
}
@@ -92,7 +89,7 @@ impl UserInputAST {
}
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
assert!(occur != Occur::MustNot);
assert_ne!(occur, Occur::MustNot);
assert!(!asts.is_empty());
if asts.len() == 1 {
asts.into_iter().next().unwrap() //< safe
@@ -114,42 +111,6 @@ impl UserInputAST {
}
}
/*
impl UserInputAST {
fn compose_occur(self, occur: Occur) -> UserInputAST {
match self {
UserInputAST::Not(other) => {
let new_occur = compose_occur(Occur::MustNot, occur);
other.simplify()
}
_ => {
self
}
}
}
pub fn simplify(self) -> UserInputAST {
match self {
UserInputAST::Clause(els) => {
if els.len() == 1 {
return els.into_iter().next().unwrap();
} else {
return self;
}
}
UserInputAST::Not(els) => {
if els.len() == 1 {
return els.into_iter().next().unwrap();
} else {
return self;
}
}
}
}
}
*/
impl From<UserInputLiteral> for UserInputLeaf {
fn from(literal: UserInputLiteral) -> UserInputLeaf {
UserInputLeaf::Literal(literal)

View File

@@ -14,7 +14,7 @@ use std::fmt;
/// - a field name
/// - a field type, itself wrapping up options describing
/// how the field should be indexed.
#[derive(Clone, Debug, Eq, PartialEq)]
#[derive(Clone, Debug)]
pub struct FieldEntry {
name: String,
field_type: FieldType,

View File

@@ -134,15 +134,6 @@ struct InnerSchema {
fields_map: HashMap<String, Field>, // transient
}
impl PartialEq for InnerSchema {
fn eq(&self, other: &InnerSchema) -> bool {
self.fields == other.fields
}
}
impl Eq for InnerSchema {}
/// Tantivy has a very strict schema.
/// You need to specify in advance, whether a field is indexed or not,
/// stored or not, and RAM-based or not.
@@ -163,7 +154,7 @@ impl Eq for InnerSchema {}
/// let schema = schema_builder.build();
///
/// ```
#[derive(Clone, Eq, PartialEq)]
#[derive(Clone)]
pub struct Schema(Arc<InnerSchema>);
impl Schema {

View File

@@ -157,34 +157,35 @@ pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::tokenizer_manager::TokenizerManager;
/// This is a function that can be used in tests and doc tests
/// to assert a token's correctness.
/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
/// public api?
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
assert_eq!(
token.position, position,
"expected position {} but {:?}",
position, token
);
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
assert_eq!(
token.offset_from, from,
"expected offset_from {} but {:?}",
from, token
);
assert_eq!(
token.offset_to, to,
"expected offset_to {} but {:?}",
to, token
);
}
#[cfg(test)]
pub mod tests {
pub mod test {
use super::assert_token;
use super::Token;
use super::TokenizerManager;
/// This is a function that can be used in tests and doc tests
/// to assert a token's correctness.
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
assert_eq!(
token.position, position,
"expected position {} but {:?}",
position, token
);
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
assert_eq!(
token.offset_from, from,
"expected offset_from {} but {:?}",
from, token
);
assert_eq!(
token.offset_to, to,
"expected offset_to {} but {:?}",
to, token
);
}
#[test]
fn test_raw_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
@@ -223,6 +224,72 @@ pub mod tests {
assert_token(&tokens[3], 3, "payer", 17, 22);
}
#[test]
fn test_ngram_tokenizer() {
use super::{LowerCaser, NgramTokenizer};
use tokenizer::tokenizer::TokenStream;
use tokenizer::tokenizer::Tokenizer;
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
tokenizer_manager.register(
"ngram3",
NgramTokenizer::new(3, 3, false).filter(LowerCaser),
);
tokenizer_manager.register(
"edgegram5",
NgramTokenizer::new(2, 5, true).filter(LowerCaser),
);
let tokenizer = NgramTokenizer::new(1, 2, false);
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
tokenizer.token_stream("hello").process(&mut add_token);
}
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2);
assert_token(&tokens[2], 1, "e", 1, 2);
assert_token(&tokens[3], 1, "el", 1, 3);
assert_token(&tokens[4], 2, "l", 2, 3);
assert_token(&tokens[5], 2, "ll", 2, 4);
assert_token(&tokens[6], 3, "l", 3, 4);
assert_token(&tokens[7], 3, "lo", 3, 5);
assert_token(&tokens[8], 4, "o", 4, 5);
let tokenizer = tokenizer_manager.get("ngram3").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
tokenizer.token_stream("Hello").process(&mut add_token);
}
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 1, "ell", 1, 4);
assert_token(&tokens[2], 2, "llo", 2, 5);
let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
tokenizer
.token_stream("Frankenstein")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3);
assert_token(&tokens[2], 0, "fran", 0, 4);
assert_token(&tokens[3], 0, "frank", 0, 5);
}
#[test]
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default();

View File

@@ -2,15 +2,14 @@ use super::{Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
/// With this tokenizer, the `position` is always 0.
/// Beware however, in presence of multiple value for the same field,
/// the position will be `POSITION_GAP * index of value`.
/// With this tokenizer, the `position` field expresses the starting offset of the ngram
/// rather than the `token` offset.
///
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
///
/// | Term | he | hel | el | ell | ll | llo | lo |
/// |----------|-----|-----|-----|-----|-----|-----|----|
/// | Position | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 |
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
///
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
@@ -20,63 +19,24 @@ use super::{Token, TokenStream, Tokenizer};
/// | Position | 0 | 0 | 0 | 0 |
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
///
/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
///
/// | Term | hε | hεl | hεll | hεllo |
/// |----------|-----|-----|-------|-------|
/// | Position | 0 | 0 | 0 | 0 |
/// | Offsets | 0,3 | 0,4 | 0,5 | 0,6 |
///
/// # Example
///
/// ```
/// # extern crate tantivy;
/// extern crate tantivy;
/// use tantivy::tokenizer::*;
/// use tantivy::tokenizer::assert_token;
///
/// # fn main() {
/// let tokenizer = NgramTokenizer::new(2, 3, false);
/// let mut stream = tokenizer.token_stream("hello");
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "he");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 2);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "hel");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 3);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "el");
/// assert_eq!(token.offset_from, 1);
/// assert_eq!(token.offset_to, 3);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "ell");
/// assert_eq!(token.offset_from, 1);
/// assert_eq!(token.offset_to, 4);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "ll");
/// assert_eq!(token.offset_from, 2);
/// assert_eq!(token.offset_to, 4);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "llo");
/// assert_eq!(token.offset_from, 2);
/// assert_eq!(token.offset_to, 5);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "lo");
/// assert_eq!(token.offset_from, 3);
/// assert_eq!(token.offset_to, 5);
/// }
///
/// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
/// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
/// assert!(stream.next().is_none());
/// # }
/// ```
@@ -98,37 +58,23 @@ impl NgramTokenizer {
min_gram <= max_gram,
"min_gram must not be greater than max_gram"
);
NgramTokenizer {
min_gram,
max_gram,
prefix_only,
}
}
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
///
/// This is as opposed to only prefix ngrams .
pub fn all_ngrams(min_gram: usize, max_gram:usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, false)
}
/// Create a `NGramTokenizer` which only generates tokens for the
/// prefix ngrams.
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
Self::new(min_gram, max_gram, true)
}
}
/// TokenStream associate to the `NgramTokenizer`
pub struct NgramTokenStream<'a> {
/// parameters
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
/// true if the NgramTokenStream is in prefix mode.
prefix_only: bool,
/// input
text: &'a str,
/// output
position: usize,
text_length: usize,
token: Token,
min_gram: usize,
max_gram: usize,
gram_size: usize,
prefix_only: bool,
}
impl<'a> Tokenizer<'a> for NgramTokenizer {
@@ -136,28 +82,65 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
NgramTokenStream {
ngram_charidx_iterator: StutteringIterator::new(
CodepointFrontiers::for_str(text),
self.min_gram,
self.max_gram),
prefix_only: self.prefix_only,
text,
position: 0,
text_length: text.len(),
token: Token::default(),
min_gram: self.min_gram,
max_gram: self.max_gram,
prefix_only: self.prefix_only,
gram_size: self.min_gram,
}
}
}
impl<'a> NgramTokenStream<'a> {
/// Get the next set of token options
/// cycle through 1,2 (min..=max)
/// returning None if processing should stop
fn chomp(&mut self) -> Option<(usize, usize)> {
// Have we exceeded the bounds of the text we are indexing?
if self.gram_size > self.max_gram {
if self.prefix_only {
return None;
}
// since we aren't just processing edges
// we need to reset the gram size
self.gram_size = self.min_gram;
// and move down the chain of letters
self.position += 1;
}
let result = if (self.position + self.gram_size) <= self.text_length {
Some((self.position, self.gram_size))
} else {
None
};
// increase the gram size for the next pass
self.gram_size += 1;
result
}
}
impl<'a> TokenStream for NgramTokenStream<'a> {
fn advance(&mut self) -> bool {
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
if self.prefix_only && offset_from > 0 {
return false;
}
self.token.position = 0;
// clear out working token text
self.token.text.clear();
if let Some((position, size)) = self.chomp() {
self.token.position = position;
let offset_from = position;
let offset_to = offset_from + size;
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.clear();
self.token.text.push_str(&self.text[offset_from..offset_to]);
true
} else {
false
@@ -167,307 +150,8 @@ impl<'a> TokenStream for NgramTokenStream<'a> {
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
/// This iterator takes an underlying Iterator
/// and emits all of the pairs `(a,b)` such that
/// a and b are items emitted by the iterator at
/// an interval between `min_gram` and `max_gram`.
///
/// The elements are emitted in the order of appearance
/// of `a` first, `b` then.
///
/// See `test_stutterring_iterator` for an example of its
/// output.
struct StutteringIterator<T> {
underlying: T,
min_gram: usize,
max_gram: usize,
memory: Vec<usize>,
cursor: usize,
gram_len: usize
}
impl<T> StutteringIterator<T>
where T: Iterator<Item=usize> {
pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
assert!(min_gram > 0);
let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
if memory.len() <= min_gram {
// returns an empty iterator
StutteringIterator {
underlying,
min_gram: 1,
max_gram: 0,
memory,
cursor: 0,
gram_len: 0,
}
} else {
StutteringIterator {
underlying,
min_gram,
max_gram: memory.len() - 1,
memory,
cursor: 0,
gram_len: min_gram,
}
}
}
}
impl<T> Iterator for StutteringIterator<T>
where T: Iterator<Item=usize> {
type Item = (usize, usize);
fn next(&mut self) -> Option<(usize, usize)> {
if self.gram_len > self.max_gram {
// we have exhausted all options
// starting at `self.memory[self.cursor]`.
//
// Time to advance.
self.gram_len = self.min_gram;
if let Some(next_val) = self.underlying.next() {
self.memory[self.cursor] = next_val;
} else {
self.max_gram -= 1;
}
self.cursor += 1;
if self.cursor >= self.memory.len() {
self.cursor = 0;
}
}
if self.max_gram < self.min_gram {
return None;
}
let start = self.memory[self.cursor % self.memory.len()];
let stop = self.memory[(self.cursor + self.gram_len) % self.memory.len()];
self.gram_len += 1;
Some((start, stop))
}
}
/// Emits all of the offsets where a codepoint starts
/// or a codepoint ends.
///
/// By convention, we emit [0] for the empty string.
struct CodepointFrontiers<'a> {
s: &'a str,
next_el: Option<usize>
}
impl<'a> CodepointFrontiers<'a> {
fn for_str(s: &'a str) -> Self {
CodepointFrontiers {
s,
next_el: Some(0)
}
}
}
impl<'a> Iterator for CodepointFrontiers<'a> {
type Item = usize;
fn next(&mut self) -> Option<usize> {
self.next_el
.map(|offset| {
if self.s.is_empty() {
self.next_el = None;
} else {
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
self.s = &self.s[first_codepoint_width..];
self.next_el = Some(offset + first_codepoint_width);
}
offset
})
}
}
const CODEPOINT_UTF8_WIDTH: [u8; 16] = [
1, 1, 1, 1,
1, 1, 1, 1,
2, 2, 2, 2,
2, 2, 3, 4,
];
// Number of bytes to encode a codepoint in UTF-8 given
// the first byte.
//
// To do that we count the number of higher significant bits set to `1`.
fn utf8_codepoint_width(b: u8) -> usize {
let higher_4_bits = (b as usize) >> 4;
CODEPOINT_UTF8_WIDTH[higher_4_bits] as usize
}
#[cfg(test)]
mod tests {
use tokenizer::tokenizer::{TokenStream, Tokenizer};
use super::NgramTokenizer;
use tokenizer::Token;
use tokenizer::tests::assert_token;
use super::CodepointFrontiers;
use super::StutteringIterator;
use super::utf8_codepoint_width;
fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
let mut tokens: Vec<Token> = vec![];
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
tokens
}
#[test]
fn test_utf8_codepoint_width() {
// 0xxx
for i in 0..128 {
assert_eq!(utf8_codepoint_width(i), 1);
}
// 110xx
for i in (128 | 64)..(128 | 64 | 32) {
assert_eq!(utf8_codepoint_width(i), 2);
}
// 1110xx
for i in (128 | 64 | 32)..(128 | 64 | 32 | 16) {
assert_eq!(utf8_codepoint_width(i), 3);
}
// 1111xx
for i in (128 | 64 | 32 | 16)..256 {
assert_eq!(utf8_codepoint_width(i as u8), 4);
}
}
#[test]
fn test_codepoint_frontiers() {
assert_eq!(CodepointFrontiers::for_str("").collect::<Vec<_>>(), vec![0]);
assert_eq!(
CodepointFrontiers::for_str("abcd").collect::<Vec<_>>(),
vec![0,1,2,3,4]
);
assert_eq!(
CodepointFrontiers::for_str("aあ").collect::<Vec<_>>(),
vec![0,1,4]
);
}
#[test]
fn test_ngram_tokenizer_1_2_false() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2);
assert_token(&tokens[2], 0, "e", 1, 2);
assert_token(&tokens[3], 0, "el", 1, 3);
assert_token(&tokens[4], 0, "l", 2, 3);
assert_token(&tokens[5], 0, "ll", 2, 4);
assert_token(&tokens[6], 0, "l", 3, 4);
assert_token(&tokens[7], 0, "lo", 3, 5);
assert_token(&tokens[8], 0, "o", 4, 5);
}
#[test]
fn test_ngram_tokenizer_min_max_equal() {
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 0, "ell", 1, 4);
assert_token(&tokens[2], 0, "llo", 2, 5);
}
#[test]
fn test_ngram_tokenizer_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3);
assert_token(&tokens[2], 0, "fran", 0, 4);
assert_token(&tokens[3], 0, "frank", 0, 5);
}
#[test]
fn test_ngram_non_ascii_1_2() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "", 0, 3);
assert_token(&tokens[2], 0, "ε", 1, 3);
assert_token(&tokens[3], 0, "εl", 1, 4);
assert_token(&tokens[4], 0, "l", 3, 4);
assert_token(&tokens[5], 0, "ll", 3, 5);
assert_token(&tokens[6], 0, "l", 4, 5);
assert_token(&tokens[7], 0, "lo", 4, 6);
assert_token(&tokens[8], 0, "o", 5, 6);
}
#[test]
fn test_ngram_non_ascii_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "", 0, 3);
assert_token(&tokens[1], 0, "hεl", 0, 4);
assert_token(&tokens[2], 0, "hεll", 0, 5);
assert_token(&tokens[3], 0, "hεllo", 0, 6);
}
#[test]
fn test_ngram_empty() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
assert!(tokens.is_empty());
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
assert!(tokens.is_empty());
}
#[test]
#[should_panic(expected = "min_gram must be greater than 0")]
fn test_ngram_min_max_interval_empty() {
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
}
#[test]
#[should_panic(expected = "min_gram must not be greater than max_gram")]
fn test_invalid_interval_should_panic_if_smaller() {
NgramTokenizer::all_ngrams(2, 1);
}
#[test]
fn test_stutterring_iterator_empty() {
let rg: Vec<usize> = vec![0];
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
assert_eq!(it.next(), None);
}
#[test]
fn test_stutterring_iterator() {
let rg: Vec<usize> = (0..10).collect();
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
assert_eq!(it.next(), Some((0, 1)));
assert_eq!(it.next(), Some((0, 2)));
assert_eq!(it.next(), Some((1, 2)));
assert_eq!(it.next(), Some((1, 3)));
assert_eq!(it.next(), Some((2, 3)));
assert_eq!(it.next(), Some((2, 4)));
assert_eq!(it.next(), Some((3, 4)));
assert_eq!(it.next(), Some((3, 5)));
assert_eq!(it.next(), Some((4, 5)));
assert_eq!(it.next(), Some((4, 6)));
assert_eq!(it.next(), Some((5, 6)));
assert_eq!(it.next(), Some((5, 7)));
assert_eq!(it.next(), Some((6, 7)));
assert_eq!(it.next(), Some((6, 8)));
assert_eq!(it.next(), Some((7, 8)));
assert_eq!(it.next(), Some((7, 9)));
assert_eq!(it.next(), Some((8, 9)));
assert_eq!(it.next(), None);
}
}