Compare commits

...

6 Commits

Author SHA1 Message Date
Paul Masurel
790baa7adf Integrated state into TermDict streamer 2019-08-16 10:29:28 +09:00
Paul Masurel
039c0a0863 Introducing a wrapper struct instead of Boxed<BoxableTokenizer> (#631)
Closes #629
2019-08-15 16:37:04 +09:00
Paul Masurel
b3b0138b82 Change for tantivy-py
Schema.convert_named_doc
Better Debug string for Terms and TermQueries
2019-08-14 17:44:25 +09:00
petr-tik
ea56160cdc Added cargo-fmt to CI runs (#627)
* Added cargo-fmt to CI runs

Closes #625

* Remove fmt from appveyor builds

Windows seems to have issues with install components through rustup.

Formatting should be equally informative regardless of the OS,
so best to keep it in Linux on Travis
2019-08-12 08:25:47 +09:00
petr-tik
028b0a749c Elastic unbounded range query (#624)
* Tidy up

fmt

remove unneccessary -> Result<()> followed by run.unwrap() in a test

* Adding support for elasticsearch-style unbounded queries

Extend the UserInputBound to include Unbounded, so we can reuse formatting and
internal query format

* Still working on elastic-style range queries

Fixes #498

Merge the elastic_range into range

Reformat to make code easier to follow, use optional() macro to return Some

* Fixed bugs

Made the range parser insensitive to whitespace between the ":" and the range.

Removed optional parsing of field.

Added a unit test for the range parser.

Derived PartialEq to compare the results of parsing as structs, instead of
strings. Found a bug with that unit test - "*}" was parsed as an
UserInputBound::Exclusive, instead of UserInputBound::Unbounded. Added an early
detection-and-return for * in the original range parser

* Correct failing test

Assume that we will use "{*" for Unbounded ranges

* Add a note in the changelog

cargo-fmt

* Moved parenthesis to a newline to make nested if-else more visible
2019-08-12 08:24:47 +09:00
Paul Masurel
941f06eb9f Added Schema.from_named_doc 2019-08-11 16:50:32 +09:00
21 changed files with 358 additions and 145 deletions

View File

@@ -47,6 +47,7 @@ matrix:
before_install:
- set -e
- rustup self update
- rustup component add rustfmt
install:
- sh ci/install.sh
@@ -60,6 +61,7 @@ before_script:
script:
- bash ci/script.sh
- cargo fmt --all -- --check
before_deploy:
- sh ci/before_deploy.sh

View File

@@ -5,7 +5,12 @@ Tantivy 0.11.0
- Various bugfixes in the query parser.
- Better handling of hyphens in query parser. (#609)
- Better handling of whitespaces.
- Closes #498 - add support for Elastic-style unbounded range queries for alphanumeric types eg. "title:>hello", "weight:>=70.5", "height:<200" (@petr-tik)
- API change around `Box<BoxableTokenizer>`. See detail in #629
## How to update?
`Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
Tantivy 0.10.1
=====================

View File

@@ -17,7 +17,7 @@ base64 = "0.10.0"
byteorder = "1.0"
once_cell = "0.2"
regex = "1.0"
tantivy-fst = "0.1"
tantivy-fst = {git="https://github.com/tantivy-search/fst"}
memmap = {version = "0.7", optional=true}
lz4 = {version="1.20", optional=true}
snap = {version="0.2"}

View File

@@ -173,11 +173,11 @@ impl Index {
}
/// Helper to access the tokenizer associated to a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<dyn BoxedTokenizer>> {
pub fn tokenizer_for_field(&self, field: Field) -> Result<BoxedTokenizer> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<Box<dyn BoxedTokenizer>> = match field_type {
let tokenizer_name_opt: Option<BoxedTokenizer> = match field_type {
FieldType::Str(text_options) => text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())

View File

@@ -49,7 +49,7 @@ pub struct SegmentWriter {
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<Box<dyn BoxedTokenizer>>>,
tokenizers: Vec<Option<BoxedTokenizer>>,
}
impl SegmentWriter {

View File

@@ -14,6 +14,7 @@ use tantivy_fst::Automaton;
pub struct AutomatonWeight<A>
where
A: Automaton + Send + Sync + 'static,
A::State: Clone + Default + Sized,
{
field: Field,
automaton: A,
@@ -22,6 +23,7 @@ where
impl<A> AutomatonWeight<A>
where
A: Automaton + Send + Sync + 'static,
A::State: Clone + Default + Sized,
{
/// Create a new AutomationWeight
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
@@ -37,6 +39,7 @@ where
impl<A> Weight for AutomatonWeight<A>
where
A: Automaton + Send + Sync + 'static,
A::State: Clone + Default + Sized,
{
fn scorer(&self, reader: &SegmentReader) -> Result<Box<dyn Scorer>> {
let max_doc = reader.max_doc();

View File

@@ -83,28 +83,67 @@ parser! {
}
parser! {
/// Function that parses a range out of a Stream
/// Supports ranges like:
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
/// [a TO *], [a TO c], [abc TO bcd}
fn range[I]()(I) -> UserInputLeaf
where [I: Stream<Item = char>] {
let range_term_val = || {
word().or(negative_number()).or(char('*').with(value("*".to_string())))
};
// check for unbounded range in the form of <5, <=10, >5, >=5
let elastic_unbounded_range = (choice([attempt(string(">=")),
attempt(string("<=")),
attempt(string("<")),
attempt(string(">"))])
.skip(spaces()),
range_term_val()).
map(|(comparison_sign, bound): (&str, String)|
match comparison_sign {
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
"<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)),
"<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)),
">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded),
// default case
_ => (UserInputBound::Unbounded, UserInputBound::Unbounded)
});
let lower_bound = (one_of("{[".chars()), range_term_val())
.map(|(boundary_char, lower_bound): (char, String)|
if boundary_char == '{' { UserInputBound::Exclusive(lower_bound) }
else { UserInputBound::Inclusive(lower_bound) });
if lower_bound == "*" {
UserInputBound::Unbounded
} else if boundary_char == '{' {
UserInputBound::Exclusive(lower_bound)
} else {
UserInputBound::Inclusive(lower_bound)
});
let upper_bound = (range_term_val(), one_of("}]".chars()))
.map(|(higher_bound, boundary_char): (String, char)|
if boundary_char == '}' { UserInputBound::Exclusive(higher_bound) }
else { UserInputBound::Inclusive(higher_bound) });
(
optional(field()),
lower_bound
.skip((spaces(), string("TO"), spaces())),
upper_bound,
).map(|(field, lower, upper)| UserInputLeaf::Range {
field,
lower,
upper
if higher_bound == "*" {
UserInputBound::Unbounded
} else if boundary_char == '}' {
UserInputBound::Exclusive(higher_bound)
} else {
UserInputBound::Inclusive(higher_bound)
});
// return only lower and upper
let lower_to_upper = (lower_bound.
skip((spaces(),
string("TO"),
spaces())),
upper_bound);
(optional(field()).skip(spaces()),
// try elastic first, if it matches, the range is unbounded
attempt(elastic_unbounded_range).or(lower_to_upper))
.map(|(field, (lower, upper))|
// Construct the leaf from extracted field (optional)
// and bounds
UserInputLeaf::Range {
field,
lower,
upper
})
}
}
@@ -258,6 +297,49 @@ mod test {
);
}
#[test]
fn test_parse_elastic_query_ranges() {
test_parse_query_to_ast_helper("title: >a", "title:{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title:>=a", "title:[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title: <a", "title:{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("title:<=a", "title:{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("title:<=bsd", "title:{\"*\" TO \"bsd\"]");
test_parse_query_to_ast_helper("weight: >70", "weight:{\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight:>=70", "weight:[\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <70", "weight:{\"*\" TO \"70\"}");
test_parse_query_to_ast_helper("weight:<=70", "weight:{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: >60.7", "weight:{\"60.7\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <= 70", "weight:{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
}
#[test]
fn test_range_parser() {
// testing the range() parser separately
let res = range().parse("title: <hello").unwrap().0;
let expected = UserInputLeaf::Range {
field: Some("title".to_string()),
lower: UserInputBound::Unbounded,
upper: UserInputBound::Exclusive("hello".to_string()),
};
let res2 = range().parse("title:{* TO hello}").unwrap().0;
assert_eq!(res, expected);
assert_eq!(res2, expected);
let expected_weight = UserInputLeaf::Range {
field: Some("weight".to_string()),
lower: UserInputBound::Inclusive("71.2".to_string()),
upper: UserInputBound::Unbounded,
};
let res3 = range().parse("weight: >=71.2").unwrap().0;
let res4 = range().parse("weight:[71.2 TO *}").unwrap().0;
assert_eq!(res3, expected_weight);
assert_eq!(res4, expected_weight);
}
#[test]
fn test_parse_query_to_triming_spaces() {
test_parse_query_to_ast_helper(" abc", "\"abc\"");
@@ -291,7 +373,7 @@ mod test {
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:[\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:{\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
test_is_parse_err("abc + ");

View File

@@ -369,6 +369,7 @@ impl QueryParser {
match *bound {
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
UserInputBound::Unbounded => Ok(Bound::Unbounded),
}
}
@@ -628,7 +629,7 @@ mod test {
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper(
"nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
"Term(field=7,bytes=[119, 111, 114, 100, 111, 110, \
101, 32, 119, 111, 114, 100, 116, 119, 111])",
false,
);
@@ -672,7 +673,7 @@ mod test {
.is_ok());
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
"Term(field=3,bytes=[0, 0, 0, 0, 0, 0, 9, 20])",
false,
);
@@ -693,19 +694,19 @@ mod test {
pub fn test_parse_query_to_ast_single_term() {
test_parse_query_to_logical_ast_helper(
"title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
"Term(field=0,bytes=[116, 111, 116, 111])",
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
"Term(field=0,bytes=[116, 111, 116, 111])",
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
"(+Term(field=0,bytes=[116, 111, 116, 111]) \
-(Term(field=0,bytes=[116, 105, 116, 105]) \
Term(field=1,bytes=[116, 105, 116, 105])))",
false,
);
assert_eq!(
@@ -720,14 +721,13 @@ mod test {
pub fn test_parse_query_to_ast_two_terms() {
test_parse_query_to_logical_ast_helper(
"title:a b",
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
"(Term(field=0,bytes=[97]) (Term(field=0,bytes=[98]) Term(field=1,bytes=[98])))",
false,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[(0, Term([0, 0, 0, 0, 97])), \
(1, Term([0, 0, 0, 0, 98]))]\"",
"\"[(0, Term(field=0,bytes=[97])), \
(1, Term(field=0,bytes=[98]))]\"",
false,
);
}
@@ -736,45 +736,43 @@ mod test {
pub fn test_parse_query_to_ast_ranges() {
test_parse_query_to_logical_ast_helper(
"title:[a TO b]",
"(Included(Term([0, 0, 0, 0, 97])) TO \
Included(Term([0, 0, 0, 0, 98])))",
"(Included(Term(field=0,bytes=[97])) TO Included(Term(field=0,bytes=[98])))",
false,
);
test_parse_query_to_logical_ast_helper(
"[a TO b]",
"((Included(Term([0, 0, 0, 0, 97])) TO \
Included(Term([0, 0, 0, 0, 98]))) \
(Included(Term([0, 0, 0, 1, 97])) TO \
Included(Term([0, 0, 0, 1, 98]))))",
"((Included(Term(field=0,bytes=[97])) TO \
Included(Term(field=0,bytes=[98]))) \
(Included(Term(field=1,bytes=[97])) TO \
Included(Term(field=1,bytes=[98]))))",
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO toto}",
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO \
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO \
Excluded(Term(field=0,bytes=[116, 111, 116, 111])))",
false,
);
test_parse_query_to_logical_ast_helper(
"title:{* TO toto}",
"(Unbounded TO \
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
"(Unbounded TO Excluded(Term(field=0,bytes=[116, 111, 116, 111])))",
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO *}",
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO Unbounded)",
false,
);
test_parse_query_to_logical_ast_helper(
"signed:{-5 TO 3}",
"(Excluded(Term([0, 0, 0, 2, 127, 255, 255, 255, 255, 255, 255, 251])) TO \
Excluded(Term([0, 0, 0, 2, 128, 0, 0, 0, 0, 0, 0, 3])))",
"(Excluded(Term(field=2,bytes=[127, 255, 255, 255, 255, 255, 255, 251])) TO \
Excluded(Term(field=2,bytes=[128, 0, 0, 0, 0, 0, 0, 3])))",
false,
);
test_parse_query_to_logical_ast_helper(
"float:{-1.5 TO 1.5}",
"(Excluded(Term([0, 0, 0, 10, 64, 7, 255, 255, 255, 255, 255, 255])) TO \
Excluded(Term([0, 0, 0, 10, 191, 248, 0, 0, 0, 0, 0, 0])))",
"(Excluded(Term(field=10,bytes=[64, 7, 255, 255, 255, 255, 255, 255])) TO \
Excluded(Term(field=10,bytes=[191, 248, 0, 0, 0, 0, 0, 0])))",
false,
);
@@ -879,19 +877,19 @@ mod test {
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper(
"title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
"Term(field=0,bytes=[116, 111, 116, 111])",
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
"Term(field=0,bytes=[116, 111, 116, 111])",
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
"(+Term(field=0,bytes=[116, 111, 116, 111]) \
-(Term(field=0,bytes=[116, 105, 116, 105]) \
Term(field=1,bytes=[116, 105, 116, 105])))",
true,
);
assert_eq!(
@@ -902,15 +900,15 @@ mod test {
);
test_parse_query_to_logical_ast_helper(
"title:a b",
"(+Term([0, 0, 0, 0, 97]) \
+(Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
"(+Term(field=0,bytes=[97]) \
+(Term(field=0,bytes=[98]) \
Term(field=1,bytes=[98])))",
true,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[(0, Term([0, 0, 0, 0, 97])), \
(1, Term([0, 0, 0, 0, 98]))]\"",
"\"[(0, Term(field=0,bytes=[97])), \
(1, Term(field=0,bytes=[98]))]\"",
true,
);
}
@@ -919,10 +917,8 @@ mod test {
pub fn test_query_parser_hyphen() {
test_parse_query_to_logical_ast_helper(
"title:www-form-encoded",
"\"[(0, Term([0, 0, 0, 0, 119, 119, 119])), \
(1, Term([0, 0, 0, 0, 102, 111, 114, 109])), \
(2, Term([0, 0, 0, 0, 101, 110, 99, 111, 100, 101, 100]))]\"",
false,
"\"[(0, Term(field=0,bytes=[119, 119, 119])), (1, Term(field=0,bytes=[102, 111, 114, 109])), (2, Term(field=0,bytes=[101, 110, 99, 111, 100, 101, 100]))]\"",
false
);
}
}

View File

@@ -3,6 +3,7 @@ use std::fmt::{Debug, Formatter};
use crate::query::Occur;
#[derive(PartialEq)]
pub enum UserInputLeaf {
Literal(UserInputLiteral),
All,
@@ -35,6 +36,7 @@ impl Debug for UserInputLeaf {
}
}
#[derive(PartialEq)]
pub struct UserInputLiteral {
pub field_name: Option<String>,
pub phrase: String,
@@ -49,9 +51,11 @@ impl fmt::Debug for UserInputLiteral {
}
}
#[derive(PartialEq)]
pub enum UserInputBound {
Inclusive(String),
Exclusive(String),
Unbounded,
}
impl UserInputBound {
@@ -59,6 +63,7 @@ impl UserInputBound {
match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
UserInputBound::Unbounded => write!(formatter, "{{\"*\""),
}
}
@@ -66,6 +71,7 @@ impl UserInputBound {
match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
UserInputBound::Unbounded => write!(formatter, "\"*\"}}"),
}
}
@@ -73,6 +79,7 @@ impl UserInputBound {
match *self {
UserInputBound::Inclusive(ref contents) => contents,
UserInputBound::Exclusive(ref contents) => contents,
UserInputBound::Unbounded => &"*",
}
}
}

View File

@@ -338,39 +338,33 @@ mod tests {
use crate::collector::Count;
use crate::schema::{Document, Field, Schema, INDEXED};
use crate::Index;
use crate::Result;
use std::collections::Bound;
#[test]
fn test_range_query_simple() {
fn run() -> Result<()> {
let mut schema_builder = Schema::builder();
let year_field = schema_builder.add_u64_field("year", INDEXED);
let schema = schema_builder.build();
let mut schema_builder = Schema::builder();
let year_field = schema_builder.add_u64_field("year", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
for year in 1950u64..2017u64 {
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
for _ in 0..num_docs_within_year {
index_writer.add_document(doc!(year_field => year));
}
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
for year in 1950u64..2017u64 {
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
for _ in 0..num_docs_within_year {
index_writer.add_document(doc!(year_field => year));
}
index_writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
// ... or `1960..=1969` if inclusive range is enabled.
let count = searcher.search(&docs_in_the_sixties, &Count)?;
assert_eq!(count, 2285);
Ok(())
index_writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
run().unwrap();
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
// ... or `1960..=1969` if inclusive range is enabled.
let count = searcher.search(&docs_in_the_sixties, &Count).unwrap();
assert_eq!(count, 2285);
}
#[test]

View File

@@ -12,7 +12,7 @@ mod tests {
use crate::collector::TopDocs;
use crate::docset::DocSet;
use crate::query::{Query, QueryParser, Scorer, TermQuery};
use crate::schema::{IndexRecordOption, Schema, STRING, TEXT};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::tests::assert_nearly_equals;
use crate::Index;
use crate::Term;
@@ -114,4 +114,16 @@ mod tests {
let reader = index.reader().unwrap();
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
}
#[test]
fn test_term_query_debug() {
let term_query = TermQuery::new(
Term::from_field_text(Field(1), "hello"),
IndexRecordOption::WithFreqs,
);
assert_eq!(
format!("{:?}", term_query),
"TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111]))"
);
}
}

View File

@@ -7,6 +7,7 @@ use crate::Result;
use crate::Searcher;
use crate::Term;
use std::collections::BTreeSet;
use std::fmt;
/// A Term query matches all of the documents
/// containing a specific term.
@@ -61,12 +62,18 @@ use std::collections::BTreeSet;
/// Ok(())
/// }
/// ```
#[derive(Clone, Debug)]
#[derive(Clone)]
pub struct TermQuery {
term: Term,
index_record_option: IndexRecordOption,
}
impl fmt::Debug for TermQuery {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TermQuery({:?})", self.term)
}
}
impl TermQuery {
/// Creates a new term query.
pub fn new(term: Term, segment_postings_options: IndexRecordOption) -> TermQuery {

View File

@@ -10,7 +10,7 @@ use serde_json::Value as JsonValue;
/// Possible error that may occur while parsing a field value
/// At this point the JSON is known to be valid.
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum ValueParsingError {
/// Encountered a numerical value that overflows or underflow its integer type.
OverflowError(String),

View File

@@ -246,6 +246,25 @@ impl Schema {
self.0.fields_map.get(field_name).cloned()
}
/// Create a named document off the doc.
pub fn convert_named_doc(
&self,
named_doc: NamedFieldDocument,
) -> Result<Document, DocParsingError> {
let mut document = Document::new();
for (field_name, values) in named_doc.0 {
if let Some(field) = self.get_field(&field_name) {
for value in values {
let field_value = FieldValue::new(field, value);
document.add(field_value);
}
} else {
return Err(DocParsingError::NoSuchFieldInSchema(field_name));
}
}
Ok(document)
}
/// Create a named document off the doc.
pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
let mut field_map = BTreeMap::new();
@@ -360,7 +379,7 @@ impl<'de> Deserialize<'de> for Schema {
/// Error that may happen when deserializing
/// a document from JSON.
#[derive(Debug, Fail)]
#[derive(Debug, Fail, PartialEq)]
pub enum DocParsingError {
/// The payload given is not valid JSON.
#[fail(display = "The provided string is not valid JSON")]
@@ -369,7 +388,10 @@ pub enum DocParsingError {
#[fail(display = "The field '{:?}' could not be parsed: {:?}", _0, _1)]
ValueError(String, ValueParsingError),
/// The json-document contains a field that is not declared in the schema.
#[fail(display = "The json-document contains an unknown field: {:?}", _0)]
#[fail(
display = "The document contains a field that is not declared in the schema: {:?}",
_0
)]
NoSuchFieldInSchema(String),
}
@@ -381,6 +403,7 @@ mod tests {
use crate::schema::*;
use matches::{assert_matches, matches};
use serde_json;
use std::collections::BTreeMap;
#[test]
pub fn is_indexed_test() {
@@ -495,6 +518,54 @@ mod tests {
assert_eq!(doc, doc_serdeser);
}
#[test]
pub fn test_document_from_nameddoc() {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT);
let val = schema_builder.add_i64_field("val", INDEXED);
let schema = schema_builder.build();
let mut named_doc_map = BTreeMap::default();
named_doc_map.insert(
"title".to_string(),
vec![Value::from("title1"), Value::from("title2")],
);
named_doc_map.insert(
"val".to_string(),
vec![Value::from(14u64), Value::from(-1i64)],
);
let doc = schema
.convert_named_doc(NamedFieldDocument(named_doc_map))
.unwrap();
assert_eq!(
doc.get_all(title),
vec![
&Value::from("title1".to_string()),
&Value::from("title2".to_string())
]
);
assert_eq!(
doc.get_all(val),
vec![&Value::from(14u64), &Value::from(-1i64)]
);
}
#[test]
pub fn test_document_from_nameddoc_error() {
let schema = Schema::builder().build();
let mut named_doc_map = BTreeMap::default();
named_doc_map.insert(
"title".to_string(),
vec![Value::from("title1"), Value::from("title2")],
);
let err = schema
.convert_named_doc(NamedFieldDocument(named_doc_map))
.unwrap_err();
assert_eq!(
err,
DocParsingError::NoSuchFieldInSchema("title".to_string())
);
}
#[test]
pub fn test_parse_document() {
let mut schema_builder = Schema::builder();

View File

@@ -224,7 +224,12 @@ where
impl fmt::Debug for Term {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Term({:?})", &self.0[..])
write!(
f,
"Term(field={},bytes={:?})",
self.field().0,
self.value_bytes()
)
}
}

View File

@@ -63,7 +63,7 @@ impl FragmentCandidate {
fn try_add_token(&mut self, token: &Token, terms: &BTreeMap<String, f32>) {
self.stop_offset = token.offset_to;
if let Some(score) = terms.get(&token.text.to_lowercase()) {
if let Some(&score) = terms.get(&token.text.to_lowercase()) {
self.score += score;
self.highlighted
.push(HighlightSection::new(token.offset_from, token.offset_to));
@@ -142,7 +142,7 @@ impl Snippet {
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
/// has to be a valid string.
fn search_fragments<'a>(
tokenizer: &dyn BoxedTokenizer,
tokenizer: &BoxedTokenizer,
text: &'a str,
terms: &BTreeMap<String, f32>,
max_num_chars: usize,
@@ -150,7 +150,6 @@ fn search_fragments<'a>(
let mut token_stream = tokenizer.token_stream(text);
let mut fragment = FragmentCandidate::new(0);
let mut fragments: Vec<FragmentCandidate> = vec![];
while let Some(next) = token_stream.next() {
if (next.offset_to - fragment.start_offset) > max_num_chars {
if fragment.score > 0.0 {
@@ -254,7 +253,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// ```
pub struct SnippetGenerator {
terms_text: BTreeMap<String, f32>,
tokenizer: Box<dyn BoxedTokenizer>,
tokenizer: BoxedTokenizer,
field: Field,
max_num_chars: usize,
}
@@ -316,12 +315,8 @@ impl SnippetGenerator {
/// Generates a snippet for the given text.
pub fn snippet(&self, text: &str) -> Snippet {
let fragment_candidates = search_fragments(
&*self.tokenizer,
&text,
&self.terms_text,
self.max_num_chars,
);
let fragment_candidates =
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars);
select_best_fragment_combination(&fragment_candidates[..], &text)
}
}
@@ -331,7 +326,7 @@ mod tests {
use super::{search_fragments, select_best_fragment_combination};
use crate::query::QueryParser;
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
use crate::tokenizer::{box_tokenizer, SimpleTokenizer};
use crate::tokenizer::SimpleTokenizer;
use crate::Index;
use crate::SnippetGenerator;
use maplit::btreemap;
@@ -355,12 +350,12 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let terms = btreemap! {
String::from("rust") => 1.0,
String::from("language") => 0.9
};
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100);
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 100);
assert_eq!(fragments.len(), 7);
{
let first = &fragments[0];
@@ -382,13 +377,13 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_scored_fragment() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
{
let terms = btreemap! {
String::from("rust") =>1.0f32,
String::from("language") => 0.9f32
};
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
{
let first = &fragments[0];
assert_eq!(first.score, 1.0);
@@ -397,13 +392,13 @@ Survey in 2016, 2017, and 2018."#;
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
}
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
{
let terms = btreemap! {
String::from("rust") =>0.9f32,
String::from("language") => 1.0f32
};
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
//assert_eq!(fragments.len(), 7);
{
let first = &fragments[0];
@@ -417,14 +412,14 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_in_second_fragment() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d e f g";
let mut terms = BTreeMap::new();
terms.insert(String::from("c"), 1.0);
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 1);
{
@@ -441,14 +436,14 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_with_term_at_the_end_of_fragment() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d e f f g";
let mut terms = BTreeMap::new();
terms.insert(String::from("f"), 1.0);
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 2);
{
@@ -465,7 +460,7 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_with_second_fragment_has_the_highest_score() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d e f g";
@@ -473,7 +468,7 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("f"), 1.0);
terms.insert(String::from("a"), 0.9);
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 7);
assert_eq!(fragments.len(), 2);
{
@@ -490,14 +485,14 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_with_term_not_in_text() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d";
let mut terms = BTreeMap::new();
terms.insert(String::from("z"), 1.0);
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 0);
@@ -508,12 +503,12 @@ Survey in 2016, 2017, and 2018."#;
#[test]
fn test_snippet_with_no_terms() {
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
let boxed_tokenizer = SimpleTokenizer.into();
let text = "a b c d";
let terms = BTreeMap::new();
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], &text);

View File

@@ -2,7 +2,7 @@ use super::TermDictionary;
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
use tantivy_fst::automaton::AlwaysMatch;
use tantivy_fst::map::{Stream, StreamBuilder};
use tantivy_fst::map::{Stream, StreamBuilder, StreamWithState};
use tantivy_fst::Automaton;
use tantivy_fst::{IntoStreamer, Streamer};
@@ -11,6 +11,7 @@ use tantivy_fst::{IntoStreamer, Streamer};
pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
where
A: Automaton,
A::State: Clone,
{
fst_map: &'a TermDictionary,
stream_builder: StreamBuilder<'a, A>,
@@ -19,6 +20,7 @@ where
impl<'a, A> TermStreamerBuilder<'a, A>
where
A: Automaton,
A::State: Clone + Default + Sized,
{
pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self {
TermStreamerBuilder {
@@ -56,10 +58,11 @@ where
pub fn into_stream(self) -> TermStreamer<'a, A> {
TermStreamer {
fst_map: self.fst_map,
stream: self.stream_builder.into_stream(),
stream: self.stream_builder.with_state().into_stream(),
term_ord: 0u64,
current_key: Vec::with_capacity(100),
current_value: TermInfo::default(),
state: Default::default(),
}
}
}
@@ -69,27 +72,31 @@ where
pub struct TermStreamer<'a, A = AlwaysMatch>
where
A: Automaton,
A::State: Clone + Default + Sized,
{
fst_map: &'a TermDictionary,
stream: Stream<'a, A>,
stream: StreamWithState<'a, A>,
term_ord: TermOrdinal,
current_key: Vec<u8>,
current_value: TermInfo,
state: A::State,
}
impl<'a, A> TermStreamer<'a, A>
where
A: Automaton,
A::State: Clone + Default + Sized,
{
/// Advance position the stream on the next item.
/// Before the first call to `.advance()`, the stream
/// is an unitialized state.
pub fn advance(&mut self) -> bool {
if let Some((term, term_ord)) = self.stream.next() {
if let Some((term, term_ord, state)) = self.stream.next() {
self.current_key.clear();
self.current_key.extend_from_slice(term);
self.term_ord = term_ord;
self.current_value = self.fst_map.term_info_from_ord(term_ord);
self.state = state;
true
} else {
false
@@ -118,6 +125,10 @@ where
&self.current_key
}
pub fn state(&self) -> &A::State {
&self.state
}
/// Accesses the current value.
///
/// Calling `.value()` after the end of the stream will return the

View File

@@ -197,7 +197,11 @@ impl TermDictionary {
/// Returns a search builder, to stream all of the terms
/// within the Automaton
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
pub fn search<'a, A>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A>
where
A: Automaton + 'a,
A::State: Clone + Default + Sized,
{
let stream_builder = self.fst_index.search(automaton);
TermStreamerBuilder::<A>::new(self, stream_builder)
}

View File

@@ -155,7 +155,6 @@ pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain;
pub(crate) use self::tokenizer::box_tokenizer;
pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};

View File

@@ -56,8 +56,6 @@ pub trait Tokenizer<'a>: Sized + Clone {
/// # Example
///
/// ```rust
/// # extern crate tantivy;
///
/// use tantivy::tokenizer::*;
///
/// # fn main() {
@@ -80,7 +78,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
}
/// A boxed tokenizer
pub trait BoxedTokenizer: Send + Sync {
trait BoxedTokenizerTrait: Send + Sync {
/// Tokenize a `&str`
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
@@ -92,7 +90,41 @@ pub trait BoxedTokenizer: Send + Sync {
fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b>;
/// Return a boxed clone of the tokenizer
fn boxed_clone(&self) -> Box<dyn BoxedTokenizer>;
fn boxed_clone(&self) -> BoxedTokenizer;
}
/// A boxed tokenizer
pub struct BoxedTokenizer(Box<dyn BoxedTokenizerTrait>);
impl<T> From<T> for BoxedTokenizer
where
T: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
fn from(tokenizer: T) -> BoxedTokenizer {
BoxedTokenizer(Box::new(BoxableTokenizer(tokenizer)))
}
}
impl BoxedTokenizer {
/// Tokenize a `&str`
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
self.0.token_stream(text)
}
/// Tokenize an array`&str`
///
/// The resulting `TokenStream` is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
pub fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b> {
self.0.token_stream_texts(texts)
}
}
impl Clone for BoxedTokenizer {
fn clone(&self) -> BoxedTokenizer {
self.0.boxed_clone()
}
}
#[derive(Clone)]
@@ -100,7 +132,7 @@ struct BoxableTokenizer<A>(A)
where
A: for<'a> Tokenizer<'a> + Send + Sync;
impl<A> BoxedTokenizer for BoxableTokenizer<A>
impl<A> BoxedTokenizerTrait for BoxableTokenizer<A>
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
@@ -125,18 +157,11 @@ where
}
}
fn boxed_clone(&self) -> Box<dyn BoxedTokenizer> {
Box::new(self.clone())
fn boxed_clone(&self) -> BoxedTokenizer {
self.0.clone().into()
}
}
pub(crate) fn box_tokenizer<A>(a: A) -> Box<dyn BoxedTokenizer>
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
{
Box::new(BoxableTokenizer(a))
}
impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
@@ -161,7 +186,6 @@ impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
/// # Example
///
/// ```
/// extern crate tantivy;
/// use tantivy::tokenizer::*;
///
/// # fn main() {
@@ -203,7 +227,6 @@ pub trait TokenStream {
/// and `.token()`.
///
/// ```
/// # extern crate tantivy;
/// # use tantivy::tokenizer::*;
/// #
/// # fn main() {

View File

@@ -1,4 +1,3 @@
use crate::tokenizer::box_tokenizer;
use crate::tokenizer::stemmer::Language;
use crate::tokenizer::BoxedTokenizer;
use crate::tokenizer::LowerCaser;
@@ -8,7 +7,6 @@ use crate::tokenizer::SimpleTokenizer;
use crate::tokenizer::Stemmer;
use crate::tokenizer::Tokenizer;
use std::collections::HashMap;
use std::ops::Deref;
use std::sync::{Arc, RwLock};
/// The tokenizer manager serves as a store for
@@ -25,16 +23,16 @@ use std::sync::{Arc, RwLock};
/// search engine.
#[derive(Clone)]
pub struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, Box<dyn BoxedTokenizer>>>>,
tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
}
impl TokenizerManager {
/// Registers a new tokenizer associated with a given name.
pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A)
where
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
A: Into<BoxedTokenizer>,
{
let boxed_tokenizer = box_tokenizer(tokenizer);
let boxed_tokenizer = tokenizer.into();
self.tokenizers
.write()
.expect("Acquiring the lock should never fail")
@@ -42,13 +40,12 @@ impl TokenizerManager {
}
/// Accessing a tokenizer given its name.
pub fn get(&self, tokenizer_name: &str) -> Option<Box<dyn BoxedTokenizer>> {
pub fn get(&self, tokenizer_name: &str) -> Option<BoxedTokenizer> {
self.tokenizers
.read()
.expect("Acquiring the lock should never fail")
.get(tokenizer_name)
.map(Deref::deref)
.map(BoxedTokenizer::boxed_clone)
.cloned()
}
}