mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Merge branch 'issue/368b'
This commit is contained in:
@@ -5,7 +5,7 @@ Tantivy 0.7
|
||||
greatly improving performance
|
||||
- Tantivy error now rely on the failure crate (@drusellers)
|
||||
- Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax
|
||||
|
||||
- Added a snippet generator with highlight (@vigneshsarma, @fulmicoton)
|
||||
|
||||
Tantivy 0.6.1
|
||||
=========================
|
||||
|
||||
@@ -47,6 +47,7 @@ census = "0.1"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
@@ -60,7 +61,6 @@ opt-level = 3
|
||||
debug = false
|
||||
lto = true
|
||||
debug-assertions = false
|
||||
overflow-checks = false
|
||||
|
||||
[profile.test]
|
||||
debug-assertions = true
|
||||
|
||||
73
examples/snippet.rs
Normal file
73
examples/snippet.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
// # Snippet example
|
||||
//
|
||||
// This example shows how to return a representative snippet of
|
||||
// your hit result.
|
||||
// Snippet are an extracted of a target document, and returned in HTML format.
|
||||
// The keyword searched by the user are highlighted with a `<b>` tag.
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::SnippetGenerator;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||
|
||||
// # Defining the schema
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
// we'll only need one doc for this example.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
let query = query_parser.parse_query("sycamore spring")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
let snippet_generator = SnippetGenerator::new(&*searcher, &*query, body)?;
|
||||
|
||||
let doc_addresses = top_collector.docs();
|
||||
for doc_address in doc_addresses {
|
||||
let doc = searcher.doc(&doc_address)?;
|
||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -28,6 +28,9 @@ use num_cpus;
|
||||
use std::path::Path;
|
||||
use tokenizer::TokenizerManager;
|
||||
use IndexWriter;
|
||||
use schema::FieldType;
|
||||
use schema::Field;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
|
||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
@@ -112,6 +115,34 @@ impl Index {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
|
||||
/// Helper to access the tokenizer associated to a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> =
|
||||
match field_type {
|
||||
FieldType::Str(text_options) => {
|
||||
text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
|
||||
},
|
||||
_ => {
|
||||
None
|
||||
}
|
||||
};
|
||||
match tokenizer_name_opt {
|
||||
Some(tokenizer) => {
|
||||
Ok(tokenizer)
|
||||
}
|
||||
None => {
|
||||
Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name())))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
@@ -258,7 +289,7 @@ impl Index {
|
||||
let schema = self.schema();
|
||||
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
|
||||
let searchers = (0..num_searchers)
|
||||
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
|
||||
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
@@ -296,3 +327,26 @@ impl Clone for Index {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use Index;
|
||||
use schema::{SchemaBuilder, TEXT, INT_INDEXED};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
assert!(index.tokenizer_for_field(body_field).is_ok());
|
||||
assert_eq!(
|
||||
format!("{:?}", index.tokenizer_for_field(num_likes_field).err()),
|
||||
"Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -10,6 +10,7 @@ use std::sync::Arc;
|
||||
use termdict::TermMerger;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use Index;
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
@@ -18,17 +19,25 @@ use Result;
|
||||
///
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher {
|
||||
schema,
|
||||
index,
|
||||
segment_readers,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `Index` associated to the `Searcher`
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
|
||||
@@ -770,23 +770,23 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "af b");
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(&DocAddress(0, 1)).unwrap();
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c");
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(&DocAddress(0, 2)).unwrap();
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d");
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(&DocAddress(0, 3)).unwrap();
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "af b");
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(&DocAddress(0, 4)).unwrap();
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g");
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g"));
|
||||
}
|
||||
{
|
||||
let get_fast_vals = |terms: Vec<Term>| {
|
||||
|
||||
10
src/lib.rs
Normal file → Executable file
10
src/lib.rs
Normal file → Executable file
@@ -154,6 +154,7 @@ extern crate stable_deref_trait;
|
||||
extern crate tempdir;
|
||||
extern crate tempfile;
|
||||
extern crate uuid;
|
||||
extern crate htmlescape;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
@@ -210,6 +211,9 @@ pub mod schema;
|
||||
pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
mod snippet;
|
||||
pub use self::snippet::SnippetGenerator;
|
||||
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
@@ -893,11 +897,11 @@ mod tests {
|
||||
assert_eq!(document.len(), 3);
|
||||
let values = document.get_all(text_field);
|
||||
assert_eq!(values.len(), 2);
|
||||
assert_eq!(values[0].text(), "tantivy");
|
||||
assert_eq!(values[1].text(), "some other value");
|
||||
assert_eq!(values[0].text(), Some("tantivy"));
|
||||
assert_eq!(values[1].text(), Some("some other value"));
|
||||
let values = document.get_all(other_text_field);
|
||||
assert_eq!(values.len(), 1);
|
||||
assert_eq!(values[0].text(), "short");
|
||||
assert_eq!(values[0].text(), Some("short"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -6,6 +6,7 @@ use query::Weight;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use Result;
|
||||
use std::collections::BTreeSet;
|
||||
use Searcher;
|
||||
|
||||
/// The boolean query combines a set of queries
|
||||
@@ -40,6 +41,7 @@ impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
|
||||
}
|
||||
|
||||
impl Query for BooleanQuery {
|
||||
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
let sub_weights = self.subqueries
|
||||
.iter()
|
||||
@@ -49,6 +51,12 @@ impl Query for BooleanQuery {
|
||||
.collect::<Result<_>>()?;
|
||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
for (_occur, subquery) in &self.subqueries {
|
||||
subquery.query_terms(term_set);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BooleanQuery {
|
||||
|
||||
@@ -27,7 +27,6 @@ mod weight;
|
||||
mod vec_docset;
|
||||
|
||||
pub(crate) mod score_combiner;
|
||||
|
||||
pub use self::intersection::Intersection;
|
||||
pub use self::union::Union;
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ use query::Query;
|
||||
use query::Weight;
|
||||
use schema::{Field, Term};
|
||||
use Result;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
/// `PhraseQuery` matches a specific sequence of words.
|
||||
///
|
||||
@@ -107,4 +108,10 @@ impl Query for PhraseQuery {
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
for (_, query_term) in &self.phrase_terms {
|
||||
term_set.insert(query_term.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ impl PhraseWeight {
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
let field = self.phrase_terms[0].1.field();
|
||||
|
||||
@@ -5,6 +5,8 @@ use downcast;
|
||||
use std::fmt;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
use std::collections::BTreeSet;
|
||||
use Term;
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
/// for those documents.
|
||||
@@ -58,6 +60,10 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Extract all of the terms associated to the query and insert them in the
|
||||
/// term set given in arguments.
|
||||
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
|
||||
@@ -274,6 +274,7 @@ impl RangeWeight {
|
||||
}
|
||||
|
||||
impl Weight for RangeWeight {
|
||||
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
|
||||
@@ -6,6 +6,7 @@ use schema::IndexRecordOption;
|
||||
use Result;
|
||||
use Searcher;
|
||||
use Term;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
/// containing a specific term.
|
||||
@@ -110,4 +111,7 @@ impl Query for TermQuery {
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
Ok(Box::new(self.specialized_weight(searcher, scoring_enabled)))
|
||||
}
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
term_set.insert(self.term.clone());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -443,8 +443,8 @@ mod tests {
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
|
||||
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
|
||||
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
|
||||
assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton"));
|
||||
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
|
||||
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
|
||||
}
|
||||
|
||||
@@ -74,10 +74,10 @@ impl Value {
|
||||
///
|
||||
/// # Panics
|
||||
/// If the value is not of type `Str`
|
||||
pub fn text(&self) -> &str {
|
||||
pub fn text(&self) -> Option<&str> {
|
||||
match *self {
|
||||
Value::Str(ref text) => text,
|
||||
_ => panic!("This is not a text field."),
|
||||
Value::Str(ref text) => Some(text),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
479
src/snippet/mod.rs
Normal file
479
src/snippet/mod.rs
Normal file
@@ -0,0 +1,479 @@
|
||||
use htmlescape::encode_minimal;
|
||||
use std::collections::BTreeMap;
|
||||
use tokenizer::{Token, TokenStream};
|
||||
use Result;
|
||||
use query::Query;
|
||||
use Searcher;
|
||||
use schema::Field;
|
||||
use std::collections::BTreeSet;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use Document;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct HighlightSection {
|
||||
start: usize,
|
||||
stop: usize,
|
||||
}
|
||||
|
||||
impl HighlightSection {
|
||||
fn new(start: usize, stop: usize) -> HighlightSection {
|
||||
HighlightSection { start, stop }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FragmentCandidate {
|
||||
score: f32,
|
||||
start_offset: usize,
|
||||
stop_offset: usize,
|
||||
num_chars: usize,
|
||||
highlighted: Vec<HighlightSection>,
|
||||
}
|
||||
|
||||
impl FragmentCandidate {
|
||||
/// Create a basic `FragmentCandidate`
|
||||
///
|
||||
/// `score`, `num_chars` are set to 0
|
||||
/// and `highlighted` is set to empty vec
|
||||
/// stop_offset is set to start_offset, which is taken as a param.
|
||||
fn new(start_offset: usize) -> FragmentCandidate {
|
||||
FragmentCandidate {
|
||||
score: 0.0,
|
||||
start_offset: start_offset,
|
||||
stop_offset: start_offset,
|
||||
num_chars: 0,
|
||||
highlighted: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Updates `score` and `highlighted` fields of the objects.
|
||||
///
|
||||
/// taking the token and terms, the token is added to the fragment.
|
||||
/// if the token is one of the terms, the score
|
||||
/// and highlighted fields are updated in the fragment.
|
||||
fn try_add_token(&mut self, token: &Token, terms: &BTreeMap<String, f32>) {
|
||||
self.stop_offset = token.offset_to;
|
||||
|
||||
if let Some(score) = terms.get(&token.text.to_lowercase()) {
|
||||
self.score += score;
|
||||
self.highlighted
|
||||
.push(HighlightSection::new(token.offset_from, token.offset_to));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Snippet {
|
||||
fragments: String,
|
||||
highlighted: Vec<HighlightSection>,
|
||||
}
|
||||
|
||||
const HIGHLIGHTEN_PREFIX: &str = "<b>";
|
||||
const HIGHLIGHTEN_POSTFIX: &str = "</b>";
|
||||
|
||||
impl Snippet {
|
||||
|
||||
pub fn empty() -> Snippet {
|
||||
Snippet {
|
||||
fragments: String::new(),
|
||||
highlighted: Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a hignlightned html from the `Snippet`.
|
||||
pub fn to_html(&self) -> String {
|
||||
let mut html = String::new();
|
||||
let mut start_from: usize = 0;
|
||||
|
||||
for item in self.highlighted.iter() {
|
||||
html.push_str(&encode_minimal(&self.fragments[start_from..item.start]));
|
||||
html.push_str(HIGHLIGHTEN_PREFIX);
|
||||
html.push_str(&encode_minimal(&self.fragments[item.start..item.stop]));
|
||||
html.push_str(HIGHLIGHTEN_POSTFIX);
|
||||
start_from = item.stop;
|
||||
}
|
||||
html.push_str(&encode_minimal(
|
||||
&self.fragments[start_from..self.fragments.len()],
|
||||
));
|
||||
html
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a non-empty list of "good" fragments.
|
||||
///
|
||||
/// If no target term is within the text, then the function
|
||||
/// should return an empty Vec.
|
||||
///
|
||||
/// If a target term is within the text, then the returned
|
||||
/// list is required to be non-empty.
|
||||
///
|
||||
/// The returned list is non-empty and contain less
|
||||
/// than 12 possibly overlapping fragments.
|
||||
///
|
||||
/// All fragments should contain at least one target term
|
||||
/// and have at most `max_num_chars` characters (not bytes).
|
||||
///
|
||||
/// It is ok to emit non-overlapping fragments, for instance,
|
||||
/// one short and one long containing the same keyword, in order
|
||||
/// to leave optimization opportunity to the fragment selector
|
||||
/// upstream.
|
||||
///
|
||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||
/// has to be a valid string.
|
||||
fn search_fragments<'a>(
|
||||
tokenizer: &BoxedTokenizer,
|
||||
text: &'a str,
|
||||
terms: &BTreeMap<String, f32>,
|
||||
max_num_chars: usize,
|
||||
) -> Vec<FragmentCandidate> {
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
let mut fragment = FragmentCandidate::new(0);
|
||||
let mut fragments: Vec<FragmentCandidate> = vec![];
|
||||
|
||||
while let Some(next) = token_stream.next() {
|
||||
if (next.offset_to - fragment.start_offset) > max_num_chars {
|
||||
if fragment.score > 0.0 {
|
||||
fragments.push(fragment)
|
||||
};
|
||||
fragment = FragmentCandidate::new(next.offset_from);
|
||||
}
|
||||
fragment.try_add_token(next, &terms);
|
||||
}
|
||||
if fragment.score > 0.0 {
|
||||
fragments.push(fragment)
|
||||
}
|
||||
|
||||
fragments
|
||||
}
|
||||
|
||||
/// Returns a Snippet
|
||||
///
|
||||
/// Takes a vector of `FragmentCandidate`s and the text.
|
||||
/// Figures out the best fragment from it and creates a snippet.
|
||||
fn select_best_fragment_combination<'a>(
|
||||
fragments: Vec<FragmentCandidate>,
|
||||
text: &'a str,
|
||||
) -> Snippet {
|
||||
let best_fragment_opt = fragments
|
||||
.iter()
|
||||
.max_by(|left, right| {
|
||||
let cmp_score = left.score.partial_cmp(&right.score).unwrap_or(Ordering::Equal);
|
||||
if cmp_score == Ordering::Equal {
|
||||
(right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset))
|
||||
} else {
|
||||
cmp_score
|
||||
}
|
||||
});
|
||||
if let Some(fragment) = best_fragment_opt {
|
||||
let fragment_text = &text[fragment.start_offset..fragment.stop_offset];
|
||||
let highlighted = fragment
|
||||
.highlighted
|
||||
.iter()
|
||||
.map(|item| {
|
||||
HighlightSection::new(
|
||||
item.start - fragment.start_offset,
|
||||
item.stop - fragment.start_offset,
|
||||
)
|
||||
}).collect();
|
||||
Snippet {
|
||||
fragments: fragment_text.to_string(),
|
||||
highlighted: highlighted,
|
||||
}
|
||||
} else {
|
||||
// when there no fragments to chose from,
|
||||
// for now create a empty snippet
|
||||
Snippet {
|
||||
fragments: String::new(),
|
||||
highlighted: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `SnippetGenerator`
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # #[macro_use]
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::Index;
|
||||
/// # use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SnippetGenerator;
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// # let mut schema_builder = SchemaBuilder::default();
|
||||
/// # let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
/// # let schema = schema_builder.build();
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
|
||||
/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
|
||||
/// # Je ne me sentis plus guidé par les haleurs :
|
||||
/// # Des Peaux-Rouges criards les avaient pris pour cibles,
|
||||
/// # Les ayant cloués nus aux poteaux de couleurs.
|
||||
/// #
|
||||
/// # J'étais insoucieux de tous les équipages,
|
||||
/// # Porteur de blés flamands ou de cotons anglais.
|
||||
/// # Quand avec mes haleurs ont fini ces tapages,
|
||||
/// # Les Fleuves m'ont laissé descendre où je voulais.
|
||||
/// # "#);
|
||||
/// # index_writer.add_document(doc.clone());
|
||||
/// # index_writer.commit()?;
|
||||
/// # let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
/// // ...
|
||||
/// let query = query_parser.parse_query("haleurs flamands").unwrap();
|
||||
/// # index.load_searchers()?;
|
||||
/// # let searcher = index.searcher();
|
||||
/// let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field)?;
|
||||
/// snippet_generator.set_max_num_chars(100);
|
||||
/// let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
/// let snippet_html: String = snippet.to_html();
|
||||
/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct SnippetGenerator {
|
||||
terms_text: BTreeMap<String, f32>,
|
||||
tokenizer: Box<BoxedTokenizer>,
|
||||
field: Field,
|
||||
max_num_chars: usize
|
||||
}
|
||||
|
||||
impl SnippetGenerator {
|
||||
/// Creates a new snippet generator
|
||||
pub fn new(searcher: &Searcher,
|
||||
query: &Query,
|
||||
field: Field) -> Result<SnippetGenerator> {
|
||||
let mut terms = BTreeSet::new();
|
||||
query.query_terms(&mut terms);
|
||||
let terms_text: BTreeMap<String, f32> = terms.into_iter()
|
||||
.filter(|term| term.field() == field)
|
||||
.map(|term| (term.text().to_string(), 1f32))
|
||||
.collect();
|
||||
let tokenizer = searcher.index().tokenizer_for_field(field)?;
|
||||
Ok(SnippetGenerator {
|
||||
terms_text,
|
||||
tokenizer,
|
||||
field,
|
||||
max_num_chars: DEFAULT_MAX_NUM_CHARS
|
||||
})
|
||||
}
|
||||
|
||||
/// Sets a maximum number of chars.
|
||||
pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
|
||||
self.max_num_chars = max_num_chars;
|
||||
}
|
||||
|
||||
/// Generates a snippet for the given `Document`.
|
||||
///
|
||||
/// This method extract the text associated to the `SnippetGenerator`'s field
|
||||
/// and computes a snippet.
|
||||
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
|
||||
let text: String = doc.get_all(self.field)
|
||||
.into_iter()
|
||||
.flat_map(|val| val.text())
|
||||
.collect::<Vec<&str>>()
|
||||
.join(" ");
|
||||
self.snippet(&text)
|
||||
}
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippet(&self, text: &str) -> Snippet {
|
||||
let fragment_candidates = search_fragments(&*self.tokenizer,
|
||||
&text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars);
|
||||
select_best_fragment_combination(fragment_candidates, &text)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{search_fragments, select_best_fragment_combination};
|
||||
use std::collections::BTreeMap;
|
||||
use std::iter::Iterator;
|
||||
use tokenizer::{box_tokenizer, SimpleTokenizer};
|
||||
use Index;
|
||||
use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing};
|
||||
use SnippetGenerator;
|
||||
use query::QueryParser;
|
||||
|
||||
|
||||
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which
|
||||
describes it as a "safe, concurrent, practical language", supporting functional and
|
||||
imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],
|
||||
but its designers intend it to provide better memory safety while still maintaining
|
||||
performance.
|
||||
|
||||
Rust is free and open-source software, released under an MIT License, or Apache License
|
||||
2.0. Its designers have refined the language through the experiences of writing the Servo
|
||||
web browser layout engine[14] and the Rust compiler. A large proportion of current commits
|
||||
to the project are from community members.[15]
|
||||
|
||||
Rust won first place for "most loved programming language" in the Stack Overflow Developer
|
||||
Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("rust"), 1.0);
|
||||
terms.insert(String::from("language"), 0.9);
|
||||
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = fragments.iter().nth(0).unwrap();
|
||||
assert_eq!(first.score, 1.9);
|
||||
assert_eq!(first.stop_offset, 89);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(fragments, &TEST_TEXT);
|
||||
assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a \"safe".to_owned());
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems programming <b>language</b> sponsored by Mozilla which\ndescribes it as a "safe".to_owned())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_in_second_fragment() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
|
||||
let text = "a b c d e f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
let first = fragments.iter().nth(0).unwrap();
|
||||
assert_eq!(first.score, 1.0);
|
||||
assert_eq!(first.start_offset, 4);
|
||||
assert_eq!(first.stop_offset, 7);
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(fragments, &text);
|
||||
assert_eq!(snippet.fragments, "c d");
|
||||
assert_eq!(snippet.to_html(), "<b>c</b> d");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_term_at_the_end_of_fragment() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
|
||||
let text = "a b c d e f f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
let first = fragments.iter().nth(0).unwrap();
|
||||
assert_eq!(first.score, 1.0);
|
||||
assert_eq!(first.stop_offset, 11);
|
||||
assert_eq!(first.start_offset, 8);
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(fragments, &text);
|
||||
assert_eq!(snippet.fragments, "e f");
|
||||
assert_eq!(snippet.to_html(), "e <b>f</b>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_second_fragment_has_the_highest_score() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
|
||||
let text = "a b c d e f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
let first = fragments.iter().nth(0).unwrap();
|
||||
assert_eq!(first.score, 0.9);
|
||||
assert_eq!(first.stop_offset, 7);
|
||||
assert_eq!(first.start_offset, 0);
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(fragments, &text);
|
||||
assert_eq!(snippet.fragments, "e f g");
|
||||
assert_eq!(snippet.to_html(), "e <b>f</b> g");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_term_not_in_text() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(fragments, &text);
|
||||
assert_eq!(snippet.fragments, "");
|
||||
assert_eq!(snippet.to_html(), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_no_terms() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(fragments, &text);
|
||||
assert_eq!(snippet.fragments, "");
|
||||
assert_eq!(snippet.to_html(), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snippet_generator() {
|
||||
let mut schema_builder = SchemaBuilder::default ();
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default()
|
||||
.set_tokenizer("en_stem")
|
||||
.set_index_option(IndexRecordOption::Basic)
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
let doc = doc ! (text_field => TEST_TEXT);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let query = query_parser.parse_query("rust design").unwrap();
|
||||
let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field).unwrap();
|
||||
{
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
|
||||
}
|
||||
{
|
||||
snippet_generator.set_max_num_chars(90);
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -109,7 +109,7 @@ pub mod tests {
|
||||
let store = StoreReader::from_source(store_source);
|
||||
for i in 0..1_000 {
|
||||
assert_eq!(
|
||||
*store.get(i).unwrap().get_first(field_title).unwrap().text(),
|
||||
*store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(),
|
||||
format!("Doc {}", i)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -152,6 +152,8 @@ pub use self::stemmer::Stemmer;
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
pub(crate) use self::tokenizer::box_tokenizer;
|
||||
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tokenizer::tokenizer::box_tokenizer;
|
||||
use tokenizer::box_tokenizer;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::LowerCaser;
|
||||
use tokenizer::RawTokenizer;
|
||||
|
||||
Reference in New Issue
Block a user