mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-27 12:32:55 +00:00
Added SnippetGenerator
This commit is contained in:
@@ -62,7 +62,6 @@ opt-level = 3
|
|||||||
debug = false
|
debug = false
|
||||||
lto = true
|
lto = true
|
||||||
debug-assertions = false
|
debug-assertions = false
|
||||||
overflow-checks = false
|
|
||||||
|
|
||||||
[profile.test]
|
[profile.test]
|
||||||
debug-assertions = true
|
debug-assertions = true
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let title = schema.get_field("title").unwrap();
|
let title = schema.get_field("title").unwrap();
|
||||||
let body = schema.get_field("body").unwrap();
|
let body = schema.get_field("body").unwrap();
|
||||||
|
|
||||||
let mut old_man_doc = Document::default();
|
|
||||||
// we'll only need one doc for this example.
|
// we'll only need one doc for this example.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
@@ -60,12 +59,14 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let mut top_collector = TopCollector::with_limit(10);
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
searcher.search(&*query, &mut top_collector)?;
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
let snippet_generator =
|
let snippet_generator = SnippetGenerator::new(&*searcher, &*query, body)?;
|
||||||
|
|
||||||
let doc_addresses = top_collector.docs();
|
let doc_addresses = top_collector.docs();
|
||||||
for doc_address in doc_addresses {
|
for doc_address in doc_addresses {
|
||||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
let doc = searcher.doc(&doc_address)?;
|
||||||
// generate_snippet(&retrieved_doc, query
|
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||||
|
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
||||||
|
println!("snippet: {}", snippet.to_html());
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -68,17 +68,6 @@ pub trait HasLen {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub fn is_stricly_sorted<T: Ord>(els: &[T]) -> bool {
|
|
||||||
if els.is_empty() {
|
|
||||||
true
|
|
||||||
} else {
|
|
||||||
els.iter()
|
|
||||||
.zip(els[1..].iter())
|
|
||||||
.all(|(left, right)| left < right)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const HIGHEST_BIT: u64 = 1 << 63;
|
const HIGHEST_BIT: u64 = 1 << 63;
|
||||||
|
|
||||||
/// Maps a `i64` to `u64`
|
/// Maps a `i64` to `u64`
|
||||||
@@ -116,20 +105,12 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
|||||||
pub(crate) mod test {
|
pub(crate) mod test {
|
||||||
|
|
||||||
pub use super::serialize::test::fixed_size_test;
|
pub use super::serialize::test::fixed_size_test;
|
||||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted};
|
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||||
|
|
||||||
fn test_i64_converter_helper(val: i64) {
|
fn test_i64_converter_helper(val: i64) {
|
||||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_is_strictly_sorted() {
|
|
||||||
assert!(is_stricly_sorted::<u32>(&[]));
|
|
||||||
assert!(is_stricly_sorted(&[1]));
|
|
||||||
assert!(is_stricly_sorted(&[1, 2, 3]));
|
|
||||||
assert!(!is_stricly_sorted(&[1, 3, 2]));
|
|
||||||
}
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_i64_converter() {
|
fn test_i64_converter() {
|
||||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||||
|
|||||||
@@ -115,6 +115,8 @@ impl Index {
|
|||||||
&self.tokenizers
|
&self.tokenizers
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Helper to access the tokenizer associated to a specific field.
|
||||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
|
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
let field_type = field_entry.field_type();
|
let field_type = field_entry.field_type();
|
||||||
@@ -325,3 +327,26 @@ impl Clone for Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use Index;
|
||||||
|
use schema::{SchemaBuilder, TEXT, INT_INDEXED};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_indexer_for_field() {
|
||||||
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||||
|
let body_field = schema_builder.add_text_field("body", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
assert!(index.tokenizer_for_field(body_field).is_ok());
|
||||||
|
assert_eq!(
|
||||||
|
format!("{:?}", index.tokenizer_for_field(num_likes_field).err()),
|
||||||
|
"Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@@ -33,6 +33,7 @@ impl Searcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the `Index` associated to the `Searcher`
|
||||||
pub fn index(&self) -> &Index {
|
pub fn index(&self) -> &Index {
|
||||||
&self.index
|
&self.index
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -770,23 +770,23 @@ mod tests {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "af b");
|
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(&DocAddress(0, 1)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 1)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c");
|
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(&DocAddress(0, 2)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 2)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d");
|
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(&DocAddress(0, 3)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 3)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "af b");
|
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(&DocAddress(0, 4)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 4)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g");
|
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let get_fast_vals = |terms: Vec<Term>| {
|
let get_fast_vals = |terms: Vec<Term>| {
|
||||||
|
|||||||
@@ -899,11 +899,11 @@ mod tests {
|
|||||||
assert_eq!(document.len(), 3);
|
assert_eq!(document.len(), 3);
|
||||||
let values = document.get_all(text_field);
|
let values = document.get_all(text_field);
|
||||||
assert_eq!(values.len(), 2);
|
assert_eq!(values.len(), 2);
|
||||||
assert_eq!(values[0].text(), "tantivy");
|
assert_eq!(values[0].text(), Some("tantivy"));
|
||||||
assert_eq!(values[1].text(), "some other value");
|
assert_eq!(values[1].text(), Some("some other value"));
|
||||||
let values = document.get_all(other_text_field);
|
let values = document.get_all(other_text_field);
|
||||||
assert_eq!(values.len(), 1);
|
assert_eq!(values.len(), 1);
|
||||||
assert_eq!(values[0].text(), "short");
|
assert_eq!(values[0].text(), Some("short"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -60,6 +60,8 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug {
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract all of the terms associated to the query and insert them in the
|
||||||
|
/// term set given in arguments.
|
||||||
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
|
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
|
||||||
|
|
||||||
/// Search works as follows :
|
/// Search works as follows :
|
||||||
|
|||||||
@@ -443,8 +443,8 @@ mod tests {
|
|||||||
}"#,
|
}"#,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
|
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
|
||||||
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
|
assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton"));
|
||||||
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
|
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
|
||||||
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
|
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -74,10 +74,10 @@ impl Value {
|
|||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// If the value is not of type `Str`
|
/// If the value is not of type `Str`
|
||||||
pub fn text(&self) -> &str {
|
pub fn text(&self) -> Option<&str> {
|
||||||
match *self {
|
match *self {
|
||||||
Value::Str(ref text) => text,
|
Value::Str(ref text) => Some(text),
|
||||||
_ => panic!("This is not a text field."),
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,9 @@ use Searcher;
|
|||||||
use schema::Field;
|
use schema::Field;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use tokenizer::BoxedTokenizer;
|
use tokenizer::BoxedTokenizer;
|
||||||
|
use Document;
|
||||||
|
|
||||||
|
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct HighlightSection {
|
pub struct HighlightSection {
|
||||||
@@ -189,16 +192,58 @@ fn select_best_fragment_combination<'a>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// `SnippetGenerator`
|
||||||
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// # #[macro_use]
|
||||||
|
/// # extern crate tantivy;
|
||||||
|
/// # use tantivy::Index;
|
||||||
|
/// # use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
|
/// # use tantivy::query::QueryParser;
|
||||||
|
/// use tantivy::SnippetGenerator;
|
||||||
|
///
|
||||||
|
/// # fn main() -> tantivy::Result<()> {
|
||||||
|
/// # let mut schema_builder = SchemaBuilder::default();
|
||||||
|
/// # let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
|
/// # let schema = schema_builder.build();
|
||||||
|
/// # let index = Index::create_in_ram(schema);
|
||||||
|
/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
|
||||||
|
/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
|
||||||
|
/// # Je ne me sentis plus guidé par les haleurs :
|
||||||
|
/// # Des Peaux-Rouges criards les avaient pris pour cibles,
|
||||||
|
/// # Les ayant cloués nus aux poteaux de couleurs.
|
||||||
|
/// #
|
||||||
|
/// # J'étais insoucieux de tous les équipages,
|
||||||
|
/// # Porteur de blés flamands ou de cotons anglais.
|
||||||
|
/// # Quand avec mes haleurs ont fini ces tapages,
|
||||||
|
/// # Les Fleuves m'ont laissé descendre où je voulais.
|
||||||
|
/// # "#);
|
||||||
|
/// # index_writer.add_document(doc.clone());
|
||||||
|
/// # index_writer.commit()?;
|
||||||
|
/// # let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
|
/// // ...
|
||||||
|
/// let query = query_parser.parse_query("haleurs flamands").unwrap();
|
||||||
|
/// # index.load_searchers()?;
|
||||||
|
/// # let searcher = index.searcher();
|
||||||
|
/// let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field)?;
|
||||||
|
/// snippet_generator.set_max_num_chars(100);
|
||||||
|
/// let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||||
|
/// let snippet_html: String = snippet.to_html();
|
||||||
|
/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
|
||||||
|
/// # Ok(())
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
pub struct SnippetGenerator {
|
pub struct SnippetGenerator {
|
||||||
terms_text: BTreeMap<String, f32>,
|
terms_text: BTreeMap<String, f32>,
|
||||||
tokenizer: Box<BoxedTokenizer>,
|
tokenizer: Box<BoxedTokenizer>,
|
||||||
|
field: Field,
|
||||||
max_num_chars: usize
|
max_num_chars: usize
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SnippetGenerator {
|
impl SnippetGenerator {
|
||||||
|
/// Creates a new snippet generator
|
||||||
pub fn new(searcher: &Searcher,
|
pub fn new(searcher: &Searcher,
|
||||||
query: &Query,
|
query: &Query,
|
||||||
field: Field) -> Result<SnippetGenerator> {
|
field: Field) -> Result<SnippetGenerator> {
|
||||||
@@ -212,14 +257,30 @@ impl SnippetGenerator {
|
|||||||
Ok(SnippetGenerator {
|
Ok(SnippetGenerator {
|
||||||
terms_text,
|
terms_text,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
|
field,
|
||||||
max_num_chars: DEFAULT_MAX_NUM_CHARS
|
max_num_chars: DEFAULT_MAX_NUM_CHARS
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Sets a maximum number of chars.
|
||||||
pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
|
pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
|
||||||
self.max_num_chars = max_num_chars;
|
self.max_num_chars = max_num_chars;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Generates a snippet for the given `Document`.
|
||||||
|
///
|
||||||
|
/// This method extract the text associated to the `SnippetGenerator`'s field
|
||||||
|
/// and computes a snippet.
|
||||||
|
pub fn snippet_from_doc(&self, doc: &Document) -> Snippet {
|
||||||
|
let text: String = doc.get_all(self.field)
|
||||||
|
.into_iter()
|
||||||
|
.flat_map(|val| val.text())
|
||||||
|
.collect::<Vec<&str>>()
|
||||||
|
.join(" ");
|
||||||
|
self.snippet(&text)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generates a snippet for the given text.
|
||||||
pub fn snippet(&self, text: &str) -> Snippet {
|
pub fn snippet(&self, text: &str) -> Snippet {
|
||||||
let fragment_candidates = search_fragments(&*self.tokenizer,
|
let fragment_candidates = search_fragments(&*self.tokenizer,
|
||||||
&text,
|
&text,
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ pub mod tests {
|
|||||||
let store = StoreReader::from_source(store_source);
|
let store = StoreReader::from_source(store_source);
|
||||||
for i in 0..1_000 {
|
for i in 0..1_000 {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
*store.get(i).unwrap().get_first(field_title).unwrap().text(),
|
*store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(),
|
||||||
format!("Doc {}", i)
|
format!("Doc {}", i)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -153,7 +153,9 @@ pub use self::simple_tokenizer::SimpleTokenizer;
|
|||||||
pub use self::stemmer::Stemmer;
|
pub use self::stemmer::Stemmer;
|
||||||
pub use self::stop_word_filter::StopWordFilter;
|
pub use self::stop_word_filter::StopWordFilter;
|
||||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||||
pub use self::tokenizer::{BoxedTokenizer, box_tokenizer};
|
pub use self::tokenizer::BoxedTokenizer;
|
||||||
|
pub(crate) use self::tokenizer::box_tokenizer;
|
||||||
|
|
||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|
||||||
|
|||||||
@@ -130,7 +130,7 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn box_tokenizer<A>(a: A) -> Box<BoxedTokenizer>
|
pub(crate) fn box_tokenizer<A>(a: A) -> Box<BoxedTokenizer>
|
||||||
where
|
where
|
||||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use tokenizer::tokenizer::box_tokenizer;
|
use tokenizer::box_tokenizer;
|
||||||
use tokenizer::BoxedTokenizer;
|
use tokenizer::BoxedTokenizer;
|
||||||
use tokenizer::JapaneseTokenizer;
|
use tokenizer::JapaneseTokenizer;
|
||||||
use tokenizer::LowerCaser;
|
use tokenizer::LowerCaser;
|
||||||
|
|||||||
Reference in New Issue
Block a user