mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-27 20:42:54 +00:00
Extracting terms matching query in the document
This commit is contained in:
75
examples/snippet.rs
Normal file
75
examples/snippet.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
// # Snippet example
|
||||
//
|
||||
// This example shows how to return a representative snippet of
|
||||
// your hit result.
|
||||
// Snippet are an extracted of a target document, and returned in HTML format.
|
||||
// The keyword searched by the user are highlighted with a `<b>` tag.
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||
|
||||
// # Defining the schema
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
// we'll only need one doc for this example.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
let query = query_parser.parse_query("sycamore spring")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
generate_snippet(&retrieved_doc, query
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
use tempdir::TempDir;
|
||||
@@ -68,6 +68,17 @@ pub trait HasLen {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn is_stricly_sorted<T: Ord>(els: &[T]) -> bool {
|
||||
if els.is_empty() {
|
||||
true
|
||||
} else {
|
||||
els.iter()
|
||||
.zip(els[1..].iter())
|
||||
.all(|(left, right)| left < right)
|
||||
}
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Maps a `i64` to `u64`
|
||||
@@ -105,12 +116,20 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_is_strictly_sorted() {
|
||||
assert!(is_stricly_sorted::<u32>(&[]));
|
||||
assert!(is_stricly_sorted(&[1]));
|
||||
assert!(is_stricly_sorted(&[1, 2, 3]));
|
||||
assert!(!is_stricly_sorted(&[1, 3, 2]));
|
||||
}
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||
|
||||
@@ -7,6 +7,11 @@ use query::{Scorer, Weight};
|
||||
use schema::{Field, IndexRecordOption};
|
||||
use termdict::{TermDictionary, TermStreamer};
|
||||
use Result;
|
||||
use query::weight::MatchingTerms;
|
||||
use SkipResult;
|
||||
use Term;
|
||||
use DocId;
|
||||
use DocSet;
|
||||
|
||||
/// A weight struct for Fuzzy Term and Regex Queries
|
||||
pub struct AutomatonWeight<A>
|
||||
@@ -36,6 +41,48 @@ impl<A> Weight for AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
|
||||
fn matching_terms(&self,
|
||||
reader: &SegmentReader,
|
||||
matching_terms: &mut MatchingTerms) -> Result<()> {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
|
||||
let inverted_index = reader.inverted_index(self.field);
|
||||
let term_dict = inverted_index.terms();
|
||||
let mut term_stream = self.automaton_stream(term_dict);
|
||||
|
||||
let doc_ids = matching_terms.sorted_doc_ids();
|
||||
let mut docs_matching_current_term: Vec<DocId> = vec![];
|
||||
|
||||
let mut term_buffer: Vec<u8> = vec![];
|
||||
|
||||
while term_stream.advance() {
|
||||
docs_matching_current_term.clear();
|
||||
let term_info = term_stream.value();
|
||||
let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic);
|
||||
for &doc_id in &doc_ids {
|
||||
match segment_postings.skip_next(doc_id) {
|
||||
SkipResult::Reached => {
|
||||
docs_matching_current_term.push(doc_id);
|
||||
}
|
||||
SkipResult::OverStep => {}
|
||||
SkipResult::End => {}
|
||||
}
|
||||
}
|
||||
if !docs_matching_current_term.is_empty() {
|
||||
term_buffer.clear();
|
||||
let term_ord = term_stream.term_ord();
|
||||
inverted_index.terms().ord_to_term(term_ord, &mut term_buffer);
|
||||
let term = Term::from_field_bytes(self.field, &term_buffer[..]);
|
||||
for &doc_id in &docs_matching_current_term {
|
||||
matching_terms.add_term(doc_id, term.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
|
||||
@@ -8,6 +8,8 @@ use query::Weight;
|
||||
use schema::IndexRecordOption;
|
||||
use Result;
|
||||
use Term;
|
||||
use SkipResult;
|
||||
use query::weight::MatchingTerms;
|
||||
|
||||
pub struct TermWeight {
|
||||
term: Term,
|
||||
@@ -38,6 +40,26 @@ impl Weight for TermWeight {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn matching_terms(&self,
|
||||
reader: &SegmentReader,
|
||||
matching_terms: &mut MatchingTerms) -> Result<()> {
|
||||
let doc_ids = matching_terms.sorted_doc_ids();
|
||||
let mut scorer = self.scorer(reader)?;
|
||||
for doc_id in doc_ids {
|
||||
match scorer.skip_next(doc_id) {
|
||||
SkipResult::Reached => {
|
||||
matching_terms.add_term(doc_id, self.term.clone());
|
||||
}
|
||||
SkipResult::OverStep => {}
|
||||
SkipResult::End => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||
if reader.num_deleted_docs() == 0 {
|
||||
let field = self.term.field();
|
||||
|
||||
@@ -1,6 +1,37 @@
|
||||
use super::Scorer;
|
||||
use core::SegmentReader;
|
||||
use Result;
|
||||
use DocId;
|
||||
use std::collections::HashSet;
|
||||
use Term;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
|
||||
pub struct MatchingTerms {
|
||||
doc_to_terms: BTreeMap<DocId, HashSet<Term>>
|
||||
}
|
||||
|
||||
impl MatchingTerms {
|
||||
pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms {
|
||||
MatchingTerms {
|
||||
doc_to_terms: doc_ids
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|doc_id| (doc_id, HashSet::default()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn sorted_doc_ids(&self) -> Vec<DocId> {
|
||||
self.doc_to_terms.keys().cloned().collect()
|
||||
}
|
||||
|
||||
pub fn add_term(&mut self, doc_id: DocId, term: Term) {
|
||||
if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) {
|
||||
terms.insert(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A Weight is the specialization of a Query
|
||||
/// for a given set of segments.
|
||||
@@ -11,6 +42,10 @@ pub trait Weight {
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
|
||||
|
||||
fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the number documents within the given `SegmentReader`.
|
||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||
Ok(self.scorer(reader)?.count())
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use htmlescape::encode_minimal;
|
||||
use htmlescape::encode_minimal;
|
||||
use schema::FieldValue;
|
||||
use schema::Value;
|
||||
use std::collections::BTreeMap;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::{Token, TokenStream, Tokenizer};
|
||||
use Document;
|
||||
use tokenizer::{Token, TokenStream};
|
||||
use Index;
|
||||
use Term;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user