mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 16:40:43 +00:00
Fragments
This commit is contained in:
@@ -29,6 +29,9 @@ use num_cpus;
|
||||
use std::path::Path;
|
||||
use tokenizer::TokenizerManager;
|
||||
use IndexWriter;
|
||||
use schema::FieldType;
|
||||
use schema::Field;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
|
||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
@@ -113,6 +116,22 @@ impl Index {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Option<Box<BoxedTokenizer>> {
|
||||
let field_type = self.schema.get_field_entry(field).field_type();
|
||||
let tokenizer: &TokenizerManager = self.tokenizers();
|
||||
match field_type {
|
||||
FieldType::Str(text_options) => {
|
||||
text_options.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer())
|
||||
.and_then(|tokenizer_name| tokenizer.get(tokenizer_name))
|
||||
|
||||
},
|
||||
_ => {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
@@ -257,7 +276,7 @@ impl Index {
|
||||
let schema = self.schema();
|
||||
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
|
||||
let searchers = (0..num_searchers)
|
||||
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
|
||||
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::sync::Arc;
|
||||
use termdict::TermMerger;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use Index;
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
@@ -18,17 +19,24 @@ use Result;
|
||||
///
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher {
|
||||
schema,
|
||||
index,
|
||||
segment_readers,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
|
||||
@@ -75,7 +75,7 @@ where
|
||||
inverted_index.terms().ord_to_term(term_ord, &mut term_buffer);
|
||||
let term = Term::from_field_bytes(self.field, &term_buffer[..]);
|
||||
for &doc_id in &docs_matching_current_term {
|
||||
matching_terms.add_term(doc_id, term.clone());
|
||||
matching_terms.add_term(doc_id, term.clone(), 1f32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ impl Weight for TermWeight {
|
||||
for doc_id in doc_ids {
|
||||
match scorer.skip_next(doc_id) {
|
||||
SkipResult::Reached => {
|
||||
matching_terms.add_term(doc_id, self.term.clone());
|
||||
matching_terms.add_term(doc_id, self.term.clone(), 1f32);
|
||||
}
|
||||
SkipResult::OverStep => {}
|
||||
SkipResult::End => {
|
||||
|
||||
@@ -5,9 +5,10 @@ use DocId;
|
||||
use std::collections::HashSet;
|
||||
use Term;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub struct MatchingTerms {
|
||||
doc_to_terms: BTreeMap<DocId, HashSet<Term>>
|
||||
doc_to_terms: BTreeMap<DocId, HashMap<Term, f32>>
|
||||
}
|
||||
|
||||
impl MatchingTerms {
|
||||
@@ -16,18 +17,22 @@ impl MatchingTerms {
|
||||
doc_to_terms: doc_ids
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|doc_id| (doc_id, HashSet::default()))
|
||||
.map(|doc_id| (doc_id, HashMap::default()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn terms_for_doc(&self, doc_id: DocId) -> Option<&HashMap<Term, f32>> {
|
||||
self.doc_to_terms.get(&doc_id)
|
||||
}
|
||||
|
||||
pub fn sorted_doc_ids(&self) -> Vec<DocId> {
|
||||
self.doc_to_terms.keys().cloned().collect()
|
||||
}
|
||||
|
||||
pub fn add_term(&mut self, doc_id: DocId, term: Term) {
|
||||
pub fn add_term(&mut self, doc_id: DocId, term: Term, score: f32) {
|
||||
if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) {
|
||||
terms.insert(term);
|
||||
terms.insert(term, score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,9 @@ use DocAddress;
|
||||
use DocId;
|
||||
use Searcher;
|
||||
use query::MatchingTerms;
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use SegmentLocalId;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct HighlightSection {
|
||||
@@ -189,28 +192,55 @@ fn select_best_fragment_combination<'a>(
|
||||
|
||||
|
||||
|
||||
fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<()> {
|
||||
fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<HashMap<SegmentLocalId, MatchingTerms>> {
|
||||
let weight = query.weight(searcher, false)?;
|
||||
let mut doc_groups = doc_addresses
|
||||
.iter()
|
||||
.group_by(|doc_address| doc_address.0);
|
||||
let mut matching_terms_per_segment: HashMap<SegmentLocalId, MatchingTerms> = HashMap::new();
|
||||
for (segment_ord, doc_addrs) in doc_groups.into_iter() {
|
||||
let doc_addrs_vec: Vec<DocId> = doc_addrs.map(|doc_addr| doc_addr.1).collect();
|
||||
let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]);
|
||||
let segment_reader = searcher.segment_reader(segment_ord);
|
||||
weight.matching_terms(segment_reader, &mut matching_terms)?;
|
||||
matching_terms_per_segment.insert(segment_ord, matching_terms);
|
||||
}
|
||||
let terms = HashSet<(DocId, Vec<Term>)>;
|
||||
Ok(())
|
||||
Ok(matching_terms_per_segment)
|
||||
}
|
||||
|
||||
pub fn generate_snippet<'a>(
|
||||
doc: &'a [DocAddress],
|
||||
index: &Index,
|
||||
pub fn generate_snippet(
|
||||
doc_addresses: &[DocAddress],
|
||||
fields: &[Field],
|
||||
searcher: &Searcher,
|
||||
query: &Query,
|
||||
terms: Vec<Term>,
|
||||
max_num_chars: usize) -> Snippet {
|
||||
search_fragments(boxed_tokenizer, &text, terms, 3);
|
||||
max_num_chars: usize) -> Result<Vec<Snippet>> {
|
||||
// TODO sort doc_addresses
|
||||
let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?;
|
||||
for doc_address in doc_addresses {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
for &field in fields {
|
||||
let mut text = String::new();
|
||||
for value in doc.get_all(field) {
|
||||
text.push_str(value.text());
|
||||
}
|
||||
if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) {
|
||||
if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&doc_address.segment_ord()) {
|
||||
if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) {
|
||||
let terms: BTreeMap<String, f32> = terms
|
||||
.iter()
|
||||
.map(|(term, score)| (term.text().to_string(), *score))
|
||||
.collect();
|
||||
search_fragments(tokenizer,
|
||||
&text,
|
||||
terms,
|
||||
max_num_chars);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// search_fragments(boxed_tokenizer, &text, terms, 3);
|
||||
panic!("e");
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -346,7 +376,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(boxed_tokenizer, &text, terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user