Fragments

This commit is contained in:
Paul Masurel
2018-09-07 09:57:12 +09:00
parent f570fe37d4
commit 9101bf5753
6 changed files with 80 additions and 18 deletions

View File

@@ -29,6 +29,9 @@ use num_cpus;
use std::path::Path;
use tokenizer::TokenizerManager;
use IndexWriter;
use schema::FieldType;
use schema::Field;
use tokenizer::BoxedTokenizer;
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
@@ -113,6 +116,22 @@ impl Index {
&self.tokenizers
}
pub fn tokenizer_for_field(&self, field: Field) -> Option<Box<BoxedTokenizer>> {
let field_type = self.schema.get_field_entry(field).field_type();
let tokenizer: &TokenizerManager = self.tokenizers();
match field_type {
FieldType::Str(text_options) => {
text_options.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer())
.and_then(|tokenizer_name| tokenizer.get(tokenizer_name))
},
_ => {
None
}
}
}
/// Opens a new directory from an index path.
#[cfg(feature = "mmap")]
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
@@ -257,7 +276,7 @@ impl Index {
let schema = self.schema();
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
let searchers = (0..num_searchers)
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())

View File

@@ -10,6 +10,7 @@ use std::sync::Arc;
use termdict::TermMerger;
use DocAddress;
use Result;
use Index;
/// Holds a list of `SegmentReader`s ready for search.
///
@@ -18,17 +19,24 @@ use Result;
///
pub struct Searcher {
schema: Schema,
index: Index,
segment_readers: Vec<SegmentReader>,
}
impl Searcher {
/// Creates a new `Searcher`
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec<SegmentReader>) -> Searcher {
Searcher {
schema,
index,
segment_readers,
}
}
pub fn index(&self) -> &Index {
&self.index
}
/// Fetches a document from tantivy's store given a `DocAddress`.
///
/// The searcher uses the segment ordinal to route the

View File

@@ -75,7 +75,7 @@ where
inverted_index.terms().ord_to_term(term_ord, &mut term_buffer);
let term = Term::from_field_bytes(self.field, &term_buffer[..]);
for &doc_id in &docs_matching_current_term {
matching_terms.add_term(doc_id, term.clone());
matching_terms.add_term(doc_id, term.clone(), 1f32);
}
}
}

View File

@@ -49,7 +49,7 @@ impl Weight for TermWeight {
for doc_id in doc_ids {
match scorer.skip_next(doc_id) {
SkipResult::Reached => {
matching_terms.add_term(doc_id, self.term.clone());
matching_terms.add_term(doc_id, self.term.clone(), 1f32);
}
SkipResult::OverStep => {}
SkipResult::End => {

View File

@@ -5,9 +5,10 @@ use DocId;
use std::collections::HashSet;
use Term;
use std::collections::BTreeMap;
use std::collections::HashMap;
pub struct MatchingTerms {
doc_to_terms: BTreeMap<DocId, HashSet<Term>>
doc_to_terms: BTreeMap<DocId, HashMap<Term, f32>>
}
impl MatchingTerms {
@@ -16,18 +17,22 @@ impl MatchingTerms {
doc_to_terms: doc_ids
.iter()
.cloned()
.map(|doc_id| (doc_id, HashSet::default()))
.map(|doc_id| (doc_id, HashMap::default()))
.collect()
}
}
pub fn terms_for_doc(&self, doc_id: DocId) -> Option<&HashMap<Term, f32>> {
self.doc_to_terms.get(&doc_id)
}
pub fn sorted_doc_ids(&self) -> Vec<DocId> {
self.doc_to_terms.keys().cloned().collect()
}
pub fn add_term(&mut self, doc_id: DocId, term: Term) {
pub fn add_term(&mut self, doc_id: DocId, term: Term, score: f32) {
if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) {
terms.insert(term);
terms.insert(term, score);
}
}
}

View File

@@ -12,6 +12,9 @@ use DocAddress;
use DocId;
use Searcher;
use query::MatchingTerms;
use schema::Field;
use std::collections::HashMap;
use SegmentLocalId;
#[derive(Debug)]
pub struct HighlightSection {
@@ -189,28 +192,55 @@ fn select_best_fragment_combination<'a>(
fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<()> {
fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<HashMap<SegmentLocalId, MatchingTerms>> {
let weight = query.weight(searcher, false)?;
let mut doc_groups = doc_addresses
.iter()
.group_by(|doc_address| doc_address.0);
let mut matching_terms_per_segment: HashMap<SegmentLocalId, MatchingTerms> = HashMap::new();
for (segment_ord, doc_addrs) in doc_groups.into_iter() {
let doc_addrs_vec: Vec<DocId> = doc_addrs.map(|doc_addr| doc_addr.1).collect();
let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]);
let segment_reader = searcher.segment_reader(segment_ord);
weight.matching_terms(segment_reader, &mut matching_terms)?;
matching_terms_per_segment.insert(segment_ord, matching_terms);
}
let terms = HashSet<(DocId, Vec<Term>)>;
Ok(())
Ok(matching_terms_per_segment)
}
pub fn generate_snippet<'a>(
doc: &'a [DocAddress],
index: &Index,
pub fn generate_snippet(
doc_addresses: &[DocAddress],
fields: &[Field],
searcher: &Searcher,
query: &Query,
terms: Vec<Term>,
max_num_chars: usize) -> Snippet {
search_fragments(boxed_tokenizer, &text, terms, 3);
max_num_chars: usize) -> Result<Vec<Snippet>> {
// TODO sort doc_addresses
let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?;
for doc_address in doc_addresses {
let doc = searcher.doc(doc_address)?;
for &field in fields {
let mut text = String::new();
for value in doc.get_all(field) {
text.push_str(value.text());
}
if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) {
if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&doc_address.segment_ord()) {
if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) {
let terms: BTreeMap<String, f32> = terms
.iter()
.map(|(term, score)| (term.text().to_string(), *score))
.collect();
search_fragments(tokenizer,
&text,
terms,
max_num_chars);
}
}
}
}
}
// search_fragments(boxed_tokenizer, &text, terms, 3);
panic!("e");
}
#[cfg(test)]
@@ -346,7 +376,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl
let text = "a b c d";
let mut terms = BTreeMap::new();
let terms = BTreeMap::new();
let fragments = search_fragments(boxed_tokenizer, &text, terms, 3);
assert_eq!(fragments.len(), 0);