From 9101bf575343926256830ddfd9aa1b80004ab637 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 7 Sep 2018 09:57:12 +0900 Subject: [PATCH] Fragments --- src/core/index.rs | 21 +++++++++++- src/core/searcher.rs | 10 +++++- src/query/automaton_weight.rs | 2 +- src/query/term_query/term_weight.rs | 2 +- src/query/weight.rs | 13 +++++--- src/snippet/mod.rs | 50 +++++++++++++++++++++++------ 6 files changed, 80 insertions(+), 18 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index efdfedc5f..c6f465eef 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -29,6 +29,9 @@ use num_cpus; use std::path::Path; use tokenizer::TokenizerManager; use IndexWriter; +use schema::FieldType; +use schema::Field; +use tokenizer::BoxedTokenizer; fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; @@ -113,6 +116,22 @@ impl Index { &self.tokenizers } + pub fn tokenizer_for_field(&self, field: Field) -> Option> { + let field_type = self.schema.get_field_entry(field).field_type(); + let tokenizer: &TokenizerManager = self.tokenizers(); + match field_type { + FieldType::Str(text_options) => { + text_options.get_indexing_options() + .map(|text_indexing_options| text_indexing_options.tokenizer()) + .and_then(|tokenizer_name| tokenizer.get(tokenizer_name)) + + }, + _ => { + None + } + } + } + /// Opens a new directory from an index path. #[cfg(feature = "mmap")] pub fn open_in_dir>(directory_path: P) -> Result { @@ -257,7 +276,7 @@ impl Index { let schema = self.schema(); let num_searchers: usize = self.num_searchers.load(Ordering::Acquire); let searchers = (0..num_searchers) - .map(|_| Searcher::new(schema.clone(), segment_readers.clone())) + .map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone())) .collect(); self.searcher_pool.publish_new_generation(searchers); Ok(()) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 8f36b58ea..9de6c857c 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use termdict::TermMerger; use DocAddress; use Result; +use Index; /// Holds a list of `SegmentReader`s ready for search. /// @@ -18,17 +19,24 @@ use Result; /// pub struct Searcher { schema: Schema, + index: Index, segment_readers: Vec, } impl Searcher { /// Creates a new `Searcher` - pub(crate) fn new(schema: Schema, segment_readers: Vec) -> Searcher { + pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec) -> Searcher { Searcher { schema, + index, segment_readers, } } + + pub fn index(&self) -> &Index { + &self.index + } + /// Fetches a document from tantivy's store given a `DocAddress`. /// /// The searcher uses the segment ordinal to route the diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 54f8c5f8b..854ecb66e 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -75,7 +75,7 @@ where inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); let term = Term::from_field_bytes(self.field, &term_buffer[..]); for &doc_id in &docs_matching_current_term { - matching_terms.add_term(doc_id, term.clone()); + matching_terms.add_term(doc_id, term.clone(), 1f32); } } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 1a9075b5a..aa1b5e456 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -49,7 +49,7 @@ impl Weight for TermWeight { for doc_id in doc_ids { match scorer.skip_next(doc_id) { SkipResult::Reached => { - matching_terms.add_term(doc_id, self.term.clone()); + matching_terms.add_term(doc_id, self.term.clone(), 1f32); } SkipResult::OverStep => {} SkipResult::End => { diff --git a/src/query/weight.rs b/src/query/weight.rs index 5b603ab1c..8a12c01da 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -5,9 +5,10 @@ use DocId; use std::collections::HashSet; use Term; use std::collections::BTreeMap; +use std::collections::HashMap; pub struct MatchingTerms { - doc_to_terms: BTreeMap> + doc_to_terms: BTreeMap> } impl MatchingTerms { @@ -16,18 +17,22 @@ impl MatchingTerms { doc_to_terms: doc_ids .iter() .cloned() - .map(|doc_id| (doc_id, HashSet::default())) + .map(|doc_id| (doc_id, HashMap::default())) .collect() } } + pub fn terms_for_doc(&self, doc_id: DocId) -> Option<&HashMap> { + self.doc_to_terms.get(&doc_id) + } + pub fn sorted_doc_ids(&self) -> Vec { self.doc_to_terms.keys().cloned().collect() } - pub fn add_term(&mut self, doc_id: DocId, term: Term) { + pub fn add_term(&mut self, doc_id: DocId, term: Term, score: f32) { if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { - terms.insert(term); + terms.insert(term, score); } } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index cd194e0d8..c82777782 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -12,6 +12,9 @@ use DocAddress; use DocId; use Searcher; use query::MatchingTerms; +use schema::Field; +use std::collections::HashMap; +use SegmentLocalId; #[derive(Debug)] pub struct HighlightSection { @@ -189,28 +192,55 @@ fn select_best_fragment_combination<'a>( -fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<()> { +fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result> { let weight = query.weight(searcher, false)?; let mut doc_groups = doc_addresses .iter() .group_by(|doc_address| doc_address.0); + let mut matching_terms_per_segment: HashMap = HashMap::new(); for (segment_ord, doc_addrs) in doc_groups.into_iter() { let doc_addrs_vec: Vec = doc_addrs.map(|doc_addr| doc_addr.1).collect(); let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]); let segment_reader = searcher.segment_reader(segment_ord); weight.matching_terms(segment_reader, &mut matching_terms)?; + matching_terms_per_segment.insert(segment_ord, matching_terms); } - let terms = HashSet<(DocId, Vec)>; - Ok(()) + Ok(matching_terms_per_segment) } -pub fn generate_snippet<'a>( - doc: &'a [DocAddress], - index: &Index, +pub fn generate_snippet( + doc_addresses: &[DocAddress], + fields: &[Field], + searcher: &Searcher, query: &Query, - terms: Vec, - max_num_chars: usize) -> Snippet { - search_fragments(boxed_tokenizer, &text, terms, 3); + max_num_chars: usize) -> Result> { + // TODO sort doc_addresses + let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?; + for doc_address in doc_addresses { + let doc = searcher.doc(doc_address)?; + for &field in fields { + let mut text = String::new(); + for value in doc.get_all(field) { + text.push_str(value.text()); + } + if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) { + if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&doc_address.segment_ord()) { + if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { + let terms: BTreeMap = terms + .iter() + .map(|(term, score)| (term.text().to_string(), *score)) + .collect(); + search_fragments(tokenizer, + &text, + terms, + max_num_chars); + } + } + } + } + } + // search_fragments(boxed_tokenizer, &text, terms, 3); + panic!("e"); } #[cfg(test)] @@ -346,7 +376,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let text = "a b c d"; - let mut terms = BTreeMap::new(); + let terms = BTreeMap::new(); let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); assert_eq!(fragments.len(), 0);