From 835cdc2fe8f5ecceaaa1c65619af96fd8114b3c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Mon, 27 Aug 2018 22:14:59 +0530 Subject: [PATCH] Initial version of snippet refer #368 --- src/snippet/mod.rs | 141 ++++++++++++++++++++++++++++++++++--- src/tokenizer/mod.rs | 2 +- src/tokenizer/tokenizer.rs | 2 +- 3 files changed, 132 insertions(+), 13 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 4cc1e41b9..4356e0a80 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,4 +1,4 @@ -use tokenizer::{TokenStream, Tokenizer}; +use tokenizer::{TokenStream, Tokenizer, Token}; use std::collections::BTreeMap; use Term; use Document; @@ -7,6 +7,7 @@ use schema::FieldValue; use schema::Value; use tokenizer::BoxedTokenizer; +#[derive(Debug)] pub struct HighlightSection { start: usize, stop: usize, @@ -21,6 +22,7 @@ impl HighlightSection { } } +#[derive(Debug)] pub struct FragmentCandidate { score: f32, start_offset: usize, @@ -29,13 +31,53 @@ pub struct FragmentCandidate { highlighted: Vec, } -pub struct Snippet { - fragments: Vec, +impl FragmentCandidate { + + fn new(start_offset: usize, end_offset: usize) -> FragmentCandidate { + FragmentCandidate{score: 0.0, + start_offset: start_offset, + stop_offset: end_offset, + num_chars: 0, + highlighted: vec![]} + } + + /// Updates `score` and `highlighted` fields of the objects. + /// + /// + fn calculate_score(&mut self, token: &Token, terms: &BTreeMap) { + if let Some(score) = terms.get(&token.text.to_lowercase()) { + self.score += score; + self.highlighted.push(HighlightSection{start: token.offset_from, + stop: token.offset_to}); + } + } } +#[derive(Debug)] +pub struct Snippet { + fragments: String, + highlighted: Vec, +} + +const HIGHLIGHTEN_PREFIX:&str = ""; +const HIGHLIGHTEN_POSTFIX:&str = ""; + impl Snippet { - pub fn to_html() -> String { - unimplemented!(); + + /// Returns a hignlightned html from the `Snippet`. + pub fn to_html(&self) -> String { + let mut html = String::new(); + let mut start_from: usize = 0; + + for item in self.highlighted.iter() { + html.push_str(&self.fragments[start_from..item.start]); + html.push_str(HIGHLIGHTEN_PREFIX); + html.push_str(&self.fragments[item.start..item.stop]); + html.push_str(HIGHLIGHTEN_POSTFIX); + start_from = item.stop; + } + html.push_str(&self.fragments[start_from..self.fragments.len()]); + html } } @@ -61,15 +103,61 @@ impl Snippet { /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. fn search_fragments<'a>( - tokenizer: &BoxedTokenizer, + tokenizer: Box, text: &'a str, terms: BTreeMap, max_num_chars: usize) -> Vec { - unimplemented!(); + let mut token_stream = tokenizer.token_stream(text); + let mut fragment = FragmentCandidate::new(0, 0); + let mut fragments:Vec = vec![]; + + loop { + if let Some(next) = token_stream.next() { + if (next.offset_to - fragment.start_offset) > max_num_chars { + let txt = &text[fragment.start_offset..fragment.stop_offset]; + if fragment.score > 0.0 { + fragments.push(fragment) + }; + fragment = FragmentCandidate::new(next.offset_from, next.offset_to); + } else { + fragment.calculate_score(next, &terms); + fragment.stop_offset = next.offset_to; + } + } else { + let txt = &text[fragment.start_offset..fragment.stop_offset]; + if fragment.score > 0.0 { + fragments.push(fragment) + }; + break; + } + } + + fragments } -fn select_best_fragment_combination(fragments_candidate: Vec<(&str, Vec)>, max_num_chars: usize) -> Snippet { - unimplemented!(); +/// Returns a Snippet +/// +/// Takes a vector of `FragmentCandidate`s and the text. +/// Figures out the best fragment from it and creates a snippet. +fn select_best_fragment_combination<'a>(fragments: Vec, + text: &'a str,) -> Snippet { + if let Some(init) = fragments.iter().nth(0) { + let fragment = fragments.iter().skip(1).fold(init, |acc, item| { + if item.score > init.score { item } else { init } + }); + let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; + let highlighted = fragment.highlighted.iter().map(|item| { + HighlightSection{start: item.start-fragment.start_offset, + stop: item.stop-fragment.start_offset} + }).collect(); + Snippet{fragments: fragment_text.to_owned(), + highlighted: highlighted} + } else { + // when there no fragments to chose from, + // for now create a empty snippet + Snippet{fragments: String::new(), + highlighted: vec![]} + } } pub fn generate_snippet<'a>( @@ -83,6 +171,37 @@ pub fn generate_snippet<'a>( #[cfg(test)] mod tests { + use tokenizer::{SimpleTokenizer, box_tokenizer}; + use std::iter::Iterator; + use std::collections::BTreeMap; + use super::{search_fragments, select_best_fragment_combination}; + #[test] - fn test_snippet() {} -} \ No newline at end of file + fn test_snippet() { + let tokenizer = SimpleTokenizer; + + let t = box_tokenizer(tokenizer); + + let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. + +Rust is free and open-source software, released under an MIT License, or Apache License 2.0. Its designers have refined the language through the experiences of writing the Servo web browser layout engine[14] and the Rust compiler. A large proportion of current commits to the project are from community members.[15] + +Rust won first place for \"most loved programming language\" in the Stack Overflow Developer Survey in 2016, 2017, and 2018. +"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("rust"), 1.0); + terms.insert(String::from("language"), 0.9); + + let fragments = search_fragments(t, &text, terms, 100); + assert_eq!(fragments.len(), 7); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.9); + assert_eq!(first.stop_offset, 89); + } + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()) + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index fd0bfbbde..d4a735bd2 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -153,7 +153,7 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::tokenizer::BoxedTokenizer; +pub use self::tokenizer::{BoxedTokenizer, box_tokenizer}; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index fcdf8f21b..e806b70d8 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -130,7 +130,7 @@ where } } -pub(crate) fn box_tokenizer(a: A) -> Box +pub fn box_tokenizer(a: A) -> Box where A: 'static + Send + Sync + for<'a> Tokenizer<'a>, {