mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 16:40:43 +00:00
@@ -1,4 +1,4 @@
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use tokenizer::{TokenStream, Tokenizer, Token};
|
||||
use std::collections::BTreeMap;
|
||||
use Term;
|
||||
use Document;
|
||||
@@ -7,6 +7,7 @@ use schema::FieldValue;
|
||||
use schema::Value;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct HighlightSection {
|
||||
start: usize,
|
||||
stop: usize,
|
||||
@@ -21,6 +22,7 @@ impl HighlightSection {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FragmentCandidate {
|
||||
score: f32,
|
||||
start_offset: usize,
|
||||
@@ -29,13 +31,53 @@ pub struct FragmentCandidate {
|
||||
highlighted: Vec<HighlightSection>,
|
||||
}
|
||||
|
||||
pub struct Snippet {
|
||||
fragments: Vec<String>,
|
||||
impl FragmentCandidate {
|
||||
|
||||
fn new(start_offset: usize, end_offset: usize) -> FragmentCandidate {
|
||||
FragmentCandidate{score: 0.0,
|
||||
start_offset: start_offset,
|
||||
stop_offset: end_offset,
|
||||
num_chars: 0,
|
||||
highlighted: vec![]}
|
||||
}
|
||||
|
||||
/// Updates `score` and `highlighted` fields of the objects.
|
||||
///
|
||||
///
|
||||
fn calculate_score(&mut self, token: &Token, terms: &BTreeMap<String, f32>) {
|
||||
if let Some(score) = terms.get(&token.text.to_lowercase()) {
|
||||
self.score += score;
|
||||
self.highlighted.push(HighlightSection{start: token.offset_from,
|
||||
stop: token.offset_to});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Snippet {
|
||||
fragments: String,
|
||||
highlighted: Vec<HighlightSection>,
|
||||
}
|
||||
|
||||
const HIGHLIGHTEN_PREFIX:&str = "<b>";
|
||||
const HIGHLIGHTEN_POSTFIX:&str = "</b>";
|
||||
|
||||
impl Snippet {
|
||||
pub fn to_html() -> String {
|
||||
unimplemented!();
|
||||
|
||||
/// Returns a hignlightned html from the `Snippet`.
|
||||
pub fn to_html(&self) -> String {
|
||||
let mut html = String::new();
|
||||
let mut start_from: usize = 0;
|
||||
|
||||
for item in self.highlighted.iter() {
|
||||
html.push_str(&self.fragments[start_from..item.start]);
|
||||
html.push_str(HIGHLIGHTEN_PREFIX);
|
||||
html.push_str(&self.fragments[item.start..item.stop]);
|
||||
html.push_str(HIGHLIGHTEN_POSTFIX);
|
||||
start_from = item.stop;
|
||||
}
|
||||
html.push_str(&self.fragments[start_from..self.fragments.len()]);
|
||||
html
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,15 +103,61 @@ impl Snippet {
|
||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||
/// has to be a valid string.
|
||||
fn search_fragments<'a>(
|
||||
tokenizer: &BoxedTokenizer,
|
||||
tokenizer: Box<BoxedTokenizer>,
|
||||
text: &'a str,
|
||||
terms: BTreeMap<String, f32>,
|
||||
max_num_chars: usize) -> Vec<FragmentCandidate> {
|
||||
unimplemented!();
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
let mut fragment = FragmentCandidate::new(0, 0);
|
||||
let mut fragments:Vec<FragmentCandidate> = vec![];
|
||||
|
||||
loop {
|
||||
if let Some(next) = token_stream.next() {
|
||||
if (next.offset_to - fragment.start_offset) > max_num_chars {
|
||||
let txt = &text[fragment.start_offset..fragment.stop_offset];
|
||||
if fragment.score > 0.0 {
|
||||
fragments.push(fragment)
|
||||
};
|
||||
fragment = FragmentCandidate::new(next.offset_from, next.offset_to);
|
||||
} else {
|
||||
fragment.calculate_score(next, &terms);
|
||||
fragment.stop_offset = next.offset_to;
|
||||
}
|
||||
} else {
|
||||
let txt = &text[fragment.start_offset..fragment.stop_offset];
|
||||
if fragment.score > 0.0 {
|
||||
fragments.push(fragment)
|
||||
};
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fragments
|
||||
}
|
||||
|
||||
fn select_best_fragment_combination(fragments_candidate: Vec<(&str, Vec<FragmentCandidate>)>, max_num_chars: usize) -> Snippet {
|
||||
unimplemented!();
|
||||
/// Returns a Snippet
|
||||
///
|
||||
/// Takes a vector of `FragmentCandidate`s and the text.
|
||||
/// Figures out the best fragment from it and creates a snippet.
|
||||
fn select_best_fragment_combination<'a>(fragments: Vec<FragmentCandidate>,
|
||||
text: &'a str,) -> Snippet {
|
||||
if let Some(init) = fragments.iter().nth(0) {
|
||||
let fragment = fragments.iter().skip(1).fold(init, |acc, item| {
|
||||
if item.score > init.score { item } else { init }
|
||||
});
|
||||
let fragment_text = &text[fragment.start_offset..fragment.stop_offset];
|
||||
let highlighted = fragment.highlighted.iter().map(|item| {
|
||||
HighlightSection{start: item.start-fragment.start_offset,
|
||||
stop: item.stop-fragment.start_offset}
|
||||
}).collect();
|
||||
Snippet{fragments: fragment_text.to_owned(),
|
||||
highlighted: highlighted}
|
||||
} else {
|
||||
// when there no fragments to chose from,
|
||||
// for now create a empty snippet
|
||||
Snippet{fragments: String::new(),
|
||||
highlighted: vec![]}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_snippet<'a>(
|
||||
@@ -83,6 +171,37 @@ pub fn generate_snippet<'a>(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tokenizer::{SimpleTokenizer, box_tokenizer};
|
||||
use std::iter::Iterator;
|
||||
use std::collections::BTreeMap;
|
||||
use super::{search_fragments, select_best_fragment_combination};
|
||||
|
||||
#[test]
|
||||
fn test_snippet() {}
|
||||
}
|
||||
fn test_snippet() {
|
||||
let tokenizer = SimpleTokenizer;
|
||||
|
||||
let t = box_tokenizer(tokenizer);
|
||||
|
||||
let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance.
|
||||
|
||||
Rust is free and open-source software, released under an MIT License, or Apache License 2.0. Its designers have refined the language through the experiences of writing the Servo web browser layout engine[14] and the Rust compiler. A large proportion of current commits to the project are from community members.[15]
|
||||
|
||||
Rust won first place for \"most loved programming language\" in the Stack Overflow Developer Survey in 2016, 2017, and 2018.
|
||||
";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("rust"), 1.0);
|
||||
terms.insert(String::from("language"), 0.9);
|
||||
|
||||
let fragments = search_fragments(t, &text, terms, 100);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = fragments.iter().nth(0).unwrap();
|
||||
assert_eq!(first.score, 1.9);
|
||||
assert_eq!(first.stop_offset, 89);
|
||||
}
|
||||
let snippet = select_best_fragment_combination(fragments, &text);
|
||||
assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned());
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems programming <b>language</b> sponsored by Mozilla which describes it as a \"safe".to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -153,7 +153,7 @@ pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::Stemmer;
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer::{BoxedTokenizer, box_tokenizer};
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn box_tokenizer<A>(a: A) -> Box<BoxedTokenizer>
|
||||
pub fn box_tokenizer<A>(a: A) -> Box<BoxedTokenizer>
|
||||
where
|
||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user