mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 15:10:40 +00:00
issue/191 Added an analyzer manager.
This commit is contained in:
@@ -179,7 +179,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// Here, if the user does not specify which
|
||||
// field they want to search, tantivy will search
|
||||
// in both title and body.
|
||||
let mut query_parser = QueryParser::new(index.schema(), vec![title, body]);
|
||||
let mut query_parser = QueryParser::for_index(index, vec![title, body]);
|
||||
|
||||
// QueryParser may fail if the query is not in the right
|
||||
// format. For user facing applications, this can be a problem.
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
/// The analyzer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use analyzer::TokenStreamChain;
|
||||
|
||||
/// Token
|
||||
pub struct Token {
|
||||
@@ -26,7 +30,7 @@ impl Default for Token {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Analyzer<'a>: Sized {
|
||||
pub trait Analyzer<'a>: Sized + Clone {
|
||||
type TokenStreamImpl: TokenStream;
|
||||
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl;
|
||||
@@ -41,20 +45,49 @@ pub trait Analyzer<'a>: Sized {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait BoxedAnalyzer {
|
||||
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a>;
|
||||
pub trait BoxedAnalyzer: Send + Sync {
|
||||
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a>;
|
||||
fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box<TokenStream + 'b>;
|
||||
fn boxed_clone(&self) -> Box<BoxedAnalyzer>;
|
||||
}
|
||||
|
||||
struct BoxableAnalyzer<A>(A) where A: for <'a> Analyzer<'a>;
|
||||
#[derive(Clone)]
|
||||
struct BoxableAnalyzer<A>(A) where A: for <'a> Analyzer<'a> + Send + Sync;
|
||||
|
||||
impl<A> BoxedAnalyzer for BoxableAnalyzer<A> where A: 'static + for <'a> Analyzer<'a> {
|
||||
fn token_stream<'b>(&mut self, text: &'b str) -> Box<TokenStream + 'b> {
|
||||
impl<A> BoxedAnalyzer for BoxableAnalyzer<A> where A: 'static + Send + Sync + for <'a> Analyzer<'a> {
|
||||
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a> {
|
||||
box self.0.token_stream(text)
|
||||
}
|
||||
|
||||
fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box<TokenStream + 'b> {
|
||||
assert!(texts.len() > 0);
|
||||
if texts.len() == 1 {
|
||||
box self.0.token_stream(texts[0])
|
||||
}
|
||||
else {
|
||||
let mut offsets = vec!();
|
||||
let mut total_offset = 0;
|
||||
for text in texts {
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
}
|
||||
let token_streams: Vec<_> = texts
|
||||
.iter()
|
||||
.map(|text| {
|
||||
self.0.token_stream(text)
|
||||
})
|
||||
.collect();
|
||||
box TokenStreamChain::new(offsets, token_streams)
|
||||
}
|
||||
}
|
||||
|
||||
fn boxed_clone(&self) -> Box<BoxedAnalyzer> {
|
||||
box self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn box_analyzer<A>(a: A) -> Box<BoxedAnalyzer>
|
||||
where A: 'static + for <'a> Analyzer<'a> {
|
||||
where A: 'static + Send + Sync + for <'a> Analyzer<'a> {
|
||||
box BoxableAnalyzer(a)
|
||||
}
|
||||
|
||||
@@ -102,7 +135,7 @@ pub trait TokenStream {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ChainAnalyzer<HeadTokenFilterFactory, TailAnalyzer> {
|
||||
head: HeadTokenFilterFactory,
|
||||
tail: TailAnalyzer,
|
||||
@@ -117,13 +150,13 @@ impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a>
|
||||
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
|
||||
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let tail_token_stream = self.tail.token_stream(text);
|
||||
let tail_token_stream = self.tail.token_stream(text );
|
||||
self.head.transform(tail_token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub trait TokenFilterFactory<TailTokenStream: TokenStream> {
|
||||
pub trait TokenFilterFactory<TailTokenStream: TokenStream>: Clone {
|
||||
type ResultTokenStream: TokenStream;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
|
||||
|
||||
66
src/analyzer/analyzer_manager.rs
Normal file
66
src/analyzer/analyzer_manager.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use analyzer::BoxedAnalyzer;
|
||||
use analyzer::Analyzer;
|
||||
use analyzer::box_analyzer;
|
||||
use analyzer::SimpleTokenizer;
|
||||
use analyzer::JapaneseTokenizer;
|
||||
use analyzer::RemoveLongFilter;
|
||||
use analyzer::LowerCaser;
|
||||
use analyzer::Stemmer;
|
||||
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AnalyzerManager {
|
||||
analyzers: Arc< RwLock<HashMap<String, Box<BoxedAnalyzer> >> >
|
||||
}
|
||||
|
||||
impl AnalyzerManager {
|
||||
pub fn register<A>(&self, analyzer_name: &str, analyzer: A)
|
||||
where A: 'static + Send + Sync + for <'a> Analyzer<'a> {
|
||||
let boxed_analyzer = box_analyzer(analyzer);
|
||||
self.analyzers
|
||||
.write()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
.insert(analyzer_name.to_string(), boxed_analyzer);
|
||||
}
|
||||
|
||||
pub fn get(&self, analyzer_name: &str) -> Option<Box<BoxedAnalyzer>> {
|
||||
self.analyzers
|
||||
.read()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
.get(analyzer_name)
|
||||
.map(|boxed_analyzer| {
|
||||
boxed_analyzer.boxed_clone()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AnalyzerManager {
|
||||
/// Creates an `AnalyzerManager` prepopulated with
|
||||
/// the default analyzers of `tantivy`.
|
||||
/// - simple
|
||||
/// - en_stem
|
||||
/// - jp
|
||||
fn default() -> AnalyzerManager {
|
||||
let manager = AnalyzerManager {
|
||||
analyzers: Arc::new(RwLock::new(HashMap::new()))
|
||||
};
|
||||
manager.register("simple",
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
);
|
||||
manager.register("en_stem",
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new())
|
||||
);
|
||||
manager.register("ja",
|
||||
JapaneseTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
);
|
||||
manager
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,10 @@
|
||||
use super::{Token, Analyzer, TokenStream};
|
||||
use tinysegmenter;
|
||||
|
||||
pub struct JPTokenizer;
|
||||
|
||||
/// Simple japanese tokenizer based on the `tinysegmenter` crate.
|
||||
#[derive(Clone)]
|
||||
pub struct JapaneseTokenizer;
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
enum Cursor {
|
||||
@@ -10,13 +13,13 @@ enum Cursor {
|
||||
Terminated,
|
||||
}
|
||||
|
||||
pub struct JPTokenizerStream {
|
||||
pub struct JapaneseTokenizerStream {
|
||||
tokens: Vec<Token>,
|
||||
cursor: Cursor,
|
||||
}
|
||||
|
||||
impl<'a> Analyzer<'a> for JPTokenizer {
|
||||
type TokenStreamImpl = JPTokenizerStream;
|
||||
impl<'a> Analyzer<'a> for JapaneseTokenizer {
|
||||
type TokenStreamImpl = JapaneseTokenizerStream;
|
||||
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let mut tokens = vec![];
|
||||
@@ -25,21 +28,23 @@ impl<'a> Analyzer<'a> for JPTokenizer {
|
||||
for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() {
|
||||
offset_from = offset_to;
|
||||
offset_to = offset_from + term.len();
|
||||
tokens.push(Token {
|
||||
offset_from: offset_from,
|
||||
offset_to: offset_to,
|
||||
position: pos,
|
||||
term: term,
|
||||
});
|
||||
if term.chars().all(char::is_alphanumeric) {
|
||||
tokens.push(Token {
|
||||
offset_from: offset_from,
|
||||
offset_to: offset_to,
|
||||
position: pos,
|
||||
term: term,
|
||||
});
|
||||
}
|
||||
}
|
||||
JPTokenizerStream {
|
||||
JapaneseTokenizerStream {
|
||||
tokens: tokens,
|
||||
cursor: Cursor::HasNotStarted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for JPTokenizerStream {
|
||||
impl<'a> TokenStream for JapaneseTokenizerStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
let new_cursor = match self.cursor {
|
||||
Cursor::HasNotStarted => {
|
||||
@@ -1,6 +1,9 @@
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
use std::ascii::AsciiExt;
|
||||
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaser;
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
|
||||
|
||||
@@ -4,45 +4,31 @@ mod analyzer;
|
||||
mod simple_tokenizer;
|
||||
mod lower_caser;
|
||||
mod remove_long;
|
||||
mod remove_nonalphanum;
|
||||
mod stemmer;
|
||||
mod jp_tokenizer;
|
||||
mod analyzer_manager;
|
||||
mod japanese_tokenizer;
|
||||
mod token_stream_chain;
|
||||
|
||||
pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory,
|
||||
TokenStream};
|
||||
pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory, TokenStream};
|
||||
pub use self::analyzer::BoxedAnalyzer;
|
||||
pub use self::analyzer_manager::AnalyzerManager;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::jp_tokenizer::JPTokenizer;
|
||||
pub use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::japanese_tokenizer::JapaneseTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::lower_caser::LowerCaser;
|
||||
pub use self::stemmer::Stemmer;
|
||||
pub use self::remove_nonalphanum::RemoveNonAlphaFilter;
|
||||
pub use self::analyzer::BoxedAnalyzer;
|
||||
|
||||
|
||||
pub fn en_pipeline<'a>() -> Box<BoxedAnalyzer> {
|
||||
box_analyzer(
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(20))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new())
|
||||
)
|
||||
}
|
||||
|
||||
pub fn jp_pipeline<'a>() -> Box<BoxedAnalyzer> {
|
||||
box_analyzer(
|
||||
JPTokenizer
|
||||
.filter(RemoveLongFilter::limit(20))
|
||||
.filter(RemoveNonAlphaFilter)
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{en_pipeline, jp_pipeline, Token};
|
||||
use super::Token;
|
||||
use super::AnalyzerManager;
|
||||
|
||||
#[test]
|
||||
fn test_en_analyzer() {
|
||||
let mut en_analyzer = en_pipeline();
|
||||
let analyzer_manager = AnalyzerManager::default();
|
||||
assert!(analyzer_manager.get("en_doesnotexist").is_none());
|
||||
let mut en_analyzer = analyzer_manager.get("en_stem").unwrap();
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
@@ -57,7 +43,9 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_jp_analyzer() {
|
||||
let mut en_analyzer = jp_pipeline();
|
||||
let analyzer_manager = AnalyzerManager::default();
|
||||
let mut en_analyzer = analyzer_manager.get("ja").unwrap();
|
||||
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
@@ -73,7 +61,8 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let mut en_analyzer = en_pipeline();
|
||||
let analyzer_manager = AnalyzerManager::default();
|
||||
let mut en_analyzer = analyzer_manager.get("en_stem").unwrap();
|
||||
{
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// than a given number of bytes (in UTF-8 representation).
|
||||
///
|
||||
/// It is especially useful when indexing unconstrained content.
|
||||
/// e.g. Mail containing base-64 encoded pictures etc.
|
||||
#[derive(Clone)]
|
||||
pub struct RemoveLongFilter {
|
||||
length_limit: usize,
|
||||
}
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
|
||||
|
||||
pub struct RemoveNonAlphaFilter;
|
||||
|
||||
impl<TailTokenStream> RemoveNonAlphaFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
for c in token.term.chars() {
|
||||
if !c.is_alphanumeric() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveNonAlphaFilter
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
type ResultTokenStream = RemoveNonAlphaFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, tail: TailTokenStream) -> Self::ResultTokenStream {
|
||||
RemoveNonAlphaFilterStream { tail: tail }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RemoveNonAlphaFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for RemoveNonAlphaFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
if self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
use std::str::CharIndices;
|
||||
use super::{Token, Analyzer, TokenStream};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
pub struct SimpleTokenStream<'a> {
|
||||
|
||||
@@ -2,14 +2,14 @@ use std::sync::Arc;
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Stemmer {
|
||||
stemmer: Arc<rust_stemmers::Stemmer>,
|
||||
stemmer_algorithm: Arc<Algorithm>,
|
||||
}
|
||||
|
||||
impl Stemmer {
|
||||
pub fn new() -> Stemmer {
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
|
||||
Stemmer { stemmer: Arc::new(inner_stemmer) }
|
||||
Stemmer { stemmer_algorithm: Arc::new(Algorithm::English) }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,8 @@ impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
|
||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
StemmerTokenStream::wrap(self.stemmer.clone(), token_stream)
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
|
||||
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,7 +29,7 @@ pub struct StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
stemmer: Arc<rust_stemmers::Stemmer>,
|
||||
stemmer: rust_stemmers::Stemmer,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
@@ -58,7 +59,7 @@ impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn wrap(stemmer: Arc<rust_stemmers::Stemmer>,
|
||||
fn wrap(stemmer: rust_stemmers::Stemmer,
|
||||
tail: TailTokenStream)
|
||||
-> StemmerTokenStream<TailTokenStream> {
|
||||
StemmerTokenStream {
|
||||
|
||||
63
src/analyzer/token_stream_chain.rs
Normal file
63
src/analyzer/token_stream_chain.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
use analyzer::{TokenStream, Token};
|
||||
|
||||
pub struct TokenStreamChain<TTokenStream: TokenStream> {
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<TTokenStream>,
|
||||
position_shift: usize,
|
||||
stream_idx: usize,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
|
||||
impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
|
||||
where TTokenStream: TokenStream {
|
||||
|
||||
pub fn new(offsets: Vec<usize>,
|
||||
token_streams: Vec<TTokenStream>) -> TokenStreamChain<TTokenStream> {
|
||||
TokenStreamChain {
|
||||
offsets: offsets,
|
||||
stream_idx: 0,
|
||||
token_streams: token_streams,
|
||||
position_shift: 0,
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, TTokenStream> TokenStream for TokenStreamChain<TTokenStream>
|
||||
where TTokenStream: TokenStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.stream_idx < self.token_streams.len() {
|
||||
let token_stream = &mut self.token_streams[self.stream_idx];
|
||||
if token_stream.advance() {
|
||||
let token = token_stream.token();
|
||||
let offset_offset = self.offsets[self.stream_idx];
|
||||
self.token.offset_from = token.offset_from + offset_offset;
|
||||
self.token.offset_from = token.offset_from + offset_offset;
|
||||
self.token.position = token.position + self.position_shift;
|
||||
self.token.term.clear();
|
||||
self.token.term.push_str(token.term.as_str());
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
self.stream_idx += 1;
|
||||
self.position_shift = self.token.position + 2;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
if self.stream_idx > self.token_streams.len() {
|
||||
panic!("You called .token(), after the end of the token stream has been reached");
|
||||
}
|
||||
&self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
if self.stream_idx > self.token_streams.len() {
|
||||
panic!("You called .token(), after the end of the token stream has been reached");
|
||||
}
|
||||
&mut self.token
|
||||
}
|
||||
}
|
||||
@@ -103,7 +103,7 @@ mod tests {
|
||||
{
|
||||
// perform the query
|
||||
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
|
||||
let mut query_parser = QueryParser::new(schema, vec![text_field]);
|
||||
let mut query_parser = QueryParser::for_index(index, vec![text_field]);
|
||||
let query = query_parser.parse_query("text:text").unwrap();
|
||||
query.search(&searcher, &mut facet_collectors).unwrap();
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use directory::ManagedDirectory;
|
||||
use core::META_FILEPATH;
|
||||
use super::segment::create_segment;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use analyzer::AnalyzerManager;
|
||||
|
||||
const NUM_SEARCHERS: usize = 12;
|
||||
|
||||
@@ -37,6 +38,7 @@ pub struct Index {
|
||||
directory: ManagedDirectory,
|
||||
schema: Schema,
|
||||
searcher_pool: Arc<Pool<Searcher>>,
|
||||
analyzers: AnalyzerManager
|
||||
}
|
||||
|
||||
|
||||
@@ -64,6 +66,10 @@ impl Index {
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
pub fn analyzers(&self) -> AnalyzerManager {
|
||||
self.analyzers.clone()
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
@@ -85,6 +91,7 @@ impl Index {
|
||||
directory: directory,
|
||||
schema: schema,
|
||||
searcher_pool: Arc::new(Pool::new()),
|
||||
analyzers: AnalyzerManager::default(),
|
||||
};
|
||||
try!(index.load_searchers());
|
||||
Ok(index)
|
||||
@@ -242,6 +249,7 @@ impl Clone for Index {
|
||||
directory: self.directory.clone(),
|
||||
schema: self.schema.clone(),
|
||||
searcher_pool: self.searcher_pool.clone(),
|
||||
analyzers: self.analyzers.clone()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,12 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
}
|
||||
|
||||
impl Segment {
|
||||
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
|
||||
/// Returns our index's schema.
|
||||
pub fn schema(&self) -> Schema {
|
||||
self.index.schema()
|
||||
|
||||
@@ -14,6 +14,8 @@ use datastruct::stacker::Heap;
|
||||
use indexer::index_writer::MARGIN_IN_BYTES;
|
||||
use super::operation::AddOperation;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use analyzer::BoxedAnalyzer;
|
||||
use schema::Value;
|
||||
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
@@ -29,6 +31,7 @@ pub struct SegmentWriter<'a> {
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FastFieldsWriter,
|
||||
doc_opstamps: Vec<u64>,
|
||||
analyzers: Vec<Option<Box<BoxedAnalyzer>>>
|
||||
}
|
||||
|
||||
|
||||
@@ -60,6 +63,18 @@ impl<'a> SegmentWriter<'a> {
|
||||
-> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, heap);
|
||||
let analyzers = schema.fields()
|
||||
.iter()
|
||||
.map(|field_entry| field_entry.field_type())
|
||||
.map(|field_type| {
|
||||
match field_type {
|
||||
&FieldType::Str(ref text_options) => {
|
||||
segment.index().analyzers().get("simple")
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
@@ -68,6 +83,7 @@ impl<'a> SegmentWriter<'a> {
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
analyzers: analyzers,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -117,17 +133,32 @@ impl<'a> SegmentWriter<'a> {
|
||||
let field_options = schema.get_field_entry(field);
|
||||
match *field_options.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() {
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &field_values)
|
||||
} else {
|
||||
let num_field_values = field_values.len() as u32;
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_text(field, field_value.value().text());
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
num_field_values
|
||||
};
|
||||
let num_tokens: u32 =
|
||||
if text_options.get_indexing_options().is_tokenized() {
|
||||
if let Some(ref mut analyzer) = self.analyzers[field.0 as usize] {
|
||||
let texts: Vec<&str> = field_values.iter()
|
||||
.flat_map(|field_value| {
|
||||
match field_value.value() {
|
||||
&Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut token_stream = analyzer.token_stream_texts(&texts[..]);
|
||||
self.multifield_postings.index_text(doc_id, field, &mut token_stream)
|
||||
}
|
||||
else {
|
||||
0u32
|
||||
}
|
||||
|
||||
} else {
|
||||
let num_field_values = field_values.len() as u32;
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_text(field, field_value.value().text());
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
num_field_values
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
use DocId;
|
||||
use schema::Term;
|
||||
use schema::FieldValue;
|
||||
use postings::PostingsSerializer;
|
||||
use std::io;
|
||||
use postings::Recorder;
|
||||
use Result;
|
||||
use schema::{Schema, Field};
|
||||
use analyzer::{en_pipeline, Token};
|
||||
use analyzer::Token;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use datastruct::stacker::{HashMap, Heap};
|
||||
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use analyzer::TokenStream;
|
||||
use schema::TextIndexingOptions;
|
||||
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry,
|
||||
@@ -62,9 +62,9 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self, doc: DocId, field: Field, field_values: &[&FieldValue]) -> u32 {
|
||||
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
|
||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
||||
postings_writer.index_text(&mut self.term_index, doc, field, field_values, self.heap)
|
||||
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
|
||||
}
|
||||
|
||||
pub fn suscribe(&mut self, doc: DocId, term: &Term) {
|
||||
@@ -140,39 +140,24 @@ pub trait PostingsWriter {
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
field_values: &[&'a FieldValue],
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &Heap)
|
||||
-> u32 {
|
||||
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
|
||||
term.set_field(field);
|
||||
let mut analyzer = en_pipeline();
|
||||
|
||||
let mut overall_position = 0u32;
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.term.as_str());
|
||||
self.suscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
};
|
||||
|
||||
for field_value in field_values {
|
||||
// TODO fix position when more than one value.
|
||||
let mut token_stream = analyzer.token_stream(field_value.value().text());
|
||||
let mut local_position = 0;
|
||||
num_tokens += {
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.term.as_str());
|
||||
local_position = token.position as u32;
|
||||
self.suscribe(term_index, doc_id, overall_position + local_position, &term, heap);
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
};
|
||||
overall_position += local_position + 2u32;
|
||||
}
|
||||
num_tokens
|
||||
token_stream.process(&mut sink)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -213,6 +198,7 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
|
||||
fn suscribe(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
|
||||
@@ -8,11 +8,11 @@ use query::Occur;
|
||||
use query::TermQuery;
|
||||
use postings::SegmentPostingsOption;
|
||||
use query::PhraseQuery;
|
||||
use analyzer::{en_pipeline, BoxedAnalyzer};
|
||||
use schema::{Term, FieldType};
|
||||
use std::str::FromStr;
|
||||
use analyzer::AnalyzerManager;
|
||||
use std::num::ParseIntError;
|
||||
|
||||
use core::Index;
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
@@ -74,7 +74,7 @@ pub struct QueryParser {
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
conjunction_by_default: bool,
|
||||
analyzer: Box<BoxedAnalyzer>,
|
||||
analyzer_manager: AnalyzerManager,
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
@@ -82,15 +82,25 @@ impl QueryParser {
|
||||
/// * schema - index Schema
|
||||
/// * default_fields - fields used to search if no field is specifically defined
|
||||
/// in the query.
|
||||
pub fn new(schema: Schema, default_fields: Vec<Field>) -> QueryParser {
|
||||
pub fn new(schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
analyzer_manager: AnalyzerManager) -> QueryParser {
|
||||
QueryParser {
|
||||
schema: schema,
|
||||
default_fields: default_fields,
|
||||
conjunction_by_default: false,
|
||||
analyzer: en_pipeline(),
|
||||
analyzer_manager: analyzer_manager,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn for_index(index: Index,
|
||||
default_fields: Vec<Field>) -> QueryParser {
|
||||
QueryParser::new(
|
||||
index.schema(),
|
||||
default_fields,
|
||||
index.analyzers())
|
||||
}
|
||||
|
||||
/// Set the default way to compose queries to a conjunction.
|
||||
///
|
||||
/// By default a ,
|
||||
@@ -135,7 +145,7 @@ impl QueryParser {
|
||||
}
|
||||
Ok(ast)
|
||||
}
|
||||
|
||||
|
||||
fn compute_logical_ast_for_leaf(&mut self,
|
||||
field: Field,
|
||||
phrase: &str)
|
||||
@@ -143,6 +153,11 @@ impl QueryParser {
|
||||
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let mut analyzer = self.analyzer_manager
|
||||
.get("simple")
|
||||
.ok_or_else(|| {
|
||||
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
|
||||
})?;
|
||||
if !field_type.is_indexed() {
|
||||
let field_name = field_entry.name().to_string();
|
||||
return Err(QueryParserError::FieldNotIndexed(field_name));
|
||||
@@ -161,7 +176,7 @@ impl QueryParser {
|
||||
FieldType::Str(ref str_options) => {
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
if str_options.get_indexing_options().is_tokenized() {
|
||||
let mut token_stream = self.analyzer.token_stream(phrase);
|
||||
let mut token_stream = analyzer.token_stream(phrase);
|
||||
token_stream.process(&mut |token| {
|
||||
let term = Term::from_field_text(field, &token.term);
|
||||
terms.push(term);
|
||||
@@ -296,6 +311,7 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED};
|
||||
use analyzer::AnalyzerManager;
|
||||
use query::Query;
|
||||
use schema::Field;
|
||||
use super::QueryParser;
|
||||
@@ -314,7 +330,8 @@ mod test {
|
||||
schema_builder.add_text_field("nottokenized", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title, text];
|
||||
QueryParser::new(schema, default_fields)
|
||||
let analyzer_manager = AnalyzerManager::default();
|
||||
QueryParser::new(schema, default_fields, analyzer_manager)
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user