Implement StopWords Filter (#292)

* Implement StopWords Filter

- added example doctest for alphanum_only.rs so that I could
drive my own test of the stopword filter

* Style Cop

* Switch HashSet Hasher to FNV for speed

* Update Change Log

* fix missed location renaming
This commit is contained in:
Dru Sellers
2018-05-09 20:40:42 -05:00
committed by Paul Masurel
parent 96b2c2971e
commit 82d87416c2
8 changed files with 165 additions and 23 deletions

View File

@@ -7,6 +7,8 @@ Tantivy 0.6
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
- Completely uncompressed
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
- Add NGram token support (@drusellers)
- Add Stopword Filter support (@drusellers)
Tantivy 0.5.2
===========================
@@ -91,7 +93,7 @@ Tantivy 0.3
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
for their contribution to this release.
Thanks also to everyone in tantivy gitter chat
Thanks also to everyone in tantivy gitter chat
for their advise and company :)
https://gitter.im/tantivy-search/tantivy
@@ -99,9 +101,9 @@ https://gitter.im/tantivy-search/tantivy
Warning:
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
code and index format.
You should not expect backward compatibility before
You should not expect backward compatibility before
tantivy 1.0.
@@ -127,7 +129,7 @@ Thanks to @KodrAus ! (#108)
the natural ordering.
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
- Misc invisible bug fixes, and code cleanup.
- Use
- Use

View File

@@ -42,6 +42,7 @@ rust-stemmers = "0.1.0"
downcast = { version="0.9" }
matches = "0.1"
bitpacking = "0.4"
fnv = "1.0.6"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"

View File

@@ -134,6 +134,7 @@ extern crate byteorder;
extern crate chan;
extern crate combine;
extern crate crossbeam;
extern crate fnv;
extern crate fst;
extern crate futures;
extern crate futures_cpupool;

View File

@@ -1,3 +1,28 @@
//! # Example
//! ```
//! extern crate tantivy;
//! use tantivy::tokenizer::*;
//!
//! # fn main() {
//!
//! let tokenizer = RawTokenizer
//! .filter(AlphaNumOnlyFilter);
//!
//! let mut stream = tokenizer.token_stream("hello there");
//! // is none because the raw filter emits one token that
//! // contains a space
//! assert!(stream.next().is_none());
//!
//! let tokenizer = SimpleTokenizer
//! .filter(AlphaNumOnlyFilter);
//!
//! let mut stream = tokenizer.token_stream("hello there 💣");
//! assert!(stream.next().is_some());
//! assert!(stream.next().is_some());
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! # }
//! ```
use super::{Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes all tokens that contain non
@@ -49,14 +74,12 @@ where
}
fn advance(&mut self) -> bool {
loop {
if self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} else {
return false;
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
}
false
}
}

View File

@@ -137,6 +137,7 @@ mod raw_tokenizer;
mod remove_long;
mod simple_tokenizer;
mod stemmer;
mod stop_word_filter;
mod token_stream_chain;
mod tokenizer;
mod tokenizer_manager;
@@ -150,6 +151,7 @@ pub use self::raw_tokenizer::RawTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::stemmer::Stemmer;
pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain;
pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};

View File

@@ -68,7 +68,7 @@ impl NgramTokenizer {
}
pub struct NgramTokenStream<'a> {
text: &'a str,
location: usize,
position: usize,
text_length: usize,
token: Token,
min_gram: usize,
@@ -83,7 +83,7 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
NgramTokenStream {
text,
location: 0,
position: 0,
text_length: text.len(),
token: Token::default(),
min_gram: self.min_gram,
@@ -110,11 +110,11 @@ impl<'a> NgramTokenStream<'a> {
self.gram_size = self.min_gram;
// and move down the chain of letters
self.location += 1;
self.position += 1;
}
let result = if (self.location + self.gram_size) <= self.text_length {
Some((self.location, self.gram_size))
let result = if (self.position + self.gram_size) <= self.text_length {
Some((self.position, self.gram_size))
} else {
None
};

View File

@@ -1,3 +1,21 @@
//! # Example
//! ```
//! extern crate tantivy;
//! use tantivy::tokenizer::*;
//!
//! # fn main() {
//!
//! let tokenizer = SimpleTokenizer
//! .filter(RemoveLongFilter::limit(5));
//!
//! let mut stream = tokenizer.token_stream("toolong nice");
//! // because `toolong` is more than 5 characters, it is filtered
//! // out of the token stream.
//! assert_eq!(stream.next().unwrap().text, "nice");
//! assert!(stream.next().is_none());
//! # }
//! ```
//!
use super::{Token, TokenFilter, TokenStream};
/// `RemoveLongFilter` removes tokens that are longer
@@ -68,14 +86,12 @@ where
}
fn advance(&mut self) -> bool {
loop {
if self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} else {
return false;
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
}
false
}
}

View File

@@ -0,0 +1,97 @@
//! # Example
//! ```
//! extern crate tantivy;
//! use tantivy::tokenizer::*;
//!
//! # fn main() {
//! let tokenizer = SimpleTokenizer
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
//!
//! let mut stream = tokenizer.token_stream("the fox is crafty");
//! assert_eq!(stream.next().unwrap().text, "fox");
//! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none());
//! # }
//! ```
use super::{Token, TokenFilter, TokenStream};
use fnv::FnvHasher;
use std::collections::HashSet;
use std::hash::BuildHasherDefault;
// configure our hashers for SPEED
type StopWordHasher = BuildHasherDefault<FnvHasher>;
type StopWordHashSet = HashSet<String, StopWordHasher>;
/// `TokenFilter` that removes stop words from a token stream
#[derive(Clone)]
pub struct StopWordFilter {
words: StopWordHashSet,
}
impl StopWordFilter {
/// Creates a `StopWordFilter` given a list of words to remove
pub fn remove(words: Vec<String>) -> StopWordFilter {
let mut set = StopWordHashSet::default();
for word in words {
set.insert(word);
}
StopWordFilter { words: set }
}
}
pub struct StopWordFilterStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
words: StopWordHashSet,
tail: TailTokenStream,
}
impl<TailTokenStream> TokenFilter<TailTokenStream> for StopWordFilter
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = StopWordFilterStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
StopWordFilterStream::wrap(self.words.clone(), token_stream)
}
}
impl<TailTokenStream> StopWordFilterStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
fn predicate(&self, token: &Token) -> bool {
!self.words.contains(&token.text)
}
fn wrap(words: StopWordHashSet, tail: TailTokenStream) -> StopWordFilterStream<TailTokenStream> {
StopWordFilterStream { words, tail }
}
}
impl<TailTokenStream> TokenStream for StopWordFilterStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
}
false
}
}