mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 00:50:41 +00:00
Implement StopWords Filter (#292)
* Implement StopWords Filter - added example doctest for alphanum_only.rs so that I could drive my own test of the stopword filter * Style Cop * Switch HashSet Hasher to FNV for speed * Update Change Log * fix missed location renaming
This commit is contained in:
committed by
Paul Masurel
parent
96b2c2971e
commit
82d87416c2
10
CHANGELOG.md
10
CHANGELOG.md
@@ -7,6 +7,8 @@ Tantivy 0.6
|
||||
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
|
||||
- Completely uncompressed
|
||||
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
|
||||
- Add NGram token support (@drusellers)
|
||||
- Add Stopword Filter support (@drusellers)
|
||||
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
@@ -91,7 +93,7 @@ Tantivy 0.3
|
||||
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
|
||||
for their contribution to this release.
|
||||
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
for their advise and company :)
|
||||
|
||||
https://gitter.im/tantivy-search/tantivy
|
||||
@@ -99,9 +101,9 @@ https://gitter.im/tantivy-search/tantivy
|
||||
|
||||
Warning:
|
||||
|
||||
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
|
||||
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
|
||||
code and index format.
|
||||
You should not expect backward compatibility before
|
||||
You should not expect backward compatibility before
|
||||
tantivy 1.0.
|
||||
|
||||
|
||||
@@ -127,7 +129,7 @@ Thanks to @KodrAus ! (#108)
|
||||
the natural ordering.
|
||||
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
|
||||
- Misc invisible bug fixes, and code cleanup.
|
||||
- Use
|
||||
- Use
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9" }
|
||||
matches = "0.1"
|
||||
bitpacking = "0.4"
|
||||
fnv = "1.0.6"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
@@ -134,6 +134,7 @@ extern crate byteorder;
|
||||
extern crate chan;
|
||||
extern crate combine;
|
||||
extern crate crossbeam;
|
||||
extern crate fnv;
|
||||
extern crate fst;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
|
||||
@@ -1,3 +1,28 @@
|
||||
//! # Example
|
||||
//! ```
|
||||
//! extern crate tantivy;
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! # fn main() {
|
||||
//!
|
||||
//! let tokenizer = RawTokenizer
|
||||
//! .filter(AlphaNumOnlyFilter);
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("hello there");
|
||||
//! // is none because the raw filter emits one token that
|
||||
//! // contains a space
|
||||
//! assert!(stream.next().is_none());
|
||||
//!
|
||||
//! let tokenizer = SimpleTokenizer
|
||||
//! .filter(AlphaNumOnlyFilter);
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("hello there 💣");
|
||||
//! assert!(stream.next().is_some());
|
||||
//! assert!(stream.next().is_some());
|
||||
//! // the "emoji" is dropped because its not an alphanum
|
||||
//! assert!(stream.next().is_none());
|
||||
//! # }
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `TokenFilter` that removes all tokens that contain non
|
||||
@@ -49,14 +74,12 @@ where
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
if self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,6 +137,7 @@ mod raw_tokenizer;
|
||||
mod remove_long;
|
||||
mod simple_tokenizer;
|
||||
mod stemmer;
|
||||
mod stop_word_filter;
|
||||
mod token_stream_chain;
|
||||
mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
@@ -150,6 +151,7 @@ pub use self::raw_tokenizer::RawTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::Stemmer;
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
@@ -68,7 +68,7 @@ impl NgramTokenizer {
|
||||
}
|
||||
pub struct NgramTokenStream<'a> {
|
||||
text: &'a str,
|
||||
location: usize,
|
||||
position: usize,
|
||||
text_length: usize,
|
||||
token: Token,
|
||||
min_gram: usize,
|
||||
@@ -83,7 +83,7 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
NgramTokenStream {
|
||||
text,
|
||||
location: 0,
|
||||
position: 0,
|
||||
text_length: text.len(),
|
||||
token: Token::default(),
|
||||
min_gram: self.min_gram,
|
||||
@@ -110,11 +110,11 @@ impl<'a> NgramTokenStream<'a> {
|
||||
self.gram_size = self.min_gram;
|
||||
|
||||
// and move down the chain of letters
|
||||
self.location += 1;
|
||||
self.position += 1;
|
||||
}
|
||||
|
||||
let result = if (self.location + self.gram_size) <= self.text_length {
|
||||
Some((self.location, self.gram_size))
|
||||
let result = if (self.position + self.gram_size) <= self.text_length {
|
||||
Some((self.position, self.gram_size))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
@@ -1,3 +1,21 @@
|
||||
//! # Example
|
||||
//! ```
|
||||
//! extern crate tantivy;
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! # fn main() {
|
||||
//!
|
||||
//! let tokenizer = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(5));
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("toolong nice");
|
||||
//! // because `toolong` is more than 5 characters, it is filtered
|
||||
//! // out of the token stream.
|
||||
//! assert_eq!(stream.next().unwrap().text, "nice");
|
||||
//! assert!(stream.next().is_none());
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
@@ -68,14 +86,12 @@ where
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
if self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
97
src/tokenizer/stop_word_filter.rs
Normal file
97
src/tokenizer/stop_word_filter.rs
Normal file
@@ -0,0 +1,97 @@
|
||||
//! # Example
|
||||
//! ```
|
||||
//! extern crate tantivy;
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! # fn main() {
|
||||
//! let tokenizer = SimpleTokenizer
|
||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("the fox is crafty");
|
||||
//! assert_eq!(stream.next().unwrap().text, "fox");
|
||||
//! assert_eq!(stream.next().unwrap().text, "crafty");
|
||||
//! assert!(stream.next().is_none());
|
||||
//! # }
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use fnv::FnvHasher;
|
||||
use std::collections::HashSet;
|
||||
use std::hash::BuildHasherDefault;
|
||||
|
||||
// configure our hashers for SPEED
|
||||
type StopWordHasher = BuildHasherDefault<FnvHasher>;
|
||||
type StopWordHashSet = HashSet<String, StopWordHasher>;
|
||||
|
||||
/// `TokenFilter` that removes stop words from a token stream
|
||||
#[derive(Clone)]
|
||||
pub struct StopWordFilter {
|
||||
words: StopWordHashSet,
|
||||
}
|
||||
|
||||
impl StopWordFilter {
|
||||
/// Creates a `StopWordFilter` given a list of words to remove
|
||||
pub fn remove(words: Vec<String>) -> StopWordFilter {
|
||||
let mut set = StopWordHashSet::default();
|
||||
|
||||
for word in words {
|
||||
set.insert(word);
|
||||
}
|
||||
|
||||
StopWordFilter { words: set }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StopWordFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
words: StopWordHashSet,
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for StopWordFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = StopWordFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
StopWordFilterStream::wrap(self.words.clone(), token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> StopWordFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
!self.words.contains(&token.text)
|
||||
}
|
||||
|
||||
fn wrap(words: StopWordHashSet, tail: TailTokenStream) -> StopWordFilterStream<TailTokenStream> {
|
||||
StopWordFilterStream { words, tail }
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for StopWordFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user