From 7e5f697d0099081fa6d8aa1e89ed8f2cc1e9d771 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 9 Sep 2018 16:23:56 +0900 Subject: [PATCH] Closes #387 --- Cargo.toml | 1 - src/lib.rs | 2 - src/tokenizer/japanese_tokenizer.rs | 94 ----------------------------- src/tokenizer/mod.rs | 24 -------- src/tokenizer/tokenizer_manager.rs | 2 - 5 files changed, 123 deletions(-) delete mode 100644 src/tokenizer/japanese_tokenizer.rs diff --git a/Cargo.toml b/Cargo.toml index 098ab91c4..53a318fc2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,6 @@ keywords = ["search", "information", "retrieval"] base64 = "0.9.1" byteorder = "1.0" lazy_static = "1" -tinysegmenter = "0.1.0" regex = "1.0" fst = {version="0.3", default-features=false} fst-regex = { version="0.2" } diff --git a/src/lib.rs b/src/lib.rs index e5a75cd64..d6073eee1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -168,8 +168,6 @@ extern crate rand; #[cfg(all(test, feature = "unstable"))] extern crate test; -extern crate tinysegmenter; - #[macro_use] extern crate downcast; diff --git a/src/tokenizer/japanese_tokenizer.rs b/src/tokenizer/japanese_tokenizer.rs deleted file mode 100644 index 5b072e380..000000000 --- a/src/tokenizer/japanese_tokenizer.rs +++ /dev/null @@ -1,94 +0,0 @@ -use super::{Token, TokenStream, Tokenizer}; -use tinysegmenter; - -/// Simple japanese tokenizer based on the `tinysegmenter` crate. -#[derive(Clone)] -pub struct JapaneseTokenizer; - -#[derive(Eq, PartialEq)] -enum Cursor { - HasNotStarted, - Cursor(usize), - Terminated, -} - -pub struct JapaneseTokenizerStream { - tokens: Vec, - cursor: Cursor, -} - -impl<'a> Tokenizer<'a> for JapaneseTokenizer { - type TokenStreamImpl = JapaneseTokenizerStream; - - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { - let mut tokens = vec![]; - let mut offset_from; - let mut offset_to = 0; - for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() { - offset_from = offset_to; - offset_to = offset_from + term.len(); - if term.chars().all(char::is_alphanumeric) { - tokens.push(Token { - offset_from, - offset_to, - position: pos, - text: term, - position_length: 1 - }); - } - } - JapaneseTokenizerStream { - tokens, - cursor: Cursor::HasNotStarted, - } - } -} - -impl<'a> TokenStream for JapaneseTokenizerStream { - fn advance(&mut self) -> bool { - let new_cursor = match self.cursor { - Cursor::HasNotStarted => { - if self.tokens.is_empty() { - Cursor::Terminated - } else { - Cursor::Cursor(0) - } - } - Cursor::Cursor(pos) => { - let new_pos = pos + 1; - if new_pos >= self.tokens.len() { - Cursor::Terminated - } else { - Cursor::Cursor(new_pos) - } - } - Cursor::Terminated => Cursor::Terminated, - }; - self.cursor = new_cursor; - self.cursor != Cursor::Terminated - } - - fn token(&self) -> &Token { - match self.cursor { - Cursor::Terminated => { - panic!("You called .token(), after the end of the token stream has been reached"); - } - Cursor::Cursor(i) => &self.tokens[i], - Cursor::HasNotStarted => { - panic!("You called .token(), before having called `.advance()`."); - } - } - } - - fn token_mut(&mut self) -> &mut Token { - match self.cursor { - Cursor::Terminated => { - panic!("You called .token(), after the end of the token stream has been reached"); - } - Cursor::Cursor(i) => &mut self.tokens[i], - Cursor::HasNotStarted => { - panic!("You called .token(), before having called `.advance()`."); - } - } - } -} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index fd0bfbbde..0b1c68339 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -130,7 +130,6 @@ //! mod alphanum_only; mod facet_tokenizer; -mod japanese_tokenizer; mod lower_caser; mod ngram_tokenizer; mod raw_tokenizer; @@ -144,7 +143,6 @@ mod tokenizer_manager; pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::facet_tokenizer::FacetTokenizer; -pub use self::japanese_tokenizer::JapaneseTokenizer; pub use self::lower_caser::LowerCaser; pub use self::ngram_tokenizer::NgramTokenizer; pub use self::raw_tokenizer::RawTokenizer; @@ -224,28 +222,6 @@ pub mod test { assert_token(&tokens[3], 3, "payer", 17, 22); } - #[test] - fn test_jp_tokenizer() { - let tokenizer_manager = TokenizerManager::default(); - let en_tokenizer = tokenizer_manager.get("ja").unwrap(); - - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - en_tokenizer - .token_stream("野菜食べないとやばい!") - .process(&mut add_token); - } - assert_eq!(tokens.len(), 5); - assert_token(&tokens[0], 0, "野菜", 0, 6); - assert_token(&tokens[1], 1, "食べ", 6, 12); - assert_token(&tokens[2], 2, "ない", 12, 18); - assert_token(&tokens[3], 3, "と", 18, 21); - assert_token(&tokens[4], 4, "やばい", 21, 30); - } - #[test] fn test_ngram_tokenizer() { use super::{LowerCaser, NgramTokenizer}; diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index cbb46af3b..981962a66 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -2,7 +2,6 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; use tokenizer::tokenizer::box_tokenizer; use tokenizer::BoxedTokenizer; -use tokenizer::JapaneseTokenizer; use tokenizer::LowerCaser; use tokenizer::RawTokenizer; use tokenizer::RemoveLongFilter; @@ -74,7 +73,6 @@ impl Default for TokenizerManager { .filter(LowerCaser) .filter(Stemmer::new()), ); - manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40))); manager } }