From 389cdffb4b55cd5ec75cdac7e9cdec5ffb06f0f0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 1 May 2016 15:08:44 +0900 Subject: [PATCH] blop --- src/core/analyzer.rs | 95 -------------------- src/core/collector.rs | 201 ------------------------------------------ src/core/merger.rs | 4 +- src/core/mod.rs | 2 - src/core/searcher.rs | 2 +- src/core/writer.rs | 4 +- src/lib.rs | 4 +- 7 files changed, 7 insertions(+), 305 deletions(-) delete mode 100644 src/core/analyzer.rs delete mode 100644 src/core/collector.rs diff --git a/src/core/analyzer.rs b/src/core/analyzer.rs deleted file mode 100644 index 09bb9eff5..000000000 --- a/src/core/analyzer.rs +++ /dev/null @@ -1,95 +0,0 @@ -extern crate regex; - -use std::str::Chars; - -pub struct TokenIter<'a> { - chars: Chars<'a>, - term_buffer: String, -} - -fn append_char_lowercase(c: char, term_buffer: &mut String) { - for c_lower in c.to_lowercase() { - term_buffer.push(c_lower); - } -} - -pub trait StreamingIterator<'a, T> { - fn next(&'a mut self) -> Option; -} - -impl<'a, 'b> TokenIter<'b> { - fn consume_token(&'a mut self) -> Option<&'a str> { - loop { - match self.chars.next() { - Some(c) => { - if c.is_alphanumeric() { - append_char_lowercase(c, &mut self.term_buffer); - } - else { - break; - } - }, - None => { - break; - } - } - } - return Some(&self.term_buffer); - } -} - - -impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> { - - fn next(&'a mut self,) -> Option<&'a str> { - self.term_buffer.clear(); - // skipping non-letter characters. - loop { - match self.chars.next() { - Some(c) => { - if c.is_alphanumeric() { - append_char_lowercase(c, &mut self.term_buffer); - return self.consume_token(); - } - } - None => { return None; } - } - } - } -} - -pub struct SimpleTokenizer; - - -impl SimpleTokenizer { - pub fn new() -> SimpleTokenizer { - SimpleTokenizer - } - - pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> { - TokenIter { - term_buffer: String::new(), - chars: text.chars(), - } - } -} - - -#[test] -fn test_tokenizer() { - let simple_tokenizer = SimpleTokenizer::new(); - let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!"); - assert_eq!(term_reader.next().unwrap(), "hello"); - assert_eq!(term_reader.next().unwrap(), "happy"); - assert_eq!(term_reader.next().unwrap(), "tax"); - assert_eq!(term_reader.next().unwrap(), "payer"); - assert_eq!(term_reader.next(), None); -} - - -#[test] -fn test_tokenizer_empty() { - let simple_tokenizer = SimpleTokenizer::new(); - let mut term_reader = simple_tokenizer.tokenize(""); - assert_eq!(term_reader.next(), None); -} diff --git a/src/core/collector.rs b/src/core/collector.rs deleted file mode 100644 index d9dfc3960..000000000 --- a/src/core/collector.rs +++ /dev/null @@ -1,201 +0,0 @@ -use DocId; -use core::reader::SegmentReader; -use core::searcher::SegmentLocalId; -use core::searcher::DocAddress; -use fastfield::U32FastFieldReader; -use schema::U32Field; -use std::io; - -pub trait Collector { - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>; - fn collect(&mut self, doc_id: DocId); -} - -pub struct FirstNCollector { - docs: Vec, - current_segment: u32, - limit: usize, -} - -impl FirstNCollector { - pub fn with_limit(limit: usize) -> FirstNCollector { - FirstNCollector { - docs: Vec::new(), - limit: limit, - current_segment: 0, - } - } - - pub fn docs(self,) -> Vec { - self.docs - } -} - -impl Collector for FirstNCollector { - - fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { - self.current_segment = segment_local_id; - Ok(()) - } - - fn collect(&mut self, doc_id: DocId) { - if self.docs.len() < self.limit { - self.docs.push(DocAddress(self.current_segment.clone(), doc_id)); - } - } -} - -pub struct CountCollector { - count: usize, -} - -impl CountCollector { - pub fn new() -> CountCollector { - CountCollector { - count: 0, - } - } - - pub fn count(&self,) -> usize { - self.count - } -} - -impl Collector for CountCollector { - - fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { - Ok(()) - } - - fn collect(&mut self, _: DocId) { - self.count += 1; - } -} - -pub struct MultiCollector<'a> { - collectors: Vec<&'a mut Collector>, -} - -impl<'a> MultiCollector<'a> { - pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector { - MultiCollector { - collectors: collectors, - } - } -} - -impl<'a> Collector for MultiCollector<'a> { - - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> { - for collector in self.collectors.iter_mut() { - try!(collector.set_segment(segment_local_id, segment)); - } - Ok(()) - } - - fn collect(&mut self, doc_id: DocId) { - for collector in self.collectors.iter_mut() { - collector.collect(doc_id); - } - } -} - -pub struct TestCollector { - offset: DocId, - segment_max_doc: DocId, - docs: Vec, -} - -impl TestCollector { - pub fn new() -> TestCollector { - TestCollector { - docs: Vec::new(), - offset: 0, - segment_max_doc: 0, - } - } - - pub fn docs(self,) -> Vec { - self.docs - } -} - -impl Collector for TestCollector { - - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { - self.offset += self.segment_max_doc; - self.segment_max_doc = reader.max_doc(); - Ok(()) - } - - fn collect(&mut self, doc_id: DocId) { - self.docs.push(doc_id + self.offset); - } -} - - -pub struct FastFieldTestCollector { - vals: Vec, - u32_field: U32Field, - ff_reader: Option, -} - -impl FastFieldTestCollector { - pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector { - FastFieldTestCollector { - vals: Vec::new(), - u32_field: u32_field, - ff_reader: None, - } - } - - pub fn vals(&self,) -> &Vec { - &self.vals - } -} - -impl Collector for FastFieldTestCollector { - - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { - self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field))); - Ok(()) - } - - fn collect(&mut self, doc_id: DocId) { - let val = self.ff_reader.as_ref().unwrap().get(doc_id); - self.vals.push(val); - } -} - - - -#[cfg(test)] -mod tests { - - use super::*; - use test::Bencher; - - #[bench] - fn build_collector(b: &mut Bencher) { - b.iter(|| { - let mut count_collector = CountCollector::new(); - let docs: Vec = (0..1_000_000).collect(); - for doc in docs { - count_collector.collect(doc); - } - count_collector.count() - }); - } - - // #[bench] - // fn build_first_3_collector(b: &mut Bencher) { - // b.iter(|| { - // let mut first3collector = FirstNCollector::with_limit(3); - // let docs: Vec = (0..1_000_000).collect(); - // for doc in docs { - // first3collector.collect(doc); - // } - // first3collector.docs() - // }); - // } -} diff --git a/src/core/merger.rs b/src/core/merger.rs index 4d7456510..10a6723a1 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -215,8 +215,8 @@ mod tests { use schema::Term; use core::index::Index; use core::searcher::DocAddress; - use core::collector::FastFieldTestCollector; - use core::collector::TestCollector; + use collector::FastFieldTestCollector; + use collector::TestCollector; #[test] fn test_index_merger() { diff --git a/src/core/mod.rs b/src/core/mod.rs index d5fc86b7e..b97d15b35 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,9 +1,7 @@ pub mod writer; -pub mod analyzer; pub mod reader; pub mod codec; pub mod searcher; -pub mod collector; pub mod index; pub mod merger; diff --git a/src/core/searcher.rs b/src/core/searcher.rs index ddc836f8c..35857dd80 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -3,7 +3,7 @@ use core::index::Index; use core::index::Segment; use DocId; use schema::{Document, Term}; -use core::collector::Collector; +use collector::Collector; use std::io; use common::TimerTree; diff --git a/src/core/writer.rs b/src/core/writer.rs index 4d4f1ba2c..177131032 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -5,9 +5,9 @@ use schema::Term; use schema::TextFieldValue; use core::codec::*; use core::index::Index; -use core::analyzer::SimpleTokenizer; +use analyzer::SimpleTokenizer; use core::index::SerializableSegment; -use core::analyzer::StreamingIterator; +use analyzer::StreamingIterator; use core::index::Segment; use core::index::SegmentInfo; use postings::PostingsWriter; diff --git a/src/lib.rs b/src/lib.rs index 883b43e5e..746d02748 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,16 +35,16 @@ mod compression; mod fastfield; mod store; mod common; +pub mod analyzer; +pub mod collector; pub mod schema; pub use directory::Directory; -pub use core::analyzer; pub use core::searcher::Searcher; pub use core::index::Index; pub use schema::Term; pub use schema::Document; -pub use core::collector; pub use core::reader::SegmentReader; pub use core::searcher::SegmentLocalId;