diff --git a/Cargo.toml b/Cargo.toml index 1a51fbc56..6017171ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,4 @@ byteorder = "0.4.2" itertools = "0.4.5" memmap = "0.2.3" lazy_static = "0.1.*" +regex = "0.1" diff --git a/src/core/analyzer.rs b/src/core/analyzer.rs index 598867b4c..50e6db624 100644 --- a/src/core/analyzer.rs +++ b/src/core/analyzer.rs @@ -1,6 +1,15 @@ +extern crate regex; + +use self::regex::Regex; + +lazy_static! { + static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap(); +} + pub struct TokenIter<'a> { - text: &'a String, + text: &'a str, + token_it: Box + 'a>, } impl<'a> Iterator for TokenIter<'a> { @@ -8,13 +17,14 @@ impl<'a> Iterator for TokenIter<'a> { type Item = &'a str; fn next(&mut self) -> Option<&'a str> { - None + self.token_it.next().map(|(start, end)| &self.text[start..end]) } } -pub fn tokenize<'a>(text: &'a String)->TokenIter<'a> { +pub fn tokenize<'a>(text: &'a str)->TokenIter<'a> { TokenIter { - text: text + text: text, + token_it: Box::new(WORD_PTN.find_iter(text)), } } diff --git a/src/core/dictionary.rs b/src/core/dictionary.rs deleted file mode 100644 index 8b1378917..000000000 --- a/src/core/dictionary.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/core/postings.rs b/src/core/postings.rs index 3c01f365c..b97a90064 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -1,8 +1,8 @@ use std::fmt; use std::fmt::{Debug, Formatter}; -// use std::core::slice; use std::io::prelude::Read; use core::global::DocId; +// use std::core::slice; // use core::schema::{Field, Term}; // use std::slice; use std::vec; @@ -13,6 +13,7 @@ pub trait Postings { } + #[derive(Clone)] pub struct SimplePostings { reader: R, diff --git a/src/core/schema.rs b/src/core/schema.rs index 8b11acce7..0d8ea85ee 100644 --- a/src/core/schema.rs +++ b/src/core/schema.rs @@ -1,8 +1,8 @@ use std::collections::HashMap; use std::sync::{Mutex, MutexGuard}; -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] -pub struct Field(&'static str); +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] +pub struct Field(pub &'static str); #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] @@ -12,7 +12,7 @@ pub struct FieldValue { } -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] pub struct Term<'a> { pub field: &'a Field, pub text: &'a str, @@ -32,10 +32,10 @@ impl Document { } } - pub fn set(&mut self, field: &Field, text: &String) { + pub fn set(&mut self, field: Field, text: &str) { self.add(FieldValue { - field: (*field).clone(), - text: (*text).clone() + field: field, + text: String::from(text) }); } diff --git a/src/core/writer.rs b/src/core/writer.rs index c6723b889..a04361071 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -2,55 +2,86 @@ use std::io; use core::schema::Document; use core::schema::Term; +use core::schema::Field ; use core::analyzer::tokenize; -use std::collections::HashMap; -// -// struct TermDictionary { -// map: HashMap, -// } -// -// struct TermId(usize); -// -// impl TermDictionary { -// -// pub fn new() -> TermDictionary { -// TermDictionary { -// map: HashMap::new(), -// } -// } -// -// pub fn term_id(&mut self, term: &Term) -> TermId { -// match self.map.get(term) { -// Some(usize) => { return TermId(usize); }, -// None => {} -// } -// let term_id = self.map.len(); -// self.map.insert(term, term_id); -// TermId(term_id) -// -// } -// } +use std::collections::{HashMap, BTreeMap}; +use core::DocId; -struct IndexWriter { +pub struct PostingsWriter { + doc_ids: Vec, +} + +impl PostingsWriter { + pub fn new()->PostingsWriter { + PostingsWriter { + doc_ids: Vec::new(), + } + } + + pub fn suscribe(&mut self, doc_id: DocId) { + self.doc_ids.push(doc_id); + } +} + +struct FieldWriter { + postings: Vec, + term_index: BTreeMap, +} + +impl FieldWriter { + pub fn new() -> FieldWriter { + FieldWriter { + term_index: BTreeMap::new(), + postings: Vec::new() + } + } + + pub fn get_postings_writer(&mut self, term_text: &str) -> &mut PostingsWriter { + match self.term_index.get(term_text) { + Some(unord_id) => { + return &mut self.postings[*unord_id]; + }, + None => {} + } + let unord_id = self.term_index.len(); + self.postings.push(PostingsWriter::new()); + self.term_index.insert(String::from(term_text), unord_id.clone()); + &mut self.postings[unord_id] + } + + pub fn suscribe(&mut self, doc: DocId, term_text: &str) { + self.get_postings_writer(term_text).suscribe(doc); + } +} + +pub struct IndexWriter { max_doc: usize, - + term_writers: HashMap, } impl IndexWriter { - fn suscribe(&mut self, term: &Term, doc_id: usize) { + pub fn new() -> IndexWriter { + IndexWriter { + max_doc: 0, + term_writers: HashMap::new(), + } + } + fn get_field_writer(&mut self, field: &Field) -> &mut FieldWriter { + if !self.term_writers.contains_key(field) { + self.term_writers.insert((*field).clone(), FieldWriter::new()); + } + self.term_writers.get_mut(field).unwrap() } pub fn add(&mut self, doc: Document) { let doc_id = self.max_doc; for field_value in doc { + let field = field_value.field; + let field_writer = self.get_field_writer(&field); for token in tokenize(&field_value.text) { - let term = Term { - field: &field_value.field, - text: &token - }; - self.suscribe(&term, doc_id); + field_writer.suscribe(doc_id, token); } } self.max_doc += 1; diff --git a/tests/core.rs b/tests/core.rs index 1b6f0b38d..7214ab6df 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -4,6 +4,9 @@ extern crate itertools; use parici::core::DocId; use parici::core::postings::{VecPostings, intersection}; use parici::core::postings::Postings; +use parici::core::analyzer::tokenize; +use parici::core::writer::IndexWriter; +use parici::core::schema::{Field, Document}; #[test] fn test_intersection() { @@ -11,5 +14,19 @@ fn test_intersection() { let right = VecPostings::new(vec!(3, 4, 9, 18)); let inter = intersection(&left, &right); let vals: Vec = inter.iter().collect(); - itertools::assert_equal(vals, vec!(3, 9)); + assert_eq!(vals, vec!(3, 9)); +} + +#[test] +fn test_tokenizer() { + let words: Vec<&str> = tokenize("hello happy tax payer!").collect(); + assert_eq!(words, vec!("hello", "happy", "tax", "payer")); +} + +#[test] +fn test_indexing() { + let mut index_writer = IndexWriter::new(); + let mut doc = Document::new(); + doc.set(Field("text"), &String::from("toto")); + index_writer.add(doc); }