From 3f09ec75dfdae94c32bacbbf59577628f2d6ccff Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 10 Jan 2016 13:00:52 +0900 Subject: [PATCH] first commit --- .gitignore | 2 + Cargo.toml | 10 +++ src/core/analyzer.rs | 20 +++++ src/core/dictionary.rs | 1 + src/core/directory.rs | 66 ++++++++++++++++ src/core/global.rs | 2 + src/core/mod.rs | 9 +++ src/core/postings.rs | 175 +++++++++++++++++++++++++++++++++++++++++ src/core/schema.rs | 56 +++++++++++++ src/core/writer.rs | 63 +++++++++++++++ src/lib.rs | 4 + src/main.rs | 4 + tests/core.rs | 15 ++++ 13 files changed, 427 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 src/core/analyzer.rs create mode 100644 src/core/dictionary.rs create mode 100644 src/core/directory.rs create mode 100644 src/core/global.rs create mode 100644 src/core/mod.rs create mode 100644 src/core/postings.rs create mode 100644 src/core/schema.rs create mode 100644 src/core/writer.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 tests/core.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..a9d37c560 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..1a51fbc56 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "parici" +version = "0.1.0" +authors = ["Paul Masurel "] + +[dependencies] +byteorder = "0.4.2" +itertools = "0.4.5" +memmap = "0.2.3" +lazy_static = "0.1.*" diff --git a/src/core/analyzer.rs b/src/core/analyzer.rs new file mode 100644 index 000000000..598867b4c --- /dev/null +++ b/src/core/analyzer.rs @@ -0,0 +1,20 @@ + +pub struct TokenIter<'a> { + text: &'a String, +} + +impl<'a> Iterator for TokenIter<'a> { + + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + None + } + +} + +pub fn tokenize<'a>(text: &'a String)->TokenIter<'a> { + TokenIter { + text: text + } +} diff --git a/src/core/dictionary.rs b/src/core/dictionary.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/src/core/dictionary.rs @@ -0,0 +1 @@ + diff --git a/src/core/directory.rs b/src/core/directory.rs new file mode 100644 index 000000000..14b604568 --- /dev/null +++ b/src/core/directory.rs @@ -0,0 +1,66 @@ + +extern crate memmap; + +use self::memmap::{Mmap, Protection}; +use std::path::PathBuf; +use std::fs::File; +use std::io; + +#[derive(Clone, Debug)] +struct SegmentId(String); + +struct IndexDirectory { + index_path: PathBuf, +} + +impl IndexDirectory { + + pub fn for_path(path: PathBuf)-> IndexDirectory { + IndexDirectory { + index_path: path, + } + } + + pub fn read_segment(&self, segment_id: &SegmentId) -> SegmentDirectory { + SegmentDirectory { + index_path: self.index_path.clone(), + segment_id: segment_id.clone() + } + } + + + +} + +enum SegmentComponent { + POSTINGS, + POSITIONS, +} + +struct SegmentDirectory { + index_path: PathBuf, + segment_id: SegmentId, +} + +impl SegmentDirectory { + + fn path_suffix(component: SegmentComponent)-> &'static str { + match component { + SegmentComponent::POSTINGS => ".pstgs", + SegmentComponent::POSITIONS => ".pos", + } + } + + fn get_file(&self, component: SegmentComponent) -> Result { + let mut res = self.index_path.clone(); + let SegmentId(ref segment_id_str) = self.segment_id; + let filename = String::new() + segment_id_str + "." + SegmentDirectory::path_suffix(component); + res.push(filename); + File::open(res) + } + + pub fn open(&self, component: SegmentComponent) -> Result { + let file = try!(self.get_file(component)); + Mmap::open(&file, Protection::Read) + } +} diff --git a/src/core/global.rs b/src/core/global.rs new file mode 100644 index 000000000..a64f95b70 --- /dev/null +++ b/src/core/global.rs @@ -0,0 +1,2 @@ + +pub type DocId = usize; diff --git a/src/core/mod.rs b/src/core/mod.rs new file mode 100644 index 000000000..ff5784af8 --- /dev/null +++ b/src/core/mod.rs @@ -0,0 +1,9 @@ + +pub mod postings; +pub mod global; +pub mod schema; +pub mod directory; +pub mod writer; +pub mod analyzer; + +pub use core::global::DocId; diff --git a/src/core/postings.rs b/src/core/postings.rs new file mode 100644 index 000000000..3c01f365c --- /dev/null +++ b/src/core/postings.rs @@ -0,0 +1,175 @@ +use std::fmt; +use std::fmt::{Debug, Formatter}; +// use std::core::slice; +use std::io::prelude::Read; +use core::global::DocId; +// use core::schema::{Field, Term}; +// use std::slice; +use std::vec; + +pub trait Postings { + type IteratorType: Iterator; + fn iter(&self) -> Self::IteratorType; +} + + +#[derive(Clone)] +pub struct SimplePostings { + reader: R, +} + +pub struct SimplePostingsIterator { + reader: R +} + +impl Postings for SimplePostings { + + type IteratorType = SimplePostingsIterator; + + fn iter(&self) -> Self::IteratorType { + SimplePostingsIterator { + reader: self.reader.clone() + } + } +} + + +impl Iterator for SimplePostingsIterator { + + type Item=DocId; + + fn next(&mut self) -> Option { + let mut buf: [u8; 8] = [0; 8]; + match self.reader.read(&mut buf) { + Ok(num_bytes) => { + if num_bytes == 8 { + unsafe { + let val = *(*buf.as_ptr() as *const usize); + return Some(val) + } + } + else { + return None + } + }, + Err(_) => None + } + } +} + + +impl Debug for SimplePostings { + fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { + let posting_lists: Vec = self.iter().collect(); + write!(f, "Postings({:?})", posting_lists); + Ok(()) + } +} + +pub struct IntersectionPostings<'a, LeftPostingsType, RightPostingsType> +where LeftPostingsType: Postings + 'static, + RightPostingsType: Postings + 'static +{ + left: &'a LeftPostingsType, + right: &'a RightPostingsType, +} + +impl<'a, LeftPostingsType, RightPostingsType> Postings for IntersectionPostings<'a, LeftPostingsType, RightPostingsType> +where LeftPostingsType: Postings + 'static, + RightPostingsType: Postings + 'static { + + type IteratorType = IntersectionIterator; + + fn iter(&self) -> IntersectionIterator { + let mut left_it = self.left.iter(); + let mut right_it = self.right.iter(); + let next_left = left_it.next(); + let next_right = right_it.next(); + IntersectionIterator { + left: left_it, + right: right_it, + next_left: next_left, + next_right: next_right, + } + } + +} +pub fn intersection<'a, LeftPostingsType, RightPostingsType> (left: &'a LeftPostingsType, right: &'a RightPostingsType) -> IntersectionPostings<'a, LeftPostingsType, RightPostingsType> +where LeftPostingsType: Postings + 'static, + RightPostingsType: Postings + 'static { + IntersectionPostings { + left: left, + right: right + } +} + + +pub struct IntersectionIterator { + left: LeftPostingsType::IteratorType, + right: RightPostingsType::IteratorType, + + next_left: Option, + next_right: Option, +} + +impl +Iterator for IntersectionIterator { + + type Item = DocId; + + fn next(&mut self,) -> Option { + loop { + match (self.next_left, self.next_right) { + (_, None) => { + return None; + }, + (None, _) => { + return None; + }, + (Some(left_val), Some(right_val)) => { + if left_val < right_val { + self.next_left = self.left.next(); + } + else if right_val > right_val { + self.next_right = self.right.next(); + } + else { + self.next_left = self.left.next(); + self.next_right = self.right.next(); + return Some(left_val) + } + } + } + } + } +} + +#[derive(Debug)] +pub struct VecPostings { + postings: Vec, +} + +impl VecPostings { + pub fn new(vals: Vec) -> VecPostings { + VecPostings { + postings: vals + } + } +} + +impl Postings for VecPostings { + type IteratorType = vec::IntoIter; + + fn iter(&self) -> vec::IntoIter { + self.postings.clone().into_iter() + + } +} + +impl<'a, L: Postings + 'static, R: Postings + 'static> Debug for IntersectionPostings<'a, L, R> { + fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { + let posting_lists: Vec = self.iter().collect(); + write!(f, "Postings({:?})", posting_lists); + Ok(()) + } +} diff --git a/src/core/schema.rs b/src/core/schema.rs new file mode 100644 index 000000000..8b11acce7 --- /dev/null +++ b/src/core/schema.rs @@ -0,0 +1,56 @@ +use std::collections::HashMap; +use std::sync::{Mutex, MutexGuard}; + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] +pub struct Field(&'static str); + + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] +pub struct FieldValue { + pub field: Field, + pub text: String, +} + + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] +pub struct Term<'a> { + pub field: &'a Field, + pub text: &'a str, +} + + +pub struct Document { + fields: Vec, +} + + +impl Document { + + pub fn new() -> Document { + Document { + fields: Vec::new() + } + } + + pub fn set(&mut self, field: &Field, text: &String) { + self.add(FieldValue { + field: (*field).clone(), + text: (*text).clone() + }); + } + + pub fn add(&mut self, field_value: FieldValue) { + self.fields.push(field_value); + } + +} + +impl IntoIterator for Document { + type Item = FieldValue; + type IntoIter = ::std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.fields.into_iter() + } + +} diff --git a/src/core/writer.rs b/src/core/writer.rs new file mode 100644 index 000000000..c6723b889 --- /dev/null +++ b/src/core/writer.rs @@ -0,0 +1,63 @@ + +use std::io; +use core::schema::Document; +use core::schema::Term; +use core::analyzer::tokenize; +use std::collections::HashMap; +// +// struct TermDictionary { +// map: HashMap, +// } +// +// struct TermId(usize); +// +// impl TermDictionary { +// +// pub fn new() -> TermDictionary { +// TermDictionary { +// map: HashMap::new(), +// } +// } +// +// pub fn term_id(&mut self, term: &Term) -> TermId { +// match self.map.get(term) { +// Some(usize) => { return TermId(usize); }, +// None => {} +// } +// let term_id = self.map.len(); +// self.map.insert(term, term_id); +// TermId(term_id) +// +// } +// } + +struct IndexWriter { + max_doc: usize, + +} + +impl IndexWriter { + + fn suscribe(&mut self, term: &Term, doc_id: usize) { + + } + + pub fn add(&mut self, doc: Document) { + let doc_id = self.max_doc; + for field_value in doc { + for token in tokenize(&field_value.text) { + let term = Term { + field: &field_value.field, + text: &token + }; + self.suscribe(&term, doc_id); + } + } + self.max_doc += 1; + } + + pub fn sync(&mut self,) -> Result<(), io::Error> { + Ok(()) + } + +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 000000000..d6f4668ce --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,4 @@ +#[macro_use] +extern crate lazy_static; + +pub mod core; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 000000000..7db121b6a --- /dev/null +++ b/src/main.rs @@ -0,0 +1,4 @@ +extern crate parici; + +fn main() { +} diff --git a/tests/core.rs b/tests/core.rs new file mode 100644 index 000000000..1b6f0b38d --- /dev/null +++ b/tests/core.rs @@ -0,0 +1,15 @@ +extern crate parici; +extern crate itertools; + +use parici::core::DocId; +use parici::core::postings::{VecPostings, intersection}; +use parici::core::postings::Postings; + +#[test] +fn test_intersection() { + let left = VecPostings::new(vec!(1, 3, 9)); + let right = VecPostings::new(vec!(3, 4, 9, 18)); + let inter = intersection(&left, &right); + let vals: Vec = inter.iter().collect(); + itertools::assert_equal(vals, vec!(3, 9)); +}