first commit

This commit is contained in:
Paul Masurel
2016-01-10 13:00:52 +09:00
commit 3f09ec75df
13 changed files with 427 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
target
Cargo.lock

10
Cargo.toml Normal file
View File

@@ -0,0 +1,10 @@
[package]
name = "parici"
version = "0.1.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
[dependencies]
byteorder = "0.4.2"
itertools = "0.4.5"
memmap = "0.2.3"
lazy_static = "0.1.*"

20
src/core/analyzer.rs Normal file
View File

@@ -0,0 +1,20 @@
pub struct TokenIter<'a> {
text: &'a String,
}
impl<'a> Iterator for TokenIter<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
None
}
}
pub fn tokenize<'a>(text: &'a String)->TokenIter<'a> {
TokenIter {
text: text
}
}

1
src/core/dictionary.rs Normal file
View File

@@ -0,0 +1 @@

66
src/core/directory.rs Normal file
View File

@@ -0,0 +1,66 @@
extern crate memmap;
use self::memmap::{Mmap, Protection};
use std::path::PathBuf;
use std::fs::File;
use std::io;
#[derive(Clone, Debug)]
struct SegmentId(String);
struct IndexDirectory {
index_path: PathBuf,
}
impl IndexDirectory {
pub fn for_path(path: PathBuf)-> IndexDirectory {
IndexDirectory {
index_path: path,
}
}
pub fn read_segment(&self, segment_id: &SegmentId) -> SegmentDirectory {
SegmentDirectory {
index_path: self.index_path.clone(),
segment_id: segment_id.clone()
}
}
}
enum SegmentComponent {
POSTINGS,
POSITIONS,
}
struct SegmentDirectory {
index_path: PathBuf,
segment_id: SegmentId,
}
impl SegmentDirectory {
fn path_suffix(component: SegmentComponent)-> &'static str {
match component {
SegmentComponent::POSTINGS => ".pstgs",
SegmentComponent::POSITIONS => ".pos",
}
}
fn get_file(&self, component: SegmentComponent) -> Result<File, io::Error> {
let mut res = self.index_path.clone();
let SegmentId(ref segment_id_str) = self.segment_id;
let filename = String::new() + segment_id_str + "." + SegmentDirectory::path_suffix(component);
res.push(filename);
File::open(res)
}
pub fn open(&self, component: SegmentComponent) -> Result<Mmap, io::Error> {
let file = try!(self.get_file(component));
Mmap::open(&file, Protection::Read)
}
}

2
src/core/global.rs Normal file
View File

@@ -0,0 +1,2 @@
pub type DocId = usize;

9
src/core/mod.rs Normal file
View File

@@ -0,0 +1,9 @@
pub mod postings;
pub mod global;
pub mod schema;
pub mod directory;
pub mod writer;
pub mod analyzer;
pub use core::global::DocId;

175
src/core/postings.rs Normal file
View File

@@ -0,0 +1,175 @@
use std::fmt;
use std::fmt::{Debug, Formatter};
// use std::core::slice;
use std::io::prelude::Read;
use core::global::DocId;
// use core::schema::{Field, Term};
// use std::slice;
use std::vec;
pub trait Postings {
type IteratorType: Iterator<Item=DocId>;
fn iter(&self) -> Self::IteratorType;
}
#[derive(Clone)]
pub struct SimplePostings<R: Read + Clone> {
reader: R,
}
pub struct SimplePostingsIterator<R: Read> {
reader: R
}
impl<R: Read + Clone> Postings for SimplePostings<R> {
type IteratorType = SimplePostingsIterator<R>;
fn iter(&self) -> Self::IteratorType {
SimplePostingsIterator {
reader: self.reader.clone()
}
}
}
impl<R: Read> Iterator for SimplePostingsIterator<R> {
type Item=DocId;
fn next(&mut self) -> Option<DocId> {
let mut buf: [u8; 8] = [0; 8];
match self.reader.read(&mut buf) {
Ok(num_bytes) => {
if num_bytes == 8 {
unsafe {
let val = *(*buf.as_ptr() as *const usize);
return Some(val)
}
}
else {
return None
}
},
Err(_) => None
}
}
}
impl<R: Read + Clone> Debug for SimplePostings<R> {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
let posting_lists: Vec<DocId> = self.iter().collect();
write!(f, "Postings({:?})", posting_lists);
Ok(())
}
}
pub struct IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
where LeftPostingsType: Postings + 'static,
RightPostingsType: Postings + 'static
{
left: &'a LeftPostingsType,
right: &'a RightPostingsType,
}
impl<'a, LeftPostingsType, RightPostingsType> Postings for IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
where LeftPostingsType: Postings + 'static,
RightPostingsType: Postings + 'static {
type IteratorType = IntersectionIterator<LeftPostingsType, RightPostingsType>;
fn iter(&self) -> IntersectionIterator<LeftPostingsType, RightPostingsType> {
let mut left_it = self.left.iter();
let mut right_it = self.right.iter();
let next_left = left_it.next();
let next_right = right_it.next();
IntersectionIterator {
left: left_it,
right: right_it,
next_left: next_left,
next_right: next_right,
}
}
}
pub fn intersection<'a, LeftPostingsType, RightPostingsType> (left: &'a LeftPostingsType, right: &'a RightPostingsType) -> IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
where LeftPostingsType: Postings + 'static,
RightPostingsType: Postings + 'static {
IntersectionPostings {
left: left,
right: right
}
}
pub struct IntersectionIterator<LeftPostingsType: Postings, RightPostingsType: Postings> {
left: LeftPostingsType::IteratorType,
right: RightPostingsType::IteratorType,
next_left: Option<DocId>,
next_right: Option<DocId>,
}
impl<LeftPostingsType: Postings, RightPostingsType: Postings>
Iterator for IntersectionIterator<LeftPostingsType, RightPostingsType> {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
loop {
match (self.next_left, self.next_right) {
(_, None) => {
return None;
},
(None, _) => {
return None;
},
(Some(left_val), Some(right_val)) => {
if left_val < right_val {
self.next_left = self.left.next();
}
else if right_val > right_val {
self.next_right = self.right.next();
}
else {
self.next_left = self.left.next();
self.next_right = self.right.next();
return Some(left_val)
}
}
}
}
}
}
#[derive(Debug)]
pub struct VecPostings {
postings: Vec<DocId>,
}
impl VecPostings {
pub fn new(vals: Vec<DocId>) -> VecPostings {
VecPostings {
postings: vals
}
}
}
impl Postings for VecPostings {
type IteratorType = vec::IntoIter<DocId>;
fn iter(&self) -> vec::IntoIter<DocId> {
self.postings.clone().into_iter()
}
}
impl<'a, L: Postings + 'static, R: Postings + 'static> Debug for IntersectionPostings<'a, L, R> {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
let posting_lists: Vec<DocId> = self.iter().collect();
write!(f, "Postings({:?})", posting_lists);
Ok(())
}
}

56
src/core/schema.rs Normal file
View File

@@ -0,0 +1,56 @@
use std::collections::HashMap;
use std::sync::{Mutex, MutexGuard};
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct Field(&'static str);
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct FieldValue {
pub field: Field,
pub text: String,
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct Term<'a> {
pub field: &'a Field,
pub text: &'a str,
}
pub struct Document {
fields: Vec<FieldValue>,
}
impl Document {
pub fn new() -> Document {
Document {
fields: Vec::new()
}
}
pub fn set(&mut self, field: &Field, text: &String) {
self.add(FieldValue {
field: (*field).clone(),
text: (*text).clone()
});
}
pub fn add(&mut self, field_value: FieldValue) {
self.fields.push(field_value);
}
}
impl IntoIterator for Document {
type Item = FieldValue;
type IntoIter = ::std::vec::IntoIter<FieldValue>;
fn into_iter(self) -> Self::IntoIter {
self.fields.into_iter()
}
}

63
src/core/writer.rs Normal file
View File

@@ -0,0 +1,63 @@
use std::io;
use core::schema::Document;
use core::schema::Term;
use core::analyzer::tokenize;
use std::collections::HashMap;
//
// struct TermDictionary {
// map: HashMap<Term, usize>,
// }
//
// struct TermId(usize);
//
// impl TermDictionary {
//
// pub fn new() -> TermDictionary {
// TermDictionary {
// map: HashMap::new(),
// }
// }
//
// pub fn term_id(&mut self, term: &Term) -> TermId {
// match self.map.get(term) {
// Some(usize) => { return TermId(usize); },
// None => {}
// }
// let term_id = self.map.len();
// self.map.insert(term, term_id);
// TermId(term_id)
//
// }
// }
struct IndexWriter {
max_doc: usize,
}
impl IndexWriter {
fn suscribe(&mut self, term: &Term, doc_id: usize) {
}
pub fn add(&mut self, doc: Document) {
let doc_id = self.max_doc;
for field_value in doc {
for token in tokenize(&field_value.text) {
let term = Term {
field: &field_value.field,
text: &token
};
self.suscribe(&term, doc_id);
}
}
self.max_doc += 1;
}
pub fn sync(&mut self,) -> Result<(), io::Error> {
Ok(())
}
}

4
src/lib.rs Normal file
View File

@@ -0,0 +1,4 @@
#[macro_use]
extern crate lazy_static;
pub mod core;

4
src/main.rs Normal file
View File

@@ -0,0 +1,4 @@
extern crate parici;
fn main() {
}

15
tests/core.rs Normal file
View File

@@ -0,0 +1,15 @@
extern crate parici;
extern crate itertools;
use parici::core::DocId;
use parici::core::postings::{VecPostings, intersection};
use parici::core::postings::Postings;
#[test]
fn test_intersection() {
let left = VecPostings::new(vec!(1, 3, 9));
let right = VecPostings::new(vec!(3, 4, 9, 18));
let inter = intersection(&left, &right);
let vals: Vec<DocId> = inter.iter().collect();
itertools::assert_equal(vals, vec!(3, 9));
}