mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
first commit
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
target
|
||||
Cargo.lock
|
||||
10
Cargo.toml
Normal file
10
Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "parici"
|
||||
version = "0.1.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
|
||||
[dependencies]
|
||||
byteorder = "0.4.2"
|
||||
itertools = "0.4.5"
|
||||
memmap = "0.2.3"
|
||||
lazy_static = "0.1.*"
|
||||
20
src/core/analyzer.rs
Normal file
20
src/core/analyzer.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
pub struct TokenIter<'a> {
|
||||
text: &'a String,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for TokenIter<'a> {
|
||||
|
||||
type Item = &'a str;
|
||||
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
None
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub fn tokenize<'a>(text: &'a String)->TokenIter<'a> {
|
||||
TokenIter {
|
||||
text: text
|
||||
}
|
||||
}
|
||||
1
src/core/dictionary.rs
Normal file
1
src/core/dictionary.rs
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
66
src/core/directory.rs
Normal file
66
src/core/directory.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
|
||||
extern crate memmap;
|
||||
|
||||
use self::memmap::{Mmap, Protection};
|
||||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct SegmentId(String);
|
||||
|
||||
struct IndexDirectory {
|
||||
index_path: PathBuf,
|
||||
}
|
||||
|
||||
impl IndexDirectory {
|
||||
|
||||
pub fn for_path(path: PathBuf)-> IndexDirectory {
|
||||
IndexDirectory {
|
||||
index_path: path,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read_segment(&self, segment_id: &SegmentId) -> SegmentDirectory {
|
||||
SegmentDirectory {
|
||||
index_path: self.index_path.clone(),
|
||||
segment_id: segment_id.clone()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
enum SegmentComponent {
|
||||
POSTINGS,
|
||||
POSITIONS,
|
||||
}
|
||||
|
||||
struct SegmentDirectory {
|
||||
index_path: PathBuf,
|
||||
segment_id: SegmentId,
|
||||
}
|
||||
|
||||
impl SegmentDirectory {
|
||||
|
||||
fn path_suffix(component: SegmentComponent)-> &'static str {
|
||||
match component {
|
||||
SegmentComponent::POSTINGS => ".pstgs",
|
||||
SegmentComponent::POSITIONS => ".pos",
|
||||
}
|
||||
}
|
||||
|
||||
fn get_file(&self, component: SegmentComponent) -> Result<File, io::Error> {
|
||||
let mut res = self.index_path.clone();
|
||||
let SegmentId(ref segment_id_str) = self.segment_id;
|
||||
let filename = String::new() + segment_id_str + "." + SegmentDirectory::path_suffix(component);
|
||||
res.push(filename);
|
||||
File::open(res)
|
||||
}
|
||||
|
||||
pub fn open(&self, component: SegmentComponent) -> Result<Mmap, io::Error> {
|
||||
let file = try!(self.get_file(component));
|
||||
Mmap::open(&file, Protection::Read)
|
||||
}
|
||||
}
|
||||
2
src/core/global.rs
Normal file
2
src/core/global.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
pub type DocId = usize;
|
||||
9
src/core/mod.rs
Normal file
9
src/core/mod.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
|
||||
pub mod postings;
|
||||
pub mod global;
|
||||
pub mod schema;
|
||||
pub mod directory;
|
||||
pub mod writer;
|
||||
pub mod analyzer;
|
||||
|
||||
pub use core::global::DocId;
|
||||
175
src/core/postings.rs
Normal file
175
src/core/postings.rs
Normal file
@@ -0,0 +1,175 @@
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
// use std::core::slice;
|
||||
use std::io::prelude::Read;
|
||||
use core::global::DocId;
|
||||
// use core::schema::{Field, Term};
|
||||
// use std::slice;
|
||||
use std::vec;
|
||||
|
||||
pub trait Postings {
|
||||
type IteratorType: Iterator<Item=DocId>;
|
||||
fn iter(&self) -> Self::IteratorType;
|
||||
}
|
||||
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SimplePostings<R: Read + Clone> {
|
||||
reader: R,
|
||||
}
|
||||
|
||||
pub struct SimplePostingsIterator<R: Read> {
|
||||
reader: R
|
||||
}
|
||||
|
||||
impl<R: Read + Clone> Postings for SimplePostings<R> {
|
||||
|
||||
type IteratorType = SimplePostingsIterator<R>;
|
||||
|
||||
fn iter(&self) -> Self::IteratorType {
|
||||
SimplePostingsIterator {
|
||||
reader: self.reader.clone()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<R: Read> Iterator for SimplePostingsIterator<R> {
|
||||
|
||||
type Item=DocId;
|
||||
|
||||
fn next(&mut self) -> Option<DocId> {
|
||||
let mut buf: [u8; 8] = [0; 8];
|
||||
match self.reader.read(&mut buf) {
|
||||
Ok(num_bytes) => {
|
||||
if num_bytes == 8 {
|
||||
unsafe {
|
||||
let val = *(*buf.as_ptr() as *const usize);
|
||||
return Some(val)
|
||||
}
|
||||
}
|
||||
else {
|
||||
return None
|
||||
}
|
||||
},
|
||||
Err(_) => None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<R: Read + Clone> Debug for SimplePostings<R> {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
let posting_lists: Vec<DocId> = self.iter().collect();
|
||||
write!(f, "Postings({:?})", posting_lists);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
|
||||
where LeftPostingsType: Postings + 'static,
|
||||
RightPostingsType: Postings + 'static
|
||||
{
|
||||
left: &'a LeftPostingsType,
|
||||
right: &'a RightPostingsType,
|
||||
}
|
||||
|
||||
impl<'a, LeftPostingsType, RightPostingsType> Postings for IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
|
||||
where LeftPostingsType: Postings + 'static,
|
||||
RightPostingsType: Postings + 'static {
|
||||
|
||||
type IteratorType = IntersectionIterator<LeftPostingsType, RightPostingsType>;
|
||||
|
||||
fn iter(&self) -> IntersectionIterator<LeftPostingsType, RightPostingsType> {
|
||||
let mut left_it = self.left.iter();
|
||||
let mut right_it = self.right.iter();
|
||||
let next_left = left_it.next();
|
||||
let next_right = right_it.next();
|
||||
IntersectionIterator {
|
||||
left: left_it,
|
||||
right: right_it,
|
||||
next_left: next_left,
|
||||
next_right: next_right,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
pub fn intersection<'a, LeftPostingsType, RightPostingsType> (left: &'a LeftPostingsType, right: &'a RightPostingsType) -> IntersectionPostings<'a, LeftPostingsType, RightPostingsType>
|
||||
where LeftPostingsType: Postings + 'static,
|
||||
RightPostingsType: Postings + 'static {
|
||||
IntersectionPostings {
|
||||
left: left,
|
||||
right: right
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct IntersectionIterator<LeftPostingsType: Postings, RightPostingsType: Postings> {
|
||||
left: LeftPostingsType::IteratorType,
|
||||
right: RightPostingsType::IteratorType,
|
||||
|
||||
next_left: Option<DocId>,
|
||||
next_right: Option<DocId>,
|
||||
}
|
||||
|
||||
impl<LeftPostingsType: Postings, RightPostingsType: Postings>
|
||||
Iterator for IntersectionIterator<LeftPostingsType, RightPostingsType> {
|
||||
|
||||
type Item = DocId;
|
||||
|
||||
fn next(&mut self,) -> Option<DocId> {
|
||||
loop {
|
||||
match (self.next_left, self.next_right) {
|
||||
(_, None) => {
|
||||
return None;
|
||||
},
|
||||
(None, _) => {
|
||||
return None;
|
||||
},
|
||||
(Some(left_val), Some(right_val)) => {
|
||||
if left_val < right_val {
|
||||
self.next_left = self.left.next();
|
||||
}
|
||||
else if right_val > right_val {
|
||||
self.next_right = self.right.next();
|
||||
}
|
||||
else {
|
||||
self.next_left = self.left.next();
|
||||
self.next_right = self.right.next();
|
||||
return Some(left_val)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct VecPostings {
|
||||
postings: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl VecPostings {
|
||||
pub fn new(vals: Vec<DocId>) -> VecPostings {
|
||||
VecPostings {
|
||||
postings: vals
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for VecPostings {
|
||||
type IteratorType = vec::IntoIter<DocId>;
|
||||
|
||||
fn iter(&self) -> vec::IntoIter<DocId> {
|
||||
self.postings.clone().into_iter()
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, L: Postings + 'static, R: Postings + 'static> Debug for IntersectionPostings<'a, L, R> {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
let posting_lists: Vec<DocId> = self.iter().collect();
|
||||
write!(f, "Postings({:?})", posting_lists);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
56
src/core/schema.rs
Normal file
56
src/core/schema.rs
Normal file
@@ -0,0 +1,56 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
pub struct Field(&'static str);
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
pub struct FieldValue {
|
||||
pub field: Field,
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
pub struct Term<'a> {
|
||||
pub field: &'a Field,
|
||||
pub text: &'a str,
|
||||
}
|
||||
|
||||
|
||||
pub struct Document {
|
||||
fields: Vec<FieldValue>,
|
||||
}
|
||||
|
||||
|
||||
impl Document {
|
||||
|
||||
pub fn new() -> Document {
|
||||
Document {
|
||||
fields: Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set(&mut self, field: &Field, text: &String) {
|
||||
self.add(FieldValue {
|
||||
field: (*field).clone(),
|
||||
text: (*text).clone()
|
||||
});
|
||||
}
|
||||
|
||||
pub fn add(&mut self, field_value: FieldValue) {
|
||||
self.fields.push(field_value);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl IntoIterator for Document {
|
||||
type Item = FieldValue;
|
||||
type IntoIter = ::std::vec::IntoIter<FieldValue>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.fields.into_iter()
|
||||
}
|
||||
|
||||
}
|
||||
63
src/core/writer.rs
Normal file
63
src/core/writer.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
|
||||
use std::io;
|
||||
use core::schema::Document;
|
||||
use core::schema::Term;
|
||||
use core::analyzer::tokenize;
|
||||
use std::collections::HashMap;
|
||||
//
|
||||
// struct TermDictionary {
|
||||
// map: HashMap<Term, usize>,
|
||||
// }
|
||||
//
|
||||
// struct TermId(usize);
|
||||
//
|
||||
// impl TermDictionary {
|
||||
//
|
||||
// pub fn new() -> TermDictionary {
|
||||
// TermDictionary {
|
||||
// map: HashMap::new(),
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// pub fn term_id(&mut self, term: &Term) -> TermId {
|
||||
// match self.map.get(term) {
|
||||
// Some(usize) => { return TermId(usize); },
|
||||
// None => {}
|
||||
// }
|
||||
// let term_id = self.map.len();
|
||||
// self.map.insert(term, term_id);
|
||||
// TermId(term_id)
|
||||
//
|
||||
// }
|
||||
// }
|
||||
|
||||
struct IndexWriter {
|
||||
max_doc: usize,
|
||||
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
|
||||
fn suscribe(&mut self, term: &Term, doc_id: usize) {
|
||||
|
||||
}
|
||||
|
||||
pub fn add(&mut self, doc: Document) {
|
||||
let doc_id = self.max_doc;
|
||||
for field_value in doc {
|
||||
for token in tokenize(&field_value.text) {
|
||||
let term = Term {
|
||||
field: &field_value.field,
|
||||
text: &token
|
||||
};
|
||||
self.suscribe(&term, doc_id);
|
||||
}
|
||||
}
|
||||
self.max_doc += 1;
|
||||
}
|
||||
|
||||
pub fn sync(&mut self,) -> Result<(), io::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
4
src/lib.rs
Normal file
4
src/lib.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
pub mod core;
|
||||
4
src/main.rs
Normal file
4
src/main.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
extern crate parici;
|
||||
|
||||
fn main() {
|
||||
}
|
||||
15
tests/core.rs
Normal file
15
tests/core.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
extern crate parici;
|
||||
extern crate itertools;
|
||||
|
||||
use parici::core::DocId;
|
||||
use parici::core::postings::{VecPostings, intersection};
|
||||
use parici::core::postings::Postings;
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
let left = VecPostings::new(vec!(1, 3, 9));
|
||||
let right = VecPostings::new(vec!(3, 4, 9, 18));
|
||||
let inter = intersection(&left, &right);
|
||||
let vals: Vec<DocId> = inter.iter().collect();
|
||||
itertools::assert_equal(vals, vec!(3, 9));
|
||||
}
|
||||
Reference in New Issue
Block a user