mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
second commit
This commit is contained in:
@@ -8,3 +8,4 @@ byteorder = "0.4.2"
|
||||
itertools = "0.4.5"
|
||||
memmap = "0.2.3"
|
||||
lazy_static = "0.1.*"
|
||||
regex = "0.1"
|
||||
|
||||
@@ -1,6 +1,15 @@
|
||||
extern crate regex;
|
||||
|
||||
use self::regex::Regex;
|
||||
|
||||
lazy_static! {
|
||||
static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap();
|
||||
}
|
||||
|
||||
|
||||
pub struct TokenIter<'a> {
|
||||
text: &'a String,
|
||||
text: &'a str,
|
||||
token_it: Box<Iterator<Item=(usize, usize)> + 'a>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for TokenIter<'a> {
|
||||
@@ -8,13 +17,14 @@ impl<'a> Iterator for TokenIter<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
None
|
||||
self.token_it.next().map(|(start, end)| &self.text[start..end])
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub fn tokenize<'a>(text: &'a String)->TokenIter<'a> {
|
||||
pub fn tokenize<'a>(text: &'a str)->TokenIter<'a> {
|
||||
TokenIter {
|
||||
text: text
|
||||
text: text,
|
||||
token_it: Box::new(WORD_PTN.find_iter(text)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
// use std::core::slice;
|
||||
use std::io::prelude::Read;
|
||||
use core::global::DocId;
|
||||
// use std::core::slice;
|
||||
// use core::schema::{Field, Term};
|
||||
// use std::slice;
|
||||
use std::vec;
|
||||
@@ -13,6 +13,7 @@ pub trait Postings {
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SimplePostings<R: Read + Clone> {
|
||||
reader: R,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
pub struct Field(&'static str);
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Field(pub &'static str);
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
@@ -12,7 +12,7 @@ pub struct FieldValue {
|
||||
}
|
||||
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Term<'a> {
|
||||
pub field: &'a Field,
|
||||
pub text: &'a str,
|
||||
@@ -32,10 +32,10 @@ impl Document {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set(&mut self, field: &Field, text: &String) {
|
||||
pub fn set(&mut self, field: Field, text: &str) {
|
||||
self.add(FieldValue {
|
||||
field: (*field).clone(),
|
||||
text: (*text).clone()
|
||||
field: field,
|
||||
text: String::from(text)
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -2,55 +2,86 @@
|
||||
use std::io;
|
||||
use core::schema::Document;
|
||||
use core::schema::Term;
|
||||
use core::schema::Field ;
|
||||
use core::analyzer::tokenize;
|
||||
use std::collections::HashMap;
|
||||
//
|
||||
// struct TermDictionary {
|
||||
// map: HashMap<Term, usize>,
|
||||
// }
|
||||
//
|
||||
// struct TermId(usize);
|
||||
//
|
||||
// impl TermDictionary {
|
||||
//
|
||||
// pub fn new() -> TermDictionary {
|
||||
// TermDictionary {
|
||||
// map: HashMap::new(),
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// pub fn term_id(&mut self, term: &Term) -> TermId {
|
||||
// match self.map.get(term) {
|
||||
// Some(usize) => { return TermId(usize); },
|
||||
// None => {}
|
||||
// }
|
||||
// let term_id = self.map.len();
|
||||
// self.map.insert(term, term_id);
|
||||
// TermId(term_id)
|
||||
//
|
||||
// }
|
||||
// }
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use core::DocId;
|
||||
|
||||
struct IndexWriter {
|
||||
pub struct PostingsWriter {
|
||||
doc_ids: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl PostingsWriter {
|
||||
pub fn new()->PostingsWriter {
|
||||
PostingsWriter {
|
||||
doc_ids: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn suscribe(&mut self, doc_id: DocId) {
|
||||
self.doc_ids.push(doc_id);
|
||||
}
|
||||
}
|
||||
|
||||
struct FieldWriter {
|
||||
postings: Vec<PostingsWriter>,
|
||||
term_index: BTreeMap<String, usize>,
|
||||
}
|
||||
|
||||
impl FieldWriter {
|
||||
pub fn new() -> FieldWriter {
|
||||
FieldWriter {
|
||||
term_index: BTreeMap::new(),
|
||||
postings: Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_postings_writer(&mut self, term_text: &str) -> &mut PostingsWriter {
|
||||
match self.term_index.get(term_text) {
|
||||
Some(unord_id) => {
|
||||
return &mut self.postings[*unord_id];
|
||||
},
|
||||
None => {}
|
||||
}
|
||||
let unord_id = self.term_index.len();
|
||||
self.postings.push(PostingsWriter::new());
|
||||
self.term_index.insert(String::from(term_text), unord_id.clone());
|
||||
&mut self.postings[unord_id]
|
||||
}
|
||||
|
||||
pub fn suscribe(&mut self, doc: DocId, term_text: &str) {
|
||||
self.get_postings_writer(term_text).suscribe(doc);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IndexWriter {
|
||||
max_doc: usize,
|
||||
|
||||
term_writers: HashMap<Field, FieldWriter>,
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
|
||||
fn suscribe(&mut self, term: &Term, doc_id: usize) {
|
||||
pub fn new() -> IndexWriter {
|
||||
IndexWriter {
|
||||
max_doc: 0,
|
||||
term_writers: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_field_writer(&mut self, field: &Field) -> &mut FieldWriter {
|
||||
if !self.term_writers.contains_key(field) {
|
||||
self.term_writers.insert((*field).clone(), FieldWriter::new());
|
||||
}
|
||||
self.term_writers.get_mut(field).unwrap()
|
||||
}
|
||||
|
||||
pub fn add(&mut self, doc: Document) {
|
||||
let doc_id = self.max_doc;
|
||||
for field_value in doc {
|
||||
let field = field_value.field;
|
||||
let field_writer = self.get_field_writer(&field);
|
||||
for token in tokenize(&field_value.text) {
|
||||
let term = Term {
|
||||
field: &field_value.field,
|
||||
text: &token
|
||||
};
|
||||
self.suscribe(&term, doc_id);
|
||||
field_writer.suscribe(doc_id, token);
|
||||
}
|
||||
}
|
||||
self.max_doc += 1;
|
||||
|
||||
@@ -4,6 +4,9 @@ extern crate itertools;
|
||||
use parici::core::DocId;
|
||||
use parici::core::postings::{VecPostings, intersection};
|
||||
use parici::core::postings::Postings;
|
||||
use parici::core::analyzer::tokenize;
|
||||
use parici::core::writer::IndexWriter;
|
||||
use parici::core::schema::{Field, Document};
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
@@ -11,5 +14,19 @@ fn test_intersection() {
|
||||
let right = VecPostings::new(vec!(3, 4, 9, 18));
|
||||
let inter = intersection(&left, &right);
|
||||
let vals: Vec<DocId> = inter.iter().collect();
|
||||
itertools::assert_equal(vals, vec!(3, 9));
|
||||
assert_eq!(vals, vec!(3, 9));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let words: Vec<&str> = tokenize("hello happy tax payer!").collect();
|
||||
assert_eq!(words, vec!("hello", "happy", "tax", "payer"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
let mut index_writer = IndexWriter::new();
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field("text"), &String::from("toto"));
|
||||
index_writer.add(doc);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user