second commit

2025-12-23 02:29:57 +00:00 · 2016-01-10 18:07:31 +09:00
parent 3f09ec75df
commit 52c0888c81
7 changed files with 107 additions and 48 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,3 +8,4 @@ byteorder = "0.4.2"
 itertools = "0.4.5"
 memmap = "0.2.3"
 lazy_static = "0.1.*"
+regex = "0.1"
--- a/src/core/analyzer.rs
+++ b/src/core/analyzer.rs
@@ -1,6 +1,15 @@
+extern crate regex;
+
+use self::regex::Regex;
+
+lazy_static! {
+    static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap();
+}
+

 pub struct TokenIter<'a> {
-    text: &'a String,
+    text: &'a str,
+    token_it: Box<Iterator<Item=(usize, usize)> + 'a>,
 }

 impl<'a> Iterator for TokenIter<'a> {
@@ -8,13 +17,14 @@ impl<'a> Iterator for TokenIter<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<&'a str> {
-        None
+        self.token_it.next().map(|(start, end)| &self.text[start..end])
    }

 }

-pub fn tokenize<'a>(text: &'a String)->TokenIter<'a> {
+pub fn tokenize<'a>(text: &'a str)->TokenIter<'a> {
    TokenIter {
-        text: text
+        text: text,
+        token_it: Box::new(WORD_PTN.find_iter(text)),
    }
 }
--- a/src/core/dictionary.rs
+++ b/src/core/dictionary.rs
@@ -1 +0,0 @@
-
--- a/src/core/postings.rs
+++ b/src/core/postings.rs
@@ -1,8 +1,8 @@
 use std::fmt;
 use std::fmt::{Debug, Formatter};
-// use std::core::slice;
 use std::io::prelude::Read;
 use core::global::DocId;
+// use std::core::slice;
 // use core::schema::{Field, Term};
 // use std::slice;
 use std::vec;
@@ -13,6 +13,7 @@ pub trait Postings {
 }


+
 #[derive(Clone)]
 pub struct SimplePostings<R: Read + Clone> {
 	reader: R,
--- a/src/core/schema.rs
+++ b/src/core/schema.rs
@@ -1,8 +1,8 @@
 use std::collections::HashMap;
 use std::sync::{Mutex, MutexGuard};

-#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
-pub struct Field(&'static str);
+#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
+pub struct Field(pub &'static str);


 #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
@@ -12,7 +12,7 @@ pub struct FieldValue {
 }


-#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
+#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
 pub struct Term<'a> {
    pub field: &'a Field,
 	pub text: &'a str,
@@ -32,10 +32,10 @@ impl Document {
        }
    }

-    pub fn set(&mut self, field: &Field, text: &String) {
+    pub fn set(&mut self, field: Field, text: &str) {
        self.add(FieldValue {
-            field: (*field).clone(),
-            text: (*text).clone()
+            field: field,
+            text: String::from(text)
        });
    }

--- a/src/core/writer.rs
+++ b/src/core/writer.rs
@@ -2,55 +2,86 @@
 use std::io;
 use core::schema::Document;
 use core::schema::Term;
+use core::schema::Field ;
 use core::analyzer::tokenize;
-use std::collections::HashMap;
-//
-// struct TermDictionary {
-//     map: HashMap<Term, usize>,
-// }
-//
-// struct TermId(usize);
-//
-// impl TermDictionary {
-//
-//     pub fn new() -> TermDictionary {
-//         TermDictionary {
-//             map: HashMap::new(),
-//         }
-//     }
-//
-//     pub fn term_id(&mut self, term: &Term) -> TermId {
-//         match self.map.get(term) {
-//             Some(usize) => { return TermId(usize); },
-//             None => {}
-//         }
-//         let term_id = self.map.len();
-//         self.map.insert(term, term_id);
-//         TermId(term_id)
-//
-//     }
-// }
+use std::collections::{HashMap, BTreeMap};
+use core::DocId;

-struct IndexWriter {
+pub struct PostingsWriter {
+	doc_ids: Vec<DocId>,
+}
+
+impl PostingsWriter {
+    pub fn new()->PostingsWriter {
+        PostingsWriter {
+            doc_ids: Vec::new(),
+        }
+    }
+
+	pub fn suscribe(&mut self, doc_id: DocId) {
+		self.doc_ids.push(doc_id);
+	}
+}
+
+struct FieldWriter {
+    postings: Vec<PostingsWriter>,
+    term_index: BTreeMap<String, usize>,
+}
+
+impl FieldWriter {
+    pub fn new() -> FieldWriter {
+        FieldWriter {
+            term_index: BTreeMap::new(),
+            postings: Vec::new()
+        }
+    }
+
+    pub fn get_postings_writer(&mut self, term_text: &str) -> &mut PostingsWriter {
+        match self.term_index.get(term_text) {
+            Some(unord_id) => {
+                return &mut self.postings[*unord_id];
+            },
+            None => {}
+        }
+        let unord_id = self.term_index.len();
+        self.postings.push(PostingsWriter::new());
+        self.term_index.insert(String::from(term_text), unord_id.clone());
+        &mut self.postings[unord_id]
+    }
+
+    pub fn suscribe(&mut self, doc: DocId, term_text: &str) {
+        self.get_postings_writer(term_text).suscribe(doc);
+    }
+}
+
+pub struct IndexWriter {
    max_doc: usize,
-
+    term_writers: HashMap<Field, FieldWriter>,
 }

 impl IndexWriter {

-    fn suscribe(&mut self, term: &Term, doc_id: usize) {
+    pub fn new() -> IndexWriter {
+        IndexWriter {
+            max_doc: 0,
+            term_writers: HashMap::new(),
+        }
+    }

+    fn get_field_writer(&mut self, field: &Field) -> &mut FieldWriter {
+        if !self.term_writers.contains_key(field) {
+            self.term_writers.insert((*field).clone(), FieldWriter::new());
+        }
+        self.term_writers.get_mut(field).unwrap()
    }

    pub fn add(&mut self, doc: Document) {
        let doc_id = self.max_doc;
        for field_value in doc {
+            let field = field_value.field;
+            let field_writer = self.get_field_writer(&field);
            for token in tokenize(&field_value.text) {
-                let term = Term {
-                    field: &field_value.field,
-                    text: &token
-                };
-                self.suscribe(&term, doc_id);
+                field_writer.suscribe(doc_id, token);
            }
        }
        self.max_doc += 1;
--- a/tests/core.rs
+++ b/tests/core.rs
@@ -4,6 +4,9 @@ extern crate itertools;
 use parici::core::DocId;
 use parici::core::postings::{VecPostings, intersection};
 use parici::core::postings::Postings;
+use parici::core::analyzer::tokenize;
+use parici::core::writer::IndexWriter;
+use parici::core::schema::{Field, Document};

 #[test]
 fn test_intersection() {
@@ -11,5 +14,19 @@ fn test_intersection() {
    let right = VecPostings::new(vec!(3, 4, 9, 18));
    let inter = intersection(&left, &right);
    let vals: Vec<DocId> = inter.iter().collect();
-    itertools::assert_equal(vals, vec!(3, 9));
+    assert_eq!(vals, vec!(3, 9));
+}
+
+#[test]
+fn test_tokenizer() {
+    let words: Vec<&str> = tokenize("hello happy tax payer!").collect();
+    assert_eq!(words, vec!("hello", "happy", "tax", "payer"));
+}
+
+#[test]
+fn test_indexing() {
+    let mut index_writer = IndexWriter::new();
+    let mut doc = Document::new();
+    doc.set(Field("text"), &String::from("toto"));
+    index_writer.add(doc);
 }