diff --git a/TODO.md b/TODO.md index 74e514acb..67b011352 100644 --- a/TODO.md +++ b/TODO.md @@ -13,9 +13,10 @@ find solution to "I have a docaddress but the segment does not exist anymore pro pass over offset from previous block +test untokenized - +test field with more than one value doc values for other types use skip list for each blocks find a clear way to put the tokenized/untokenized thing upstream diff --git a/src/postings/writer.rs b/src/postings/writer.rs index a1a1b4891..38edfddd3 100644 --- a/src/postings/writer.rs +++ b/src/postings/writer.rs @@ -1,5 +1,5 @@ use DocId; -use std::collections::BTreeMap; +use std::collections::HashMap; use schema::Term; use postings::PostingsSerializer; use std::io; @@ -62,7 +62,7 @@ pub trait PostingsWriter { pub struct SpecializedPostingsWriter { postings: Vec>, - term_index: BTreeMap, // remove btree map + term_index: HashMap, } impl SpecializedPostingsWriter { @@ -70,7 +70,7 @@ impl SpecializedPostingsWriter { pub fn new() -> SpecializedPostingsWriter { SpecializedPostingsWriter { postings: Vec::new(), - term_index: BTreeMap::new(), + term_index: HashMap::new(), } } @@ -107,8 +107,13 @@ impl PostingsWriter for SpecializedPostingsWriter } fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> { - for (term, postings_id) in &self.term_index { - let term_postings_writer = &self.postings[postings_id.clone()]; + let mut term_offsets: Vec<(Term, usize)> = self.term_index + .iter() + .map(|(k,v)| (k.clone(), *v)) // Get rid of the clone + .collect(); + term_offsets.sort(); + for (term, postings_id) in term_offsets { + let term_postings_writer = &self.postings[postings_id]; let term_docfreq = term_postings_writer.doc_freq(); try!(serializer.new_term(&term, term_docfreq)); try!(term_postings_writer.serialize(serializer));