test passing

This commit is contained in:
Paul Masurel
2016-05-19 11:08:50 +09:00
parent 9226338bf0
commit cdcc72a0c9
12 changed files with 152 additions and 46 deletions

View File

@@ -11,3 +11,4 @@ use skip list for each blocks
find a clear way to put the tokenized/untokenized thing upstream
index frequent bigrams
clean up compression
reconsider the first byte == field in the [u8] repr of a term.

View File

@@ -258,6 +258,10 @@ impl fmt::Debug for Segment {
impl Segment {
pub fn schema(&self,) -> Schema {
self.index.schema()
}
pub fn id(&self,) -> SegmentId {
self.segment_id.clone()
}

View File

@@ -74,7 +74,7 @@ impl<'a> PostingsMerger<'a> {
}
postings_merger
}
// pushes the term_reader associated with the given segment ordinal
// into the heap.
fn push_next_segment_el(&mut self, segment_ord: usize) {
@@ -96,7 +96,7 @@ impl<'a> PostingsMerger<'a> {
let offset = self.doc_offsets[heap_item.segment_ord];
let reader = &self.readers[heap_item.segment_ord];
let segment_postings = reader.read_postings(&heap_item.term_info);
let offset_postings = OffsetPostings::new(segment_postings, offset);
let offset_postings = OffsetPostings::new(segment_postings, offset);
segment_postings_list.push(offset_postings);
}
self.push_next_segment_el(heap_item.segment_ord);
@@ -223,11 +223,12 @@ mod tests {
use core::searcher::DocAddress;
use collector::FastFieldTestCollector;
use collector::TestCollector;
use schema::TextIndexingOptions;
#[test]
fn test_index_merger() {
let mut schema = schema::Schema::new();
let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed().set_stored();
let text_fieldtype = schema::TextOptions::new().set_indexing_options(TextIndexingOptions::TokenizedWithFreq).set_stored();
let text_field = schema.add_text_field("text", &text_fieldtype);
let score_fieldtype = schema::U32Options::new().set_fast();
let score_field = schema.add_u32_field("score", &score_fieldtype);

View File

@@ -161,7 +161,7 @@ impl SegmentWriter {
let doc_id = self.max_doc;
for field_value in doc.text_fields() {
let field_options = schema.text_field_options(&field_value.field);
if field_options.is_tokenized_indexed() {
if field_options.indexing_options().is_tokenized() {
let mut tokens = self.tokenizer.tokenize(&field_value.text);
let mut pos = 0u32;
loop {

View File

@@ -63,11 +63,9 @@ mod tests {
#[test]
fn test_indexing() {
let mut schema = schema::Schema::new();
let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed();
let text_field = schema.add_text_field("text", &text_fieldtype);
let text_field = schema.add_text_field("text", &schema::TEXT);
let index = Index::create_from_tempdir(schema).unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1).unwrap();
@@ -99,8 +97,7 @@ mod tests {
#[test]
fn test_searcher() {
let mut schema = schema::Schema::new();
let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed();
let text_field = schema.add_text_field("text", &text_fieldtype);
let text_field = schema.add_text_field("text", &schema::TEXT);
let index = Index::create_in_ram(schema);
{

View File

@@ -1,5 +1,4 @@
use compression::SIMDBlockDecoder;
use DocId;
pub enum FreqHandler {
FreqReader(SIMDBlockDecoder),
@@ -8,6 +7,11 @@ pub enum FreqHandler {
}
impl FreqHandler {
pub fn new_freq_reader() -> FreqHandler {
FreqHandler::FreqReader(SIMDBlockDecoder::new())
}
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
match *self {
FreqHandler::FreqReader(ref mut block_decoder) => {

View File

@@ -50,7 +50,7 @@ impl<'a> SegmentPostings<'a> {
doc_freq: doc_freq as usize,
doc_offset: 0,
block_decoder: SIMDBlockDecoder::new(),
freq_reader: FreqHandler::NoFreq,
freq_reader: FreqHandler::new_freq_reader(),
remaining_data: data,
cur: Wrapping(usize::max_value()),
}

View File

@@ -1,6 +1,8 @@
use datastruct::FstMapBuilder;
use super::TermInfo;
use schema::Term;
use schema::Schema;
use schema::TextIndexingOptions;
use directory::WritePtr;
use compression::{NUM_DOCS_PER_BLOCK, SIMDBlockEncoder, CompositeEncoder};
use DocId;
@@ -23,8 +25,8 @@ pub struct PostingsSerializer {
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
position_deltas: Vec<u32>,
is_termfreq_enabled: bool,
is_positions_enabled: bool,
schema: Schema,
text_indexing_options: TextIndexingOptions,
}
impl PostingsSerializer {
@@ -34,6 +36,7 @@ impl PostingsSerializer {
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
let schema = segment.schema();
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: postings_write,
@@ -46,13 +49,26 @@ impl PostingsSerializer {
doc_ids: Vec::new(),
term_freqs: Vec::new(),
position_deltas: Vec::new(),
is_positions_enabled: false,
is_termfreq_enabled: false,
schema: schema,
text_indexing_options: TextIndexingOptions::Unindexed,
})
}
pub fn load_indexing_options(&mut self, term: &Term) {
self.text_indexing_options = match term.get_text_field() {
Some(text_field) => {
let text_options = self.schema.text_field_options(&text_field);
text_options.indexing_options()
}
None => {
TextIndexingOptions::Unindexed
}
};
}
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
try!(self.close_term());
self.load_indexing_options(term);
self.doc_ids.clear();
self.last_doc_id_encoded = 0;
self.term_freqs.clear();
@@ -72,7 +88,7 @@ impl PostingsSerializer {
self.written_bytes_postings += block_encoded.len();
try!(self.postings_write.write_all(block_encoded));
}
if self.is_termfreq_enabled {
if self.text_indexing_options.is_termfreq_enabled() {
{
let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
@@ -81,7 +97,7 @@ impl PostingsSerializer {
}
self.term_freqs.clear();
}
if self.is_positions_enabled {
if self.text_indexing_options.is_position_enabled() {
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
try!(self.positions_write.write_all(positions_encoded));
self.written_bytes_positions += positions_encoded.len();
@@ -95,13 +111,13 @@ impl PostingsSerializer {
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
self.doc_ids.push(doc_id);
if self.is_termfreq_enabled {
if self.text_indexing_options.is_termfreq_enabled() {
self.term_freqs.push(term_freq as u32);
}
if self.is_positions_enabled {
if self.text_indexing_options.is_position_enabled() {
self.position_deltas.extend_from_slice(position_deltas);
}
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
{
// encode the positions
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
@@ -109,7 +125,7 @@ impl PostingsSerializer {
try!(self.postings_write.write_all(block_encoded));
self.written_bytes_postings += block_encoded.len();
}
if self.is_termfreq_enabled {
if self.text_indexing_options.is_termfreq_enabled() {
// encode the term_freqs
let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs);
try!(self.postings_write.write_all(block_encoded));
@@ -120,7 +136,7 @@ impl PostingsSerializer {
}
Ok(())
}
pub fn close(mut self,) -> io::Result<()> {
try!(self.close_term());
try!(self.terms_fst_builder.finish());

View File

@@ -12,8 +12,9 @@ pub use self::text_field::TextFieldValue;
pub use self::text_field::TextOptions;
pub use self::text_field::FAST;
pub use self::text_field::TEXT;
pub use self::text_field::STRING;
pub use self::text_field::STORED;
pub use self::text_field::TextIndexingOptions;
pub use self::u32_field::U32Field;
pub use self::u32_field::U32FieldValue;

View File

@@ -33,19 +33,18 @@ pub struct U32FieldEntry {
/// # Examples
///
/// ```
/// use tantivy::schema::{Schema, TextOptions};
/// use tantivy::schema::*;
///
/// fn create_schema() -> Schema {
/// let mut schema = Schema::new();
/// let str_fieldtype = TextOptions::new();
/// let text_fieldtype = TextOptions::new().set_tokenized_indexed();
/// let id_field = schema.add_text_field("id", &str_fieldtype);
/// let url_field = schema.add_text_field("url", &str_fieldtype);
/// let body_field = schema.add_text_field("body", &text_fieldtype);
/// let id_field = schema.add_text_field("id", &str_fieldtype);
/// let url_field = schema.add_text_field("url", &str_fieldtype);
/// let title_field = schema.add_text_field("title", &text_fieldtype);
/// let body_field = schema.add_text_field("body", &text_fieldtype);
/// let id_field = schema.add_text_field("id", &STRING);
/// let url_field = schema.add_text_field("url", &STRING);
/// let body_field = schema.add_text_field("body", &TEXT);
/// let id_field = schema.add_text_field("id", &STRING);
/// let url_field = schema.add_text_field("url", &STRING);
/// let title_field = schema.add_text_field("title", &TEXT);
/// let body_field = schema.add_text_field("body", &TEXT);
/// schema
/// }
///

View File

@@ -24,6 +24,27 @@ impl Term {
}
}
fn type_num(&self,) -> u8 {
self.data[0]
}
pub fn is_u32(&self,) -> bool {
!self.is_text()
}
pub fn is_text(&self,) -> bool {
self.type_num() & 128 == 0
}
pub fn get_text_field(&self,) -> Option<TextField> {
if self.is_text() {
Some(TextField(self.type_num()))
}
else {
None
}
}
pub fn from_field_text(field: &TextField, text: &str) -> Term {
let mut buffer = Vec::with_capacity(1 + text.len());
let TextField(field_idx) = *field;

View File

@@ -10,17 +10,74 @@ use std::ops::BitOr;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct TextField(pub u8);
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash, RustcDecodable, RustcEncodable)]
pub enum TextIndexingOptions {
Unindexed,
Untokenized,
TokenizedNoFreq,
TokenizedWithFreq,
TokenizedWithFreqAndPosition,
}
impl TextIndexingOptions {
pub fn is_termfreq_enabled(&self) -> bool {
match *self {
TextIndexingOptions::TokenizedWithFreq => true,
TextIndexingOptions::TokenizedWithFreqAndPosition => true,
_ => false,
}
}
pub fn is_tokenized(&self,) -> bool {
match *self {
TextIndexingOptions::TokenizedNoFreq => true,
TextIndexingOptions::TokenizedWithFreq => true,
TextIndexingOptions::TokenizedWithFreqAndPosition => true,
_ => false,
}
}
pub fn is_position_enabled(&self,) -> bool {
match *self {
TextIndexingOptions::TokenizedWithFreqAndPosition => true,
_ => false,
}
}
}
impl BitOr for TextIndexingOptions {
type Output = TextIndexingOptions;
fn bitor(self, other: TextIndexingOptions) -> TextIndexingOptions {
use super::TextIndexingOptions::*;
if self == Unindexed {
other
}
else if other == Unindexed {
self
}
else if self == other {
self
}
else {
// make it possible
panic!("Combining {:?} and {:?} is ambiguous");
}
}
}
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct TextOptions {
tokenized_indexed: bool,
indexing_options: TextIndexingOptions,
stored: bool,
fast: bool,
}
impl TextOptions {
pub fn is_tokenized_indexed(&self,) -> bool {
self.tokenized_indexed
pub fn indexing_options(&self,) -> TextIndexingOptions {
self.indexing_options.clone()
}
pub fn is_stored(&self,) -> bool {
@@ -41,15 +98,15 @@ impl TextOptions {
self
}
pub fn set_tokenized_indexed(mut self,) -> TextOptions {
self.tokenized_indexed = true;
pub fn set_indexing_options(mut self, indexing_options: TextIndexingOptions) -> TextOptions {
self.indexing_options = indexing_options;
self
}
pub fn new() -> TextOptions {
TextOptions {
fast: false,
tokenized_indexed: false,
indexing_options: TextIndexingOptions::Unindexed,
stored: false,
}
}
@@ -94,12 +151,17 @@ pub struct TextFieldValue {
}
/// The field will be untokenized and indexed
pub const STRING: TextOptions = TextOptions {
indexing_options: TextIndexingOptions::Untokenized,
stored: false,
fast: false,
};
/// The field will be tokenized and indexed
pub const TEXT: TextOptions = TextOptions {
tokenized_indexed: true,
indexing_options: TextIndexingOptions::TokenizedWithFreqAndPosition,
stored: false,
fast: false,
};
@@ -109,7 +171,7 @@ pub const TEXT: TextOptions = TextOptions {
/// Reading the stored fields of a document is relatively slow.
/// (100 microsecs)
pub const STORED: TextOptions = TextOptions {
tokenized_indexed: false,
indexing_options: TextIndexingOptions::Unindexed,
stored: true,
fast: false,
};
@@ -117,7 +179,7 @@ pub const STORED: TextOptions = TextOptions {
/// Fast field are used for field you need to access many times during
/// collection. (e.g: for sort, aggregates).
pub const FAST: TextOptions = TextOptions {
tokenized_indexed: false,
indexing_options: TextIndexingOptions::Unindexed,
stored: false,
fast: true
};
@@ -129,7 +191,7 @@ impl BitOr for TextOptions {
fn bitor(self, other: TextOptions) -> TextOptions {
let mut res = TextOptions::new();
res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed;
res.indexing_options = self.indexing_options | other.indexing_options;
res.stored = self.stored || other.stored;
res.fast = self.fast || other.fast;
res
@@ -148,19 +210,19 @@ mod tests {
let field_options = STORED | FAST;
assert!(field_options.is_stored());
assert!(field_options.is_fast());
assert!(!field_options.is_tokenized_indexed());
assert!(!field_options.indexing_options().is_tokenized());
}
{
let field_options = STORED | TEXT;
assert!(field_options.is_stored());
assert!(!field_options.is_fast());
assert!(field_options.is_tokenized_indexed());
assert!(field_options.indexing_options().is_tokenized());
}
{
let mut schema = Schema::new();
let _body_field: TextField = schema.add_text_field("body", &TEXT);
let field = schema.text_field("body");
assert!(schema.text_field_options(&field).is_tokenized_indexed());
assert!(schema.text_field_options(&field).indexing_options().is_tokenized());
}
}
}