mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 15:10:40 +00:00
test passing
This commit is contained in:
1
TODO.md
1
TODO.md
@@ -11,3 +11,4 @@ use skip list for each blocks
|
||||
find a clear way to put the tokenized/untokenized thing upstream
|
||||
index frequent bigrams
|
||||
clean up compression
|
||||
reconsider the first byte == field in the [u8] repr of a term.
|
||||
|
||||
@@ -258,6 +258,10 @@ impl fmt::Debug for Segment {
|
||||
|
||||
impl Segment {
|
||||
|
||||
pub fn schema(&self,) -> Schema {
|
||||
self.index.schema()
|
||||
}
|
||||
|
||||
pub fn id(&self,) -> SegmentId {
|
||||
self.segment_id.clone()
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ impl<'a> PostingsMerger<'a> {
|
||||
}
|
||||
postings_merger
|
||||
}
|
||||
|
||||
|
||||
// pushes the term_reader associated with the given segment ordinal
|
||||
// into the heap.
|
||||
fn push_next_segment_el(&mut self, segment_ord: usize) {
|
||||
@@ -96,7 +96,7 @@ impl<'a> PostingsMerger<'a> {
|
||||
let offset = self.doc_offsets[heap_item.segment_ord];
|
||||
let reader = &self.readers[heap_item.segment_ord];
|
||||
let segment_postings = reader.read_postings(&heap_item.term_info);
|
||||
let offset_postings = OffsetPostings::new(segment_postings, offset);
|
||||
let offset_postings = OffsetPostings::new(segment_postings, offset);
|
||||
segment_postings_list.push(offset_postings);
|
||||
}
|
||||
self.push_next_segment_el(heap_item.segment_ord);
|
||||
@@ -223,11 +223,12 @@ mod tests {
|
||||
use core::searcher::DocAddress;
|
||||
use collector::FastFieldTestCollector;
|
||||
use collector::TestCollector;
|
||||
use schema::TextIndexingOptions;
|
||||
|
||||
#[test]
|
||||
fn test_index_merger() {
|
||||
let mut schema = schema::Schema::new();
|
||||
let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed().set_stored();
|
||||
let text_fieldtype = schema::TextOptions::new().set_indexing_options(TextIndexingOptions::TokenizedWithFreq).set_stored();
|
||||
let text_field = schema.add_text_field("text", &text_fieldtype);
|
||||
let score_fieldtype = schema::U32Options::new().set_fast();
|
||||
let score_field = schema.add_u32_field("score", &score_fieldtype);
|
||||
|
||||
@@ -161,7 +161,7 @@ impl SegmentWriter {
|
||||
let doc_id = self.max_doc;
|
||||
for field_value in doc.text_fields() {
|
||||
let field_options = schema.text_field_options(&field_value.field);
|
||||
if field_options.is_tokenized_indexed() {
|
||||
if field_options.indexing_options().is_tokenized() {
|
||||
let mut tokens = self.tokenizer.tokenize(&field_value.text);
|
||||
let mut pos = 0u32;
|
||||
loop {
|
||||
|
||||
@@ -63,11 +63,9 @@ mod tests {
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
let mut schema = schema::Schema::new();
|
||||
let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed();
|
||||
let text_field = schema.add_text_field("text", &text_fieldtype);
|
||||
let text_field = schema.add_text_field("text", &schema::TEXT);
|
||||
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1).unwrap();
|
||||
@@ -99,8 +97,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_searcher() {
|
||||
let mut schema = schema::Schema::new();
|
||||
let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed();
|
||||
let text_field = schema.add_text_field("text", &text_fieldtype);
|
||||
let text_field = schema.add_text_field("text", &schema::TEXT);
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use compression::SIMDBlockDecoder;
|
||||
use DocId;
|
||||
|
||||
pub enum FreqHandler {
|
||||
FreqReader(SIMDBlockDecoder),
|
||||
@@ -8,6 +7,11 @@ pub enum FreqHandler {
|
||||
}
|
||||
|
||||
impl FreqHandler {
|
||||
|
||||
pub fn new_freq_reader() -> FreqHandler {
|
||||
FreqHandler::FreqReader(SIMDBlockDecoder::new())
|
||||
}
|
||||
|
||||
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
|
||||
match *self {
|
||||
FreqHandler::FreqReader(ref mut block_decoder) => {
|
||||
|
||||
@@ -50,7 +50,7 @@ impl<'a> SegmentPostings<'a> {
|
||||
doc_freq: doc_freq as usize,
|
||||
doc_offset: 0,
|
||||
block_decoder: SIMDBlockDecoder::new(),
|
||||
freq_reader: FreqHandler::NoFreq,
|
||||
freq_reader: FreqHandler::new_freq_reader(),
|
||||
remaining_data: data,
|
||||
cur: Wrapping(usize::max_value()),
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use datastruct::FstMapBuilder;
|
||||
use super::TermInfo;
|
||||
use schema::Term;
|
||||
use schema::Schema;
|
||||
use schema::TextIndexingOptions;
|
||||
use directory::WritePtr;
|
||||
use compression::{NUM_DOCS_PER_BLOCK, SIMDBlockEncoder, CompositeEncoder};
|
||||
use DocId;
|
||||
@@ -23,8 +25,8 @@ pub struct PostingsSerializer {
|
||||
doc_ids: Vec<DocId>,
|
||||
term_freqs: Vec<u32>,
|
||||
position_deltas: Vec<u32>,
|
||||
is_termfreq_enabled: bool,
|
||||
is_positions_enabled: bool,
|
||||
schema: Schema,
|
||||
text_indexing_options: TextIndexingOptions,
|
||||
}
|
||||
|
||||
impl PostingsSerializer {
|
||||
@@ -34,6 +36,7 @@ impl PostingsSerializer {
|
||||
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
|
||||
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
|
||||
let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
|
||||
let schema = segment.schema();
|
||||
Ok(PostingsSerializer {
|
||||
terms_fst_builder: terms_fst_builder,
|
||||
postings_write: postings_write,
|
||||
@@ -46,13 +49,26 @@ impl PostingsSerializer {
|
||||
doc_ids: Vec::new(),
|
||||
term_freqs: Vec::new(),
|
||||
position_deltas: Vec::new(),
|
||||
is_positions_enabled: false,
|
||||
is_termfreq_enabled: false,
|
||||
schema: schema,
|
||||
text_indexing_options: TextIndexingOptions::Unindexed,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load_indexing_options(&mut self, term: &Term) {
|
||||
self.text_indexing_options = match term.get_text_field() {
|
||||
Some(text_field) => {
|
||||
let text_options = self.schema.text_field_options(&text_field);
|
||||
text_options.indexing_options()
|
||||
}
|
||||
None => {
|
||||
TextIndexingOptions::Unindexed
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
|
||||
try!(self.close_term());
|
||||
self.load_indexing_options(term);
|
||||
self.doc_ids.clear();
|
||||
self.last_doc_id_encoded = 0;
|
||||
self.term_freqs.clear();
|
||||
@@ -72,7 +88,7 @@ impl PostingsSerializer {
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
}
|
||||
if self.is_termfreq_enabled {
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
{
|
||||
let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
|
||||
@@ -81,7 +97,7 @@ impl PostingsSerializer {
|
||||
}
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
if self.is_positions_enabled {
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
|
||||
try!(self.positions_write.write_all(positions_encoded));
|
||||
self.written_bytes_positions += positions_encoded.len();
|
||||
@@ -95,13 +111,13 @@ impl PostingsSerializer {
|
||||
|
||||
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
|
||||
self.doc_ids.push(doc_id);
|
||||
if self.is_termfreq_enabled {
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
self.term_freqs.push(term_freq as u32);
|
||||
}
|
||||
if self.is_positions_enabled {
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
self.position_deltas.extend_from_slice(position_deltas);
|
||||
}
|
||||
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
|
||||
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
|
||||
{
|
||||
// encode the positions
|
||||
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
@@ -109,7 +125,7 @@ impl PostingsSerializer {
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
}
|
||||
if self.is_termfreq_enabled {
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
// encode the term_freqs
|
||||
let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs);
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
@@ -120,7 +136,7 @@ impl PostingsSerializer {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
pub fn close(mut self,) -> io::Result<()> {
|
||||
try!(self.close_term());
|
||||
try!(self.terms_fst_builder.finish());
|
||||
|
||||
@@ -12,8 +12,9 @@ pub use self::text_field::TextFieldValue;
|
||||
pub use self::text_field::TextOptions;
|
||||
pub use self::text_field::FAST;
|
||||
pub use self::text_field::TEXT;
|
||||
pub use self::text_field::STRING;
|
||||
pub use self::text_field::STORED;
|
||||
|
||||
pub use self::text_field::TextIndexingOptions;
|
||||
|
||||
pub use self::u32_field::U32Field;
|
||||
pub use self::u32_field::U32FieldValue;
|
||||
|
||||
@@ -33,19 +33,18 @@ pub struct U32FieldEntry {
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use tantivy::schema::{Schema, TextOptions};
|
||||
/// use tantivy::schema::*;
|
||||
///
|
||||
/// fn create_schema() -> Schema {
|
||||
/// let mut schema = Schema::new();
|
||||
/// let str_fieldtype = TextOptions::new();
|
||||
/// let text_fieldtype = TextOptions::new().set_tokenized_indexed();
|
||||
/// let id_field = schema.add_text_field("id", &str_fieldtype);
|
||||
/// let url_field = schema.add_text_field("url", &str_fieldtype);
|
||||
/// let body_field = schema.add_text_field("body", &text_fieldtype);
|
||||
/// let id_field = schema.add_text_field("id", &str_fieldtype);
|
||||
/// let url_field = schema.add_text_field("url", &str_fieldtype);
|
||||
/// let title_field = schema.add_text_field("title", &text_fieldtype);
|
||||
/// let body_field = schema.add_text_field("body", &text_fieldtype);
|
||||
/// let id_field = schema.add_text_field("id", &STRING);
|
||||
/// let url_field = schema.add_text_field("url", &STRING);
|
||||
/// let body_field = schema.add_text_field("body", &TEXT);
|
||||
/// let id_field = schema.add_text_field("id", &STRING);
|
||||
/// let url_field = schema.add_text_field("url", &STRING);
|
||||
/// let title_field = schema.add_text_field("title", &TEXT);
|
||||
/// let body_field = schema.add_text_field("body", &TEXT);
|
||||
/// schema
|
||||
/// }
|
||||
///
|
||||
|
||||
@@ -24,6 +24,27 @@ impl Term {
|
||||
}
|
||||
}
|
||||
|
||||
fn type_num(&self,) -> u8 {
|
||||
self.data[0]
|
||||
}
|
||||
|
||||
pub fn is_u32(&self,) -> bool {
|
||||
!self.is_text()
|
||||
}
|
||||
|
||||
pub fn is_text(&self,) -> bool {
|
||||
self.type_num() & 128 == 0
|
||||
}
|
||||
|
||||
pub fn get_text_field(&self,) -> Option<TextField> {
|
||||
if self.is_text() {
|
||||
Some(TextField(self.type_num()))
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_field_text(field: &TextField, text: &str) -> Term {
|
||||
let mut buffer = Vec::with_capacity(1 + text.len());
|
||||
let TextField(field_idx) = *field;
|
||||
|
||||
@@ -10,17 +10,74 @@ use std::ops::BitOr;
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct TextField(pub u8);
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash, RustcDecodable, RustcEncodable)]
|
||||
pub enum TextIndexingOptions {
|
||||
Unindexed,
|
||||
Untokenized,
|
||||
TokenizedNoFreq,
|
||||
TokenizedWithFreq,
|
||||
TokenizedWithFreqAndPosition,
|
||||
}
|
||||
|
||||
impl TextIndexingOptions {
|
||||
pub fn is_termfreq_enabled(&self) -> bool {
|
||||
match *self {
|
||||
TextIndexingOptions::TokenizedWithFreq => true,
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_tokenized(&self,) -> bool {
|
||||
match *self {
|
||||
TextIndexingOptions::TokenizedNoFreq => true,
|
||||
TextIndexingOptions::TokenizedWithFreq => true,
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_position_enabled(&self,) -> bool {
|
||||
match *self {
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl BitOr for TextIndexingOptions {
|
||||
type Output = TextIndexingOptions;
|
||||
|
||||
fn bitor(self, other: TextIndexingOptions) -> TextIndexingOptions {
|
||||
use super::TextIndexingOptions::*;
|
||||
if self == Unindexed {
|
||||
other
|
||||
}
|
||||
else if other == Unindexed {
|
||||
self
|
||||
}
|
||||
else if self == other {
|
||||
self
|
||||
}
|
||||
else {
|
||||
// make it possible
|
||||
panic!("Combining {:?} and {:?} is ambiguous");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
|
||||
pub struct TextOptions {
|
||||
tokenized_indexed: bool,
|
||||
indexing_options: TextIndexingOptions,
|
||||
stored: bool,
|
||||
fast: bool,
|
||||
}
|
||||
|
||||
impl TextOptions {
|
||||
pub fn is_tokenized_indexed(&self,) -> bool {
|
||||
self.tokenized_indexed
|
||||
|
||||
pub fn indexing_options(&self,) -> TextIndexingOptions {
|
||||
self.indexing_options.clone()
|
||||
}
|
||||
|
||||
pub fn is_stored(&self,) -> bool {
|
||||
@@ -41,15 +98,15 @@ impl TextOptions {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn set_tokenized_indexed(mut self,) -> TextOptions {
|
||||
self.tokenized_indexed = true;
|
||||
pub fn set_indexing_options(mut self, indexing_options: TextIndexingOptions) -> TextOptions {
|
||||
self.indexing_options = indexing_options;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn new() -> TextOptions {
|
||||
TextOptions {
|
||||
fast: false,
|
||||
tokenized_indexed: false,
|
||||
indexing_options: TextIndexingOptions::Unindexed,
|
||||
stored: false,
|
||||
}
|
||||
}
|
||||
@@ -94,12 +151,17 @@ pub struct TextFieldValue {
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// The field will be untokenized and indexed
|
||||
pub const STRING: TextOptions = TextOptions {
|
||||
indexing_options: TextIndexingOptions::Untokenized,
|
||||
stored: false,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
|
||||
/// The field will be tokenized and indexed
|
||||
pub const TEXT: TextOptions = TextOptions {
|
||||
tokenized_indexed: true,
|
||||
indexing_options: TextIndexingOptions::TokenizedWithFreqAndPosition,
|
||||
stored: false,
|
||||
fast: false,
|
||||
};
|
||||
@@ -109,7 +171,7 @@ pub const TEXT: TextOptions = TextOptions {
|
||||
/// Reading the stored fields of a document is relatively slow.
|
||||
/// (100 microsecs)
|
||||
pub const STORED: TextOptions = TextOptions {
|
||||
tokenized_indexed: false,
|
||||
indexing_options: TextIndexingOptions::Unindexed,
|
||||
stored: true,
|
||||
fast: false,
|
||||
};
|
||||
@@ -117,7 +179,7 @@ pub const STORED: TextOptions = TextOptions {
|
||||
/// Fast field are used for field you need to access many times during
|
||||
/// collection. (e.g: for sort, aggregates).
|
||||
pub const FAST: TextOptions = TextOptions {
|
||||
tokenized_indexed: false,
|
||||
indexing_options: TextIndexingOptions::Unindexed,
|
||||
stored: false,
|
||||
fast: true
|
||||
};
|
||||
@@ -129,7 +191,7 @@ impl BitOr for TextOptions {
|
||||
|
||||
fn bitor(self, other: TextOptions) -> TextOptions {
|
||||
let mut res = TextOptions::new();
|
||||
res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed;
|
||||
res.indexing_options = self.indexing_options | other.indexing_options;
|
||||
res.stored = self.stored || other.stored;
|
||||
res.fast = self.fast || other.fast;
|
||||
res
|
||||
@@ -148,19 +210,19 @@ mod tests {
|
||||
let field_options = STORED | FAST;
|
||||
assert!(field_options.is_stored());
|
||||
assert!(field_options.is_fast());
|
||||
assert!(!field_options.is_tokenized_indexed());
|
||||
assert!(!field_options.indexing_options().is_tokenized());
|
||||
}
|
||||
{
|
||||
let field_options = STORED | TEXT;
|
||||
assert!(field_options.is_stored());
|
||||
assert!(!field_options.is_fast());
|
||||
assert!(field_options.is_tokenized_indexed());
|
||||
assert!(field_options.indexing_options().is_tokenized());
|
||||
}
|
||||
{
|
||||
let mut schema = Schema::new();
|
||||
let _body_field: TextField = schema.add_text_field("body", &TEXT);
|
||||
let field = schema.text_field("body");
|
||||
assert!(schema.text_field_options(&field).is_tokenized_indexed());
|
||||
assert!(schema.text_field_options(&field).indexing_options().is_tokenized());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user