mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 09:12:55 +00:00
* Integrating bugfixes into master Closes #274 Closes #280 Closes #289 * Next version will be 0.6
This commit is contained in:
11
CHANGELOG.md
11
CHANGELOG.md
@@ -1,11 +1,18 @@
|
||||
Tantivy 0.5.2
|
||||
Tantivy 0.6
|
||||
==========================
|
||||
|
||||
- Removed C code. Tantivy is now pure Rust.
|
||||
- BM25
|
||||
- Approximate field norms encoded over 1 byte.
|
||||
- Compiles on stable rust
|
||||
|
||||
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
- bugfix #274
|
||||
- bugfix #280
|
||||
- bugfix #289
|
||||
|
||||
|
||||
Tantivy 0.5.1
|
||||
==========================
|
||||
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.5.2"
|
||||
version = "0.6.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -119,6 +119,19 @@ impl SegmentReader {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
idx: usize
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||
Ok(FastFieldReader::open(ff_source))
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
||||
@@ -128,14 +141,8 @@ impl SegmentReader {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let vals_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
|
||||
@@ -26,13 +26,31 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
/// Returns the number of values associated to a given document.
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let (start, stop) = self.range(doc);
|
||||
(stop - start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values associated to documents.
|
||||
pub(crate) fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let start = self.idx_reader.get(doc) as u32;
|
||||
let stop = self.idx_reader.get(doc + 1) as u32;
|
||||
let (start, stop) = self.range(doc);
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range(start, &mut vals[..]);
|
||||
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ use fastfield::value_to_u64;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use itertools::Itertools;
|
||||
use postings::UnorderedTermId;
|
||||
use termdict::TermOrdinal;
|
||||
use schema::{Document, Field};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
@@ -101,7 +102,7 @@ impl MultiValueIntFastFieldWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
@@ -125,13 +126,13 @@ impl MultiValueIntFastFieldWriter {
|
||||
1,
|
||||
)?;
|
||||
for val in &self.vals {
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
|
||||
value_serializer.add_val(remapped_val)?;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = self.vals.iter().cloned().minmax();
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for &val in &self.vals {
|
||||
|
||||
@@ -71,6 +71,9 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
///
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||
self.bit_unpacker.get_range(start, output_u64);
|
||||
|
||||
@@ -77,11 +77,21 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use common;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use common::{self, VInt, BinarySerializable};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use postings::UnorderedTermId;
|
||||
use schema::FieldType;
|
||||
use schema::{Cardinality, Document, Field, Schema};
|
||||
use schema::{FieldType, Cardinality, Document, Field, Schema};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
@@ -93,7 +91,7 @@ impl FastFieldsWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
|
||||
@@ -17,7 +17,8 @@ impl FieldNormsWriter {
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.filter(|&(_, field_entry)|
|
||||
field_entry.is_indexed())
|
||||
.map(|(field, _)| Field(field as u32))
|
||||
.collect::<Vec<Field>>()
|
||||
}
|
||||
|
||||
@@ -195,7 +195,6 @@ pub fn advance_deletes(
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
{
|
||||
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
|
||||
// We are already up-to-date here.
|
||||
@@ -236,7 +235,6 @@ pub fn advance_deletes(
|
||||
}
|
||||
}
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
|
||||
Ok(file_protect)
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,13 +1,8 @@
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use postings::Recorder;
|
||||
use postings::UnorderedTermId;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use schema::{Field, Schema};
|
||||
use schema::{FieldEntry, FieldType, Term, Field, Schema};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
@@ -16,6 +11,9 @@ use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use DocId;
|
||||
use Result;
|
||||
use schema::IndexRecordOption;
|
||||
use postings::UnorderedTermId;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
@@ -44,6 +42,7 @@ fn posting_from_field_entry<'a>(
|
||||
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
schema: Schema,
|
||||
term_index: TermHashMap<'a>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
}
|
||||
@@ -58,8 +57,8 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
.iter()
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.collect();
|
||||
|
||||
MultiFieldPostingsWriter {
|
||||
schema: schema.clone(),
|
||||
heap,
|
||||
term_index,
|
||||
per_field_postings_writers,
|
||||
@@ -83,7 +82,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
|
||||
@@ -94,7 +93,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
@@ -110,17 +109,23 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i + 1];
|
||||
|
||||
// populating the unordered term ord -> ordered term ord mapping
|
||||
// for the field.
|
||||
let mut mapping = HashMap::new();
|
||||
for (term_ord, term_unord_id) in term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket)
|
||||
.enumerate()
|
||||
{
|
||||
mapping.insert(term_unord_id, term_ord);
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match field_entry.field_type() {
|
||||
FieldType::Str(_) | FieldType::HierarchicalFacet => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let mut unordered_term_ids = term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| (unord_term_id as UnorderedTermId, term_ord as TermOrdinal))
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) => {}
|
||||
}
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
let mut field_serializer =
|
||||
|
||||
@@ -1,19 +1,17 @@
|
||||
use Result;
|
||||
use DocId;
|
||||
use super::TermInfo;
|
||||
use common::BinarySerializable;
|
||||
use common::CompositeWrite;
|
||||
use common::CountingWriter;
|
||||
use compression::VIntEncoder;
|
||||
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use core::Segment;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::{Field, FieldEntry, FieldType};
|
||||
use schema::Schema;
|
||||
use std::io::{self, Write};
|
||||
use termdict::TermDictionaryBuilder;
|
||||
use DocId;
|
||||
use Result;
|
||||
use common::{CompositeWrite, CountingWriter};
|
||||
use termdict::{TermOrdinal, TermDictionaryBuilder};
|
||||
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
@@ -119,6 +117,7 @@ pub struct FieldSerializer<'a> {
|
||||
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
|
||||
current_term_info: TermInfo,
|
||||
term_open: bool,
|
||||
num_terms: TermOrdinal,
|
||||
}
|
||||
|
||||
impl<'a> FieldSerializer<'a> {
|
||||
@@ -157,6 +156,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
positions_serializer_opt,
|
||||
current_term_info: TermInfo::default(),
|
||||
term_open: false,
|
||||
num_terms: TermOrdinal::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -177,7 +177,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
|
||||
assert!(
|
||||
!self.term_open,
|
||||
"Called new_term, while the previous term was not closed."
|
||||
@@ -185,7 +185,10 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.term_open = true;
|
||||
self.postings_serializer.clear();
|
||||
self.current_term_info = self.current_term_info();
|
||||
self.term_dictionary_builder.insert_key(term)
|
||||
self.term_dictionary_builder.insert_key(term)?;
|
||||
let term_ordinal = self.num_terms;
|
||||
self.num_terms += 1;
|
||||
Ok(term_ordinal)
|
||||
}
|
||||
|
||||
/// Serialize the information that a document contains the current term,
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
use core::Searcher;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use query::Query;
|
||||
use query::Scorer;
|
||||
use query::Weight;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use core::Searcher;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use query::{Query, Weight, Scorer};
|
||||
|
||||
/// Query that matches all of the documents.
|
||||
///
|
||||
@@ -26,28 +24,46 @@ pub struct AllWeight;
|
||||
impl Weight for AllWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
Ok(Box::new(AllScorer {
|
||||
started: false,
|
||||
state: State::NotStarted,
|
||||
doc: 0u32,
|
||||
max_doc: reader.max_doc(),
|
||||
max_doc: reader.max_doc()
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
enum State {
|
||||
NotStarted,
|
||||
Started,
|
||||
Finished
|
||||
}
|
||||
|
||||
/// Scorer associated to the `AllQuery` query.
|
||||
pub struct AllScorer {
|
||||
started: bool,
|
||||
state: State,
|
||||
doc: DocId,
|
||||
max_doc: DocId,
|
||||
max_doc: DocId
|
||||
}
|
||||
|
||||
impl DocSet for AllScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.started {
|
||||
self.doc += 1u32;
|
||||
} else {
|
||||
self.started = true;
|
||||
match self.state {
|
||||
State::NotStarted => {
|
||||
self.state = State::Started;
|
||||
self.doc = 0;
|
||||
}
|
||||
State::Started => {
|
||||
self.doc += 1u32;
|
||||
}
|
||||
State::Finished => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if self.doc < self.max_doc {
|
||||
return true;
|
||||
} else {
|
||||
self.state = State::Finished;
|
||||
return false;
|
||||
}
|
||||
self.doc < self.max_doc
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
@@ -64,3 +80,49 @@ impl Scorer for AllScorer {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::AllQuery;
|
||||
use Index;
|
||||
use schema::{TEXT, SchemaBuilder};
|
||||
use query::Query;
|
||||
|
||||
#[test]
|
||||
fn test_all_query() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 10_000_000)
|
||||
.unwrap();
|
||||
index_writer.add_document(doc!(field=>"aaa"));
|
||||
index_writer.add_document(doc!(field=>"bbb"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(field=>"ccc"));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let weight = AllQuery.weight(&searcher, false).unwrap();
|
||||
{
|
||||
let reader = searcher.segment_reader(0);
|
||||
let mut scorer = weight.scorer(reader).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 1u32);
|
||||
assert!(!scorer.advance());
|
||||
}
|
||||
{
|
||||
let reader = searcher.segment_reader(1);
|
||||
let mut scorer = weight.scorer(reader).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert!(!scorer.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -212,6 +212,14 @@ mod tests {
|
||||
assert!(Facet::root().is_root());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_path() {
|
||||
assert_eq!(
|
||||
Facet::from_path(vec!["top", "a", "firstdoc"]),
|
||||
Facet::from("/top/a/firstdoc")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_display() {
|
||||
{
|
||||
|
||||
@@ -4,6 +4,7 @@ use super::Field;
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use common;
|
||||
use std::str;
|
||||
use schema::Facet;
|
||||
|
||||
/// Size (in bytes) of the buffer of a int field.
|
||||
const INT_TERM_LEN: usize = 4 + 8;
|
||||
@@ -29,6 +30,16 @@ impl Term {
|
||||
Term::from_field_u64(field, val_u64)
|
||||
}
|
||||
|
||||
/// Creates a `Term` given a facet.
|
||||
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
||||
let bytes = facet.encoded_bytes();
|
||||
let buffer = Vec::with_capacity(4 + bytes.len());
|
||||
let mut term = Term(buffer);
|
||||
term.set_field(field);
|
||||
term.set_bytes(bytes);
|
||||
term
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a string value
|
||||
///
|
||||
/// Assuming the term has a field id of 2, and a text value of "abc",
|
||||
@@ -91,10 +102,14 @@ impl Term {
|
||||
self.set_u64(common::i64_to_u64(val));
|
||||
}
|
||||
|
||||
fn set_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.resize(4, 0u8);
|
||||
self.0.extend(bytes);
|
||||
}
|
||||
|
||||
/// Set the texts only, keeping the field untouched.
|
||||
pub fn set_text(&mut self, text: &str) {
|
||||
self.0.resize(4, 0u8);
|
||||
self.0.extend(text.as_bytes());
|
||||
self.set_bytes(text.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use schema::Term;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use termdict::TermStreamer;
|
||||
use termdict::TermOrdinal;
|
||||
use schema::Term;
|
||||
|
||||
pub struct HeapItem<'a> {
|
||||
pub streamer: TermStreamer<'a>,
|
||||
@@ -28,6 +29,7 @@ impl<'a> Ord for HeapItem<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
///
|
||||
@@ -43,7 +45,6 @@ pub struct TermMerger<'a> {
|
||||
impl<'a> TermMerger<'a> {
|
||||
/// Stream of merged term dictionary
|
||||
///
|
||||
///
|
||||
pub fn new(streams: Vec<TermStreamer<'a>>) -> TermMerger<'a> {
|
||||
TermMerger {
|
||||
heap: BinaryHeap::new(),
|
||||
@@ -58,6 +59,14 @@ impl<'a> TermMerger<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn matching_segments<'b: 'a>(&'b self) -> Box<'b + Iterator<Item=(usize, TermOrdinal)>> {
|
||||
Box::new(self.current_streamers
|
||||
.iter()
|
||||
.map(|heap_item| {
|
||||
(heap_item.segment_ord, heap_item.streamer.term_ord())
|
||||
}))
|
||||
}
|
||||
|
||||
fn advance_segments(&mut self) {
|
||||
let streamers = &mut self.current_streamers;
|
||||
let heap = &mut self.heap;
|
||||
|
||||
@@ -59,6 +59,7 @@ pub use self::merger::TermMerger;
|
||||
pub use self::streamer::{TermStreamer, TermStreamerBuilder};
|
||||
pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
|
||||
|
||||
@@ -6,21 +6,21 @@ use tokenizer::TokenStreamChain;
|
||||
/// Token
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Token {
|
||||
/// Offset (byte index) of the first character of the token.
|
||||
/// Offsets shall not be modified by token filters.
|
||||
pub offset_from: usize,
|
||||
/// Offset (byte index) of the last character of the token + 1.
|
||||
/// The text that generated the token should be obtained by
|
||||
/// &text[token.offset_from..token.offset_to]
|
||||
pub offset_to: usize,
|
||||
/// Position, expressed in number of tokens.
|
||||
pub position: usize,
|
||||
/// Actual text content of the token.
|
||||
pub text: String,
|
||||
/// Offset (byte index) of the first character of the token.
|
||||
/// Offsets shall not be modified by token filters.
|
||||
pub offset_from: usize,
|
||||
/// Offset (byte index) of the last character of the token + 1.
|
||||
/// The text that generated the token should be obtained by
|
||||
/// &text[token.offset_from..token.offset_to]
|
||||
pub offset_to: usize,
|
||||
/// Position, expressed in number of tokens.
|
||||
pub position: usize,
|
||||
/// Actual text content of the token.
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
impl Default for Token {
|
||||
fn default() -> Token {
|
||||
fn default() -> Token {
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 0,
|
||||
|
||||
Reference in New Issue
Block a user