Several TermDict operation now returns an io::Result

This commit is contained in:
Paul Masurel
2020-12-03 12:46:52 +09:00
parent e9e2984ac2
commit 2f342257d3
14 changed files with 65 additions and 122 deletions

View File

@@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> {
let query_ords: HashSet<u64> = facets
.iter()
.filter_map(|key| facet_dict.term_ord(key.encoded_str()))
.filter_map(|key| facet_dict.term_ord(key.encoded_str()).unwrap())
.collect();
let mut facet_ords_buffer: Vec<u64> = Vec::with_capacity(20);

View File

@@ -274,7 +274,7 @@ impl Collector for FacetCollector {
let mut collapse_facet_it = self.facets.iter().peekable();
collapse_facet_ords.push(0);
{
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
let mut facet_streamer = facet_reader.facet_dict().range().into_stream()?;
if facet_streamer.advance() {
'outer: loop {
// at the begining of this loop, facet_streamer

View File

@@ -66,7 +66,7 @@ impl InvertedIndexReader {
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
pub fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>> {
self.termdict.get(term.value_bytes())
}
@@ -106,10 +106,9 @@ impl InvertedIndexReader {
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<BlockSegmentPostings>> {
Ok(self
.get_term_info(term)
self.get_term_info(term)?
.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
.transpose()?)
.transpose()
}
/// Returns a block postings given a `term_info`.
@@ -181,7 +180,7 @@ impl InvertedIndexReader {
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<SegmentPostings>> {
self.get_term_info(term)
self.get_term_info(term)?
.map(move |term_info| self.read_postings_from_terminfo(&term_info, option))
.transpose()
}
@@ -191,7 +190,7 @@ impl InvertedIndexReader {
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<SegmentPostings>> {
self.get_term_info(term)
self.get_term_info(term)?
.map(|term_info| self.read_postings_from_terminfo(&term_info, option))
.transpose()
}
@@ -199,7 +198,7 @@ impl InvertedIndexReader {
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
Ok(self
.get_term_info(term)
.get_term_info(term)?
.map(|term_info| term_info.doc_freq)
.unwrap_or(0u32))
}

View File

@@ -12,7 +12,7 @@ pub use self::executor::Executor;
pub use self::index::Index;
pub use self::index_meta::{IndexMeta, SegmentMeta, SegmentMetaInventory};
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::{FieldSearcher, Searcher};
pub use self::searcher::Searcher;
pub use self::segment::Segment;
pub use self::segment::SerializableSegment;
pub use self::segment_component::SegmentComponent;

View File

@@ -1,17 +1,16 @@
use crate::collector::Collector;
use crate::core::Executor;
use crate::core::InvertedIndexReader;
use crate::core::SegmentReader;
use crate::query::Query;
use crate::schema::Document;
use crate::schema::Schema;
use crate::schema::{Field, Term};
use crate::schema::Term;
use crate::space_usage::SearcherSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermMerger;
use crate::DocAddress;
use crate::Index;
use std::sync::Arc;
use std::{fmt, io};
/// Holds a list of `SegmentReader`s ready for search.
@@ -148,16 +147,6 @@ impl Searcher {
collector.merge_fruits(fruits)
}
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> crate::Result<FieldSearcher> {
let inv_index_readers: Vec<Arc<InvertedIndexReader>> = self
.segment_readers
.iter()
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<crate::Result<Vec<_>>>()?;
Ok(FieldSearcher::new(inv_index_readers))
}
/// Summarize total space usage of this searcher.
pub fn space_usage(&self) -> io::Result<SearcherSpaceUsage> {
let mut space_usage = SearcherSpaceUsage::new();
@@ -168,32 +157,6 @@ impl Searcher {
}
}
/// **Experimental API** `FieldSearcher` only gives access to a stream over the terms of a field.
pub struct FieldSearcher {
inv_index_readers: Vec<Arc<InvertedIndexReader>>,
}
impl FieldSearcher {
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
FieldSearcher { inv_index_readers }
}
/// Returns a Stream over all of the sorted unique terms of
/// for the given field.
///
/// This method does not take into account which documents are deleted, so
/// in presence of deletes some terms may not actually exist in any document
/// anymore.
pub fn terms(&self) -> TermMerger {
let term_streamers: Vec<_> = self
.inv_index_readers
.iter()
.map(|inverted_index| inverted_index.terms().stream())
.collect();
TermMerger::new(term_streamers)
}
}
impl fmt::Debug for Searcher {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let segment_ids = self

View File

@@ -514,7 +514,7 @@ impl IndexMerger {
for field_reader in &field_readers {
let terms = field_reader.terms();
field_term_streams.push(terms.stream());
field_term_streams.push(terms.stream()?);
max_term_ords.push(terms.num_terms() as u64);
}

View File

@@ -160,7 +160,7 @@ pub use self::docset::{DocSet, TERMINATED};
pub use crate::common::HasLen;
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{FieldSearcher, Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
pub use crate::core::{InvertedIndexReader, SegmentReader};
pub use crate::directory::Directory;
pub use crate::indexer::operation::UserOperation;

View File

@@ -469,7 +469,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term).unwrap();
let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
.unwrap()
@@ -513,7 +513,7 @@ mod tests {
{
let term = Term::from_field_u64(int_field, 0u64);
let inverted_index = segment_reader.inverted_index(int_field)?;
let term_info = inverted_index.get_term_info(&term).unwrap();
let term_info = inverted_index.get_term_info(&term)?.unwrap();
block_segments = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
}
@@ -521,7 +521,7 @@ mod tests {
{
let term = Term::from_field_u64(int_field, 1u64);
let inverted_index = segment_reader.inverted_index(int_field)?;
let term_info = inverted_index.get_term_info(&term).unwrap();
let term_info = inverted_index.get_term_info(&term)?.unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?;
}
assert_eq!(block_segments.docs(), &[1, 3, 5]);

View File

@@ -7,6 +7,7 @@ use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::TantivyError;
use crate::{DocId, Score};
use std::io;
use std::sync::Arc;
use tantivy_fst::Automaton;
@@ -28,7 +29,10 @@ where
}
}
fn automaton_stream<'a>(&'a self, term_dict: &'a TermDictionary) -> TermStreamer<'a, &'a A> {
fn automaton_stream<'a>(
&'a self,
term_dict: &'a TermDictionary,
) -> io::Result<TermStreamer<'a, &'a A>> {
let automaton: &A = &*self.automaton;
let term_stream_builder = term_dict.search(automaton);
term_stream_builder.into_stream()
@@ -44,7 +48,7 @@ where
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field)?;
let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict);
let mut term_stream = self.automaton_stream(term_dict)?;
while term_stream.advance() {
let term_info = term_stream.value();
let mut block_segment_postings = inverted_index

View File

@@ -11,6 +11,7 @@ use crate::schema::{Field, IndexRecordOption, Term};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DocId, Score};
use std::collections::Bound;
use std::io;
use std::ops::Range;
fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
@@ -274,7 +275,7 @@ pub struct RangeWeight {
}
impl RangeWeight {
fn term_range<'a>(&self, term_dict: &'a TermDictionary) -> TermStreamer<'a> {
fn term_range<'a>(&self, term_dict: &'a TermDictionary) -> io::Result<TermStreamer<'a>> {
use std::collections::Bound::*;
let mut term_stream_builder = term_dict.range();
term_stream_builder = match self.left_bound {
@@ -298,7 +299,7 @@ impl Weight for RangeWeight {
let inverted_index = reader.inverted_index(self.field)?;
let term_dict = inverted_index.terms();
let mut term_range = self.term_range(term_dict);
let mut term_range = self.term_range(term_dict)?;
while term_range.advance() {
let term_info = term_range.value();
let mut block_segment_postings = inverted_index

View File

@@ -45,7 +45,7 @@ impl Weight for TermWeight {
} else {
let field = self.term.field();
let inv_index = reader.inverted_index(field)?;
let term_info = inv_index.get_term_info(&self.term);
let term_info = inv_index.get_term_info(&self.term)?;
Ok(term_info.map(|term_info| term_info.doc_freq).unwrap_or(0))
}
}

View File

@@ -1,3 +1,5 @@
use std::io;
use super::TermDictionary;
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
@@ -59,14 +61,14 @@ where
/// Creates the stream corresponding to the range
/// of terms defined using the `TermStreamerBuilder`.
pub fn into_stream(self) -> TermStreamer<'a, A> {
TermStreamer {
pub fn into_stream(self) -> io::Result<TermStreamer<'a, A>> {
Ok(TermStreamer {
fst_map: self.fst_map,
stream: self.stream_builder.into_stream(),
term_ord: 0u64,
current_key: Vec::with_capacity(100),
current_value: TermInfo::default(),
}
})
}
}

View File

@@ -139,8 +139,8 @@ impl TermDictionary {
}
/// Returns the ordinal associated to a given term.
pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> Option<TermOrdinal> {
self.fst_index.get(key)
pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermOrdinal>> {
Ok(self.fst_index.get(key))
}
/// Returns the term associated to a given term ordinal.
@@ -179,9 +179,10 @@ impl TermDictionary {
}
/// Lookups the value corresponding to the key.
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<TermInfo> {
self.term_ord(key)
.map(|term_ord| self.term_info_from_ord(term_ord))
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermInfo>> {
Ok(self
.term_ord(key)?
.map(|term_ord| self.term_info_from_ord(term_ord)))
}
/// Returns a range builder, to stream all of the terms
@@ -191,7 +192,7 @@ impl TermDictionary {
}
/// A stream of all the sorted terms. [See also `.stream_field()`](#method.stream_field)
pub fn stream(&self) -> TermStreamer<'_> {
pub fn stream(&self) -> io::Result<TermStreamer<'_>> {
self.range().into_stream()
}

View File

@@ -1,8 +1,8 @@
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
use crate::core::Index;
use crate::directory::{Directory, FileSlice, RAMDirectory};
use crate::postings::TermInfo;
use crate::schema::{Schema, TEXT};
use std::path::PathBuf;
use std::str;
@@ -21,7 +21,7 @@ fn make_term_info(term_ord: u64) -> TermInfo {
#[test]
fn test_empty_term_dictionary() {
let empty = TermDictionary::empty();
assert!(empty.stream().next().is_none());
assert!(empty.stream().unwrap().next().is_none());
}
#[test]
@@ -48,7 +48,7 @@ fn test_term_ordinals() -> crate::Result<()> {
let term_file = directory.open_read(&path)?;
let term_dict: TermDictionary = TermDictionary::open(term_file)?;
for (term_ord, term) in COUNTRIES.iter().enumerate() {
assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64);
assert_eq!(term_dict.term_ord(term)?, Some(term_ord as u64));
let mut bytes = vec![];
assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes));
assert_eq!(bytes, term.as_bytes());
@@ -69,9 +69,9 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
}
let file = directory.open_read(&path)?;
let term_dict: TermDictionary = TermDictionary::open(file)?;
assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
let mut stream = term_dict.stream();
assert_eq!(term_dict.get("abc")?.unwrap().doc_freq, 34u32);
assert_eq!(term_dict.get("abcd")?.unwrap().doc_freq, 346u32);
let mut stream = term_dict.stream()?;
{
{
let (k, v) = stream.next().unwrap();
@@ -94,33 +94,6 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
Ok(())
}
#[test]
fn test_term_iterator() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b d f"));
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a b c d f"));
index_writer.commit()?;
index_writer.add_document(doc!(text_field => "e f"));
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let field_searcher = searcher.field(text_field)?;
let mut term_it = field_searcher.terms();
let mut term_string = String::new();
while term_it.advance() {
//let term = Term::from_bytes(term_it.key());
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
}
assert_eq!(&*term_string, "abcdef");
Ok(())
}
#[test]
fn test_term_dictionary_stream() -> crate::Result<()> {
let ids: Vec<_> = (0u32..10_000u32)
@@ -138,7 +111,7 @@ fn test_term_dictionary_stream() -> crate::Result<()> {
let term_file = FileSlice::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(term_file)?;
{
let mut streamer = term_dictionary.stream();
let mut streamer = term_dictionary.stream()?;
let mut i = 0;
while let Some((streamer_k, streamer_v)) = streamer.next() {
let &(ref key, ref v) = &ids[i];
@@ -150,7 +123,7 @@ fn test_term_dictionary_stream() -> crate::Result<()> {
let &(ref key, ref val) = &ids[2047];
assert_eq!(
term_dictionary.get(key.as_bytes()),
term_dictionary.get(key.as_bytes())?,
Some(make_term_info(*val as u64))
);
Ok(())
@@ -168,7 +141,7 @@ fn test_stream_high_range_prefix_suffix() -> crate::Result<()> {
};
let term_dict_file = FileSlice::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(term_dict_file)?;
let mut kv_stream = term_dictionary.stream();
let mut kv_stream = term_dictionary.stream()?;
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(1));
@@ -206,7 +179,7 @@ fn test_stream_range() -> crate::Result<()> {
let mut streamer = term_dictionary
.range()
.ge(target_key.as_bytes())
.into_stream();
.into_stream()?;
for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j];
@@ -223,7 +196,7 @@ fn test_stream_range() -> crate::Result<()> {
let mut streamer = term_dictionary
.range()
.gt(target_key.as_bytes())
.into_stream();
.into_stream()?;
for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j + 1];
@@ -242,7 +215,7 @@ fn test_stream_range() -> crate::Result<()> {
.range()
.ge(fst_key.as_bytes())
.lt(last_key.as_bytes())
.into_stream();
.into_stream()?;
for _ in 0..j {
assert!(streamer.next().is_some());
}
@@ -267,7 +240,7 @@ fn test_empty_string() -> crate::Result<()> {
};
let file = FileSlice::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
let mut stream = term_dictionary.stream();
let mut stream = term_dictionary.stream()?;
assert!(stream.advance());
assert!(stream.key().is_empty());
assert!(stream.advance());
@@ -300,70 +273,70 @@ fn test_stream_range_boundaries() -> crate::Result<()> {
res
};
{
let range = term_dictionary.range().backward().into_stream();
let range = term_dictionary.range().backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().ge([2u8]).into_stream();
let range = term_dictionary.range().ge([2u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().ge([2u8]).backward().into_stream();
let range = term_dictionary.range().ge([2u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).into_stream();
let range = term_dictionary.range().gt([2u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).backward().into_stream();
let range = term_dictionary.range().gt([2u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).into_stream();
let range = term_dictionary.range().lt([6u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).backward().into_stream();
let range = term_dictionary.range().lt([6u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
);
}
{
let range = term_dictionary.range().le([6u8]).into_stream();
let range = term_dictionary.range().le([6u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary.range().le([6u8]).backward().into_stream();
let range = term_dictionary.range().le([6u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream()?;
assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
}
{
@@ -372,7 +345,7 @@ fn test_stream_range_boundaries() -> crate::Result<()> {
.ge([0u8])
.lt([5u8])
.backward()
.into_stream();
.into_stream()?;
assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
}
Ok(())
@@ -410,7 +383,7 @@ fn test_automaton_search() -> crate::Result<()> {
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen"));
let mut range = term_dict.search(automaton).into_stream();
let mut range = term_dict.search(automaton).into_stream()?;
// get the first finding
assert!(range.advance());