From 2f342257d3fcba1eff5b8c0b5b1844ad01bcf191 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 3 Dec 2020 12:46:52 +0900 Subject: [PATCH] Several TermDict operation now returns an io::Result --- examples/faceted_search_with_tweaked_score.rs | 2 +- src/collector/facet_collector.rs | 2 +- src/core/inverted_index_reader.rs | 13 ++- src/core/mod.rs | 2 +- src/core/searcher.rs | 43 +--------- src/indexer/merger.rs | 2 +- src/lib.rs | 2 +- src/postings/block_segment_postings.rs | 6 +- src/query/automaton_weight.rs | 8 +- src/query/range_query.rs | 5 +- src/query/term_query/term_weight.rs | 2 +- src/termdict/fst_termdict/streamer.rs | 8 +- src/termdict/fst_termdict/termdict.rs | 13 +-- src/termdict/tests.rs | 79 ++++++------------- 14 files changed, 65 insertions(+), 122 deletions(-) diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs index 57331f822..bb9ad002b 100644 --- a/examples/faceted_search_with_tweaked_score.rs +++ b/examples/faceted_search_with_tweaked_score.rs @@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> { let query_ords: HashSet = facets .iter() - .filter_map(|key| facet_dict.term_ord(key.encoded_str())) + .filter_map(|key| facet_dict.term_ord(key.encoded_str()).unwrap()) .collect(); let mut facet_ords_buffer: Vec = Vec::with_capacity(20); diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 9e86472cb..6d91ef5d2 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -274,7 +274,7 @@ impl Collector for FacetCollector { let mut collapse_facet_it = self.facets.iter().peekable(); collapse_facet_ords.push(0); { - let mut facet_streamer = facet_reader.facet_dict().range().into_stream(); + let mut facet_streamer = facet_reader.facet_dict().range().into_stream()?; if facet_streamer.advance() { 'outer: loop { // at the begining of this loop, facet_streamer diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index e1d76edd7..2f4edf76d 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -66,7 +66,7 @@ impl InvertedIndexReader { } /// Returns the term info associated with the term. - pub fn get_term_info(&self, term: &Term) -> Option { + pub fn get_term_info(&self, term: &Term) -> io::Result> { self.termdict.get(term.value_bytes()) } @@ -106,10 +106,9 @@ impl InvertedIndexReader { term: &Term, option: IndexRecordOption, ) -> io::Result> { - Ok(self - .get_term_info(term) + self.get_term_info(term)? .map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option)) - .transpose()?) + .transpose() } /// Returns a block postings given a `term_info`. @@ -181,7 +180,7 @@ impl InvertedIndexReader { term: &Term, option: IndexRecordOption, ) -> io::Result> { - self.get_term_info(term) + self.get_term_info(term)? .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) .transpose() } @@ -191,7 +190,7 @@ impl InvertedIndexReader { term: &Term, option: IndexRecordOption, ) -> io::Result> { - self.get_term_info(term) + self.get_term_info(term)? .map(|term_info| self.read_postings_from_terminfo(&term_info, option)) .transpose() } @@ -199,7 +198,7 @@ impl InvertedIndexReader { /// Returns the number of documents containing the term. pub fn doc_freq(&self, term: &Term) -> io::Result { Ok(self - .get_term_info(term) + .get_term_info(term)? .map(|term_info| term_info.doc_freq) .unwrap_or(0u32)) } diff --git a/src/core/mod.rs b/src/core/mod.rs index d94112b32..e0fe08e6c 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -12,7 +12,7 @@ pub use self::executor::Executor; pub use self::index::Index; pub use self::index_meta::{IndexMeta, SegmentMeta, SegmentMetaInventory}; pub use self::inverted_index_reader::InvertedIndexReader; -pub use self::searcher::{FieldSearcher, Searcher}; +pub use self::searcher::Searcher; pub use self::segment::Segment; pub use self::segment::SerializableSegment; pub use self::segment_component::SegmentComponent; diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 8cfed0b4f..7123cfcf4 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -1,17 +1,16 @@ use crate::collector::Collector; use crate::core::Executor; -use crate::core::InvertedIndexReader; + use crate::core::SegmentReader; use crate::query::Query; use crate::schema::Document; use crate::schema::Schema; -use crate::schema::{Field, Term}; +use crate::schema::Term; use crate::space_usage::SearcherSpaceUsage; use crate::store::StoreReader; -use crate::termdict::TermMerger; use crate::DocAddress; use crate::Index; -use std::sync::Arc; + use std::{fmt, io}; /// Holds a list of `SegmentReader`s ready for search. @@ -148,16 +147,6 @@ impl Searcher { collector.merge_fruits(fruits) } - /// Return the field searcher associated to a `Field`. - pub fn field(&self, field: Field) -> crate::Result { - let inv_index_readers: Vec> = self - .segment_readers - .iter() - .map(|segment_reader| segment_reader.inverted_index(field)) - .collect::>>()?; - Ok(FieldSearcher::new(inv_index_readers)) - } - /// Summarize total space usage of this searcher. pub fn space_usage(&self) -> io::Result { let mut space_usage = SearcherSpaceUsage::new(); @@ -168,32 +157,6 @@ impl Searcher { } } -/// **Experimental API** `FieldSearcher` only gives access to a stream over the terms of a field. -pub struct FieldSearcher { - inv_index_readers: Vec>, -} - -impl FieldSearcher { - fn new(inv_index_readers: Vec>) -> FieldSearcher { - FieldSearcher { inv_index_readers } - } - - /// Returns a Stream over all of the sorted unique terms of - /// for the given field. - /// - /// This method does not take into account which documents are deleted, so - /// in presence of deletes some terms may not actually exist in any document - /// anymore. - pub fn terms(&self) -> TermMerger { - let term_streamers: Vec<_> = self - .inv_index_readers - .iter() - .map(|inverted_index| inverted_index.terms().stream()) - .collect(); - TermMerger::new(term_streamers) - } -} - impl fmt::Debug for Searcher { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let segment_ids = self diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 6ad3f61e7..a21d924df 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -514,7 +514,7 @@ impl IndexMerger { for field_reader in &field_readers { let terms = field_reader.terms(); - field_term_streams.push(terms.stream()); + field_term_streams.push(terms.stream()?); max_term_ords.push(terms.num_terms() as u64); } diff --git a/src/lib.rs b/src/lib.rs index fa14c9095..f66b54712 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -160,7 +160,7 @@ pub use self::docset::{DocSet, TERMINATED}; pub use crate::common::HasLen; pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; pub use crate::core::{Executor, SegmentComponent}; -pub use crate::core::{FieldSearcher, Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta}; +pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta}; pub use crate::core::{InvertedIndexReader, SegmentReader}; pub use crate::directory::Directory; pub use crate::indexer::operation::UserOperation; diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 849453af5..9030d8a57 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -469,7 +469,7 @@ mod tests { let segment_reader = searcher.segment_reader(0); let inverted_index = segment_reader.inverted_index(int_field).unwrap(); let term = Term::from_field_u64(int_field, 0u64); - let term_info = inverted_index.get_term_info(&term).unwrap(); + let term_info = inverted_index.get_term_info(&term).unwrap().unwrap(); inverted_index .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic) .unwrap() @@ -513,7 +513,7 @@ mod tests { { let term = Term::from_field_u64(int_field, 0u64); let inverted_index = segment_reader.inverted_index(int_field)?; - let term_info = inverted_index.get_term_info(&term).unwrap(); + let term_info = inverted_index.get_term_info(&term)?.unwrap(); block_segments = inverted_index .read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?; } @@ -521,7 +521,7 @@ mod tests { { let term = Term::from_field_u64(int_field, 1u64); let inverted_index = segment_reader.inverted_index(int_field)?; - let term_info = inverted_index.get_term_info(&term).unwrap(); + let term_info = inverted_index.get_term_info(&term)?.unwrap(); inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments)?; } assert_eq!(block_segments.docs(), &[1, 3, 5]); diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 83ee9c88e..6ca424e0a 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,6 +7,7 @@ use crate::schema::{Field, IndexRecordOption}; use crate::termdict::{TermDictionary, TermStreamer}; use crate::TantivyError; use crate::{DocId, Score}; +use std::io; use std::sync::Arc; use tantivy_fst::Automaton; @@ -28,7 +29,10 @@ where } } - fn automaton_stream<'a>(&'a self, term_dict: &'a TermDictionary) -> TermStreamer<'a, &'a A> { + fn automaton_stream<'a>( + &'a self, + term_dict: &'a TermDictionary, + ) -> io::Result> { let automaton: &A = &*self.automaton; let term_stream_builder = term_dict.search(automaton); term_stream_builder.into_stream() @@ -44,7 +48,7 @@ where let mut doc_bitset = BitSet::with_max_value(max_doc); let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); - let mut term_stream = self.automaton_stream(term_dict); + let mut term_stream = self.automaton_stream(term_dict)?; while term_stream.advance() { let term_info = term_stream.value(); let mut block_segment_postings = inverted_index diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 7d78bf4f2..55ea15720 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -11,6 +11,7 @@ use crate::schema::{Field, IndexRecordOption, Term}; use crate::termdict::{TermDictionary, TermStreamer}; use crate::{DocId, Score}; use std::collections::Bound; +use std::io; use std::ops::Range; fn map_bound TTo>( @@ -274,7 +275,7 @@ pub struct RangeWeight { } impl RangeWeight { - fn term_range<'a>(&self, term_dict: &'a TermDictionary) -> TermStreamer<'a> { + fn term_range<'a>(&self, term_dict: &'a TermDictionary) -> io::Result> { use std::collections::Bound::*; let mut term_stream_builder = term_dict.range(); term_stream_builder = match self.left_bound { @@ -298,7 +299,7 @@ impl Weight for RangeWeight { let inverted_index = reader.inverted_index(self.field)?; let term_dict = inverted_index.terms(); - let mut term_range = self.term_range(term_dict); + let mut term_range = self.term_range(term_dict)?; while term_range.advance() { let term_info = term_range.value(); let mut block_segment_postings = inverted_index diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index fb1e8e0fa..a7c583c29 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -45,7 +45,7 @@ impl Weight for TermWeight { } else { let field = self.term.field(); let inv_index = reader.inverted_index(field)?; - let term_info = inv_index.get_term_info(&self.term); + let term_info = inv_index.get_term_info(&self.term)?; Ok(term_info.map(|term_info| term_info.doc_freq).unwrap_or(0)) } } diff --git a/src/termdict/fst_termdict/streamer.rs b/src/termdict/fst_termdict/streamer.rs index b680d2ede..66ce02c2a 100644 --- a/src/termdict/fst_termdict/streamer.rs +++ b/src/termdict/fst_termdict/streamer.rs @@ -1,3 +1,5 @@ +use std::io; + use super::TermDictionary; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; @@ -59,14 +61,14 @@ where /// Creates the stream corresponding to the range /// of terms defined using the `TermStreamerBuilder`. - pub fn into_stream(self) -> TermStreamer<'a, A> { - TermStreamer { + pub fn into_stream(self) -> io::Result> { + Ok(TermStreamer { fst_map: self.fst_map, stream: self.stream_builder.into_stream(), term_ord: 0u64, current_key: Vec::with_capacity(100), current_value: TermInfo::default(), - } + }) } } diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs index 0dd54ec5d..240706dc6 100644 --- a/src/termdict/fst_termdict/termdict.rs +++ b/src/termdict/fst_termdict/termdict.rs @@ -139,8 +139,8 @@ impl TermDictionary { } /// Returns the ordinal associated to a given term. - pub fn term_ord>(&self, key: K) -> Option { - self.fst_index.get(key) + pub fn term_ord>(&self, key: K) -> io::Result> { + Ok(self.fst_index.get(key)) } /// Returns the term associated to a given term ordinal. @@ -179,9 +179,10 @@ impl TermDictionary { } /// Lookups the value corresponding to the key. - pub fn get>(&self, key: K) -> Option { - self.term_ord(key) - .map(|term_ord| self.term_info_from_ord(term_ord)) + pub fn get>(&self, key: K) -> io::Result> { + Ok(self + .term_ord(key)? + .map(|term_ord| self.term_info_from_ord(term_ord))) } /// Returns a range builder, to stream all of the terms @@ -191,7 +192,7 @@ impl TermDictionary { } /// A stream of all the sorted terms. [See also `.stream_field()`](#method.stream_field) - pub fn stream(&self) -> TermStreamer<'_> { + pub fn stream(&self) -> io::Result> { self.range().into_stream() } diff --git a/src/termdict/tests.rs b/src/termdict/tests.rs index c7d3afc7f..23b0cb0b9 100644 --- a/src/termdict/tests.rs +++ b/src/termdict/tests.rs @@ -1,8 +1,8 @@ use super::{TermDictionary, TermDictionaryBuilder, TermStreamer}; -use crate::core::Index; + use crate::directory::{Directory, FileSlice, RAMDirectory}; use crate::postings::TermInfo; -use crate::schema::{Schema, TEXT}; + use std::path::PathBuf; use std::str; @@ -21,7 +21,7 @@ fn make_term_info(term_ord: u64) -> TermInfo { #[test] fn test_empty_term_dictionary() { let empty = TermDictionary::empty(); - assert!(empty.stream().next().is_none()); + assert!(empty.stream().unwrap().next().is_none()); } #[test] @@ -48,7 +48,7 @@ fn test_term_ordinals() -> crate::Result<()> { let term_file = directory.open_read(&path)?; let term_dict: TermDictionary = TermDictionary::open(term_file)?; for (term_ord, term) in COUNTRIES.iter().enumerate() { - assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64); + assert_eq!(term_dict.term_ord(term)?, Some(term_ord as u64)); let mut bytes = vec![]; assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes)); assert_eq!(bytes, term.as_bytes()); @@ -69,9 +69,9 @@ fn test_term_dictionary_simple() -> crate::Result<()> { } let file = directory.open_read(&path)?; let term_dict: TermDictionary = TermDictionary::open(file)?; - assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); - assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); - let mut stream = term_dict.stream(); + assert_eq!(term_dict.get("abc")?.unwrap().doc_freq, 34u32); + assert_eq!(term_dict.get("abcd")?.unwrap().doc_freq, 346u32); + let mut stream = term_dict.stream()?; { { let (k, v) = stream.next().unwrap(); @@ -94,33 +94,6 @@ fn test_term_dictionary_simple() -> crate::Result<()> { Ok(()) } -#[test] -fn test_term_iterator() -> crate::Result<()> { - let mut schema_builder = Schema::builder(); - let text_field = schema_builder.add_text_field("text", TEXT); - let index = Index::create_in_ram(schema_builder.build()); - { - let mut index_writer = index.writer_for_tests()?; - index_writer.add_document(doc!(text_field=>"a b d f")); - index_writer.commit()?; - index_writer.add_document(doc!(text_field=>"a b c d f")); - index_writer.commit()?; - index_writer.add_document(doc!(text_field => "e f")); - index_writer.commit()?; - } - let searcher = index.reader()?.searcher(); - - let field_searcher = searcher.field(text_field)?; - let mut term_it = field_searcher.terms(); - let mut term_string = String::new(); - while term_it.advance() { - //let term = Term::from_bytes(term_it.key()); - term_string.push_str(str::from_utf8(term_it.key()).expect("test")); - } - assert_eq!(&*term_string, "abcdef"); - Ok(()) -} - #[test] fn test_term_dictionary_stream() -> crate::Result<()> { let ids: Vec<_> = (0u32..10_000u32) @@ -138,7 +111,7 @@ fn test_term_dictionary_stream() -> crate::Result<()> { let term_file = FileSlice::from(buffer); let term_dictionary: TermDictionary = TermDictionary::open(term_file)?; { - let mut streamer = term_dictionary.stream(); + let mut streamer = term_dictionary.stream()?; let mut i = 0; while let Some((streamer_k, streamer_v)) = streamer.next() { let &(ref key, ref v) = &ids[i]; @@ -150,7 +123,7 @@ fn test_term_dictionary_stream() -> crate::Result<()> { let &(ref key, ref val) = &ids[2047]; assert_eq!( - term_dictionary.get(key.as_bytes()), + term_dictionary.get(key.as_bytes())?, Some(make_term_info(*val as u64)) ); Ok(()) @@ -168,7 +141,7 @@ fn test_stream_high_range_prefix_suffix() -> crate::Result<()> { }; let term_dict_file = FileSlice::from(buffer); let term_dictionary: TermDictionary = TermDictionary::open(term_dict_file)?; - let mut kv_stream = term_dictionary.stream(); + let mut kv_stream = term_dictionary.stream()?; assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); assert_eq!(kv_stream.value(), &make_term_info(1)); @@ -206,7 +179,7 @@ fn test_stream_range() -> crate::Result<()> { let mut streamer = term_dictionary .range() .ge(target_key.as_bytes()) - .into_stream(); + .into_stream()?; for j in 0..3 { let (streamer_k, streamer_v) = streamer.next().unwrap(); let &(ref key, ref v) = &ids[i + j]; @@ -223,7 +196,7 @@ fn test_stream_range() -> crate::Result<()> { let mut streamer = term_dictionary .range() .gt(target_key.as_bytes()) - .into_stream(); + .into_stream()?; for j in 0..3 { let (streamer_k, streamer_v) = streamer.next().unwrap(); let &(ref key, ref v) = &ids[i + j + 1]; @@ -242,7 +215,7 @@ fn test_stream_range() -> crate::Result<()> { .range() .ge(fst_key.as_bytes()) .lt(last_key.as_bytes()) - .into_stream(); + .into_stream()?; for _ in 0..j { assert!(streamer.next().is_some()); } @@ -267,7 +240,7 @@ fn test_empty_string() -> crate::Result<()> { }; let file = FileSlice::from(buffer); let term_dictionary: TermDictionary = TermDictionary::open(file)?; - let mut stream = term_dictionary.stream(); + let mut stream = term_dictionary.stream()?; assert!(stream.advance()); assert!(stream.key().is_empty()); assert!(stream.advance()); @@ -300,70 +273,70 @@ fn test_stream_range_boundaries() -> crate::Result<()> { res }; { - let range = term_dictionary.range().backward().into_stream(); + let range = term_dictionary.range().backward().into_stream()?; assert_eq!( value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] ); } { - let range = term_dictionary.range().ge([2u8]).into_stream(); + let range = term_dictionary.range().ge([2u8]).into_stream()?; assert_eq!( value_list(range, false), vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] ); } { - let range = term_dictionary.range().ge([2u8]).backward().into_stream(); + let range = term_dictionary.range().ge([2u8]).backward().into_stream()?; assert_eq!( value_list(range, true), vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] ); } { - let range = term_dictionary.range().gt([2u8]).into_stream(); + let range = term_dictionary.range().gt([2u8]).into_stream()?; assert_eq!( value_list(range, false), vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] ); } { - let range = term_dictionary.range().gt([2u8]).backward().into_stream(); + let range = term_dictionary.range().gt([2u8]).backward().into_stream()?; assert_eq!( value_list(range, true), vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] ); } { - let range = term_dictionary.range().lt([6u8]).into_stream(); + let range = term_dictionary.range().lt([6u8]).into_stream()?; assert_eq!( value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32] ); } { - let range = term_dictionary.range().lt([6u8]).backward().into_stream(); + let range = term_dictionary.range().lt([6u8]).backward().into_stream()?; assert_eq!( value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32] ); } { - let range = term_dictionary.range().le([6u8]).into_stream(); + let range = term_dictionary.range().le([6u8]).into_stream()?; assert_eq!( value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32] ); } { - let range = term_dictionary.range().le([6u8]).backward().into_stream(); + let range = term_dictionary.range().le([6u8]).backward().into_stream()?; assert_eq!( value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32] ); } { - let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream(); + let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream()?; assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]); } { @@ -372,7 +345,7 @@ fn test_stream_range_boundaries() -> crate::Result<()> { .ge([0u8]) .lt([5u8]) .backward() - .into_stream(); + .into_stream()?; assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]); } Ok(()) @@ -410,7 +383,7 @@ fn test_automaton_search() -> crate::Result<()> { let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true); let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen")); - let mut range = term_dict.search(automaton).into_stream(); + let mut range = term_dict.search(automaton).into_stream()?; // get the first finding assert!(range.advance());