diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs new file mode 100644 index 000000000..9d3937617 --- /dev/null +++ b/examples/iterating_docs_and_positions.rs @@ -0,0 +1,139 @@ +// # Iterating docs and positioms. +// +// At its core of tantivy, relies on a data structure +// called an inverted index. +// +// This example shows how to manually iterate through +// the list of documents containing a term, getting +// its term frequency, and accessing its positions. + + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::schema::*; +use tantivy::Index; +use tantivy::{DocSet, DocId, Postings}; + +fn main() -> tantivy::Result<()> { + + + // We first create a schema for the sake of the + // example. Check the `basic_search` example for more information. + let mut schema_builder = SchemaBuilder::default(); + + // For this example, we need to make sure to index positions for our title + // field. `TEXT` precisely does this. + let title = schema_builder.add_text_field("title", TEXT | STORED); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema.clone()); + + let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?; + index_writer.add_document(doc!(title => "The Old Man and the Sea")); + index_writer.add_document(doc!(title => "Of Mice and Men")); + index_writer.add_document(doc!(title => "The modern Promotheus")); + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + + // A tantivy index is actually a collection of segments. + // Similarly, a searcher just wraps a list `segment_reader`. + // + // (Because we indexed a very small number of documents over one thread + // there is actually only one segment here, but let's iterate through the list + // anyway) + for segment_reader in searcher.segment_readers() { + + // A segment contains different data structure. + // Inverted index stands for the combination of + // - the term dictionary + // - the inverted lists associated to each terms and their positions + let inverted_index = segment_reader.inverted_index(title); + + // A `Term` is a text token associated with a field. + // Let's go through all docs containing the term `title:the` and access their position + let term_the = Term::from_field_text(title, "the"); + + + // This segment posting object is like a cursor over the documents matching the term. + // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies + // and positions. + // + // If you don't need all this information, you may get better performance by decompressing less + // information. + if let Some(mut segment_postings) = inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) { + + // this buffer will be used to request for positions + let mut positions: Vec = Vec::with_capacity(100); + while segment_postings.advance() { + + // the number of time the term appears in the document. + let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once. + + // This MAY contains deleted documents as well. + if segment_reader.is_deleted(doc_id) { + continue; + } + + // the number of time the term appears in the document. + let term_freq: u32 = segment_postings.term_freq(); + // accessing positions is slightly expensive and lazy, do not request + // for them if you don't need them for some documents. + segment_postings.positions(&mut positions); + + // By definition we should have `term_freq` positions. + assert_eq!(positions.len(), term_freq as usize); + + // This prints: + // ``` + // Doc 0: TermFreq 2: [0, 4] + // Doc 2: TermFreq 1: [0] + // ``` + println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions); + } + } + } + + + // A `Term` is a text token associated with a field. + // Let's go through all docs containing the term `title:the` and access their position + let term_the = Term::from_field_text(title, "the"); + + // Some other powerful operations (especially `.skip_to`) may be useful to consume these + // posting lists rapidly. + // You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait + // and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait + + // Also, for some VERY specific high performance use case like an OLAP analysis of logs, + // you can get better performance by accessing directly the blocks of doc ids. + for segment_reader in searcher.segment_readers() { + + // A segment contains different data structure. + // Inverted index stands for the combination of + // - the term dictionary + // - the inverted lists associated to each terms and their positions + let inverted_index = segment_reader.inverted_index(title); + + // This segment posting object is like a cursor over the documents matching the term. + // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies + // and positions. + // + // If you don't need all this information, you may get better performance by decompressing less + // information. + if let Some(mut block_segment_postings) = inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) { + while block_segment_postings.advance() { + // Once again these docs MAY contains deleted documents as well. + let docs = block_segment_postings.docs(); + // Prints `Docs [0, 2].` + println!("Docs {:?}", docs); + } + } + } + + Ok(()) +} + diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 57716748a..b919e09b0 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -100,6 +100,20 @@ impl InvertedIndexReader { block_postings.reset(term_info.doc_freq, postings_reader); } + + /// Returns a block postings given a `Term`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_block_postings( + &self, + term: &Term, + option: IndexRecordOption, + ) -> Option { + self.get_term_info(term) + .map(move|term_info| self.read_block_postings_from_terminfo(&term_info, option)) + } + /// Returns a block postings given a `term_info`. /// This method is for an advanced usage only. /// @@ -159,8 +173,8 @@ impl InvertedIndexReader { /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option { - let term_info = get!(self.get_term_info(term)); - Some(self.read_postings_from_terminfo(&term_info, option)) + self.get_term_info(term) + .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) } pub(crate) fn read_postings_no_deletes( @@ -168,8 +182,8 @@ impl InvertedIndexReader { term: &Term, option: IndexRecordOption, ) -> Option { - let term_info = get!(self.get_term_info(term)); - Some(self.read_postings_from_terminfo(&term_info, option)) + self.get_term_info(term) + .map(|term_info| self.read_postings_from_terminfo(&term_info, option)) } /// Returns the number of documents containing the term. diff --git a/src/macros.rs b/src/macros.rs index 5e3d9b023..87d4d926e 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -1,7 +1,3 @@ -macro_rules! get( - ($e:expr) => (match $e { Some(e) => e, None => return None }) -); - /// `doc!` is a shortcut that helps building `Document` /// objects. ///