tantivy/src/core/searcher.rs

use crate::collector::Collector;
use crate::core::Executor;
use crate::core::InvertedIndexReader;
use crate::core::SegmentReader;
use crate::query::Query;
use crate::schema::Document;
use crate::schema::Schema;
use crate::schema::{Field, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermMerger;
use crate::DocAddress;
use crate::Index;
use std::fmt;
use std::sync::Arc;

/// Holds a list of `SegmentReader`s ready for search.
///
/// It guarantees that the `Segment` will not be removed before
/// the destruction of the `Searcher`.
///
pub struct Searcher {
    schema: Schema,
    index: Index,
    segment_readers: Vec<SegmentReader>,
    store_readers: Vec<StoreReader>,
}

impl Searcher {
    /// Creates a new `Searcher`
    pub(crate) fn new(
        schema: Schema,
        index: Index,
        segment_readers: Vec<SegmentReader>,
    ) -> Searcher {
        let store_readers = segment_readers
            .iter()
            .map(SegmentReader::get_store_reader)
            .collect();
        Searcher {
            schema,
            index,
            segment_readers,
            store_readers,
        }
    }

    /// Returns the `Index` associated to the `Searcher`
    pub fn index(&self) -> &Index {
        &self.index
    }

    /// Fetches a document from tantivy's store given a `DocAddress`.
    ///
    /// The searcher uses the segment ordinal to route the
    /// the request to the right `Segment`.
    pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
        let DocAddress(segment_local_id, doc_id) = doc_address;
        let store_reader = &self.store_readers[segment_local_id as usize];
        store_reader.get(doc_id)
    }

    /// Access the schema associated to the index of this searcher.
    pub fn schema(&self) -> &Schema {
        &self.schema
    }

    /// Returns the overall number of documents in the index.
    pub fn num_docs(&self) -> u64 {
        self.segment_readers
            .iter()
            .map(|segment_reader| u64::from(segment_reader.num_docs()))
            .sum::<u64>()
    }

    /// Return the overall number of documents containing
    /// the given term.
    pub fn doc_freq(&self, term: &Term) -> u64 {
        self.segment_readers
            .iter()
            .map(|segment_reader| {
                u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
            })
            .sum::<u64>()
    }

    /// Return the list of segment readers
    pub fn segment_readers(&self) -> &[SegmentReader] {
        &self.segment_readers
    }

    /// Returns the segment_reader associated with the given segment_ordinal
    pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
        &self.segment_readers[segment_ord as usize]
    }

    /// Runs a query on the segment readers wrapped by the searcher.
    ///
    /// Search works as follows :
    ///
    ///  First the weight object associated to the query is created.
    ///
    ///  Then, the query loops over the segments and for each segment :
    ///  - setup the collector and informs it that the segment being processed has changed.
    ///  - creates a SegmentCollector for collecting documents associated to the segment
    ///  - creates a `Scorer` object associated for this segment
    ///  - iterate through the matched documents and push them to the segment collector.
    ///
    ///  Finally, the Collector merges each of the child collectors into itself for result usability
    ///  by the caller.
    pub fn search<C: Collector>(
        &self,
        query: &dyn Query,
        collector: &C,
    ) -> crate::Result<C::Fruit> {
        let executor = self.index.search_executor();
        self.search_with_executor(query, collector, executor)
    }

    /// Same as [`search(...)`](#method.search) but multithreaded.
    ///
    /// The current implementation is rather naive :
    /// multithreading is by splitting search into as many task
    /// as there are segments.
    ///
    /// It is powerless at making search faster if your index consists in
    /// one large segment.
    ///
    /// Also, keep in my multithreading a single query on several
    /// threads will not improve your throughput. It can actually
    /// hurt it. It will however, decrease the average response time.
    pub fn search_with_executor<C: Collector>(
        &self,
        query: &dyn Query,
        collector: &C,
        executor: &Executor,
    ) -> crate::Result<C::Fruit> {
        let scoring_enabled = collector.requires_scoring();
        let weight = query.weight(self, scoring_enabled)?;
        let segment_readers = self.segment_readers();
        let fruits = executor.map(
            |(segment_ord, segment_reader)| {
                collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader)
            },
            segment_readers.iter().enumerate(),
        )?;
        collector.merge_fruits(fruits)
    }

    /// Return the field searcher associated to a `Field`.
    pub fn field(&self, field: Field) -> FieldSearcher {
        let inv_index_readers = self
            .segment_readers
            .iter()
            .map(|segment_reader| segment_reader.inverted_index(field))
            .collect::<Vec<_>>();
        FieldSearcher::new(inv_index_readers)
    }

    /// Summarize total space usage of this searcher.
    pub fn space_usage(&self) -> SearcherSpaceUsage {
        let mut space_usage = SearcherSpaceUsage::new();
        for segment_reader in self.segment_readers.iter() {
            space_usage.add_segment(segment_reader.space_usage());
        }
        space_usage
    }
}

pub struct FieldSearcher {
    inv_index_readers: Vec<Arc<InvertedIndexReader>>,
}

impl FieldSearcher {
    fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
        FieldSearcher { inv_index_readers }
    }

    /// Returns a Stream over all of the sorted unique terms of
    /// for the given field.
    pub fn terms(&self) -> TermMerger<'_> {
        let term_streamers: Vec<_> = self
            .inv_index_readers
            .iter()
            .map(|inverted_index| inverted_index.terms().stream())
            .collect();
        TermMerger::new(term_streamers)
    }
}

impl fmt::Debug for Searcher {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let segment_ids = self
            .segment_readers
            .iter()
            .map(SegmentReader::segment_id)
            .collect::<Vec<_>>();
        write!(f, "Searcher({:?})", segment_ids)
    }
}