mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
improve docs, rework exports (#2220)
* rework exports move snippet and advice make indexer pub, remove indexer reexports * add deprecation warning * add architecture overview
This commit is contained in:
@@ -10,7 +10,8 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, IndexWriter, Snippet, SnippetGenerator};
|
||||
use tantivy::snippet::{Snippet, SnippetGenerator};
|
||||
use tantivy::{doc, Index, IndexWriter};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
|
||||
@@ -18,11 +18,11 @@ use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_L
|
||||
use crate::error::{DataCorruption, TantivyError};
|
||||
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::indexer::IndexWriter;
|
||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::document::Document;
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
|
||||
fn load_metas(
|
||||
directory: &dyn Directory,
|
||||
|
||||
@@ -8,6 +8,8 @@ use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use common::StableDeref;
|
||||
use fs4::FileExt;
|
||||
#[cfg(all(feature = "mmap", unix))]
|
||||
pub use memmap2::Advice;
|
||||
use memmap2::Mmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tempfile::TempDir;
|
||||
@@ -21,8 +23,6 @@ use crate::directory::{
|
||||
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
|
||||
WatchCallback, WatchHandle, WritePtr,
|
||||
};
|
||||
#[cfg(unix)]
|
||||
use crate::Advice;
|
||||
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
|
||||
@@ -63,10 +63,13 @@ impl MergeOperation {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the opstamp up to which we want to consume the delete queue and reflect their
|
||||
/// deletes.
|
||||
pub fn target_opstamp(&self) -> Opstamp {
|
||||
self.inner.target_opstamp
|
||||
}
|
||||
|
||||
/// Returns the list of segment to be merged.
|
||||
pub fn segment_ids(&self) -> &[SegmentId] {
|
||||
&self.inner.segment_ids[..]
|
||||
}
|
||||
|
||||
@@ -1,23 +1,29 @@
|
||||
pub mod delete_queue;
|
||||
//! Indexing and merging data.
|
||||
//!
|
||||
//! Contains code to create and merge segments.
|
||||
//! `IndexWriter` is the main entry point for that, which created from
|
||||
//! [`Index::writer`](crate::Index::writer).
|
||||
|
||||
pub mod doc_id_mapping;
|
||||
pub(crate) mod delete_queue;
|
||||
|
||||
pub(crate) mod doc_id_mapping;
|
||||
mod doc_opstamp_mapping;
|
||||
mod flat_map_with_buffer;
|
||||
pub mod index_writer;
|
||||
mod index_writer_status;
|
||||
pub(crate) mod index_writer;
|
||||
pub(crate) mod index_writer_status;
|
||||
mod log_merge_policy;
|
||||
mod merge_operation;
|
||||
pub mod merge_policy;
|
||||
pub mod merger;
|
||||
pub(crate) mod merge_policy;
|
||||
pub(crate) mod merger;
|
||||
mod merger_sorted_index_test;
|
||||
pub mod operation;
|
||||
pub mod prepared_commit;
|
||||
pub(crate) mod operation;
|
||||
pub(crate) mod prepared_commit;
|
||||
mod segment_entry;
|
||||
mod segment_manager;
|
||||
mod segment_register;
|
||||
pub mod segment_serializer;
|
||||
pub mod segment_updater;
|
||||
mod segment_writer;
|
||||
pub(crate) mod segment_serializer;
|
||||
pub(crate) mod segment_updater;
|
||||
pub(crate) mod segment_writer;
|
||||
mod stamper;
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
@@ -27,10 +33,10 @@ pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
pub use self::operation::UserOperation;
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::SegmentEntry;
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub(crate) use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_updater::{merge_filtered_segments, merge_indices};
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
use crate::indexer::operation::AddOperation;
|
||||
|
||||
@@ -155,6 +155,8 @@ impl SegmentWriter {
|
||||
Ok(doc_opstamps)
|
||||
}
|
||||
|
||||
/// Returns an estimation of the current memory usage of the segment writer.
|
||||
/// If the mem usage exceeds the `memory_budget`, the segment be serialized.
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.ctx.mem_usage()
|
||||
+ self.fieldnorms_writer.mem_usage()
|
||||
|
||||
64
src/lib.rs
64
src/lib.rs
@@ -103,7 +103,48 @@
|
||||
//! the example code (
|
||||
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
|
||||
//! [source code](https://github.com/quickwit-oss/tantivy/blob/main/examples/basic_search.rs))
|
||||
|
||||
//!
|
||||
//! # Tantivy Architecture Overview
|
||||
//!
|
||||
//! Tantivy is inspired by Lucene, the Architecture is very similar.
|
||||
//!
|
||||
//! ## Core Concepts
|
||||
//!
|
||||
//! - **[Index]**: A collection of segments. The top level entry point for tantivy users to search
|
||||
//! and index data.
|
||||
//!
|
||||
//! - **[Segment]**: At the heart of Tantivy's indexing structure is the [Segment]. It contains
|
||||
//! documents and indices and is the atomic unit of indexing and search.
|
||||
//!
|
||||
//! - **[Schema](schema)**: A schema is a set of fields in an index. Each field has a specific data
|
||||
//! type and set of attributes.
|
||||
//!
|
||||
//! - **[IndexWriter]**: Responsible creating and merging segments. It executes the indexing
|
||||
//! pipeline including tokenization, creating indices, and storing the index in the
|
||||
//! [Directory](directory).
|
||||
//!
|
||||
//! - **Searching**: [Searcher] searches the segments with anything that implements
|
||||
//! [Query](query::Query) and merges the results. The list of [supported
|
||||
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
|
||||
//! [Query](query::Query) trait.
|
||||
//!
|
||||
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
|
||||
//!
|
||||
//! - **[Tokenizer](tokenizer)**: Breaks down text into individual tokens. Users can implement or
|
||||
//! use provided tokenizers.
|
||||
//!
|
||||
//! ## Architecture Flow
|
||||
//!
|
||||
//! 1. **Document Addition**: Users create documents according to the defined schema. The documents
|
||||
//! fields are tokenized, processed, and added to the current segment. See
|
||||
//! [Document](schema::document) for the structure and usage.
|
||||
//!
|
||||
//! 2. **Segment Creation**: Once the memory limit threshold is reached or a commit is called, the
|
||||
//! segment is written to the Directory. Documents are searchable after `commit`.
|
||||
//!
|
||||
//! 3. **Merging**: To optimize space and search speed, segments might be merged. This operation is
|
||||
//! performed in the background. Customize the merge behaviour via
|
||||
//! [IndexWriter::set_merge_policy].
|
||||
#[cfg_attr(test, macro_use)]
|
||||
extern crate serde_json;
|
||||
#[macro_use]
|
||||
@@ -137,7 +178,7 @@ pub use crate::future_result::FutureResult;
|
||||
pub type Result<T> = std::result::Result<T, TantivyError>;
|
||||
|
||||
mod core;
|
||||
mod indexer;
|
||||
pub mod indexer;
|
||||
|
||||
#[allow(unused_doc_comments)]
|
||||
pub mod error;
|
||||
@@ -161,8 +202,7 @@ pub mod termdict;
|
||||
mod reader;
|
||||
|
||||
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
|
||||
mod snippet;
|
||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
pub mod snippet;
|
||||
|
||||
mod docset;
|
||||
use std::fmt;
|
||||
@@ -173,6 +213,11 @@ use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use self::docset::{DocSet, TERMINATED};
|
||||
#[deprecated(
|
||||
since = "0.22.0",
|
||||
note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
|
||||
)]
|
||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
#[doc(hidden)]
|
||||
pub use crate::core::json_utils;
|
||||
pub use crate::core::{
|
||||
@@ -181,8 +226,12 @@ pub use crate::core::{
|
||||
SegmentReader, SingleSegmentIndexWriter,
|
||||
};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::operation::UserOperation;
|
||||
pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit};
|
||||
pub use crate::indexer::IndexWriter;
|
||||
#[deprecated(
|
||||
since = "0.22.0",
|
||||
note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
|
||||
)]
|
||||
pub use crate::indexer::{merge_filtered_segments, merge_indices, PreparedCommit};
|
||||
pub use crate::postings::Postings;
|
||||
#[allow(deprecated)]
|
||||
pub use crate::schema::DatePrecision;
|
||||
@@ -191,9 +240,6 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocumen
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 5;
|
||||
|
||||
#[cfg(all(feature = "mmap", unix))]
|
||||
pub use memmap2::Advice;
|
||||
|
||||
/// Structure version for the index.
|
||||
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct Version {
|
||||
|
||||
@@ -1,3 +1,59 @@
|
||||
//! [`SnippetGenerator`]
|
||||
//! Generates a text snippet for a given document, and some highlighted parts inside it.
|
||||
//! Imagine you doing a text search in a document
|
||||
//! and want to show a preview of where in the document the search terms occur,
|
||||
//! along with some surrounding text to give context, and the search terms highlighted.
|
||||
//!
|
||||
//! [`SnippetGenerator`] serves this purpose.
|
||||
//! It scans a document and constructs a snippet, which consists of sections where the search terms
|
||||
//! have been found, stitched together with "..." in between sections if necessary.
|
||||
//!
|
||||
//! ## Example
|
||||
//!
|
||||
//! ```rust
|
||||
//! # use tantivy::query::QueryParser;
|
||||
//! # use tantivy::schema::{Schema, TEXT};
|
||||
//! # use tantivy::{doc, Index};
|
||||
//! use tantivy::snippet::SnippetGenerator;
|
||||
//!
|
||||
//! # fn main() -> tantivy::Result<()> {
|
||||
//! # let mut schema_builder = Schema::builder();
|
||||
//! # let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
//! # let schema = schema_builder.build();
|
||||
//! # let index = Index::create_in_ram(schema);
|
||||
//! # let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||
//! # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
|
||||
//! # Je ne me sentis plus guidé par les haleurs :
|
||||
//! # Des Peaux-Rouges criards les avaient pris pour cibles,
|
||||
//! # Les ayant cloués nus aux poteaux de couleurs.
|
||||
//! #
|
||||
//! # J'étais insoucieux de tous les équipages,
|
||||
//! # Porteur de blés flamands ou de cotons anglais.
|
||||
//! # Quand avec mes haleurs ont fini ces tapages,
|
||||
//! # Les Fleuves m'ont laissé descendre où je voulais.
|
||||
//! # "#);
|
||||
//! # index_writer.add_document(doc.clone())?;
|
||||
//! # index_writer.commit()?;
|
||||
//! # let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
//! // ...
|
||||
//! let query = query_parser.parse_query("haleurs flamands").unwrap();
|
||||
//! # let reader = index.reader()?;
|
||||
//! # let searcher = reader.searcher();
|
||||
//! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
|
||||
//! snippet_generator.set_max_num_chars(100);
|
||||
//! let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
//! let snippet_html: String = snippet.to_html();
|
||||
//! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
|
||||
//! # Ok(())
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! You can also specify the maximum number of characters for the snippets generated with the
|
||||
//! `set_max_num_chars` method. By default, this limit is set to 150.
|
||||
//!
|
||||
//! SnippetGenerator needs to be created from the `Searcher` and the query, and the field on which
|
||||
//! the `SnippetGenerator` should generate the snippets.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::ops::Range;
|
||||
@@ -16,7 +72,7 @@ const DEFAULT_SNIPPET_PREFIX: &str = "<b>";
|
||||
const DEFAULT_SNIPPET_POSTFIX: &str = "</b>";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FragmentCandidate {
|
||||
pub(crate) struct FragmentCandidate {
|
||||
score: Score,
|
||||
start_offset: usize,
|
||||
stop_offset: usize,
|
||||
@@ -256,7 +312,7 @@ fn is_sorted(mut it: impl Iterator<Item = usize>) -> bool {
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// # use tantivy::schema::{Schema, TEXT};
|
||||
/// # use tantivy::{doc, Index};
|
||||
/// use tantivy::SnippetGenerator;
|
||||
/// use tantivy::snippet::SnippetGenerator;
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
@@ -346,7 +402,7 @@ impl SnippetGenerator {
|
||||
})
|
||||
}
|
||||
|
||||
/// Sets a maximum number of chars.
|
||||
/// Sets a maximum number of chars. Default is 150.
|
||||
pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
|
||||
self.max_num_chars = max_num_chars;
|
||||
}
|
||||
@@ -398,8 +454,9 @@ mod tests {
|
||||
use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination};
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
|
||||
use crate::snippet::SnippetGenerator;
|
||||
use crate::tokenizer::{NgramTokenizer, SimpleTokenizer};
|
||||
use crate::{Index, SnippetGenerator};
|
||||
use crate::Index;
|
||||
|
||||
const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by
|
||||
Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//! The term dictionary main role is to associate the sorted [`Term`s](crate::Term) to
|
||||
//! a [`TermInfo`](crate::postings::TermInfo) struct that contains some meta-information
|
||||
//! a [`TermInfo`] struct that contains some meta-information
|
||||
//! about the term.
|
||||
//!
|
||||
//! Internally, the term dictionary relies on the `fst` crate to store
|
||||
@@ -16,8 +16,7 @@
|
||||
//! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
|
||||
//! as `u64`.
|
||||
//!
|
||||
//! A second datastructure makes it possible to access a
|
||||
//! [`TermInfo`](crate::postings::TermInfo).
|
||||
//! A second datastructure makes it possible to access a [`TermInfo`].
|
||||
|
||||
#[cfg(not(feature = "quickwit"))]
|
||||
mod fst_termdict;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update
|
||||
//! for each new tantivy version.
|
||||
//!
|
||||
//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait.
|
||||
//! To add support for a tokenizer, implement the [`Tokenizer`] trait.
|
||||
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
|
||||
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
|
||||
Reference in New Issue
Block a user