update examples for literate docs (#1880)

2025-12-23 02:29:57 +00:00 · 2023-02-17 18:48:22 +08:00
parent 111f25a8f7
commit bf1449b22d
10 changed files with 510 additions and 102 deletions
--- a/examples/fuzzy_search.rs
+++ b/examples/fuzzy_search.rs
@@ -0,0 +1,170 @@
+// # Basic Example
+//
+// This example covers the basic functionalities of
+// tantivy.
+//
+// We will :
+// - define our schema
+// - create an index in a directory
+// - index a few documents into our index
+// - search for the best document matching a basic query
+// - retrieve the best document's original content.
+
+use std::collections::HashSet;
+
+// ---
+// Importing tantivy...
+use tantivy::collector::{Count, TopDocs};
+use tantivy::query::{FuzzyTermQuery, QueryParser};
+use tantivy::schema::*;
+use tantivy::{doc, DocId, Index, ReloadPolicy, Score, SegmentReader};
+use tempfile::TempDir;
+
+fn main() -> tantivy::Result<()> {
+    // Let's create a temporary directory for the
+    // sake of this example
+    let index_path = TempDir::new()?;
+
+    // # Defining the schema
+    //
+    // The Tantivy index requires a very strict schema.
+    // The schema declares which fields are in the index,
+    // and for each field, its type and "the way it should
+    // be indexed".
+
+    // First we need to define a schema ...
+    let mut schema_builder = Schema::builder();
+
+    // Our first field is title.
+    // We want full-text search for it, and we also want
+    // to be able to retrieve the document after the search.
+    //
+    // `TEXT | STORED` is some syntactic sugar to describe
+    // that.
+    //
+    // `TEXT` means the field should be tokenized and indexed,
+    // along with its term frequency and term positions.
+    //
+    // `STORED` means that the field will also be saved
+    // in a compressed, row-oriented key-value store.
+    // This store is useful for reconstructing the
+    // documents that were selected during the search phase.
+    let title = schema_builder.add_text_field("title", TEXT | STORED);
+
+    let schema = schema_builder.build();
+
+    // # Indexing documents
+    //
+    // Let's create a brand new index.
+    //
+    // This will actually just save a meta.json
+    // with our schema in the directory.
+    let index = Index::create_in_dir(&index_path, schema.clone())?;
+
+    // To insert a document we will need an index writer.
+    // There must be only one writer at a time.
+    // This single `IndexWriter` is already
+    // multithreaded.
+    //
+    // Here we give tantivy a budget of `50MB`.
+    // Using a bigger memory_arena for the indexer may increase
+    // throughput, but 50 MB is already plenty.
+    let mut index_writer = index.writer(50_000_000)?;
+
+    // Let's index our documents!
+    // We first need a handle on the title and the body field.
+
+    // ### Adding documents
+    //
+    index_writer.add_document(doc!(
+        title => "The Name of the Wind",
+    ))?;
+    index_writer.add_document(doc!(
+        title => "The Diary of Muadib",
+    ))?;
+    index_writer.add_document(doc!(
+        title => "A Dairy Cow",
+    ))?;
+    index_writer.add_document(doc!(
+        title => "The Diary of a Young Girl",
+    ))?;
+    index_writer.commit()?;
+
+    // ### Committing
+    //
+    // At this point our documents are not searchable.
+    //
+    //
+    // We need to call `.commit()` explicitly to force the
+    // `index_writer` to finish processing the documents in the queue,
+    // flush the current index to the disk, and advertise
+    // the existence of new documents.
+    //
+    // This call is blocking.
+    index_writer.commit()?;
+
+    // If `.commit()` returns correctly, then all of the
+    // documents that have been added are guaranteed to be
+    // persistently indexed.
+    //
+    // In the scenario of a crash or a power failure,
+    // tantivy behaves as if it has rolled back to its last
+    // commit.
+
+    // # Searching
+    //
+    // ### Searcher
+    //
+    // A reader is required first in order to search an index.
+    // It acts as a `Searcher` pool that reloads itself,
+    // depending on a `ReloadPolicy`.
+    //
+    // For a search server you will typically create one reader for the entire lifetime of your
+    // program, and acquire a new searcher for every single request.
+    //
+    // In the code below, we rely on the 'ON_COMMIT' policy: the reader
+    // will reload the index automatically after each commit.
+    let reader = index
+        .reader_builder()
+        .reload_policy(ReloadPolicy::OnCommit)
+        .try_into()?;
+
+    // We now need to acquire a searcher.
+    //
+    // A searcher points to a snapshotted, immutable version of the index.
+    //
+    // Some search experience might require more than
+    // one query. Using the same searcher ensures that all of these queries will run on the
+    // same version of the index.
+    //
+    // Acquiring a `searcher` is very cheap.
+    //
+    // You should acquire a searcher every time you start processing a request and
+    // and release it right after your query is finished.
+    let searcher = reader.searcher();
+
+    // ### FuzzyTermQuery
+    {
+        let term = Term::from_field_text(title, "Diary");
+        let query = FuzzyTermQuery::new(term, 2, true);
+
+        let (top_docs, count) = searcher
+            .search(&query, &(TopDocs::with_limit(5), Count))
+            .unwrap();
+        assert_eq!(count, 3);
+        assert_eq!(top_docs.len(), 3);
+        for (score, doc_address) in top_docs {
+            let retrieved_doc = searcher.doc(doc_address)?;
+            // Note that the score is not lower for the fuzzy hit.
+            // There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
+            println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
+            // score 1.0 doc {"title":["The Diary of Muadib"]}
+            //
+            // score 1.0 doc {"title":["The Diary of a Young Girl"]}
+            //
+            // score 1.0 doc {"title":["A Dairy Cow"]}
+        }
+    }
+
+    Ok(())
+}