// # Basic Example // // This example covers the basic functionalities of // tantivy. // // We will : // - define our schema // = create an index in a directory // - index few documents in our index // - search for the best document matchings "sea whale" // - retrieve the best document original content. extern crate tempdir; // --- // Importing tantivy... #[macro_use] extern crate tantivy; use tantivy::collector::TopCollector; use tantivy::query::QueryParser; use tantivy::schema::*; use tantivy::Index; fn main() -> tantivy::Result<()> { // Let's create a temporary directory for the // sake of this example let index_path = TempDir::new("tantivy_example_dir")?; // # Defining the schema // // The Tantivy index requires a very strict schema. // The schema declares which fields are in the index, // and for each field, its type and "the way it should // be indexed". // first we need to define a schema ... let mut schema_builder = SchemaBuilder::default(); // Our first field is title. // We want full-text search for it, and we also want // to be able to retrieve the document after the search. // // `TEXT | STORED` is some syntactic sugar to describe // that. // // `TEXT` means the field should be tokenized and indexed, // along with its term frequency and term positions. // // `STORED` means that the field will also be saved // in a compressed, row-oriented key-value store. // This store is useful to reconstruct the // documents that were selected during the search phase. schema_builder.add_text_field("title", TEXT | STORED); // Our second field is body. // We want full-text search for it, but we do not // need to be able to be able to retrieve it // for our application. // // We can make our index lighter and // by omitting `STORED` flag. schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); // # Indexing documents // // Let's create a brand new index. // // This will actually just save a meta.json // with our schema in the directory. let index = Index::create_in_dir(&index_path, schema.clone())?; // To insert document we need an index writer. // There must be only one writer at a time. // This single `IndexWriter` is already // multithreaded. // // Here we give tantivy a budget of `50MB`. // Using a bigger heap for the indexer may increase // throughput, but 50 MB is already plenty. let mut index_writer = index.writer(50_000_000)?; // Let's index our documents! // We first need a handle on the title and the body field. // ### Adding documents // // We can create a document manually, by setting the fields // one by one in a Document object. let title = schema.get_field("title").unwrap(); let body = schema.get_field("body").unwrap(); let mut old_man_doc = Document::default(); old_man_doc.add_text(title, "The Old Man and the Sea"); old_man_doc.add_text( body, "He was an old man who fished alone in a skiff in the Gulf Stream and \ he had gone eighty-four days now without taking a fish.", ); // ... and add it to the `IndexWriter`. index_writer.add_document(old_man_doc); // For convenience, tantivy also comes with a macro to // reduce the boilerplate above. index_writer.add_document(doc!( title => "Of Mice and Men", body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ bank and runs deep and green. The water is warm too, for it has slipped twinkling \ over the yellow sands in the sunlight before reaching the narrow pool. On one \ side of the river the golden foothill slopes curve up to the strong and rocky \ Gabilan Mountains, but on the valley side the water is lined with trees—willows \ fresh and green with every spring, carrying in their lower leaf junctures the \ debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ limbs and branches that arch over the pool" )); index_writer.add_document(doc!( title => "Of Mice and Men", body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ bank and runs deep and green. The water is warm too, for it has slipped twinkling \ over the yellow sands in the sunlight before reaching the narrow pool. On one \ side of the river the golden foothill slopes curve up to the strong and rocky \ Gabilan Mountains, but on the valley side the water is lined with trees—willows \ fresh and green with every spring, carrying in their lower leaf junctures the \ debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ limbs and branches that arch over the pool" )); // Multivalued field just need to be repeated. index_writer.add_document(doc!( title => "Frankenstein", title => "The Modern Prometheus", body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ enterprise which you have regarded with such evil forebodings. I arrived here \ yesterday, and my first task is to assure my dear sister of my welfare and \ increasing confidence in the success of my undertaking." )); // This is an example, so we will only index 3 documents // here. You can check out tantivy's tutorial to index // the English wikipedia. Tantivy's indexing is rather fast. // Indexing 5 million articles of the English wikipedia takes // around 3 minutes on my computer! // ### Committing // // At this point our documents are not searchable. // // // We need to call .commit() explicitly to force the // index_writer to finish processing the documents in the queue, // flush the current index to the disk, and advertise // the existence of new documents. // // This call is blocking. index_writer.commit()?; // If `.commit()` returns correctly, then all of the // documents that have been added are guaranteed to be // persistently indexed. // // In the scenario of a crash or a power failure, // tantivy behaves as if has rolled back to its last // commit. // # Searching // // ### Searcher // // Let's search our index. Start by reloading // searchers in the index. This should be done // after every `commit()`. index.load_searchers()?; // We now need to acquire a searcher. // Some search experience might require more than // one query. // // The searcher ensure that we get to work // with a consistent version of the index. // // Acquiring a `searcher` is very cheap. // // You should acquire a searcher every time you // start processing a request and // and release it right after your query is finished. let searcher = index.searcher(); // ### Query // The query parser can interpret human queries. // Here, if the user does not specify which // field they want to search, tantivy will search // in both title and body. let query_parser = QueryParser::for_index(&index, vec![title, body]); // QueryParser may fail if the query is not in the right // format. For user facing applications, this can be a problem. // A ticket has been opened regarding this problem. let query = query_parser.parse_query("sea whale")?; // A query defines a set of documents, as // well as the way they should be scored. // // A query created by the query parser is scored according // to a metric called Tf-Idf, and will consider // any document matching at least one of our terms. // ### Collectors // // We are not interested in all of the documents but // only in the top 10. Keeping track of our top 10 best documents // is the role of the TopCollector. let mut top_collector = TopCollector::with_limit(10); // We can now perform our query. searcher.search(&*query, &mut top_collector)?; // Our top collector now contains the 10 // most relevant doc ids... let doc_addresses = top_collector.docs(); // The actual documents still need to be // retrieved from Tantivy's store. // // Since the body field was not configured as stored, // the document returned will only contain // a title. for doc_address in doc_addresses { let retrieved_doc = searcher.doc(doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } Ok(()) } use tempdir::TempDir;