tantivy/examples/stop_words.rs

// # Stop Words Example
//
// This example covers the basic usage of stop words
// with tantivy
//
// We will :
// - define our schema
// - create an index in a directory
// - add a few stop words
// - index few documents in our index

extern crate tempdir;

// ---
// Importing tantivy...
#[macro_use]
extern crate tantivy;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::Index;

fn main() -> tantivy::Result<()> {
    // this example assumes you understand the content in `basic_search`
    let mut schema_builder = Schema::builder();

    // This configures your custom options for how tantivy will
    // store and process your content in the index; The key
    // to note is that we are setting the tokenizer to `stoppy`
    // which will be defined and registered below.
    let text_field_indexing = TextFieldIndexing::default()
        .set_tokenizer("stoppy")
        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
    let text_options = TextOptions::default()
        .set_indexing_options(text_field_indexing)
        .set_stored();

    // Our first field is title.
    schema_builder.add_text_field("title", text_options);

    // Our second field is body.
    let text_field_indexing = TextFieldIndexing::default()
        .set_tokenizer("stoppy")
        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
    let text_options = TextOptions::default()
        .set_indexing_options(text_field_indexing)
        .set_stored();
    schema_builder.add_text_field("body", text_options);

    let schema = schema_builder.build();

    let index = Index::create_in_ram(schema.clone());

    // This tokenizer lowers all of the text (to help with stop word matching)
    // then removes all instances of `the` and `and` from the corpus
    let tokenizer = SimpleTokenizer
        .filter(LowerCaser)
        .filter(StopWordFilter::remove(vec![
            "the".to_string(),
            "and".to_string(),
        ]));

    index.tokenizers().register("stoppy", tokenizer);

    let mut index_writer = index.writer(50_000_000)?;

    let title = schema.get_field("title").unwrap();
    let body = schema.get_field("body").unwrap();

    index_writer.add_document(doc!(
    title => "The Old Man and the Sea",
    body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
     he had gone eighty-four days now without taking a fish."
  ));

    index_writer.add_document(doc!(
      title => "Of Mice and Men",
      body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
              bank and runs deep and green. The water is warm too, for it has slipped twinkling \
              over the yellow sands in the sunlight before reaching the narrow pool. On one \
              side of the river the golden foothill slopes curve up to the strong and rocky \
              Gabilan Mountains, but on the valley side the water is lined with trees—willows \
              fresh and green with every spring, carrying in their lower leaf junctures the \
              debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
              limbs and branches that arch over the pool"
  ));

    index_writer.add_document(doc!(
       title => "Frankenstein",
       body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
                enterprise which you have regarded with such evil forebodings.  I arrived here \
                yesterday, and my first task is to assure my dear sister of my welfare and \
                increasing confidence in the success of my undertaking."
    ));

    index_writer.commit()?;

    index.load_searchers()?;

    let searcher = index.searcher();

    let query_parser = QueryParser::for_index(&index, vec![title, body]);

    // stop words are applied on the query as well.
    // The following will be equivalent to `title:frankenstein`
    let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
    let top_docs = searcher.search(&query,  &TopDocs::with_limit(10))?;

    for (score, doc_address) in top_docs {
        let retrieved_doc = searcher.doc(doc_address)?;
        println!("\n==\nDocument score {}:", score);
        println!("{}", schema.to_json(&retrieved_doc));
    }

    Ok(())
}