mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
543 lines
20 KiB
HTML
543 lines
20 KiB
HTML
<!DOCTYPE html>
|
||
|
||
<html>
|
||
<head>
|
||
<title>simple_search.rs</title>
|
||
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||
<meta name="viewport" content="width=device-width, target-densitydpi=160dpi, initial-scale=1.0; maximum-scale=1.0; user-scalable=0;">
|
||
<link rel="stylesheet" media="all" href="docco.css" />
|
||
</head>
|
||
<body>
|
||
<div id="container">
|
||
<div id="background"></div>
|
||
|
||
<ul class="sections">
|
||
|
||
<li id="title">
|
||
<div class="annotation">
|
||
<h1>simple_search.rs</h1>
|
||
</div>
|
||
</li>
|
||
|
||
|
||
|
||
<li id="section-1">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-1">¶</a>
|
||
</div>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tantivy;
|
||
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tempdir;
|
||
|
||
<span class="hljs-meta">#[macro_use]</span>
|
||
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> serde_json;
|
||
|
||
<span class="hljs-keyword">use</span> std::path::Path;
|
||
<span class="hljs-keyword">use</span> tempdir::TempDir;
|
||
<span class="hljs-keyword">use</span> tantivy::Index;
|
||
<span class="hljs-keyword">use</span> tantivy::schema::*;
|
||
<span class="hljs-keyword">use</span> tantivy::collector::TopCollector;
|
||
<span class="hljs-keyword">use</span> tantivy::query::QueryParser;
|
||
|
||
<span class="hljs-function"><span class="hljs-keyword">fn</span> <span class="hljs-title">main</span></span>() {</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-2">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-2">¶</a>
|
||
</div>
|
||
<p>Let’s create a temporary directory for the
|
||
sake of this example</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">if</span> <span class="hljs-keyword">let</span> <span class="hljs-literal">Ok</span>(dir) = TempDir::new(<span class="hljs-string">"tantivy_example_dir"</span>) {
|
||
run_example(dir.path()).unwrap();
|
||
dir.close().unwrap();
|
||
}
|
||
}
|
||
|
||
|
||
<span class="hljs-function"><span class="hljs-keyword">fn</span> <span class="hljs-title">run_example</span></span>(index_path: &Path) -> tantivy::<span class="hljs-built_in">Result</span><()> {</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-3">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-3">¶</a>
|
||
</div>
|
||
<h1 id="defining-the-schema">Defining the schema</h1>
|
||
<p>The Tantivy index requires a very strict schema.
|
||
The schema declares which fields are in the index,
|
||
and for each field, its type and “the way it should
|
||
be indexed”.</p>
|
||
|
||
</div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-4">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-4">¶</a>
|
||
</div>
|
||
<p>first we need to define a schema …</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> schema_builder = SchemaBuilder::<span class="hljs-keyword">default</span>();</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-5">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-5">¶</a>
|
||
</div>
|
||
<p>Our first field is title.
|
||
We want full-text search for it, and we also want
|
||
to be able to retrieve the document after the search.</p>
|
||
<p>TEXT | STORED is some syntactic sugar to describe
|
||
that.</p>
|
||
<p><code>TEXT</code> means the field should be tokenized and indexed,
|
||
along with its term frequency and term positions.</p>
|
||
<p><code>STORED</code> means that the field will also be saved
|
||
in a compressed, row-oriented key-value store.
|
||
This store is useful to reconstruct the
|
||
documents that were selected during the search phase.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> schema_builder.add_text_field(<span class="hljs-string">"title"</span>, TEXT | STORED);</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-6">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-6">¶</a>
|
||
</div>
|
||
<p>Our second field is body.
|
||
We want full-text search for it, but we do not
|
||
need to be able to be able to retrieve it
|
||
for our application. </p>
|
||
<p>We can make our index lighter and
|
||
by omitting <code>STORED</code> flag.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> schema_builder.add_text_field(<span class="hljs-string">"body"</span>, TEXT);
|
||
|
||
<span class="hljs-keyword">let</span> schema = schema_builder.build();</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-7">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-7">¶</a>
|
||
</div>
|
||
<h1 id="indexing-documents">Indexing documents</h1>
|
||
<p>Let’s create a brand new index.</p>
|
||
<p>This will actually just save a meta.json
|
||
with our schema in the directory.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> index = Index::create(index_path, schema.clone())?;</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-8">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-8">¶</a>
|
||
</div>
|
||
<p>To insert document we need an index writer.
|
||
There must be only one writer at a time.
|
||
This single <code>IndexWriter</code> is already
|
||
multithreaded.</p>
|
||
<p>Here we use a buffer of 50MB per thread. Using a bigger
|
||
heap for the indexer can increase its throughput.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = index.writer(<span class="hljs-number">50_000_000</span>)?;</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-9">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-9">¶</a>
|
||
</div>
|
||
<p>Let’s index our documents!
|
||
We first need a handle on the title and the body field.</p>
|
||
|
||
</div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-10">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-10">¶</a>
|
||
</div>
|
||
<h3 id="create-a-document-manually-">Create a document “manually”.</h3>
|
||
<p>We can create a document manually, by setting the fields
|
||
one by one in a Document object.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> title = schema.get_field(<span class="hljs-string">"title"</span>).unwrap();
|
||
<span class="hljs-keyword">let</span> body = schema.get_field(<span class="hljs-string">"body"</span>).unwrap();
|
||
|
||
<span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> old_man_doc = Document::<span class="hljs-keyword">default</span>();
|
||
old_man_doc.add_text(title, <span class="hljs-string">"The Old Man and the Sea"</span>);
|
||
old_man_doc.add_text(
|
||
body,
|
||
<span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||
he had gone eighty-four days now without taking a fish."</span>,
|
||
);</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-11">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-11">¶</a>
|
||
</div>
|
||
<p>… and add it to the <code>IndexWriter</code>.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> index_writer.add_document(old_man_doc);</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-12">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-12">¶</a>
|
||
</div>
|
||
<h3 id="create-a-document-directly-from-json-">Create a document directly from json.</h3>
|
||
<p>Alternatively, we can use our schema to parse a
|
||
document object directly from json.
|
||
The document is a string, but we use the <code>json</code> macro
|
||
from <code>serde_json</code> for the convenience of multi-line support.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
|
||
<span class="hljs-string">"title"</span>: <span class="hljs-string">"Of Mice and Men"</span>,
|
||
<span class="hljs-string">"body"</span>: <span class="hljs-string">"A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||
limbs and branches that arch over the pool"</span>
|
||
});
|
||
<span class="hljs-keyword">let</span> mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||
|
||
index_writer.add_document(mice_and_men_doc);</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-13">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-13">¶</a>
|
||
</div>
|
||
<p>Multi-valued field are allowed, they are
|
||
expressed in JSON by an array.
|
||
The following document has two titles.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
|
||
<span class="hljs-string">"title"</span>: [<span class="hljs-string">"Frankenstein"</span>, <span class="hljs-string">"The Modern Prometheus"</span>],
|
||
<span class="hljs-string">"body"</span>: <span class="hljs-string">"You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||
increasing confidence in the success of my undertaking."</span>
|
||
});
|
||
<span class="hljs-keyword">let</span> frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||
|
||
index_writer.add_document(frankenstein_doc);</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-14">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-14">¶</a>
|
||
</div>
|
||
<p>This is an example, so we will only index 3 documents
|
||
here. You can check out tantivy’s tutorial to index
|
||
the English wikipedia. Tantivy’s indexing is rather fast.
|
||
Indexing 5 million articles of the English wikipedia takes
|
||
around 4 minutes on my computer!</p>
|
||
|
||
</div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-15">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-15">¶</a>
|
||
</div>
|
||
<h3 id="committing">Committing</h3>
|
||
<p>At this point our documents are not searchable.</p>
|
||
<p>We need to call .commit() explicitly to force the
|
||
index_writer to finish processing the documents in the queue,
|
||
flush the current index to the disk, and advertise
|
||
the existence of new documents.</p>
|
||
<p>This call is blocking.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> index_writer.commit()?;</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-16">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-16">¶</a>
|
||
</div>
|
||
<p>If <code>.commit()</code> returns correctly, then all of the
|
||
documents that have been added are guaranteed to be
|
||
persistently indexed.</p>
|
||
<p>In the scenario of a crash or a power failure,
|
||
tantivy behaves as if has rolled back to its last
|
||
commit.</p>
|
||
|
||
</div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-17">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-17">¶</a>
|
||
</div>
|
||
<h1 id="searching">Searching</h1>
|
||
<p>Let’s search our index. Start by reloading
|
||
searchers in the index. This should be done
|
||
after every commit().</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> index.load_searchers()?;</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-18">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-18">¶</a>
|
||
</div>
|
||
<p>Afterwards create one (or more) searchers.</p>
|
||
<p>You should create a searcher
|
||
every time you start a “search query”.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> searcher = index.searcher();</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-19">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-19">¶</a>
|
||
</div>
|
||
<p>The query parser can interpret human queries.
|
||
Here, if the user does not specify which
|
||
field they want to search, tantivy will search
|
||
in both title and body.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> query_parser = QueryParser::for_index(index, <span class="hljs-built_in">vec!</span>[title, body]);</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-20">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-20">¶</a>
|
||
</div>
|
||
<p>QueryParser may fail if the query is not in the right
|
||
format. For user facing applications, this can be a problem.
|
||
A ticket has been opened regarding this problem.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query = query_parser.parse_query(<span class="hljs-string">"sea whale"</span>)?;</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-21">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-21">¶</a>
|
||
</div>
|
||
<p>A query defines a set of documents, as
|
||
well as the way they should be scored.</p>
|
||
<p>A query created by the query parser is scored according
|
||
to a metric called Tf-Idf, and will consider
|
||
any document matching at least one of our terms.</p>
|
||
|
||
</div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-22">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-22">¶</a>
|
||
</div>
|
||
<h3 id="collectors">Collectors</h3>
|
||
<p>We are not interested in all of the documents but
|
||
only in the top 10. Keeping track of our top 10 best documents
|
||
is the role of the TopCollector.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> top_collector = TopCollector::with_limit(<span class="hljs-number">10</span>);</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-23">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-23">¶</a>
|
||
</div>
|
||
<p>We can now perform our query.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> searcher.search(&*query, &<span class="hljs-keyword">mut</span> top_collector)?;</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-24">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-24">¶</a>
|
||
</div>
|
||
<p>Our top collector now contains the 10
|
||
most relevant doc ids…</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> doc_addresses = top_collector.docs();</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-25">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-25">¶</a>
|
||
</div>
|
||
<p>The actual documents still need to be
|
||
retrieved from Tantivy’s store.</p>
|
||
<p>Since the body field was not configured as stored,
|
||
the document returned will only contain
|
||
a title.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre>
|
||
<span class="hljs-keyword">for</span> doc_address <span class="hljs-keyword">in</span> doc_addresses {
|
||
<span class="hljs-keyword">let</span> retrieved_doc = searcher.doc(&doc_address)?;
|
||
<span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, schema.to_json(&retrieved_doc));
|
||
}</pre></div></div>
|
||
|
||
</li>
|
||
|
||
|
||
<li id="section-26">
|
||
<div class="annotation">
|
||
|
||
<div class="pilwrap ">
|
||
<a class="pilcrow" href="#section-26">¶</a>
|
||
</div>
|
||
<p>Wait for indexing and merging threads to shut down.
|
||
Usually this isn’t needed, but in <code>main</code> we try to
|
||
delete the temporary directory and that fails on
|
||
Windows if the files are still open.</p>
|
||
|
||
</div>
|
||
|
||
<div class="content"><div class='highlight'><pre> index_writer.wait_merging_threads()?;
|
||
|
||
<span class="hljs-literal">Ok</span>(())
|
||
}</pre></div></div>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</div>
|
||
</body>
|
||
</html>
|