mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-04 09:30:42 +00:00
Remove mut in add_document and delete_term. Made stamper ordering rel… (#551)
* Remove mut in add_document and delete_term. Made stamper ordering relaxed. * Made batch operations &mut self -> &self * Added example
This commit is contained in:
10
CHANGELOG.md
10
CHANGELOG.md
@@ -9,14 +9,20 @@ Tantivy 0.10.0
|
||||
Minor
|
||||
---------
|
||||
- Small simplification of the code.
|
||||
Calling .freq() or .doc() when .advance() has never
|
||||
Calling .freq() or .doc() when .advance() has never been called
|
||||
on segment postings should panic from now on.
|
||||
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
||||
- Fast fields are now preloaded when the `SegmentReader` is created.
|
||||
- `IndexMeta` is now public. (@hntd187)
|
||||
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
|
||||
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
|
||||
only require a read lock. (@pmasurel)
|
||||
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
|
||||
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
|
||||
|
||||
## How to update?
|
||||
|
||||
Your existing indexes are usable as is. Your may or may need some
|
||||
Your existing indexes are usable as is, but you may need some
|
||||
trivial updates.
|
||||
|
||||
### Fast fields
|
||||
|
||||
107
examples/multiple_producer.rs
Normal file
107
examples/multiple_producer.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
// # Indexing from different threads.
|
||||
//
|
||||
// It is fairly common to have to index from different threads.
|
||||
// Tantivy forbids to create more than one `IndexWriter` at a time.
|
||||
//
|
||||
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
|
||||
// indexing threads will not help. However, it can still be useful for some applications.
|
||||
//
|
||||
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
|
||||
// your application, it is reasonable to have multiple threads.
|
||||
//
|
||||
// Another very common reason to want to index from multiple threads, is implementing a webserver
|
||||
// with CRUD capabilities. The server framework will most likely handle request from
|
||||
// different threads.
|
||||
//
|
||||
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
|
||||
// `Arc<RwLock<IndexWriter>>`.
|
||||
//
|
||||
// While this is counterintuitive, adding and deleting documents do not require mutability
|
||||
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
|
||||
//
|
||||
// The example below does not represent an actual real-life use case (who would spawn thread to
|
||||
// index a single document?), but aims at demonstrating the mechanism that makes indexing
|
||||
// from several threads possible.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::Opstamp;
|
||||
use tantivy::{Index, IndexWriter};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
|
||||
|
||||
// # First indexing thread.
|
||||
let index_writer_clone_1 = index_writer.clone();
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
let opstamp = {
|
||||
// A read lock is sufficient here.
|
||||
let index_writer_rlock = index_writer_clone_1.read().unwrap();
|
||||
index_writer_rlock.add_document(
|
||||
doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))
|
||||
};
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
});
|
||||
|
||||
// # Second indexing thread.
|
||||
let index_writer_clone_2 = index_writer.clone();
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
// A read lock is sufficient here.
|
||||
let opstamp = {
|
||||
let index_writer_rlock = index_writer_clone_2.read().unwrap();
|
||||
index_writer_rlock.add_document(doc!(
|
||||
title => "Manufacturing consent",
|
||||
body => "Some great book description..."
|
||||
))
|
||||
};
|
||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
});
|
||||
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
for _ in 0..10 {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit().unwrap()
|
||||
};
|
||||
println!("committed with opstamp {}", opstamp);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -48,14 +48,14 @@ impl RetryPolicy {
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||
pub struct DirectoryLock(Box<Drop + Send + 'static>);
|
||||
pub struct DirectoryLock(Box<Drop + Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
directory: Box<Directory>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
|
||||
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||
fn from(underlying: Box<T>) -> Self {
|
||||
DirectoryLock(underlying)
|
||||
}
|
||||
|
||||
@@ -618,7 +618,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// Like adds, the deletion itself will be visible
|
||||
/// only after calling `commit()`.
|
||||
pub fn delete_term(&mut self, term: Term) -> Opstamp {
|
||||
pub fn delete_term(&self, term: Term) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let delete_operation = DeleteOperation { opstamp, term };
|
||||
self.delete_queue.push(delete_operation);
|
||||
@@ -646,7 +646,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// Currently it represents the number of documents that
|
||||
/// have been added since the creation of the index.
|
||||
pub fn add_document(&mut self, document: Document) -> Opstamp {
|
||||
pub fn add_document(&self, document: Document) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
let send_result = self.operation_sender.send(vec![add_operation]);
|
||||
@@ -663,7 +663,7 @@ impl IndexWriter {
|
||||
/// The total number of stamps generated by this method is `count + 1`;
|
||||
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
||||
/// is for the batch itself.
|
||||
fn get_batch_opstamps(&mut self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
||||
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||
let last_opstamp = end - 1;
|
||||
let stamps = Range {
|
||||
@@ -689,7 +689,7 @@ impl IndexWriter {
|
||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||
/// visible to readers only after calling `commit()`.
|
||||
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> Opstamp {
|
||||
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
|
||||
let count = user_operations.len() as u64;
|
||||
if count == 0 {
|
||||
return self.stamper.stamp();
|
||||
@@ -740,7 +740,7 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let operations = vec![
|
||||
UserOperation::Add(doc!(text_field=>"a")),
|
||||
UserOperation::Add(doc!(text_field=>"b")),
|
||||
@@ -802,7 +802,7 @@ mod tests {
|
||||
fn test_empty_operations_group() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
let operations1 = vec![];
|
||||
let batch_opstamp1 = index_writer.run(operations1);
|
||||
assert_eq!(batch_opstamp1, 0u64);
|
||||
|
||||
Reference in New Issue
Block a user