mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-31 06:22:54 +00:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e14701e9cd | ||
|
|
45e62d4329 | ||
|
|
76d2b4dab6 | ||
|
|
04e9606638 | ||
|
|
a5c57ebbd9 | ||
|
|
96eaa5bc63 | ||
|
|
f1d30ab196 | ||
|
|
4507df9255 | ||
|
|
e8625548b7 | ||
|
|
50ed6fb534 | ||
|
|
76609deadf | ||
|
|
749e62c40b | ||
|
|
259ce567d1 | ||
|
|
4c93b096eb | ||
|
|
6a547b0b5f | ||
|
|
e99d1a2355 |
@@ -1,5 +1,7 @@
|
||||
Tantivy 0.9.0
|
||||
=====================
|
||||
*0.9.0 index format is not compatible with the
|
||||
previous index format.*
|
||||
- Removed most unsafe (@fulmicoton)
|
||||
- Indexer memory footprint improved. (VInt comp, inlining the first block. (@fulmicoton)
|
||||
- Stemming in other language possible (@pentlander)
|
||||
|
||||
@@ -39,10 +39,10 @@ futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "1"
|
||||
downcast = { version="0.9" }
|
||||
rust-stemmers = "1.1"
|
||||
downcast-rs = { version="1.0" }
|
||||
matches = "0.1"
|
||||
bitpacking = "0.5"
|
||||
bitpacking = "0.6"
|
||||
census = "0.2"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
|
||||
41
examples/integer_range_search.rs
Normal file
41
examples/integer_range_search.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
// # Searching a range on an indexed int field.
|
||||
//
|
||||
// Below is an example of creating an indexed integer field in your schema
|
||||
// You can use RangeQuery to get a Count of all occurrences in a given range.
|
||||
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INT_INDEXED};
|
||||
use tantivy::Index;
|
||||
use tantivy::Result;
|
||||
|
||||
fn run() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
let mut schema_builder = Schema::builder();
|
||||
// INT_INDEXED is shorthand for such fields
|
||||
let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||
for year in 1950u64..2019u64 {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
}
|
||||
index_writer.commit()?;
|
||||
// The index will be a range of years
|
||||
}
|
||||
index.load_searchers()?;
|
||||
let searcher = index.searcher();
|
||||
// The end is excluded i.e. here we are searching up to 1969
|
||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
||||
// Uses a Count collector to sum the total number of docs in the range
|
||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
assert_eq!(num_60s_books, 10);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
run().unwrap()
|
||||
}
|
||||
@@ -85,7 +85,7 @@ See the `custom_collector` example.
|
||||
|
||||
*/
|
||||
|
||||
use downcast;
|
||||
use downcast_rs;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -111,9 +111,9 @@ pub use self::facet_collector::FacetCollector;
|
||||
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
pub trait Fruit: Send + downcast::Any {}
|
||||
pub trait Fruit: Send + downcast_rs::Downcast {}
|
||||
|
||||
impl<T> Fruit for T where T: Send + downcast::Any {}
|
||||
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
@@ -358,10 +358,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(missing_docs)]
|
||||
mod downcast_impl {
|
||||
downcast!(super::Fruit);
|
||||
}
|
||||
impl_downcast!(Fruit);
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests;
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use collector::Fruit;
|
||||
use downcast::Downcast;
|
||||
use std::marker::PhantomData;
|
||||
use DocId;
|
||||
use Result;
|
||||
@@ -37,11 +36,11 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
let typed_fruit: Vec<TCollector::Fruit> = children
|
||||
.into_iter()
|
||||
.map(|untyped_fruit| {
|
||||
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
|
||||
untyped_fruit
|
||||
.downcast::<TCollector::Fruit>()
|
||||
.map(|boxed_but_typed| *boxed_but_typed)
|
||||
.map_err(|e| {
|
||||
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
|
||||
TantivyError::InvalidArgument(err_msg)
|
||||
.map_err(|_| {
|
||||
TantivyError::InvalidArgument("Failed to cast child fruit.".to_string())
|
||||
})
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
@@ -89,7 +88,10 @@ pub struct FruitHandle<TFruit: Fruit> {
|
||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||
*Downcast::<TFruit>::downcast(boxed_fruit).expect("Failed")
|
||||
*boxed_fruit
|
||||
.downcast::<TFruit>()
|
||||
.map_err(|_| ())
|
||||
.expect("Failed to downcast collector fruit.")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ pub struct BitUnpacker<Data>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
num_bits: usize,
|
||||
num_bits: u64,
|
||||
mask: u64,
|
||||
data: Data,
|
||||
}
|
||||
@@ -80,13 +80,13 @@ where
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: num_bits as usize,
|
||||
num_bits: num_bits as u64,
|
||||
mask,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
pub fn get(&self, idx: u64) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
}
|
||||
@@ -97,10 +97,10 @@ where
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
addr + 8 <= data.len() as u64,
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
@@ -129,7 +129,7 @@ mod test {
|
||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i), *val);
|
||||
assert_eq!(bitunpacker.get(i as u64), *val);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ impl BinarySerializable for FileAddr {
|
||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||
pub struct CompositeWrite<W = WritePtr> {
|
||||
write: CountingWriter<W>,
|
||||
offsets: HashMap<FileAddr, usize>,
|
||||
offsets: HashMap<FileAddr, u64>,
|
||||
}
|
||||
|
||||
impl<W: Write> CompositeWrite<W> {
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::io::Write;
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
written_bytes: usize,
|
||||
written_bytes: u64,
|
||||
}
|
||||
|
||||
impl<W: Write> CountingWriter<W> {
|
||||
@@ -14,11 +14,11 @@ impl<W: Write> CountingWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn written_bytes(&self) -> usize {
|
||||
pub fn written_bytes(&self) -> u64 {
|
||||
self.written_bytes
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> io::Result<(W, usize)> {
|
||||
pub fn finish(mut self) -> io::Result<(W, u64)> {
|
||||
self.flush()?;
|
||||
Ok((self.underlying, self.written_bytes))
|
||||
}
|
||||
@@ -27,10 +27,16 @@ impl<W: Write> CountingWriter<W> {
|
||||
impl<W: Write> Write for CountingWriter<W> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let written_size = self.underlying.write(buf)?;
|
||||
self.written_bytes += written_size;
|
||||
self.written_bytes += written_size as u64;
|
||||
Ok(written_size)
|
||||
}
|
||||
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.underlying.write_all(buf)?;
|
||||
self.written_bytes += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.underlying.flush()
|
||||
}
|
||||
@@ -48,8 +54,8 @@ mod test {
|
||||
let mut counting_writer = CountingWriter::wrap(buffer);
|
||||
let bytes = (0u8..10u8).collect::<Vec<u8>>();
|
||||
counting_writer.write_all(&bytes).unwrap();
|
||||
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
|
||||
assert_eq!(len, 10);
|
||||
let (w, len): (Vec<u8>, u64) = counting_writer.finish().unwrap();
|
||||
assert_eq!(len, 10u64);
|
||||
assert_eq!(w.len(), 10);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,7 +111,6 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Opens or creates a new index in the provided directory
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
if Index::exists(&dir) {
|
||||
let index = Index::open(dir)?;
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crossbeam::queue::MsQueue;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
@@ -10,6 +9,12 @@ pub struct GenerationItem<T> {
|
||||
item: T,
|
||||
}
|
||||
|
||||
|
||||
/// An object pool
|
||||
///
|
||||
/// This is used in tantivy to create a pool of `Searcher`.
|
||||
/// Object are wrapped in a `LeasedItem` wrapper and are
|
||||
/// released automatically back into the pool on `Drop`.
|
||||
pub struct Pool<T> {
|
||||
queue: Arc<MsQueue<GenerationItem<T>>>,
|
||||
freshest_generation: AtomicUsize,
|
||||
@@ -26,6 +31,10 @@ impl<T> Pool<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Publishes a new generation of `Searcher`.
|
||||
///
|
||||
/// After publish, all new `Searcher` acquired will be
|
||||
/// of the new generation.
|
||||
pub fn publish_new_generation(&self, items: Vec<T>) {
|
||||
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
|
||||
for item in items {
|
||||
@@ -61,6 +70,10 @@ impl<T> Pool<T> {
|
||||
self.freshest_generation.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
/// Acquires a new searcher.
|
||||
///
|
||||
/// If no searcher is available, this methods block until
|
||||
/// a searcher is released.
|
||||
pub fn acquire(&self) -> LeasedItem<T> {
|
||||
let generation = self.generation();
|
||||
loop {
|
||||
@@ -107,9 +120,9 @@ impl<T> DerefMut for LeasedItem<T> {
|
||||
|
||||
impl<T> Drop for LeasedItem<T> {
|
||||
fn drop(&mut self) {
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
|
||||
.expect("Unwrapping a leased item should never fail");
|
||||
self.recycle_queue.push(gen_item);
|
||||
if let Some(gen_item) = self.gen_item.take() {
|
||||
self.recycle_queue.push(gen_item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
let (start, stop) = self.range(doc);
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -59,7 +59,29 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
|
||||
self.get_u64(doc as u64)
|
||||
}
|
||||
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
/// It works as follows... A first column contains the list of start index
|
||||
/// for each document, a second column contains the actual values.
|
||||
///
|
||||
/// The values associated to a given doc, are then
|
||||
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
|
||||
///
|
||||
/// Which means single value fast field reader can be indexed internally with
|
||||
/// something different from a `DocId`. For this use case, we want to use `u64`
|
||||
/// values.
|
||||
///
|
||||
/// See `get_range` for an actual documentation about this method.
|
||||
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
|
||||
for (i, out) in output.iter_mut().enumerate() {
|
||||
*out = self.get_u64(start + (i as u64));
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
@@ -75,13 +97,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
///
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
for (i, out) in output.iter_mut().enumerate() {
|
||||
*out = self.get(start + i as u32);
|
||||
}
|
||||
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
self.get_range_u64(start as u64, output);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::operation::AddOperation;
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use bit_set::BitSet;
|
||||
@@ -26,6 +26,7 @@ use schema::Document;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
@@ -43,8 +44,8 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
|
||||
type DocumentSender = channel::Sender<AddOperation>;
|
||||
type DocumentReceiver = channel::Receiver<AddOperation>;
|
||||
type DocumentSender = channel::Sender<Vec<AddOperation>>;
|
||||
type DocumentReceiver = channel::Receiver<Vec<AddOperation>>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
@@ -258,7 +259,7 @@ pub fn advance_deletes(
|
||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||
}
|
||||
}
|
||||
segment_entry.set_meta((*segment.meta()).clone());
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -266,7 +267,7 @@ fn index_documents(
|
||||
memory_budget: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
document_iterator: &mut Iterator<Item = Vec<AddOperation>>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
@@ -274,11 +275,11 @@ fn index_documents(
|
||||
let segment_id = segment.id();
|
||||
let table_size = initial_table_size(memory_budget);
|
||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||
for doc in document_iterator {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
|
||||
for documents in document_iterator {
|
||||
for doc in documents {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
}
|
||||
let mem_usage = segment_writer.mem_usage();
|
||||
|
||||
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
@@ -409,8 +410,12 @@ impl IndexWriter {
|
||||
// this is a valid guarantee as the
|
||||
// peeked document now belongs to
|
||||
// our local iterator.
|
||||
if let Some(operation) = document_iterator.peek() {
|
||||
delete_cursor.skip_to(operation.opstamp);
|
||||
if let Some(operations) = document_iterator.peek() {
|
||||
if let Some(first) = operations.first() {
|
||||
delete_cursor.skip_to(first.opstamp);
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
} else {
|
||||
// No more documents.
|
||||
// Happens when there is a commit, or if the `IndexWriter`
|
||||
@@ -643,25 +648,168 @@ impl IndexWriter {
|
||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
let send_result = self.document_sender.send(add_operation);
|
||||
let send_result = self.document_sender.send(vec![add_operation]);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
}
|
||||
opstamp
|
||||
}
|
||||
|
||||
/// Gets a range of stamps from the stamper and "pops" the last stamp
|
||||
/// from the range returning a tuple of the last optstamp and the popped
|
||||
/// range.
|
||||
///
|
||||
/// The total number of stamps generated by this method is `count + 1`;
|
||||
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
||||
/// is for the batch itself.
|
||||
fn get_batch_opstamps(&mut self, count: u64) -> (u64, Range<u64>) {
|
||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||
let last_opstamp = end - 1;
|
||||
let stamps = Range {
|
||||
start: start,
|
||||
end: last_opstamp,
|
||||
};
|
||||
(last_opstamp, stamps)
|
||||
}
|
||||
|
||||
/// Runs a group of document operations ensuring that the operations are
|
||||
/// assigned contigous u64 opstamps and that add operations of the same
|
||||
/// group are flushed into the same segment.
|
||||
///
|
||||
/// If the indexing pipeline is full, this call may block.
|
||||
///
|
||||
/// Each operation of the given `user_operations` will receive an in-order,
|
||||
/// contiguous u64 opstamp. The entire batch itself is also given an
|
||||
/// opstamp that is 1 greater than the last given operation. This
|
||||
/// `batch_opstamp` is the return value of `run`. An empty group of
|
||||
/// `user_operations`, an empty `Vec<UserOperation>`, still receives
|
||||
/// a valid opstamp even though no changes were _actually_ made to the index.
|
||||
///
|
||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||
/// visible to readers only after calling `commit()`.
|
||||
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> u64 {
|
||||
let count = user_operations.len() as u64;
|
||||
if count == 0 {
|
||||
return self.stamper.stamp();
|
||||
}
|
||||
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
||||
|
||||
let mut adds: Vec<AddOperation> = Vec::new();
|
||||
|
||||
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
|
||||
match user_op {
|
||||
UserOperation::Delete(term) => {
|
||||
let delete_operation = DeleteOperation {
|
||||
opstamp: opstamp,
|
||||
term: term,
|
||||
};
|
||||
self.delete_queue.push(delete_operation);
|
||||
}
|
||||
UserOperation::Add(doc) => {
|
||||
let add_operation = AddOperation {
|
||||
opstamp: opstamp,
|
||||
document: doc,
|
||||
};
|
||||
adds.push(add_operation);
|
||||
}
|
||||
}
|
||||
}
|
||||
let send_result = self.document_sender.send(adds);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
};
|
||||
|
||||
batch_opstamp
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::operation::UserOperation;
|
||||
use super::initial_table_size;
|
||||
use directory::error::LockError;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
use schema::{self, Document};
|
||||
use schema::{self, Document, IndexRecordOption};
|
||||
use query::{TermQuery};
|
||||
use collector::TopDocs;
|
||||
use Index;
|
||||
use Term;
|
||||
|
||||
#[test]
|
||||
fn test_operations_group() {
|
||||
// an operations group with 2 items should cause 3 opstamps 0, 1, and 2.
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let operations = vec![
|
||||
UserOperation::Add(doc!(text_field=>"a")),
|
||||
UserOperation::Add(doc!(text_field=>"b")),
|
||||
];
|
||||
let batch_opstamp1 = index_writer.run(operations);
|
||||
assert_eq!(batch_opstamp1, 2u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ordered_batched_operations() {
|
||||
// * one delete for `doc!(field=>"a")`
|
||||
// * one add for `doc!(field=>"a")`
|
||||
// * one add for `doc!(field=>"b")`
|
||||
// * one delete for `doc!(field=>"b")`
|
||||
// after commit there is one doc with "a" and 0 doc with "b"
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let a_term = Term::from_field_text(text_field, "a");
|
||||
let b_term = Term::from_field_text(text_field, "b");
|
||||
let operations = vec![
|
||||
UserOperation::Delete(a_term),
|
||||
UserOperation::Add(doc!(text_field=>"a")),
|
||||
UserOperation::Add(doc!(text_field=>"b")),
|
||||
UserOperation::Delete(b_term),
|
||||
];
|
||||
|
||||
index_writer.run(operations);
|
||||
index_writer.commit().expect("failed to commit");
|
||||
index.load_searchers().expect("failed to load searchers");
|
||||
|
||||
let a_term = Term::from_field_text(text_field, "a");
|
||||
let b_term = Term::from_field_text(text_field, "b");
|
||||
|
||||
let a_query = TermQuery::new(a_term, IndexRecordOption::Basic);
|
||||
let b_query = TermQuery::new(b_term, IndexRecordOption::Basic);
|
||||
|
||||
let searcher = index.searcher();
|
||||
|
||||
let a_docs = searcher
|
||||
.search(&a_query, &TopDocs::with_limit(1))
|
||||
.expect("search for a failed");
|
||||
|
||||
let b_docs = searcher
|
||||
.search(&b_query, &TopDocs::with_limit(1))
|
||||
.expect("search for b failed");
|
||||
|
||||
assert_eq!(a_docs.len(), 1);
|
||||
assert_eq!(b_docs.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_operations_group() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let operations1 = vec![];
|
||||
let batch_opstamp1 = index_writer.run(operations1);
|
||||
assert_eq!(batch_opstamp1, 0u64);
|
||||
let operations2 = vec![];
|
||||
let batch_opstamp2 = index_writer.run(operations2);
|
||||
assert_eq!(batch_opstamp2, 1u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_stops_duplicates() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
|
||||
@@ -14,3 +14,10 @@ pub struct AddOperation {
|
||||
pub opstamp: u64,
|
||||
pub document: Document,
|
||||
}
|
||||
|
||||
/// UserOperation is an enum type that encapsulates other operation types.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub enum UserOperation {
|
||||
Add(Document),
|
||||
Delete(Term),
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -60,6 +61,16 @@ impl Stamper {
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
}
|
||||
|
||||
/// Given a desired count `n`, `stamps` returns an iterator that
|
||||
/// will supply `n` number of u64 stamps.
|
||||
pub fn stamps(&self, n: u64) -> Range<u64> {
|
||||
let start = self.0.fetch_add(n, Ordering::SeqCst);
|
||||
Range {
|
||||
start: start,
|
||||
end: start + n,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -78,5 +89,7 @@ mod test {
|
||||
|
||||
assert_eq!(stamper.stamp(), 10u64);
|
||||
assert_eq!(stamper_clone.stamp(), 11u64);
|
||||
assert_eq!(stamper.stamps(3u64), (12..15));
|
||||
assert_eq!(stamper.stamp(), 15u64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,7 +168,7 @@ extern crate maplit;
|
||||
extern crate test;
|
||||
|
||||
#[macro_use]
|
||||
extern crate downcast;
|
||||
extern crate downcast_rs;
|
||||
|
||||
#[macro_use]
|
||||
extern crate fail;
|
||||
|
||||
@@ -34,10 +34,6 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
||||
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
|
||||
|
||||
lazy_static! {
|
||||
static ref BIT_PACKER: BitPacker4x = BitPacker4x::new();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
|
||||
@@ -1,4 +1,23 @@
|
||||
use super::BIT_PACKER;
|
||||
/// Positions works as a long sequence of compressed block.
|
||||
/// All terms are chained one after the other.
|
||||
///
|
||||
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
|
||||
/// This means we need to skip to the `nth` positions efficiently.
|
||||
///
|
||||
/// This is done thanks to two levels of skiping that we refer to in the code
|
||||
/// as `long_skip` and `short_skip`.
|
||||
///
|
||||
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
|
||||
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
|
||||
///
|
||||
/// We find the number of long skips, as `n / long_skip`.
|
||||
///
|
||||
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
|
||||
/// (values can go from 0bit to 32 bits) required to decompressed every block.
|
||||
///
|
||||
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
|
||||
/// so skipping a block without decompressing it is just a matter of advancing that many
|
||||
/// bytes.
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use directory::ReadOnlySource;
|
||||
@@ -8,9 +27,65 @@ use positions::LONG_SKIP_INTERVAL;
|
||||
use positions::LONG_SKIP_IN_BLOCKS;
|
||||
use postings::compression::compressed_block_size;
|
||||
|
||||
struct Positions {
|
||||
bit_packer: BitPacker4x,
|
||||
skip_source: ReadOnlySource,
|
||||
position_source: ReadOnlySource,
|
||||
long_skip_source: ReadOnlySource,
|
||||
}
|
||||
|
||||
impl Positions {
|
||||
pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions {
|
||||
let skip_len = skip_source.len();
|
||||
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
||||
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
||||
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
|
||||
let (skip_source, long_skip_source) = body.split(body_split);
|
||||
Positions {
|
||||
bit_packer: BitPacker4x::new(),
|
||||
skip_source,
|
||||
long_skip_source,
|
||||
position_source,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the offset of the block associated to the given `long_skip_id`.
|
||||
///
|
||||
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
|
||||
fn long_skip(&self, long_skip_id: usize) -> u64 {
|
||||
if long_skip_id == 0 {
|
||||
return 0;
|
||||
}
|
||||
let long_skip_slice = self.long_skip_source.as_slice();
|
||||
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
|
||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
|
||||
}
|
||||
|
||||
fn reader(&self, offset: u64) -> PositionReader {
|
||||
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
||||
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
|
||||
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
|
||||
let mut position_read = OwnedRead::new(self.position_source.clone());
|
||||
position_read.advance(offset_num_bytes as usize);
|
||||
let mut skip_read = OwnedRead::new(self.skip_source.clone());
|
||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||
let mut position_reader = PositionReader {
|
||||
bit_packer: self.bit_packer,
|
||||
skip_read,
|
||||
position_read,
|
||||
inner_offset: 0,
|
||||
buffer: Box::new([0u32; 128]),
|
||||
ahead: None,
|
||||
};
|
||||
position_reader.skip(small_skip);
|
||||
position_reader
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositionReader {
|
||||
skip_read: OwnedRead,
|
||||
position_read: OwnedRead,
|
||||
bit_packer: BitPacker4x,
|
||||
inner_offset: usize,
|
||||
buffer: Box<[u32; 128]>,
|
||||
ahead: Option<usize>, // if None, no block is loaded.
|
||||
@@ -27,6 +102,7 @@ pub struct PositionReader {
|
||||
// If the requested number of els ends exactly at a given block, the next
|
||||
// block is not decompressed.
|
||||
fn read_impl(
|
||||
bit_packer: BitPacker4x,
|
||||
mut position: &[u8],
|
||||
buffer: &mut [u32; 128],
|
||||
mut inner_offset: usize,
|
||||
@@ -37,21 +113,23 @@ fn read_impl(
|
||||
let mut output_len = output.len();
|
||||
let mut ahead = 0;
|
||||
loop {
|
||||
let available_len = 128 - inner_offset;
|
||||
let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
|
||||
// We have enough elements in the current block.
|
||||
// Let's copy the requested elements in the output buffer,
|
||||
// and return.
|
||||
if output_len <= available_len {
|
||||
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
|
||||
return ahead;
|
||||
} else {
|
||||
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
||||
output_len -= available_len;
|
||||
output_start += available_len;
|
||||
inner_offset = 0;
|
||||
let num_bits = num_bits[ahead];
|
||||
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
position = &position[block_len..];
|
||||
ahead += 1;
|
||||
}
|
||||
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
||||
output_len -= available_len;
|
||||
output_start += available_len;
|
||||
inner_offset = 0;
|
||||
let num_bits = num_bits[ahead];
|
||||
bit_packer.decompress(position, &mut buffer[..], num_bits);
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
position = &position[block_len..];
|
||||
ahead += 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,35 +139,7 @@ impl PositionReader {
|
||||
skip_source: ReadOnlySource,
|
||||
offset: u64,
|
||||
) -> PositionReader {
|
||||
let skip_len = skip_source.len();
|
||||
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
||||
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
||||
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
|
||||
let (skip_body, long_skips) = body.split(body_split);
|
||||
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
||||
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
|
||||
let offset_num_bytes: u64 = {
|
||||
if long_skip_id > 0 {
|
||||
let mut long_skip_blocks: &[u8] =
|
||||
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
|
||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
|
||||
} else {
|
||||
0
|
||||
}
|
||||
};
|
||||
let mut position_read = OwnedRead::new(position_source);
|
||||
position_read.advance(offset_num_bytes as usize);
|
||||
let mut skip_read = OwnedRead::new(skip_body);
|
||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||
let mut position_reader = PositionReader {
|
||||
skip_read,
|
||||
position_read,
|
||||
inner_offset: 0,
|
||||
buffer: Box::new([0u32; 128]),
|
||||
ahead: None,
|
||||
};
|
||||
position_reader.skip(small_skip);
|
||||
position_reader
|
||||
Positions::new(position_source, skip_source).reader(offset)
|
||||
}
|
||||
|
||||
/// Fills a buffer with the next `output.len()` integers.
|
||||
@@ -101,10 +151,12 @@ impl PositionReader {
|
||||
if self.ahead != Some(0) {
|
||||
// the block currently available is not the block
|
||||
// for the current position
|
||||
BIT_PACKER.decompress(position_data, self.buffer.as_mut(), num_bits);
|
||||
self.bit_packer.decompress(position_data, self.buffer.as_mut(), num_bits);
|
||||
self.ahead = Some(0);
|
||||
}
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
self.ahead = Some(read_impl(
|
||||
self.bit_packer,
|
||||
&position_data[block_len..],
|
||||
self.buffer.as_mut(),
|
||||
self.inner_offset,
|
||||
@@ -133,14 +185,13 @@ impl PositionReader {
|
||||
}
|
||||
});
|
||||
|
||||
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
|
||||
let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|num_bit| num_bit as usize)
|
||||
.map(|num_bits| *num_bits as usize)
|
||||
.sum::<usize>()
|
||||
* (COMPRESSION_BLOCK_SIZE / 8);
|
||||
|
||||
* COMPRESSION_BLOCK_SIZE;
|
||||
let skip_len_in_bytes = skip_len_in_bits / 8;
|
||||
self.skip_read.advance(num_blocks_to_advance);
|
||||
self.position_read.advance(skip_len);
|
||||
self.position_read.advance(skip_len_in_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,29 +1,30 @@
|
||||
use super::BIT_PACKER;
|
||||
use bitpacking::BitPacker;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||
use std::io;
|
||||
use std::io::{self, Write};
|
||||
use bitpacking::BitPacker4x;
|
||||
|
||||
pub struct PositionSerializer<W: io::Write> {
|
||||
write_stream: W,
|
||||
bit_packer: BitPacker4x,
|
||||
write_stream: CountingWriter<W>,
|
||||
write_skiplist: W,
|
||||
block: Vec<u32>,
|
||||
buffer: Vec<u8>,
|
||||
num_ints: u64,
|
||||
long_skips: Vec<u64>,
|
||||
cumulated_num_bits: u64,
|
||||
}
|
||||
|
||||
impl<W: io::Write> PositionSerializer<W> {
|
||||
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
|
||||
PositionSerializer {
|
||||
write_stream,
|
||||
bit_packer: BitPacker4x::new(),
|
||||
write_stream: CountingWriter::wrap(write_stream),
|
||||
write_skiplist,
|
||||
block: Vec::with_capacity(128),
|
||||
buffer: vec![0u8; 128 * 4],
|
||||
num_ints: 0u64,
|
||||
long_skips: Vec::new(),
|
||||
cumulated_num_bits: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,14 +51,13 @@ impl<W: io::Write> PositionSerializer<W> {
|
||||
}
|
||||
|
||||
fn flush_block(&mut self) -> io::Result<()> {
|
||||
let num_bits = BIT_PACKER.num_bits(&self.block[..]);
|
||||
self.cumulated_num_bits += u64::from(num_bits);
|
||||
let num_bits = self.bit_packer.num_bits(&self.block[..]);
|
||||
self.write_skiplist.write_all(&[num_bits])?;
|
||||
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits);
|
||||
let written_len = self.bit_packer.compress(&self.block[..], &mut self.buffer, num_bits);
|
||||
self.write_stream.write_all(&self.buffer[..written_len])?;
|
||||
self.block.clear();
|
||||
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
|
||||
self.long_skips.push(self.cumulated_num_bits);
|
||||
self.long_skips.push(self.write_stream.written_bytes());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -123,12 +123,17 @@ impl SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||
}
|
||||
|
||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
let end = arr.len();
|
||||
debug_assert!(arr.len() <= 128);
|
||||
debug_assert!(target <= arr[end - 1]);
|
||||
let mut begin = 0;
|
||||
for &pivot in [1,3,7,15,31,63].iter().take_while(|&&el| el < end) {
|
||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||
if pivot >= end {
|
||||
break;
|
||||
}
|
||||
if arr[pivot] > target {
|
||||
return (begin, pivot);
|
||||
}
|
||||
@@ -145,12 +150,8 @@ fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||
/// The target is assumed greater or equal to the first element.
|
||||
/// The target is assumed smaller or equal to the last element.
|
||||
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(target, block_docs);
|
||||
start.wrapping_add(
|
||||
block_docs[start..end]
|
||||
.binary_search(&target)
|
||||
.unwrap_or_else(|e| e),
|
||||
)
|
||||
let (start, end) = exponential_search(block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
}
|
||||
|
||||
impl DocSet for SegmentPostings {
|
||||
@@ -617,6 +618,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
mod tests {
|
||||
|
||||
use super::exponential_search;
|
||||
use super::linear_search;
|
||||
use super::search_within_block;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
@@ -632,6 +634,21 @@ mod tests {
|
||||
use DocId;
|
||||
use SkipResult;
|
||||
|
||||
#[test]
|
||||
fn test_linear_search() {
|
||||
let len: usize = 50;
|
||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||
for target in 1..*arr.last().unwrap() {
|
||||
let res = linear_search(&arr[..], target);
|
||||
if res > 0 {
|
||||
assert!(arr[res - 1] < target);
|
||||
}
|
||||
if res < len {
|
||||
assert!(arr[res] >= target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
let mut postings = SegmentPostings::empty();
|
||||
@@ -660,10 +677,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
|
||||
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
|
||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use core::SegmentReader;
|
||||
use downcast::Downcast;
|
||||
use query::intersect_scorers;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use query::term_query::TermScorer;
|
||||
@@ -10,7 +9,6 @@ use query::RequiredOptionalScorer;
|
||||
use query::Scorer;
|
||||
use query::Union;
|
||||
use query::Weight;
|
||||
use std::borrow::Borrow;
|
||||
use std::collections::HashMap;
|
||||
use Result;
|
||||
|
||||
@@ -24,14 +22,11 @@ where
|
||||
}
|
||||
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
});
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorer> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
|
||||
return scorer;
|
||||
|
||||
@@ -8,7 +8,6 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::tests::TestCollector;
|
||||
use downcast::Downcast;
|
||||
use query::score_combiner::SumWithCoordsCombiner;
|
||||
use query::term_query::TermScorer;
|
||||
use query::Intersection;
|
||||
@@ -72,7 +71,7 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
assert!(scorer.is::<TermScorer>());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -84,13 +83,13 @@ mod tests {
|
||||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
||||
assert!(scorer.is::<Intersection<TermScorer>>());
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
|
||||
assert!(scorer.is::<Intersection<Box<Scorer>>>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,16 +102,14 @@ mod tests {
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<
|
||||
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
|
||||
>::is_type(&*scorer));
|
||||
assert!(scorer
|
||||
.is::<RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>>());
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&searcher, false).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
println!("{:?}", scorer.type_name());
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
assert!(scorer.is::<TermScorer>());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use docset::{DocSet, SkipResult};
|
||||
use downcast::Downcast;
|
||||
use query::term_query::TermScorer;
|
||||
use query::EmptyScorer;
|
||||
use query::Scorer;
|
||||
use std::borrow::Borrow;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
@@ -26,13 +24,12 @@ pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
||||
(Some(single_docset), None) => single_docset,
|
||||
(Some(left), Some(right)) => {
|
||||
{
|
||||
let all_term_scorers = [&left, &right].iter().all(|&scorer| {
|
||||
let scorer_ref: &Scorer = <Box<Scorer> as Borrow<Scorer>>::borrow(scorer);
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
});
|
||||
let all_term_scorers = [&left, &right]
|
||||
.iter()
|
||||
.all(|&scorer| scorer.is::<TermScorer>());
|
||||
if all_term_scorers {
|
||||
let left = *Downcast::<TermScorer>::downcast(left).unwrap();
|
||||
let right = *Downcast::<TermScorer>::downcast(right).unwrap();
|
||||
let left = *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap());
|
||||
let right = *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap());
|
||||
return Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
|
||||
@@ -43,7 +43,7 @@ impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
|
||||
|
||||
pub struct PhraseScorer<TPostings: Postings> {
|
||||
intersection_docset: Intersection<PostingsWithOffset<TPostings>, PostingsWithOffset<TPostings>>,
|
||||
num_docsets: usize,
|
||||
num_terms: usize,
|
||||
left: Vec<u32>,
|
||||
right: Vec<u32>,
|
||||
phrase_count: u32,
|
||||
@@ -138,7 +138,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
.collect::<Vec<_>>();
|
||||
PhraseScorer {
|
||||
intersection_docset: Intersection::new(postings_with_offsets),
|
||||
num_docsets,
|
||||
num_terms: num_docsets,
|
||||
left: Vec::with_capacity(100),
|
||||
right: Vec::with_capacity(100),
|
||||
phrase_count: 0u32,
|
||||
@@ -165,7 +165,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_docsets - 1 {
|
||||
for i in 1..self.num_terms - 1 {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
@@ -178,7 +178,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_docsets - 1)
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
||||
}
|
||||
@@ -190,7 +190,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_docsets - 1 {
|
||||
for i in 1..self.num_terms - 1 {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
@@ -203,7 +203,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_docsets - 1)
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::Weight;
|
||||
use core::searcher::Searcher;
|
||||
use downcast;
|
||||
use downcast_rs;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt;
|
||||
use Result;
|
||||
@@ -39,7 +39,7 @@ use Term;
|
||||
///
|
||||
/// When implementing a new type of `Query`, it is normal to implement a
|
||||
/// dedicated `Query`, `Weight` and `Scorer`.
|
||||
pub trait Query: QueryClone + downcast::Any + fmt::Debug {
|
||||
pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
|
||||
/// Create the weight associated to a query.
|
||||
///
|
||||
/// If scoring is not required, setting `scoring_enabled` to `false`
|
||||
@@ -96,7 +96,4 @@ impl QueryClone for Box<Query> {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(missing_docs)]
|
||||
mod downcast_impl {
|
||||
downcast!(super::Query);
|
||||
}
|
||||
impl_downcast!(Query);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use common::BitSet;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use downcast;
|
||||
use downcast_rs;
|
||||
use std::ops::DerefMut;
|
||||
use DocId;
|
||||
use Score;
|
||||
@@ -8,7 +8,7 @@ use Score;
|
||||
/// Scored set of documents matching a query within a specific segment.
|
||||
///
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
pub trait Scorer: downcast::Any + DocSet + 'static {
|
||||
pub trait Scorer: downcast_rs::Downcast + DocSet + 'static {
|
||||
/// Returns the score.
|
||||
///
|
||||
/// This method will perform a bit of computation and is not cached.
|
||||
@@ -23,10 +23,7 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(missing_docs)]
|
||||
mod downcast_impl {
|
||||
downcast!(super::Scorer);
|
||||
}
|
||||
impl_downcast!(Scorer);
|
||||
|
||||
impl Scorer for Box<Scorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
|
||||
@@ -15,6 +15,7 @@ pub enum Language {
|
||||
Finnish,
|
||||
French,
|
||||
German,
|
||||
Greek,
|
||||
Hungarian,
|
||||
Italian,
|
||||
Portuguese,
|
||||
@@ -37,6 +38,7 @@ impl Language {
|
||||
Finnish => Algorithm::Finnish,
|
||||
French => Algorithm::French,
|
||||
German => Algorithm::German,
|
||||
Greek => Algorithm::Greek,
|
||||
Hungarian => Algorithm::Hungarian,
|
||||
Italian => Algorithm::Italian,
|
||||
Portuguese => Algorithm::Portuguese,
|
||||
|
||||
Reference in New Issue
Block a user