mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-22 18:19:58 +00:00
Minor refactoring (#1266)
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
use std::{convert::TryInto, io};
|
||||
use std::convert::TryInto;
|
||||
use std::io;
|
||||
|
||||
pub struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use super::bitpacker::BitPacker;
|
||||
use super::compute_num_bits;
|
||||
use crate::{minmax, BitUnpacker};
|
||||
|
||||
use super::{bitpacker::BitPacker, compute_num_bits};
|
||||
|
||||
const BLOCK_SIZE: usize = 128;
|
||||
|
||||
/// `BlockedBitpacker` compresses data in blocks of
|
||||
/// 128 elements, while keeping an index on it
|
||||
///
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BlockedBitpacker {
|
||||
// bitpacked blocks
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
mod bitpacker;
|
||||
mod blocked_bitpacker;
|
||||
|
||||
pub use crate::bitpacker::BitPacker;
|
||||
pub use crate::bitpacker::BitUnpacker;
|
||||
pub use crate::bitpacker::{BitPacker, BitUnpacker};
|
||||
pub use crate::blocked_bitpacker::BlockedBitpacker;
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use ownedbytes::OwnedBytes;
|
||||
use std::convert::TryInto;
|
||||
use std::io::Write;
|
||||
use std::u64;
|
||||
use std::{fmt, io};
|
||||
use std::{fmt, io, u64};
|
||||
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub struct TinySet(u64);
|
||||
@@ -187,7 +187,6 @@ fn num_buckets(max_val: u32) -> u32 {
|
||||
|
||||
impl BitSet {
|
||||
/// serialize a `BitSet`.
|
||||
///
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||
for tinyset in self.tinysets.iter().cloned() {
|
||||
@@ -353,7 +352,6 @@ impl ReadOnlyBitSet {
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ {
|
||||
self.data.chunks_exact(8).map(move |chunk| {
|
||||
@@ -363,7 +361,6 @@ impl ReadOnlyBitSet {
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the elements.
|
||||
///
|
||||
#[inline]
|
||||
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
self.iter_tinysets()
|
||||
@@ -415,14 +412,14 @@ impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::ReadOnlyBitSet;
|
||||
use super::TinySet;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use ownedbytes::OwnedBytes;
|
||||
use rand::distributions::Bernoulli;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::{BitSet, ReadOnlyBitSet, TinySet};
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full_multi() {
|
||||
@@ -710,10 +707,10 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
use super::{BitSet, TinySet};
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
|
||||
@@ -104,11 +104,12 @@ pub fn u64_to_f64(val: u64) -> f64 {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use super::{BinarySerializable, FixedSize};
|
||||
use proptest::prelude::*;
|
||||
use std::f64;
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
@@ -157,10 +158,10 @@ pub mod test {
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
|
||||
.contains(&f64_to_u64(f64::NAN))); // nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); // same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); // same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); // different exponent and mantissa
|
||||
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
|
||||
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use crate::Endianness;
|
||||
use crate::VInt;
|
||||
use std::io::{Read, Write};
|
||||
use std::{fmt, io};
|
||||
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
use crate::{Endianness, VInt};
|
||||
|
||||
/// Trait for a simple binary serialization.
|
||||
pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
@@ -202,8 +201,7 @@ impl BinarySerializable for String {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use super::VInt;
|
||||
use super::*;
|
||||
use super::{VInt, *};
|
||||
use crate::serialize::BinarySerializable;
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use super::BinarySerializable;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::io::{Read, Write};
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
|
||||
use super::BinarySerializable;
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
@@ -174,9 +175,7 @@ impl BinarySerializable for VInt {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::serialize_vint_u32;
|
||||
use super::BinarySerializable;
|
||||
use super::VInt;
|
||||
use super::{serialize_vint_u32, BinarySerializable, VInt};
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
|
||||
@@ -54,7 +54,8 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly
|
||||
/// Struct used to prevent from calling
|
||||
/// [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
@@ -64,9 +65,7 @@ pub struct AntiCallToken(());
|
||||
pub trait TerminatingWrite: Write {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
where Self: Sized {
|
||||
self.terminate_ref(AntiCallToken(()))
|
||||
}
|
||||
|
||||
@@ -97,9 +96,10 @@ impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::CountingWriter;
|
||||
use std::io::Write;
|
||||
|
||||
use super::CountingWriter;
|
||||
|
||||
#[test]
|
||||
fn test_counting_writer() {
|
||||
let buffer: Vec<u8> = vec![];
|
||||
|
||||
@@ -91,8 +91,8 @@ fn main() -> tantivy::Result<()> {
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
|
||||
eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
|
||||
@@ -12,8 +12,7 @@
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::{doc, Index, Score, SegmentReader};
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -56,8 +56,9 @@ fn main() -> tantivy::Result<()> {
|
||||
// If it is `text`, let's make sure to keep it `raw` and let's avoid
|
||||
// running any text processing on it.
|
||||
// This is done by associating this field to the tokenizer named `raw`.
|
||||
// Rather than building our [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually,
|
||||
// We use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions)
|
||||
// Rather than building our
|
||||
// [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually, We
|
||||
// use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions)
|
||||
// and untokenized.
|
||||
//
|
||||
// Because we also want to be able to see this `id` in our returned documents,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::doc;
|
||||
use tantivy::query::BooleanQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{DocId, Index, Score, SegmentReader};
|
||||
use tantivy::{doc, DocId, Index, Score, SegmentReader};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.unwrap()
|
||||
.get_first(title)
|
||||
.unwrap()
|
||||
.text()
|
||||
.as_text()
|
||||
.unwrap()
|
||||
.to_owned()
|
||||
})
|
||||
|
||||
@@ -52,11 +52,11 @@ fn main() -> tantivy::Result<()> {
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
||||
// and positions.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
|
||||
// frequencies and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing less
|
||||
// information.
|
||||
// If you don't need all this information, you may get better performance by decompressing
|
||||
// less information.
|
||||
if let Some(mut segment_postings) =
|
||||
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
@@ -109,11 +109,11 @@ fn main() -> tantivy::Result<()> {
|
||||
let inverted_index = segment_reader.inverted_index(title)?;
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
||||
// and positions.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
|
||||
// frequencies and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing less
|
||||
// information.
|
||||
// If you don't need all this information, you may get better performance by decompressing
|
||||
// less information.
|
||||
if let Some(mut block_segment_postings) =
|
||||
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
|
||||
{
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
|
||||
|
||||
@@ -90,7 +91,8 @@ fn main() -> tantivy::Result<()> {
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
for _ in 0..10 {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block
|
||||
// other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit()?
|
||||
};
|
||||
|
||||
@@ -57,7 +57,10 @@ fn main() -> tantivy::Result<()> {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
println!("Document score {}:", score);
|
||||
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
||||
println!(
|
||||
"title: {}",
|
||||
doc.get_first(title).unwrap().as_text().unwrap()
|
||||
);
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
println!("custom highlighting: {}", highlight(snippet));
|
||||
}
|
||||
|
||||
@@ -6,8 +6,10 @@ use tantivy::collector::TopDocs;
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Field, Schema, FAST, TEXT};
|
||||
use tantivy::{doc, DocAddress, DocId, Index, IndexReader, SegmentReader};
|
||||
use tantivy::{Opstamp, Searcher, SearcherGeneration, SegmentId, Warmer};
|
||||
use tantivy::{
|
||||
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
|
||||
SegmentReader, Warmer,
|
||||
};
|
||||
|
||||
// This example shows how warmers can be used to
|
||||
// load a values from an external sources using the Warmer API.
|
||||
@@ -90,7 +92,6 @@ impl Warmer for DynamicPriceColumn {
|
||||
/// This map represents a map (ProductId -> Price)
|
||||
///
|
||||
/// In practise, it could be fetching things from an external service, like a SQL table.
|
||||
///
|
||||
#[derive(Default, Clone)]
|
||||
pub struct ExternalPriceTable {
|
||||
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
|
||||
|
||||
@@ -4,14 +4,14 @@ extern crate test;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
*,
|
||||
use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||
use fastfield_codecs::linearinterpol::{
|
||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::*;
|
||||
|
||||
fn get_data() -> Vec<u64> {
|
||||
let mut data: Vec<_> = (100..55000_u64)
|
||||
|
||||
@@ -1,13 +1,9 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
use common::BinarySerializable;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
|
||||
@@ -53,7 +53,8 @@ pub trait FastFieldCodecSerializer {
|
||||
pub trait FastFieldDataAccess {
|
||||
/// Return the value associated to the given position.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
|
||||
/// reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
@@ -82,12 +83,10 @@ impl FastFieldDataAccess for Vec<u64> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
use crate::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||
use crate::linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer};
|
||||
use crate::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
|
||||
@@ -1,15 +1,10 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::FixedSize;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
@@ -137,7 +132,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
// positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
@@ -171,7 +166,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 3 {
|
||||
return false; //disable compressor for this case
|
||||
return false; // disable compressor for this case
|
||||
}
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
@@ -211,8 +206,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within
|
||||
// 50% threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::{
|
||||
linearinterpol::LinearInterpolFastFieldSerializer,
|
||||
multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer,
|
||||
FastFieldStats,
|
||||
};
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
fn main() {
|
||||
@@ -24,7 +22,7 @@ fn main() {
|
||||
);
|
||||
results.push(res);
|
||||
|
||||
//let best_estimation_codec = results
|
||||
// let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
@@ -73,7 +71,7 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
//let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
|
||||
@@ -1,30 +1,22 @@
|
||||
/*!
|
||||
//! MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the
|
||||
//! offset, but in blocks of 512.
|
||||
//!
|
||||
//! With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 /
|
||||
//! 512 = 0,45 bits per element. The additional space required per element in a block is the the
|
||||
//! maximum deviation of the linear interpolation estimation function.
|
||||
//!
|
||||
//! E.g. if the maximum deviation of an element is 12, all elements cost 4bits.
|
||||
//!
|
||||
//! Size per block:
|
||||
//! Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata
|
||||
|
||||
MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the offset, but in blocks of 512.
|
||||
|
||||
With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 / 512 = 0,45 bits per element.
|
||||
The additional space required per element in a block is the the maximum deviation of the linear interpolation estimation function.
|
||||
|
||||
E.g. if the maximum deviation of an element is 12, all elements cost 4bits.
|
||||
|
||||
Size per block:
|
||||
Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata
|
||||
|
||||
*/
|
||||
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::CountingWriter;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::DeserializeFrom;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
use common::{BinarySerializable, CountingWriter, DeserializeFrom};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
const CHUNK_SIZE: u64 = 512;
|
||||
|
||||
@@ -252,11 +244,11 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
// we ignore negative values in the max value calculation, because negative
|
||||
// values will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
// positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
@@ -350,8 +342,8 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within
|
||||
// 50% threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#![allow(clippy::return_self_not_must_use)]
|
||||
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::TryInto;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
use std::{fmt, io, mem};
|
||||
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
@@ -102,7 +102,6 @@ impl OwnedBytes {
|
||||
}
|
||||
|
||||
/// Drops the left most `advance_len` bytes.
|
||||
///
|
||||
#[inline]
|
||||
pub fn advance(&mut self, advance_len: usize) {
|
||||
self.data = &self.data[advance_len..]
|
||||
@@ -163,8 +162,7 @@ impl PartialEq<str> for OwnedBytes {
|
||||
}
|
||||
|
||||
impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes
|
||||
where
|
||||
OwnedBytes: PartialEq<T>,
|
||||
where OwnedBytes: PartialEq<T>
|
||||
{
|
||||
fn eq(&self, other: &&'a T) -> bool {
|
||||
*self == **other
|
||||
|
||||
@@ -1,17 +1,20 @@
|
||||
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use crate::Occur;
|
||||
use combine::error::StringStreamError;
|
||||
use combine::parser::char::{char, digit, space, spaces, string};
|
||||
use combine::parser::combinator::recognize;
|
||||
use combine::parser::range::{take_while, take_while1};
|
||||
use combine::parser::repeat::escaped;
|
||||
use combine::parser::Parser;
|
||||
use combine::{
|
||||
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
|
||||
};
|
||||
use combine::{error::StringStreamError, parser::combinator::recognize};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to special characters.
|
||||
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use crate::Occur;
|
||||
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
||||
// special characters.
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
|
||||
];
|
||||
@@ -363,9 +366,10 @@ mod test {
|
||||
|
||||
type TestParseResult = Result<(), StringStreamError>;
|
||||
|
||||
use super::*;
|
||||
use combine::parser::Parser;
|
||||
|
||||
use super::*;
|
||||
|
||||
pub fn nearly_equals(a: f64, b: f64) -> bool {
|
||||
(a - b).abs() < 0.0005 * (a + b).abs()
|
||||
}
|
||||
|
||||
@@ -1 +1,7 @@
|
||||
use_try_shorthand = true
|
||||
comment_width = 120
|
||||
format_strings = true
|
||||
group_imports = "StdExternalCrate"
|
||||
imports_granularity = "Module"
|
||||
normalize_comments = true
|
||||
where_single_line = true
|
||||
wrap_comments = true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use super::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
@@ -80,8 +77,7 @@ impl SegmentCollector for SegmentCountCollector {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{Count, SegmentCountCollector};
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
|
||||
#[test]
|
||||
fn test_count_collect_does_not_requires_scoring() {
|
||||
|
||||
@@ -8,8 +8,7 @@ pub(crate) struct CustomScoreTopCollector<TCustomScorer, TScore = Score> {
|
||||
}
|
||||
|
||||
impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
|
||||
where
|
||||
TScore: Clone + PartialOrd,
|
||||
where TScore: Clone + PartialOrd
|
||||
{
|
||||
pub(crate) fn new(
|
||||
custom_scorer: TCustomScorer,
|
||||
@@ -114,8 +113,7 @@ where
|
||||
}
|
||||
|
||||
impl<F, TScore> CustomSegmentScorer<TScore> for F
|
||||
where
|
||||
F: 'static + FnMut(DocId) -> TScore,
|
||||
where F: 'static + FnMut(DocId) -> TScore
|
||||
{
|
||||
fn score(&mut self, doc: DocId) -> TScore {
|
||||
(self)(doc)
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use crate::{DocAddress, DocId, Score};
|
||||
|
||||
use super::{Collector, SegmentCollector};
|
||||
use crate::{DocAddress, DocId, Score};
|
||||
|
||||
/// Collectors that returns the set of DocAddress that matches the query.
|
||||
///
|
||||
|
||||
@@ -1,21 +1,14 @@
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::btree_map;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::{btree_map, BTreeMap, BTreeSet, BinaryHeap};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Bound;
|
||||
use std::{u64, usize};
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::schema::{Facet, Field};
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
facet: &'a Facet,
|
||||
@@ -240,9 +233,7 @@ impl FacetCollector {
|
||||
/// If you need the correct number of unique documents for two such facets,
|
||||
/// just add them in separate `FacetCollector`.
|
||||
pub fn add_facet<T>(&mut self, facet_from: T)
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
where Facet: From<T> {
|
||||
let facet = Facet::from(facet_from);
|
||||
for old_facet in &self.facets {
|
||||
assert!(
|
||||
@@ -402,9 +393,7 @@ impl FacetCounts {
|
||||
/// Returns an iterator over all of the facet count pairs inside this result.
|
||||
/// See the documentation for [FacetCollector] for a usage example.
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
where Facet: From<T> {
|
||||
let facet = Facet::from(facet_from);
|
||||
let left_bound = Bound::Excluded(facet.clone());
|
||||
let right_bound = if facet.is_root() {
|
||||
@@ -423,9 +412,7 @@ impl FacetCounts {
|
||||
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
|
||||
/// See the documentation for [FacetCollector] for a usage example.
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
where Facet: From<T> {
|
||||
let mut heap = BinaryHeap::with_capacity(k);
|
||||
let mut it = self.get(facet);
|
||||
|
||||
@@ -458,16 +445,18 @@ impl FacetCounts {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter;
|
||||
|
||||
use rand::distributions::Uniform;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use crate::collector::Count;
|
||||
use crate::core::Index;
|
||||
use crate::query::{AllQuery, QueryParser, TermQuery};
|
||||
use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema};
|
||||
use crate::Term;
|
||||
use rand::distributions::Uniform;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() -> crate::Result<()> {
|
||||
@@ -522,8 +511,9 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
#[should_panic(
|
||||
expected = "Tried to add a facet which is a descendant of an already added facet."
|
||||
)]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
@@ -700,13 +690,14 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
use crate::collector::FacetCollector;
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{Facet, Schema, INDEXED};
|
||||
use crate::Index;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
|
||||
@@ -17,7 +17,8 @@ use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next
|
||||
/// collector.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::{TopDocs, FilterCollector};
|
||||
@@ -58,8 +59,7 @@ use crate::{Score, SegmentReader, TantivyError};
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
|
||||
where
|
||||
TPredicate: 'static + Clone,
|
||||
where TPredicate: 'static + Clone
|
||||
{
|
||||
field: Field,
|
||||
collector: TCollector,
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use fastdivide::DividerU64;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::{DocId, Score};
|
||||
use fastdivide::DividerU64;
|
||||
|
||||
/// Histogram builds an histogram of the values of a fastfield for the
|
||||
/// collected DocSet.
|
||||
@@ -36,8 +37,8 @@ impl HistogramCollector {
|
||||
/// - `bucket_width`: the length of the interval that is associated to each buckets.
|
||||
/// - `num_buckets`: The overall number of buckets.
|
||||
///
|
||||
/// Together, this parameters define a partition of `[min_value, min_value + num_buckets * bucket_width)`
|
||||
/// into `num_buckets` intervals of width bucket that we call `bucket`.
|
||||
/// Together, this parameters define a partition of `[min_value, min_value + num_buckets *
|
||||
/// bucket_width)` into `num_buckets` intervals of width bucket that we call `bucket`.
|
||||
///
|
||||
/// # Disclaimer
|
||||
/// This function panics if the field given is of type f64.
|
||||
@@ -147,12 +148,13 @@ fn add_vecs(mut vals_list: Vec<Vec<u64>>, len: usize) -> Vec<u64> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastdivide::DividerU64;
|
||||
use query::AllQuery;
|
||||
|
||||
use super::{add_vecs, HistogramCollector, HistogramComputer};
|
||||
use crate::chrono::{TimeZone, Utc};
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::{doc, query, Index};
|
||||
use fastdivide::DividerU64;
|
||||
use query::AllQuery;
|
||||
|
||||
#[test]
|
||||
fn test_add_histograms_simple() {
|
||||
|
||||
@@ -1,95 +1,90 @@
|
||||
/*!
|
||||
//! # Collectors
|
||||
//!
|
||||
//! Collectors define the information you want to extract from the documents matching the queries.
|
||||
//! In tantivy jargon, we call this information your search "fruit".
|
||||
//!
|
||||
//! Your fruit could for instance be :
|
||||
//! - [the count of matching documents](./struct.Count.html)
|
||||
//! - [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
|
||||
//! - [facet counts](./struct.FacetCollector.html)
|
||||
//!
|
||||
//! At one point in your code, you will trigger the actual search operation by calling
|
||||
//! [the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
|
||||
//! This call will look like this.
|
||||
//!
|
||||
//! ```verbatim
|
||||
//! let fruit = searcher.search(&query, &collector)?;
|
||||
//! ```
|
||||
//!
|
||||
//! Here the type of fruit is actually determined as an associated type of the collector
|
||||
//! (`Collector::Fruit`).
|
||||
//!
|
||||
//!
|
||||
//! # Combining several collectors
|
||||
//!
|
||||
//! A rich search experience often requires to run several collectors on your search query.
|
||||
//! For instance,
|
||||
//! - selecting the top-K products matching your query
|
||||
//! - counting the matching documents
|
||||
//! - computing several facets
|
||||
//! - computing statistics about the matching product prices
|
||||
//!
|
||||
//! A simple and efficient way to do that is to pass your collectors as one tuple.
|
||||
//! The resulting `Fruit` will then be a typed tuple with each collector's original fruits
|
||||
//! in their respective position.
|
||||
//!
|
||||
//! ```rust
|
||||
//! # use tantivy::schema::*;
|
||||
//! # use tantivy::*;
|
||||
//! # use tantivy::query::*;
|
||||
//! use tantivy::collector::{Count, TopDocs};
|
||||
//! #
|
||||
//! # fn main() -> tantivy::Result<()> {
|
||||
//! # let mut schema_builder = Schema::builder();
|
||||
//! # let title = schema_builder.add_text_field("title", TEXT);
|
||||
//! # let schema = schema_builder.build();
|
||||
//! # let index = Index::create_in_ram(schema);
|
||||
//! # let mut index_writer = index.writer(3_000_000)?;
|
||||
//! # index_writer.add_document(doc!(
|
||||
//! # title => "The Name of the Wind",
|
||||
//! # ))?;
|
||||
//! # index_writer.add_document(doc!(
|
||||
//! # title => "The Diary of Muadib",
|
||||
//! # ))?;
|
||||
//! # index_writer.commit()?;
|
||||
//! # let reader = index.reader()?;
|
||||
//! # let searcher = reader.searcher();
|
||||
//! # let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
//! # let query = query_parser.parse_query("diary")?;
|
||||
//! let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||
//! searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
||||
//! # Ok(())
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! The `Collector` trait is implemented for up to 4 collectors.
|
||||
//! If you have more than 4 collectors, you can either group them into
|
||||
//! tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html).
|
||||
//!
|
||||
//! # Combining several collectors dynamically
|
||||
//!
|
||||
//! Combining collectors into a tuple is a zero-cost abstraction: everything
|
||||
//! happens as if you had manually implemented a single collector
|
||||
//! combining all of our features.
|
||||
//!
|
||||
//! Unfortunately it requires you to know at compile time your collector types.
|
||||
//! If on the other hand, the collectors depend on some query parameter,
|
||||
//! you can rely on `MultiCollector`'s.
|
||||
//!
|
||||
//!
|
||||
//! # Implementing your own collectors.
|
||||
//!
|
||||
//! See the `custom_collector` example.
|
||||
|
||||
# Collectors
|
||||
|
||||
Collectors define the information you want to extract from the documents matching the queries.
|
||||
In tantivy jargon, we call this information your search "fruit".
|
||||
|
||||
Your fruit could for instance be :
|
||||
- [the count of matching documents](./struct.Count.html)
|
||||
- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
|
||||
- [facet counts](./struct.FacetCollector.html)
|
||||
|
||||
At one point in your code, you will trigger the actual search operation by calling
|
||||
[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
|
||||
This call will look like this.
|
||||
|
||||
```verbatim
|
||||
let fruit = searcher.search(&query, &collector)?;
|
||||
```
|
||||
|
||||
Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`).
|
||||
|
||||
|
||||
# Combining several collectors
|
||||
|
||||
A rich search experience often requires to run several collectors on your search query.
|
||||
For instance,
|
||||
- selecting the top-K products matching your query
|
||||
- counting the matching documents
|
||||
- computing several facets
|
||||
- computing statistics about the matching product prices
|
||||
|
||||
A simple and efficient way to do that is to pass your collectors as one tuple.
|
||||
The resulting `Fruit` will then be a typed tuple with each collector's original fruits
|
||||
in their respective position.
|
||||
|
||||
```rust
|
||||
# use tantivy::schema::*;
|
||||
# use tantivy::*;
|
||||
# use tantivy::query::*;
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
#
|
||||
# fn main() -> tantivy::Result<()> {
|
||||
# let mut schema_builder = Schema::builder();
|
||||
# let title = schema_builder.add_text_field("title", TEXT);
|
||||
# let schema = schema_builder.build();
|
||||
# let index = Index::create_in_ram(schema);
|
||||
# let mut index_writer = index.writer(3_000_000)?;
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Name of the Wind",
|
||||
# ))?;
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Diary of Muadib",
|
||||
# ))?;
|
||||
# index_writer.commit()?;
|
||||
# let reader = index.reader()?;
|
||||
# let searcher = reader.searcher();
|
||||
# let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
# let query = query_parser.parse_query("diary")?;
|
||||
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||
searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
||||
# Ok(())
|
||||
# }
|
||||
```
|
||||
|
||||
The `Collector` trait is implemented for up to 4 collectors.
|
||||
If you have more than 4 collectors, you can either group them into
|
||||
tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html).
|
||||
|
||||
# Combining several collectors dynamically
|
||||
|
||||
Combining collectors into a tuple is a zero-cost abstraction: everything
|
||||
happens as if you had manually implemented a single collector
|
||||
combining all of our features.
|
||||
|
||||
Unfortunately it requires you to know at compile time your collector types.
|
||||
If on the other hand, the collectors depend on some query parameter,
|
||||
you can rely on `MultiCollector`'s.
|
||||
|
||||
|
||||
# Implementing your own collectors.
|
||||
|
||||
See the `custom_collector` example.
|
||||
|
||||
*/
|
||||
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use downcast_rs::impl_downcast;
|
||||
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::Count;
|
||||
|
||||
@@ -111,8 +106,7 @@ mod tweak_score_top_collector;
|
||||
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
pub use self::facet_collector::FacetCounts;
|
||||
pub use self::facet_collector::{FacetCollector, FacetCounts};
|
||||
use crate::query::Weight;
|
||||
|
||||
mod docset_collector;
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use crate::collector::Fruit;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use crate::TantivyError;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Deref;
|
||||
|
||||
use super::{Collector, SegmentCollector};
|
||||
use crate::collector::Fruit;
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
pub struct MultiFruit {
|
||||
sub_fruits: Vec<Option<Box<dyn Fruit>>>,
|
||||
}
|
||||
@@ -104,7 +100,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
///
|
||||
/// If the type of the collectors is known, you can just group yours collectors
|
||||
/// in a tuple. See the
|
||||
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
|
||||
/// [Combining several collectors section of the collector
|
||||
/// documentation](./index.html#combining-several-collectors).
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::{Count, TopDocs, MultiCollector};
|
||||
@@ -248,10 +245,8 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
||||
use crate::{Index, Term};
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() -> crate::Result<()> {
|
||||
|
||||
@@ -1,21 +1,13 @@
|
||||
use super::*;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::{DocAddress, Document, Searcher};
|
||||
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Schema, FAST, TEXT};
|
||||
use crate::DateTime;
|
||||
use crate::{doc, Index};
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::*;
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, TEXT};
|
||||
use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
|
||||
|
||||
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
|
||||
compute_score: true,
|
||||
};
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader};
|
||||
|
||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||
///
|
||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||
@@ -62,8 +60,7 @@ pub(crate) struct TopCollector<T> {
|
||||
}
|
||||
|
||||
impl<T> TopCollector<T>
|
||||
where
|
||||
T: PartialOrd + Clone,
|
||||
where T: PartialOrd + Clone
|
||||
{
|
||||
/// Creates a top collector, with a number of documents equal to "limit".
|
||||
///
|
||||
@@ -322,9 +319,10 @@ mod tests {
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::TopSegmentCollector;
|
||||
use test::Bencher;
|
||||
|
||||
use super::TopSegmentCollector;
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 400);
|
||||
|
||||
@@ -1,21 +1,18 @@
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::Collector;
|
||||
use crate::collector::top_collector::{ComparableDoc, TopCollector};
|
||||
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
|
||||
use crate::collector::top_collector::{ComparableDoc, TopCollector, TopSegmentCollector};
|
||||
use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use crate::{collector::custom_score_top_collector::CustomScoreTopCollector, fastfield::FastValue};
|
||||
use crate::{collector::top_collector::TopSegmentCollector, TantivyError};
|
||||
use std::fmt;
|
||||
use std::{collections::BinaryHeap, marker::PhantomData};
|
||||
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
struct FastFieldConvertCollector<
|
||||
TCollector: Collector<Fruit = Vec<(u64, DocAddress)>>,
|
||||
@@ -217,11 +214,12 @@ impl TopDocs {
|
||||
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
///
|
||||
/// If the field is not a fast or does not exist, this method returns successfully (it is not aware of any schema).
|
||||
/// An error will be returned at the moment of search.
|
||||
/// If the field is not a fast or does not exist, this method returns successfully (it is not
|
||||
/// aware of any schema). An error will be returned at the moment of search.
|
||||
///
|
||||
/// If the field is a FAST field but not a u64 field, search will return successfully but it will return
|
||||
/// returns a monotonic u64-representation (ie. the order is still correct) of the requested field type.
|
||||
/// If the field is a FAST field but not a u64 field, search will return successfully but it
|
||||
/// will return returns a monotonic u64-representation (ie. the order is still correct) of
|
||||
/// the requested field type.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
@@ -296,14 +294,15 @@ impl TopDocs {
|
||||
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
///
|
||||
/// If the field is not a fast field, or its field type does not match the generic type, this method does not panic,
|
||||
/// but an explicit error will be returned at the moment of collection.
|
||||
/// If the field is not a fast field, or its field type does not match the generic type, this
|
||||
/// method does not panic, but an explicit error will be returned at the moment of
|
||||
/// collection.
|
||||
///
|
||||
/// Note that this method is a generic. The requested fast field type will be often
|
||||
/// inferred in your code by the rust compiler.
|
||||
///
|
||||
/// Implementation-wise, for performance reason, tantivy will manipulate the u64 representation of your fast
|
||||
/// field until the last moment.
|
||||
/// Implementation-wise, for performance reason, tantivy will manipulate the u64 representation
|
||||
/// of your fast field until the last moment.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
@@ -715,10 +714,7 @@ mod tests {
|
||||
use crate::collector::Collector;
|
||||
use crate::query::{AllQuery, Query, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::Index;
|
||||
use crate::IndexWriter;
|
||||
use crate::Score;
|
||||
use crate::{DocAddress, DocId, SegmentReader};
|
||||
use crate::{DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
|
||||
|
||||
fn make_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::DocAddress;
|
||||
use crate::{DocId, Result, Score, SegmentReader};
|
||||
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
|
||||
|
||||
pub(crate) struct TweakedScoreTopCollector<TScoreTweaker, TScore = Score> {
|
||||
score_tweaker: TScoreTweaker,
|
||||
@@ -9,8 +8,7 @@ pub(crate) struct TweakedScoreTopCollector<TScoreTweaker, TScore = Score> {
|
||||
}
|
||||
|
||||
impl<TScoreTweaker, TScore> TweakedScoreTopCollector<TScoreTweaker, TScore>
|
||||
where
|
||||
TScore: Clone + PartialOrd,
|
||||
where TScore: Clone + PartialOrd
|
||||
{
|
||||
pub fn new(
|
||||
score_tweaker: TScoreTweaker,
|
||||
@@ -118,8 +116,7 @@ where
|
||||
}
|
||||
|
||||
impl<F, TScore> ScoreSegmentTweaker<TScore> for F
|
||||
where
|
||||
F: 'static + FnMut(DocId, Score) -> TScore,
|
||||
where F: 'static + FnMut(DocId, Score) -> TScore
|
||||
{
|
||||
fn score(&mut self, doc: DocId, score: Score) -> TScore {
|
||||
(self)(doc, score)
|
||||
|
||||
@@ -57,7 +57,11 @@ impl Executor {
|
||||
let (idx, arg) = arg_with_idx;
|
||||
let fruit = f(arg);
|
||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
||||
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
|
||||
error!(
|
||||
"Failed to send search task. It probably means all search \
|
||||
threads have panicked. {:?}",
|
||||
err
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1,35 +1,27 @@
|
||||
use super::{segment::Segment, IndexSettings};
|
||||
use crate::core::Executor;
|
||||
use crate::core::IndexMeta;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::SegmentMetaInventory;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::OpenReadError;
|
||||
use crate::directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use crate::directory::MmapDirectory;
|
||||
use crate::directory::INDEX_WRITER_LOCK;
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::index_writer::{HEAP_SIZE_MIN, MAX_NUM_THREAD};
|
||||
use crate::indexer::segment_updater::save_new_metas;
|
||||
use crate::reader::IndexReader;
|
||||
use crate::reader::IndexReaderBuilder;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::segment::Segment;
|
||||
use super::IndexSettings;
|
||||
use crate::core::{
|
||||
Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
|
||||
};
|
||||
use crate::directory::error::OpenReadError;
|
||||
#[cfg(feature = "mmap")]
|
||||
use crate::directory::MmapDirectory;
|
||||
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
|
||||
use crate::error::{DataCorruption, TantivyError};
|
||||
use crate::indexer::index_writer::{HEAP_SIZE_MIN, MAX_NUM_THREAD};
|
||||
use crate::indexer::segment_updater::save_new_metas;
|
||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::IndexWriter;
|
||||
|
||||
fn load_metas(
|
||||
directory: &dyn Directory,
|
||||
inventory: &SegmentMetaInventory,
|
||||
@@ -78,7 +70,6 @@ fn load_metas(
|
||||
/// let schema = schema_builder.build();
|
||||
/// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc}), ..Default::default()};
|
||||
/// let index = Index::builder().schema(schema).settings(settings).create_in_ram();
|
||||
///
|
||||
/// ```
|
||||
pub struct IndexBuilder {
|
||||
schema: Option<Schema>,
|
||||
@@ -416,10 +407,9 @@ impl Index {
|
||||
TantivyError::LockFailure(
|
||||
err,
|
||||
Some(
|
||||
"Failed to acquire index lock. If you are using \
|
||||
a regular directory, this means there is already an \
|
||||
`IndexWriter` working on this `Directory`, in this process \
|
||||
or in a different process."
|
||||
"Failed to acquire index lock. If you are using a regular directory, this \
|
||||
means there is already an `IndexWriter` working on this `Directory`, in \
|
||||
this process or in a different process."
|
||||
.to_string(),
|
||||
),
|
||||
)
|
||||
@@ -462,13 +452,11 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Accessor to the index settings
|
||||
///
|
||||
pub fn settings(&self) -> &IndexSettings {
|
||||
&self.settings
|
||||
}
|
||||
|
||||
/// Accessor to the index settings
|
||||
///
|
||||
pub fn settings_mut(&mut self) -> &mut IndexSettings {
|
||||
&mut self.settings
|
||||
}
|
||||
@@ -556,15 +544,9 @@ impl fmt::Debug for Index {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::Field;
|
||||
use crate::schema::{Schema, INDEXED, TEXT};
|
||||
use crate::IndexReader;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::{
|
||||
directory::{RamDirectory, WatchCallback},
|
||||
IndexSettings,
|
||||
};
|
||||
use crate::{Directory, Index};
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::schema::{Field, Schema, INDEXED, TEXT};
|
||||
use crate::{Directory, Index, IndexReader, IndexSettings, ReloadPolicy};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -673,10 +655,12 @@ mod tests {
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_specific {
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use tempfile::TempDir;
|
||||
|
||||
use super::*;
|
||||
use crate::Directory;
|
||||
use std::path::PathBuf;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> {
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
use super::SegmentComponent;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::{core::SegmentId, store::Compressor};
|
||||
use crate::{Inventory, TrackedObject};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::{collections::HashSet, sync::atomic::AtomicBool};
|
||||
use std::{fmt, sync::Arc};
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Arc;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::store::Compressor;
|
||||
use crate::{Inventory, Opstamp, TrackedObject};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
@@ -282,7 +286,6 @@ impl Order {
|
||||
/// * the searchable segments,
|
||||
/// * the index `docstamp`
|
||||
/// * the schema
|
||||
///
|
||||
#[derive(Clone, Serialize)]
|
||||
pub struct IndexMeta {
|
||||
/// `IndexSettings` to configure index options.
|
||||
@@ -370,10 +373,8 @@ impl fmt::Debug for IndexMeta {
|
||||
mod tests {
|
||||
|
||||
use super::IndexMeta;
|
||||
use crate::{
|
||||
schema::{Schema, TEXT},
|
||||
IndexSettings, IndexSortByField, Order,
|
||||
};
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
use std::io;
|
||||
|
||||
use common::BinarySerializable;
|
||||
|
||||
use crate::directory::FileSlice;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
|
||||
use crate::schema::{IndexRecordOption, Term};
|
||||
use crate::termdict::TermDictionary;
|
||||
use common::BinarySerializable;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
|
||||
@@ -8,6 +8,10 @@ mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_reader;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub use self::executor::Executor;
|
||||
pub use self::index::{Index, IndexBuilder};
|
||||
pub use self::index_meta::{
|
||||
@@ -20,9 +24,6 @@ pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use std::path::Path;
|
||||
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
pub static META_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new("meta.json"));
|
||||
|
||||
@@ -1,21 +1,14 @@
|
||||
use crate::collector::Collector;
|
||||
use crate::core::Executor;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Query;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::Opstamp;
|
||||
use crate::SegmentId;
|
||||
use crate::TrackedObject;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::{fmt, io};
|
||||
|
||||
use crate::collector::Collector;
|
||||
use crate::core::{Executor, SegmentReader};
|
||||
use crate::query::Query;
|
||||
use crate::schema::{Document, Schema, Term};
|
||||
use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
|
||||
|
||||
/// Identifies the searcher generation accessed by a [Searcher].
|
||||
///
|
||||
/// While this might seem redundant, a [SearcherGeneration] contains
|
||||
@@ -69,7 +62,6 @@ impl SearcherGeneration {
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
/// the destruction of the `Searcher`.
|
||||
///
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
use super::SegmentComponent;
|
||||
use crate::core::Index;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::directory::error::{OpenReadError, OpenWriteError};
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::{FileSlice, WritePtr};
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use super::SegmentComponent;
|
||||
use crate::core::{Index, SegmentId, SegmentMeta};
|
||||
use crate::directory::error::{OpenReadError, OpenWriteError};
|
||||
use crate::directory::{Directory, FileSlice, WritePtr};
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
#[derive(Clone)]
|
||||
pub struct Segment {
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use std::cmp::{Ord, Ordering};
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
use uuid::Uuid;
|
||||
use std::str::FromStr;
|
||||
#[cfg(test)]
|
||||
use std::sync::atomic;
|
||||
|
||||
#[cfg(test)]
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::error::Error;
|
||||
use std::str::FromStr;
|
||||
#[cfg(test)]
|
||||
use std::sync::atomic;
|
||||
use uuid::Uuid;
|
||||
|
||||
/// Uuid identifying a segment.
|
||||
///
|
||||
|
||||
@@ -1,28 +1,19 @@
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{fmt, io};
|
||||
|
||||
use fail::fail_point;
|
||||
|
||||
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::intersect_alive_bitsets;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::fastfield::FastFieldReaders;
|
||||
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::{Field, IndexRecordOption};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Schema};
|
||||
use crate::space_usage::SegmentSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
use fail::fail_point;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::{collections::HashMap, io};
|
||||
use crate::{DocId, Opstamp};
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -130,7 +121,8 @@ impl SegmentReader {
|
||||
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg = format!(
|
||||
"Field norm not found for field {:?}. Was the field set to record norm during indexing?",
|
||||
"Field norm not found for field {:?}. Was the field set to record norm during \
|
||||
indexing?",
|
||||
field_name
|
||||
);
|
||||
crate::TantivyError::SchemaError(err_msg)
|
||||
@@ -259,19 +251,24 @@ impl SegmentReader {
|
||||
let record_option = record_option_opt.unwrap();
|
||||
let postings_file = postings_file_opt.unwrap();
|
||||
|
||||
let termdict_file: FileSlice = self.termdict_composite.open_read(field)
|
||||
.ok_or_else(||
|
||||
DataCorruption::comment_only(format!("Failed to open field {:?}'s term dictionary in the composite file. Has the schema been modified?", field_entry.name()))
|
||||
)?;
|
||||
|
||||
let positions_file = self
|
||||
.positions_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| {
|
||||
let error_msg = format!("Failed to open field {:?}'s positions in the composite file. Has the schema been modified?", field_entry.name());
|
||||
DataCorruption::comment_only(error_msg)
|
||||
let termdict_file: FileSlice =
|
||||
self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||
DataCorruption::comment_only(format!(
|
||||
"Failed to open field {:?}'s term dictionary in the composite file. Has the \
|
||||
schema been modified?",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
|
||||
let positions_file = self.positions_composite.open_read(field).ok_or_else(|| {
|
||||
let error_msg = format!(
|
||||
"Failed to open field {:?}'s positions in the composite file. Has the schema been \
|
||||
modified?",
|
||||
field_entry.name()
|
||||
);
|
||||
DataCorruption::comment_only(error_msg)
|
||||
})?;
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionary::open(termdict_file)?,
|
||||
postings_file,
|
||||
|
||||
@@ -1,17 +1,14 @@
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
use crate::space_usage::FieldUsage;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::HasLen;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::iter::ExactSizeIterator;
|
||||
use std::ops::Range;
|
||||
|
||||
use common::{BinarySerializable, CountingWriter, HasLen, VInt};
|
||||
|
||||
use crate::directory::{FileSlice, TerminatingWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
use crate::space_usage::{FieldUsage, PerFieldSpaceUsage};
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
field: Field,
|
||||
@@ -186,13 +183,14 @@ impl CompositeFile {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
use common::{BinarySerializable, VInt};
|
||||
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::schema::Field;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn test_composite_file() -> crate::Result<()> {
|
||||
|
||||
@@ -1,18 +1,12 @@
|
||||
use crate::directory::directory_lock::Lock;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::{FileHandle, WatchCallback};
|
||||
use crate::directory::{FileSlice, WritePtr};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::marker::Send;
|
||||
use std::marker::Sync;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::thread;
|
||||
use std::marker::{Send, Sync};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Duration;
|
||||
use std::{fmt, io, thread};
|
||||
|
||||
use crate::directory::directory_lock::Lock;
|
||||
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::{FileHandle, FileSlice, WatchCallback, WatchHandle, WritePtr};
|
||||
|
||||
/// Retry the logic of acquiring locks is pretty simple.
|
||||
/// We just retry `n` times after a given `duratio`, both
|
||||
@@ -233,8 +227,7 @@ pub trait DirectoryClone {
|
||||
}
|
||||
|
||||
impl<T> DirectoryClone for T
|
||||
where
|
||||
T: 'static + Directory + Clone,
|
||||
where T: 'static + Directory + Clone
|
||||
{
|
||||
fn box_clone(&self) -> Box<dyn Directory> {
|
||||
Box::new(self.clone())
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
/// A directory lock.
|
||||
///
|
||||
/// A lock is associated to a specific path and some
|
||||
@@ -11,7 +12,6 @@ use std::path::PathBuf;
|
||||
/// - [META_LOCK]
|
||||
///
|
||||
/// Check out these locks documentation for more information.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct Lock {
|
||||
/// The lock needs to be associated with its own file `path`.
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
use crate::Version;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use std::{fmt, io};
|
||||
|
||||
use crate::Version;
|
||||
|
||||
/// Error while trying to acquire a directory lock.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum LockError {
|
||||
/// Failed to acquired a lock as it is already held by another
|
||||
/// client.
|
||||
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
|
||||
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
|
||||
/// - In the context of a blocking lock, this means the lock was not released within some
|
||||
/// `timeout` period.
|
||||
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the
|
||||
/// call.
|
||||
#[error("Could not acquire lock as it is already held, possibly by a different process.")]
|
||||
LockBusy,
|
||||
/// Trying to acquire a lock failed with an `IoError`
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::{fmt, io};
|
||||
|
||||
use common::HasLen;
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
use crate::directory::OwnedBytes;
|
||||
use common::HasLen;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::{io, ops::Deref};
|
||||
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
@@ -33,8 +33,7 @@ impl FileHandle for &'static [u8] {
|
||||
}
|
||||
|
||||
impl<B> From<B> for FileSlice
|
||||
where
|
||||
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
|
||||
where B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync
|
||||
{
|
||||
fn from(bytes: B) -> FileSlice {
|
||||
FileSlice::new(Box::new(OwnedBytes::new(bytes)))
|
||||
@@ -44,7 +43,6 @@ where
|
||||
/// Logical slice of read only file in tantivy.
|
||||
///
|
||||
/// It can be cloned and sliced cheaply.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct FileSlice {
|
||||
data: Arc<dyn FileHandle>,
|
||||
@@ -172,10 +170,12 @@ impl HasLen for FileSlice {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{FileHandle, FileSlice};
|
||||
use common::HasLen;
|
||||
use std::io;
|
||||
|
||||
use common::HasLen;
|
||||
|
||||
use super::{FileHandle, FileSlice};
|
||||
|
||||
#[test]
|
||||
fn test_file_slice() -> io::Result<()> {
|
||||
let file_slice = FileSlice::new(Box::new(b"abcdef".as_ref()));
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use crate::directory::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
use crc32fast::Hasher;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::io::BufRead;
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
use crc32fast::Hasher;
|
||||
|
||||
use crate::directory::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
|
||||
pub const POLLING_INTERVAL: Duration = Duration::from_millis(if cfg!(test) { 1 } else { 500 });
|
||||
|
||||
@@ -99,9 +99,8 @@ mod tests {
|
||||
|
||||
use std::mem;
|
||||
|
||||
use crate::directory::mmap_directory::atomic_write;
|
||||
|
||||
use super::*;
|
||||
use crate::directory::mmap_directory::atomic_write;
|
||||
|
||||
#[test]
|
||||
fn test_file_watcher_drop_watcher() -> crate::Result<()> {
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
use crate::directory::error::Incompatibility;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::{
|
||||
directory::{AntiCallToken, TerminatingWrite},
|
||||
Version, INDEX_FORMAT_VERSION,
|
||||
};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen};
|
||||
use crc32fast::Hasher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use crate::directory::error::Incompatibility;
|
||||
use crate::directory::{AntiCallToken, FileSlice, TerminatingWrite};
|
||||
use crate::{Version, INDEX_FORMAT_VERSION};
|
||||
|
||||
const FOOTER_MAX_LEN: u32 = 50_000;
|
||||
|
||||
@@ -64,7 +63,9 @@ impl Footer {
|
||||
if footer_magic_byte != FOOTER_MAGIC_NUMBER {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which is not supported anymore. Please use tantivy 0.15 or above to recreate the index.",
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an \
|
||||
tantivy version which is not supported anymore. Please use tantivy 0.15 or above \
|
||||
to recreate the index.",
|
||||
));
|
||||
}
|
||||
|
||||
@@ -73,7 +74,7 @@ impl Footer {
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
footer_len
|
||||
),
|
||||
));
|
||||
@@ -154,12 +155,13 @@ impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use crate::directory::footer::Footer;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice};
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
|
||||
use common::BinarySerializable;
|
||||
|
||||
use crate::directory::footer::{Footer, FOOTER_MAGIC_NUMBER};
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_footer() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
@@ -183,8 +185,9 @@ mod tests {
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which \
|
||||
is not supported anymore. Please use tantivy 0.15 or above to recreate the index."
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy \
|
||||
version which is not supported anymore. Please use tantivy 0.15 or above to recreate \
|
||||
the index."
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
@@ -219,8 +222,8 @@ mod tests {
|
||||
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer seems invalid as it suggests a footer len of 50001. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy."
|
||||
"Footer seems invalid as it suggests a footer len of 50001. File is corrupted, or the \
|
||||
index was created with a different & old version of tantivy."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,24 +1,21 @@
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock, RwLockWriteGuard};
|
||||
use std::{io, result};
|
||||
|
||||
use crc32fast::Hasher;
|
||||
|
||||
use crate::core::MANAGED_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::footer::{Footer, FooterProxy};
|
||||
use crate::directory::GarbageCollectionResult;
|
||||
use crate::directory::Lock;
|
||||
use crate::directory::META_LOCK;
|
||||
use crate::directory::{DirectoryLock, FileHandle};
|
||||
use crate::directory::{FileSlice, WritePtr};
|
||||
use crate::directory::{WatchCallback, WatchHandle};
|
||||
use crate::directory::{
|
||||
DirectoryLock, FileHandle, FileSlice, GarbageCollectionResult, Lock, WatchCallback,
|
||||
WatchHandle, WritePtr, META_LOCK,
|
||||
};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::Directory;
|
||||
|
||||
use crc32fast::Hasher;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
/// Returns true iff the file is "managed".
|
||||
/// Non-managed file are not subject to garbage collection.
|
||||
///
|
||||
@@ -344,12 +341,14 @@ impl Clone for ManagedDirectory {
|
||||
#[cfg(test)]
|
||||
mod tests_mmap_specific {
|
||||
|
||||
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite};
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite};
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
|
||||
@@ -1,32 +1,28 @@
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{DeleteError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::file_watcher::FileWatcher;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::DirectoryLock;
|
||||
use crate::directory::Lock;
|
||||
use crate::directory::WatchCallback;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::{AntiCallToken, FileHandle, OwnedBytes};
|
||||
use crate::directory::{ArcBytes, WeakArcBytes};
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{self, BufWriter, Read, Seek, SeekFrom, Write};
|
||||
use std::ops::Deref;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{fmt, result};
|
||||
|
||||
use fs2::FileExt;
|
||||
use memmap2::Mmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::From;
|
||||
use std::fmt;
|
||||
use std::fs::OpenOptions;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::{collections::HashMap, ops::Deref};
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::{
|
||||
DeleteError, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
|
||||
};
|
||||
use crate::directory::file_watcher::FileWatcher;
|
||||
use crate::directory::{
|
||||
AntiCallToken, ArcBytes, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes,
|
||||
TerminatingWrite, WatchCallback, WatchHandle, WeakArcBytes, WritePtr,
|
||||
};
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
@@ -320,8 +316,7 @@ impl Directory for MmapDirectory {
|
||||
|
||||
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while reading {:?}",
|
||||
"Failed to acquired write lock on mmap cache while reading {:?}",
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
@@ -457,6 +452,7 @@ impl Directory for MmapDirectory {
|
||||
#[cfg(windows)]
|
||||
{
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
|
||||
use winapi::um::winbase;
|
||||
|
||||
open_opts
|
||||
@@ -476,15 +472,12 @@ mod tests {
|
||||
// There are more tests in directory/mod.rs
|
||||
// The following tests are specific to the MmapDirectory
|
||||
|
||||
use common::HasLen;
|
||||
|
||||
use super::*;
|
||||
use crate::indexer::LogMergePolicy;
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::{
|
||||
schema::{Schema, SchemaBuilder, TEXT},
|
||||
IndexSettings,
|
||||
};
|
||||
use common::HasLen;
|
||||
use crate::schema::{Schema, SchemaBuilder, TEXT};
|
||||
use crate::{Index, IndexSettings, ReloadPolicy};
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existent_path() {
|
||||
|
||||
@@ -1,8 +1,4 @@
|
||||
/*!
|
||||
|
||||
WORM (Write Once Read Many) directory abstraction.
|
||||
|
||||
*/
|
||||
//! WORM (Write Once Read Many) directory abstraction.
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_directory;
|
||||
@@ -22,19 +18,19 @@ pub mod error;
|
||||
|
||||
mod composite_file;
|
||||
|
||||
use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub use common::{AntiCallToken, TerminatingWrite};
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::directory::DirectoryLock;
|
||||
pub use self::directory::{Directory, DirectoryClone};
|
||||
pub use self::directory::{Directory, DirectoryClone, DirectoryLock};
|
||||
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes};
|
||||
pub use self::file_slice::{FileHandle, FileSlice};
|
||||
pub use self::owned_bytes::OwnedBytes;
|
||||
pub use self::ram_directory::RamDirectory;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
pub use common::AntiCallToken;
|
||||
pub use common::TerminatingWrite;
|
||||
use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Outcome of the Garbage collection
|
||||
pub struct GarbageCollectionResult {
|
||||
@@ -50,11 +46,10 @@ pub struct GarbageCollectionResult {
|
||||
pub failed_to_delete_files: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
pub use self::managed_directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Write object for Directory.
|
||||
///
|
||||
/// `WritePtr` are required to implement both Write
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use crate::directory::FileHandle;
|
||||
use std::io;
|
||||
use std::ops::Range;
|
||||
|
||||
pub use ownedbytes::OwnedBytes;
|
||||
|
||||
use crate::directory::FileHandle;
|
||||
|
||||
impl FileHandle for OwnedBytes {
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
Ok(self.slice(range))
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::AntiCallToken;
|
||||
use crate::directory::WatchCallbackList;
|
||||
use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle};
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use common::HasLen;
|
||||
use fail::fail_point;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{fmt, result};
|
||||
|
||||
use common::HasLen;
|
||||
use fail::fail_point;
|
||||
|
||||
use super::FileHandle;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::{
|
||||
AntiCallToken, Directory, FileSlice, TerminatingWrite, WatchCallback, WatchCallbackList,
|
||||
WatchHandle, WritePtr,
|
||||
};
|
||||
|
||||
/// Writer associated with the `RamDirectory`
|
||||
///
|
||||
@@ -40,7 +40,9 @@ impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
warn!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This \
|
||||
also occurs when the indexer crashed, so you may want to check the logs for the \
|
||||
root cause.",
|
||||
self.path
|
||||
)
|
||||
}
|
||||
@@ -123,7 +125,6 @@ impl fmt::Debug for RamDirectory {
|
||||
///
|
||||
/// It is mainly meant for unit testing.
|
||||
/// Writes are only made visible upon flushing.
|
||||
///
|
||||
#[derive(Clone, Default)]
|
||||
pub struct RamDirectory {
|
||||
fs: Arc<RwLock<InnerDirectory>>,
|
||||
@@ -233,11 +234,12 @@ impl Directory for RamDirectory {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::RamDirectory;
|
||||
use crate::Directory;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
use super::RamDirectory;
|
||||
use crate::Directory;
|
||||
|
||||
#[test]
|
||||
fn test_persist() {
|
||||
let msg_atomic: &'static [u8] = b"atomic is the way";
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
use super::*;
|
||||
use futures::channel::oneshot;
|
||||
use futures::executor::block_on;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -9,6 +6,11 @@ use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::channel::oneshot;
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_directory_tests {
|
||||
use crate::directory::MmapDirectory;
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use futures::channel::oneshot;
|
||||
use futures::{Future, TryFutureExt};
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
|
||||
/// Cloneable wrapper for callbacks registered when watching files of a `Directory`.
|
||||
#[derive(Clone)]
|
||||
@@ -103,12 +102,14 @@ impl WatchCallbackList {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::directory::{WatchCallback, WatchCallbackList};
|
||||
use futures::executor::block_on;
|
||||
use std::mem;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::executor::block_on;
|
||||
|
||||
use crate::directory::{WatchCallback, WatchCallbackList};
|
||||
|
||||
#[test]
|
||||
fn test_watch_event_router_simple() {
|
||||
let watch_event_router = WatchCallbackList::default();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
|
||||
/// Sentinel value returned when a DocSet has been entirely consumed.
|
||||
///
|
||||
|
||||
17
src/error.rs
17
src/error.rs
@@ -1,17 +1,14 @@
|
||||
//! Definition of Tantivy's error and result.
|
||||
|
||||
use std::io;
|
||||
|
||||
use crate::directory::error::{Incompatibility, LockError};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::query;
|
||||
use crate::{
|
||||
directory::error::{OpenDirectoryError, OpenReadError, OpenWriteError},
|
||||
schema,
|
||||
};
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
use std::{fmt, io};
|
||||
|
||||
use crate::directory::error::{
|
||||
Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
|
||||
};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::{query, schema};
|
||||
|
||||
/// Represents a `DataCorruption` error.
|
||||
///
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use common::intersect_bitsets;
|
||||
use common::BitSet;
|
||||
use common::ReadOnlyBitSet;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use common::{intersect_bitsets, BitSet, ReadOnlyBitSet};
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
|
||||
/// Write a alive `BitSet`
|
||||
///
|
||||
/// where `alive_bitset` is the set of alive `DocId`.
|
||||
@@ -168,11 +168,12 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::AliveBitSet;
|
||||
use rand::prelude::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
use super::AliveBitSet;
|
||||
|
||||
fn get_alive() -> Vec<u32> {
|
||||
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
|
||||
for _ in 0..(1_000_000) * 1 / 8 {
|
||||
|
||||
@@ -6,11 +6,12 @@ pub use self::writer::BytesFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value};
|
||||
use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::STORED};
|
||||
use crate::{DocAddress, DocSet, Index, Searcher, Term};
|
||||
use std::ops::Deref;
|
||||
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value, FAST, INDEXED, STORED};
|
||||
use crate::{DocAddress, DocSet, Index, Searcher, Term};
|
||||
|
||||
#[test]
|
||||
fn test_bytes() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -62,7 +63,7 @@ mod tests {
|
||||
assert_eq!(values.len(), 2);
|
||||
let values_bytes: Vec<&[u8]> = values
|
||||
.into_iter()
|
||||
.flat_map(|value| value.bytes_value())
|
||||
.flat_map(|value| value.as_bytes())
|
||||
.collect();
|
||||
assert_eq!(values_bytes, &[&b"tantivy"[..], &b"lucene"[..]]);
|
||||
Ok(())
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use std::io;
|
||||
|
||||
use crate::fastfield::serializer::CompositeFastFieldSerializer;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::schema::{Document, Field, Value};
|
||||
use crate::DocId;
|
||||
use crate::{
|
||||
fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping,
|
||||
};
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use crate::schema::FieldEntry;
|
||||
use std::result;
|
||||
|
||||
use crate::schema::FieldEntry;
|
||||
|
||||
/// `FastFieldNotAvailableError` is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
/// defined in the schema as a fast field.
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::str;
|
||||
|
||||
use super::MultiValuedFastFieldReader;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::schema::Facet;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::termdict::{TermDictionary, TermOrdinal};
|
||||
use crate::DocId;
|
||||
use std::str;
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
/// facets associated to a given document in a specific
|
||||
@@ -82,11 +82,8 @@ impl FacetReader {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::Index;
|
||||
use crate::{
|
||||
schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED},
|
||||
DocAddress, Document,
|
||||
};
|
||||
use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
|
||||
use crate::{DocAddress, Document, Index};
|
||||
|
||||
#[test]
|
||||
fn test_facet_only_indexed() -> crate::Result<()> {
|
||||
@@ -106,7 +103,7 @@ mod tests {
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::facet);
|
||||
let value = doc.get_first(facet_field).and_then(Value::as_facet);
|
||||
assert_eq!(value, None);
|
||||
Ok(())
|
||||
}
|
||||
@@ -129,7 +126,7 @@ mod tests {
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::facet);
|
||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet);
|
||||
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,51 +1,39 @@
|
||||
/*!
|
||||
Column oriented field storage for tantivy.
|
||||
//! Column oriented field storage for tantivy.
|
||||
//!
|
||||
//! It is the equivalent of `Lucene`'s `DocValues`.
|
||||
//!
|
||||
//! Fast fields is a column-oriented fashion storage of `tantivy`.
|
||||
//!
|
||||
//! It is designed for the fast random access of some document
|
||||
//! fields given a document id.
|
||||
//!
|
||||
//! `FastField` are useful when a field is required for all or most of
|
||||
//! the `DocSet` : for instance for scoring, grouping, filtering, or faceting.
|
||||
//!
|
||||
//!
|
||||
//! Fields have to be declared as `FAST` in the schema.
|
||||
//! Currently only 64-bits integers (signed or unsigned) are
|
||||
//! supported.
|
||||
//!
|
||||
//! They are stored in a bit-packed fashion so that their
|
||||
//! memory usage is directly linear with the amplitude of the
|
||||
//! values stored.
|
||||
//!
|
||||
//! Read access performance is comparable to that of an array lookup.
|
||||
|
||||
It is the equivalent of `Lucene`'s `DocValues`.
|
||||
|
||||
Fast fields is a column-oriented fashion storage of `tantivy`.
|
||||
|
||||
It is designed for the fast random access of some document
|
||||
fields given a document id.
|
||||
|
||||
`FastField` are useful when a field is required for all or most of
|
||||
the `DocSet` : for instance for scoring, grouping, filtering, or faceting.
|
||||
|
||||
|
||||
Fields have to be declared as `FAST` in the schema.
|
||||
Currently only 64-bits integers (signed or unsigned) are
|
||||
supported.
|
||||
|
||||
They are stored in a bit-packed fashion so that their
|
||||
memory usage is directly linear with the amplitude of the
|
||||
values stored.
|
||||
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
pub use self::alive_bitset::intersect_alive_bitsets;
|
||||
pub use self::alive_bitset::write_alive_bitset;
|
||||
pub use self::alive_bitset::AliveBitSet;
|
||||
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub(crate) use self::reader::BitpackedFastFieldReader;
|
||||
pub use self::reader::DynamicFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::reader::{DynamicFastFieldReader, FastFieldReader};
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::CompositeFastFieldSerializer;
|
||||
pub use self::serializer::FastFieldDataAccess;
|
||||
pub use self::serializer::FastFieldStats;
|
||||
pub use self::serializer::{CompositeFastFieldSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Value;
|
||||
use crate::chrono::{NaiveDateTime, Utc};
|
||||
use crate::schema::{Cardinality, FieldType, Type, Value};
|
||||
use crate::DocId;
|
||||
use crate::{
|
||||
chrono::{NaiveDateTime, Utc},
|
||||
schema::Type,
|
||||
};
|
||||
|
||||
mod alive_bitset;
|
||||
mod bytes;
|
||||
@@ -213,22 +201,20 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::schema::{Document, IntOptions};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use common::HasLen;
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{Document, Field, IntOptions, Schema, FAST};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -407,7 +393,7 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
//assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
||||
// assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
||||
assert_eq!(file.len(), 10175_usize); // linear interpol size
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
@@ -587,16 +573,16 @@ mod tests {
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::tests::FIELD;
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use test::{self, Bencher};
|
||||
|
||||
use super::tests::{generate_permutation, FIELD, SCHEMA};
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
|
||||
@@ -7,23 +7,17 @@ pub use self::writer::MultiValuedFastFieldWriter;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use chrono::Duration;
|
||||
use futures::executor::block_on;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
use test_log::test;
|
||||
|
||||
use crate::collector::TopDocs;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::FacetOptions;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Schema;
|
||||
use crate::Document;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use chrono::Duration;
|
||||
use futures::executor::block_on;
|
||||
use proptest::prop_oneof;
|
||||
use proptest::proptest;
|
||||
use proptest::strategy::Strategy;
|
||||
use test_log::test;
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema};
|
||||
use crate::{Document, Index, Term};
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() -> crate::Result<()> {
|
||||
@@ -110,7 +104,7 @@ mod tests {
|
||||
retrieved_doc
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.date_value()
|
||||
.as_date()
|
||||
.unwrap()
|
||||
.timestamp(),
|
||||
first_time_stamp.timestamp()
|
||||
@@ -119,7 +113,7 @@ mod tests {
|
||||
retrieved_doc
|
||||
.get_first(time_i)
|
||||
.expect("cannot find value")
|
||||
.i64_value(),
|
||||
.as_i64(),
|
||||
Some(1i64)
|
||||
);
|
||||
}
|
||||
@@ -138,7 +132,7 @@ mod tests {
|
||||
retrieved_doc
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.date_value()
|
||||
.as_date()
|
||||
.unwrap()
|
||||
.timestamp(),
|
||||
two_secs_ahead.timestamp()
|
||||
@@ -147,7 +141,7 @@ mod tests {
|
||||
retrieved_doc
|
||||
.get_first(time_i)
|
||||
.expect("cannot find value")
|
||||
.i64_value(),
|
||||
.as_i64(),
|
||||
Some(3i64)
|
||||
);
|
||||
}
|
||||
@@ -180,7 +174,7 @@ mod tests {
|
||||
retrieved_doc
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.date_value()
|
||||
.as_date()
|
||||
.expect("value not of Date type")
|
||||
.timestamp(),
|
||||
(first_time_stamp + Duration::seconds(offset_sec)).timestamp()
|
||||
@@ -189,7 +183,7 @@ mod tests {
|
||||
retrieved_doc
|
||||
.get_first(time_i)
|
||||
.expect("cannot find value")
|
||||
.i64_value(),
|
||||
.as_i64(),
|
||||
Some(time_i_val)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@ use crate::DocId;
|
||||
/// The `vals_reader` will access the concatenated list of all
|
||||
/// values for all reader.
|
||||
/// The `idx_reader` associated, for each document, the index of its first value.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
use std::io;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use crate::{fastfield::value_to_u64, indexer::doc_id_mapping::DocIdMapping};
|
||||
use fnv::FnvHashMap;
|
||||
use std::io;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
@@ -20,7 +22,8 @@ use tantivy_bitpacker::minmax;
|
||||
/// - add your document simply by calling `.add_document(...)`.
|
||||
///
|
||||
/// The `MultiValuedFastFieldWriter` can be acquired from the
|
||||
/// fastfield writer, by calling [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer).
|
||||
/// fastfield writer, by calling
|
||||
/// [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer).
|
||||
///
|
||||
/// Once acquired, writing is done by calling calls to
|
||||
/// `.add_document_vals(&[u64])` once per document.
|
||||
@@ -131,7 +134,6 @@ impl MultiValuedFastFieldWriter {
|
||||
/// During the serialization of the segment, terms gets sorted and
|
||||
/// `tantivy` builds a mapping to convert this `UnorderedTermId` into
|
||||
/// term ordinals.
|
||||
///
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
|
||||
@@ -1,25 +1,25 @@
|
||||
use super::FastValue;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::FastFieldCodecReader;
|
||||
use fastfield_codecs::FastFieldCodecSerializer;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::{
|
||||
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::linearinterpol::{
|
||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer};
|
||||
|
||||
use super::FastValue;
|
||||
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
|
||||
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::DocId;
|
||||
|
||||
/// FastFieldReader is the trait to access fast field data.
|
||||
pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
/// Return the value associated to the given document.
|
||||
@@ -64,7 +64,6 @@ pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
#[derive(Clone)]
|
||||
/// DynamicFastFieldReader wraps different readers to access
|
||||
/// the various encoded fastfield data
|
||||
///
|
||||
pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
/// Bitpacked compressed fastfield data.
|
||||
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
|
||||
@@ -146,7 +145,6 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
|
||||
reader: CodecReader,
|
||||
@@ -162,7 +160,8 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
assert_eq!(
|
||||
BitpackedFastFieldSerializer::ID,
|
||||
id,
|
||||
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with different id"
|
||||
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with \
|
||||
different id"
|
||||
);
|
||||
Self::open_from_bytes(bytes)
|
||||
}
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
|
||||
use crate::fastfield::{BytesFastFieldReader, FastValue};
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::fastfield::{
|
||||
BitpackedFastFieldReader, BytesFastFieldReader, FastFieldNotAvailableError, FastValue,
|
||||
MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::TantivyError;
|
||||
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
|
||||
/// Provides access to all of the BitpackedFastFieldReader.
|
||||
///
|
||||
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||
@@ -131,10 +130,11 @@ impl FastFieldReaders {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated to `field`, regardless of whether the given
|
||||
/// field is effectively of type `u64` or not.
|
||||
/// Returns the `u64` fast field reader reader associated to `field`, regardless of whether the
|
||||
/// given field is effectively of type `u64` or not.
|
||||
///
|
||||
/// If not, the fastfield reader will returns the u64-value associated to the original FastValue.
|
||||
/// If not, the fastfield reader will returns the u64-value associated to the original
|
||||
/// FastValue.
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -171,8 +171,8 @@ impl FastFieldReaders {
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`, regardless of whether the given
|
||||
/// field is effectively of type `u64` or not.
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`, regardless of
|
||||
/// whether the given field is effectively of type `u64` or not.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
|
||||
pub fn u64s_lenient(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<u64>> {
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
use crate::directory::CompositeWrite;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use common::{BinarySerializable, CountingWriter};
|
||||
pub use fastfield_codecs::bitpacked::{
|
||||
BitpackedFastFieldSerializer, BitpackedFastFieldSerializerLegacy,
|
||||
};
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
pub use fastfield_codecs::FastFieldCodecSerializer;
|
||||
pub use fastfield_codecs::FastFieldDataAccess;
|
||||
pub use fastfield_codecs::FastFieldStats;
|
||||
use std::io::{self, Write};
|
||||
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
use crate::directory::{CompositeWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
@@ -58,7 +57,8 @@ impl CompositeFastFieldSerializer {
|
||||
Ok(CompositeFastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
@@ -76,7 +76,8 @@ impl CompositeFastFieldSerializer {
|
||||
0,
|
||||
)
|
||||
}
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
@@ -112,7 +113,8 @@ impl CompositeFastFieldSerializer {
|
||||
broken_estimation.1
|
||||
);
|
||||
}
|
||||
// removing nan values for codecs with broken calculations, and max values which disables codecs
|
||||
// removing nan values for codecs with broken calculations, and max values which disables
|
||||
// codecs
|
||||
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
|
||||
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
let (_ratio, name, id) = estimations[0];
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
use common;
|
||||
use fnv::FnvHashMap;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use super::serializer::FastFieldStats;
|
||||
use super::FastFieldDataAccess;
|
||||
@@ -6,11 +13,6 @@ use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use common;
|
||||
use fnv::FnvHashMap;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
@@ -324,7 +326,8 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
/// Return the value associated to the given doc.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
|
||||
/// reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
@@ -332,7 +335,9 @@ impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'b
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
if let Some(doc_id_map) = self.doc_id_map {
|
||||
self.vals
|
||||
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
|
||||
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra
|
||||
// FastFieldReader wrapper for
|
||||
// non doc_id_map
|
||||
} else {
|
||||
self.vals.get(doc as usize)
|
||||
}
|
||||
|
||||
@@ -21,32 +21,24 @@ mod reader;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
pub use self::reader::{FieldNormReader, FieldNormReaders};
|
||||
pub use self::serializer::FieldNormsSerializer;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::query::Query;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::TextOptions;
|
||||
use crate::schema::TEXT;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use crate::TERMINATED;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::schema::{Field, Schema, STORED};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormsSerializer, FieldNormsWriter};
|
||||
use crate::query::{Query, TermQuery};
|
||||
use crate::schema::{
|
||||
Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, TEXT,
|
||||
};
|
||||
use crate::{Index, Term, TERMINATED};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{CompositeFile, FileSlice, OwnedBytes};
|
||||
use crate::schema::Field;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::DocId;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Reader for the fieldnorm (for each document, the number of tokens indexed in the
|
||||
/// field) of all indexed fields in the index.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use crate::directory::CompositeWrite;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use crate::directory::{CompositeWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
|
||||
/// The fieldnorms serializer is in charge of
|
||||
/// the serialization of field norms for all fields.
|
||||
pub struct FieldNormsSerializer {
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use crate::{indexer::doc_id_mapping::DocIdMapping, DocId};
|
||||
|
||||
use super::fieldnorm_to_id;
|
||||
use super::FieldNormsSerializer;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use std::cmp::Ordering;
|
||||
use std::{io, iter};
|
||||
|
||||
use super::{fieldnorm_to_id, FieldNormsSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::DocId;
|
||||
|
||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||
/// of each document for each field with field norms.
|
||||
///
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
use crate::schema;
|
||||
use crate::Index;
|
||||
use crate::IndexSettings;
|
||||
use crate::IndexSortByField;
|
||||
use crate::Order;
|
||||
use crate::Searcher;
|
||||
use crate::{doc, schema::*};
|
||||
use rand::thread_rng;
|
||||
use rand::Rng;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use crate::schema::*;
|
||||
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, Order, Searcher};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
assert_eq!(searcher.num_docs() as usize, vals.len());
|
||||
@@ -130,14 +126,12 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.";
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \
|
||||
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \
|
||||
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo \
|
||||
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse \
|
||||
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat \
|
||||
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";
|
||||
fn get_text() -> String {
|
||||
use rand::seq::SliceRandom;
|
||||
let mut rng = thread_rng();
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use super::operation::DeleteOperation;
|
||||
use crate::Opstamp;
|
||||
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use super::operation::DeleteOperation;
|
||||
use crate::Opstamp;
|
||||
|
||||
// The DeleteQueue is similar in conceptually to a multiple
|
||||
// consumer single producer broadcast channel.
|
||||
//
|
||||
@@ -13,12 +13,10 @@ use std::sync::{Arc, RwLock, Weak};
|
||||
// which points to a specific place of the `DeleteQueue`.
|
||||
//
|
||||
// New consumer can be created in two ways
|
||||
// - calling `delete_queue.cursor()` returns a cursor, that
|
||||
// will include all future delete operation (and some or none
|
||||
// of the past operations... The client is in charge of checking the opstamps.).
|
||||
// - cloning an existing cursor returns a new cursor, that
|
||||
// is at the exact same position, and can now advance independently
|
||||
// from the original cursor.
|
||||
// - calling `delete_queue.cursor()` returns a cursor, that will include all future delete operation
|
||||
// (and some or none of the past operations... The client is in charge of checking the opstamps.).
|
||||
// - cloning an existing cursor returns a new cursor, that is at the exact same position, and can
|
||||
// now advance independently from the original cursor.
|
||||
#[derive(Default)]
|
||||
struct InnerDeleteQueue {
|
||||
writer: Vec<DeleteOperation>,
|
||||
@@ -179,8 +177,8 @@ pub struct DeleteCursor {
|
||||
|
||||
impl DeleteCursor {
|
||||
/// Skips operations and position it so that
|
||||
/// - either all of the delete operation currently in the
|
||||
/// queue are consume and the next get will return None.
|
||||
/// - either all of the delete operation currently in the queue are consume and the next get
|
||||
/// will return None.
|
||||
/// - the next get will return the first operation with an
|
||||
/// `opstamp >= target_opstamp`.
|
||||
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||
|
||||
@@ -5,8 +5,8 @@ use crate::fastfield::AliveBitSet;
|
||||
use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, SegmentOrdinal};
|
||||
/// DemuxMapping can be used to reorganize data from multiple segments.
|
||||
///
|
||||
/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong to a different tenant.
|
||||
/// It allows to reorganize documents as follows:
|
||||
/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong
|
||||
/// to a different tenant. It allows to reorganize documents as follows:
|
||||
///
|
||||
/// e.g. if you have two tenant ids TENANT_A and TENANT_B and two segments with
|
||||
/// the documents (simplified)
|
||||
@@ -18,7 +18,8 @@ use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, S
|
||||
/// Seg 2 [TENANT_B, TENANT_B]
|
||||
///
|
||||
/// Demuxing is the tool for that.
|
||||
/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment ordinal].
|
||||
/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment
|
||||
/// ordinal].
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DemuxMapping {
|
||||
/// [index old segment ordinal] -> [index doc_id] = new segment ordinal
|
||||
@@ -132,27 +133,24 @@ pub fn demux(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
collector::TopDocs,
|
||||
directory::RamDirectory,
|
||||
query::QueryParser,
|
||||
schema::{Schema, TEXT},
|
||||
DocAddress, Term,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::{DocAddress, Term};
|
||||
|
||||
#[test]
|
||||
fn test_demux_map_to_deletebitset() {
|
||||
let max_value = 2;
|
||||
let mut demux_mapping = DemuxMapping::default();
|
||||
//segment ordinal 0 mapping
|
||||
// segment ordinal 0 mapping
|
||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
||||
doc_id_to_segment.set(0, 1);
|
||||
doc_id_to_segment.set(1, 0);
|
||||
demux_mapping.add(doc_id_to_segment);
|
||||
|
||||
//segment ordinal 1 mapping
|
||||
// segment ordinal 1 mapping
|
||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
||||
doc_id_to_segment.set(0, 1);
|
||||
doc_id_to_segment.set(1, 1);
|
||||
@@ -235,13 +233,13 @@ mod tests {
|
||||
let mut demux_mapping = DemuxMapping::default();
|
||||
{
|
||||
let max_value = 2;
|
||||
//segment ordinal 0 mapping
|
||||
// segment ordinal 0 mapping
|
||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
||||
doc_id_to_segment.set(0, 1);
|
||||
doc_id_to_segment.set(1, 0);
|
||||
demux_mapping.add(doc_id_to_segment);
|
||||
|
||||
//segment ordinal 1 mapping
|
||||
// segment ordinal 1 mapping
|
||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
||||
doc_id_to_segment.set(0, 1);
|
||||
doc_id_to_segment.set(1, 1);
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
//! This module is used when sorting the index by a property, e.g.
|
||||
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
|
||||
//!
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::ops::Index;
|
||||
|
||||
use super::SegmentWriter;
|
||||
use crate::{
|
||||
schema::{Field, Schema},
|
||||
DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError,
|
||||
};
|
||||
use std::{cmp::Reverse, ops::Index};
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::{DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError};
|
||||
|
||||
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
||||
#[derive(Clone)]
|
||||
@@ -152,11 +151,12 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_indexsorting {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::{collector::TopDocs, query::QueryParser, schema::*};
|
||||
use crate::{schema::Schema, DocAddress};
|
||||
use crate::{Index, IndexSettings, IndexSortByField, Order};
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, *};
|
||||
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order};
|
||||
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
@@ -217,7 +217,7 @@ mod tests_indexsorting {
|
||||
];
|
||||
|
||||
for option in options {
|
||||
//let options = get_text_options();
|
||||
// let options = get_text_options();
|
||||
// no index_sort
|
||||
let index = create_test_index(None, option.clone())?;
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
@@ -318,7 +318,7 @@ mod tests_indexsorting {
|
||||
.doc(DocAddress::new(0, 3))?
|
||||
.get_first(my_string_field)
|
||||
.unwrap()
|
||||
.text(),
|
||||
.as_text(),
|
||||
Some("blublub")
|
||||
);
|
||||
}
|
||||
@@ -341,7 +341,7 @@ mod tests_indexsorting {
|
||||
.doc(DocAddress::new(0, 0))?
|
||||
.get_first(my_string_field)
|
||||
.unwrap()
|
||||
.text(),
|
||||
.as_text(),
|
||||
Some("blublub")
|
||||
);
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
@@ -363,7 +363,7 @@ mod tests_indexsorting {
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
assert_eq!(
|
||||
doc.get_first(my_string_field).unwrap().text(),
|
||||
doc.get_first(my_string_field).unwrap().as_text(),
|
||||
Some("blublub")
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
use crate::{DocId, Opstamp};
|
||||
|
||||
// Doc to opstamp is used to identify which
|
||||
// document should be deleted.
|
||||
|
||||
@@ -1,14 +1,19 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
use common::BitSet;
|
||||
use crossbeam::channel;
|
||||
use futures::executor::block_on;
|
||||
use futures::future::Future;
|
||||
use smallvec::smallvec;
|
||||
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use crate::core::Index;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::directory::TerminatingWrite;
|
||||
use crate::directory::{DirectoryLock, GarbageCollectionResult};
|
||||
use super::{AddBatch, AddBatchReceiver, AddBatchSender, PreparedCommit};
|
||||
use crate::core::{Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader};
|
||||
use crate::directory::{DirectoryLock, GarbageCollectionResult, TerminatingWrite};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::TantivyError;
|
||||
use crate::fastfield::write_alive_bitset;
|
||||
@@ -17,24 +22,9 @@ use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use crate::indexer::index_writer_status::IndexWriterStatus;
|
||||
use crate::indexer::operation::DeleteOperation;
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::MergePolicy;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
|
||||
use crate::schema::{Document, IndexRecordOption, Term};
|
||||
use crate::Opstamp;
|
||||
use common::BitSet;
|
||||
use crossbeam::channel;
|
||||
use futures::executor::block_on;
|
||||
use futures::future::Future;
|
||||
use smallvec::smallvec;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
use super::{AddBatch, AddBatchReceiver, AddBatchSender};
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
@@ -392,7 +382,13 @@ impl IndexWriter {
|
||||
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
|
||||
self.index_writer_status
|
||||
.operation_receiver()
|
||||
.ok_or_else(|| crate::TantivyError::ErrorInThread("The index writer was killed. It can happen if an indexing worker encounterred an Io error for instance.".to_string()))
|
||||
.ok_or_else(|| {
|
||||
crate::TantivyError::ErrorInThread(
|
||||
"The index writer was killed. It can happen if an indexing worker \
|
||||
encounterred an Io error for instance."
|
||||
.to_string(),
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Spawns a new worker thread for indexing.
|
||||
@@ -653,7 +649,6 @@ impl IndexWriter {
|
||||
///
|
||||
/// Commit returns the `opstamp` of the last document
|
||||
/// that made it in the commit.
|
||||
///
|
||||
pub fn commit(&mut self) -> crate::Result<Opstamp> {
|
||||
self.prepare_commit()?.commit()
|
||||
}
|
||||
@@ -780,8 +775,7 @@ impl Drop for IndexWriter {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use futures::executor::block_on;
|
||||
use proptest::prelude::*;
|
||||
@@ -794,31 +788,20 @@ mod tests {
|
||||
use crate::error::*;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::FacetOptions;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::TextOptions;
|
||||
use crate::schema::STORED;
|
||||
use crate::schema::TEXT;
|
||||
use crate::schema::{self, IndexRecordOption, FAST, INDEXED, STRING};
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::Term;
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
use crate::query::{QueryParser, TermQuery};
|
||||
use crate::schema::{
|
||||
self, Cardinality, Facet, FacetOptions, IndexRecordOption, IntOptions, TextFieldIndexing,
|
||||
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term};
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.";
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
|
||||
eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad \
|
||||
minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip \
|
||||
ex ea commodo consequat. Duis aute irure dolor in reprehenderit in \
|
||||
voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur \
|
||||
sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt \
|
||||
mollit anim id est laborum.";
|
||||
|
||||
#[test]
|
||||
fn test_operations_group() {
|
||||
@@ -973,8 +956,8 @@ mod tests {
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
|
||||
level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }"
|
||||
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \
|
||||
min_layer_size: 10000, level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }"
|
||||
);
|
||||
let merge_policy = Box::new(NoMergePolicy::default());
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
@@ -1547,12 +1530,7 @@ mod tests {
|
||||
let store_reader = segment_reader.get_store_reader().unwrap();
|
||||
// test store iterator
|
||||
for doc in store_reader.iter(segment_reader.alive_bitset()) {
|
||||
let id = doc
|
||||
.unwrap()
|
||||
.get_first(id_field)
|
||||
.unwrap()
|
||||
.u64_value()
|
||||
.unwrap();
|
||||
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
|
||||
assert!(expected_ids_and_num_occurences.contains_key(&id));
|
||||
}
|
||||
// test store random access
|
||||
@@ -1562,7 +1540,7 @@ mod tests {
|
||||
.unwrap()
|
||||
.get_first(id_field)
|
||||
.unwrap()
|
||||
.u64_value()
|
||||
.as_u64()
|
||||
.unwrap();
|
||||
assert!(expected_ids_and_num_occurences.contains_key(&id));
|
||||
let id2 = store_reader
|
||||
@@ -1570,7 +1548,7 @@ mod tests {
|
||||
.unwrap()
|
||||
.get_first(multi_numbers)
|
||||
.unwrap()
|
||||
.u64_value()
|
||||
.as_u64()
|
||||
.unwrap();
|
||||
assert_eq!(id, id2);
|
||||
}
|
||||
|
||||
@@ -90,10 +90,12 @@ impl Drop for IndexWriterBomb {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::IndexWriterStatus;
|
||||
use crossbeam::channel;
|
||||
use std::mem;
|
||||
|
||||
use crossbeam::channel;
|
||||
|
||||
use super::IndexWriterStatus;
|
||||
|
||||
#[test]
|
||||
fn test_bomb_goes_boom() {
|
||||
let (_tx, rx) = channel::bounded(10);
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::cmp;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::merge_policy::{MergeCandidate, MergePolicy};
|
||||
use crate::core::SegmentMeta;
|
||||
use itertools::Itertools;
|
||||
use std::cmp;
|
||||
|
||||
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
||||
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
@@ -139,14 +141,14 @@ impl Default for LogMergePolicy {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
core::{SegmentId, SegmentMeta, SegmentMetaInventory},
|
||||
schema,
|
||||
};
|
||||
use crate::{indexer::merge_policy::MergePolicy, schema::INDEXED};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use super::*;
|
||||
use crate::core::{SegmentId, SegmentMeta, SegmentMetaInventory};
|
||||
use crate::indexer::merge_policy::MergePolicy;
|
||||
use crate::schema;
|
||||
use crate::schema::INDEXED;
|
||||
|
||||
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);
|
||||
|
||||
use crate::Index;
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
use crate::Opstamp;
|
||||
use crate::SegmentId;
|
||||
use crate::{Inventory, TrackedObject};
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Deref;
|
||||
|
||||
use crate::{Inventory, Opstamp, SegmentId, TrackedObject};
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct MergeOperationInventory(Inventory<InnerMergeOperation>);
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use std::fmt::Debug;
|
||||
use std::marker;
|
||||
|
||||
use crate::core::{SegmentId, SegmentMeta};
|
||||
|
||||
/// Set of segment suggested for a merge.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MergeCandidate(pub Vec<SegmentId>);
|
||||
@@ -39,8 +39,7 @@ impl MergePolicy for NoMergePolicy {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::{SegmentId, SegmentMeta};
|
||||
|
||||
/// `MergePolicy` useful for test purposes.
|
||||
///
|
||||
|
||||
@@ -1,41 +1,30 @@
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldDataAccess;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::FastFieldStats;
|
||||
use crate::fastfield::MultiValueLength;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
|
||||
use crate::indexer::SegmentSerializer;
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::{InvertedIndexSerializer, SegmentPostings};
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::store::StoreWriter;
|
||||
use crate::termdict::TermMerger;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::IndexSettings;
|
||||
use crate::IndexSortByField;
|
||||
use crate::{core::Segment, indexer::doc_id_mapping::expect_field_id_for_sort_field};
|
||||
use crate::{core::SegmentReader, Order};
|
||||
use crate::{
|
||||
docset::{DocSet, TERMINATED},
|
||||
SegmentOrdinal,
|
||||
};
|
||||
use crate::{DocId, InvertedIndexReader, SegmentComponent};
|
||||
use itertools::Itertools;
|
||||
use measure_time::debug_time;
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use itertools::Itertools;
|
||||
use measure_time::debug_time;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
use crate::core::{Segment, SegmentReader};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{
|
||||
AliveBitSet, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldDataAccess,
|
||||
FastFieldReader, FastFieldStats, MultiValueLength, MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
||||
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
|
||||
use crate::indexer::SegmentSerializer;
|
||||
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::store::StoreWriter;
|
||||
use crate::termdict::{TermMerger, TermOrdinal};
|
||||
use crate::{
|
||||
DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentComponent,
|
||||
SegmentOrdinal,
|
||||
};
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
/// We do not allow segments with more than
|
||||
@@ -46,8 +35,8 @@ fn estimate_total_num_tokens_in_single_segment(
|
||||
field: Field,
|
||||
) -> crate::Result<u64> {
|
||||
// There are no deletes. We can simply use the exact value saved into the posting list.
|
||||
// Note that this value is not necessarily exact as it could have been the result of a merge between
|
||||
// segments themselves containing deletes.
|
||||
// Note that this value is not necessarily exact as it could have been the result of a merge
|
||||
// between segments themselves containing deletes.
|
||||
if !reader.has_deletes() {
|
||||
return Ok(reader.inverted_index(field)?.total_num_tokens());
|
||||
}
|
||||
@@ -218,8 +207,8 @@ impl IndexMerger {
|
||||
// sort segments by their natural sort setting
|
||||
if max_doc >= MAX_DOC_LIMIT {
|
||||
let err_msg = format!(
|
||||
"The segment resulting from this merge would have {} docs,\
|
||||
which exceeds the limit {}.",
|
||||
"The segment resulting from this merge would have {} docs,which exceeds the limit \
|
||||
{}.",
|
||||
max_doc, MAX_DOC_LIMIT
|
||||
);
|
||||
return Err(crate::TantivyError::InvalidArgument(err_msg));
|
||||
@@ -295,10 +284,10 @@ impl IndexMerger {
|
||||
let field_type = field_entry.field_type();
|
||||
match field_type {
|
||||
FieldType::Facet(_) => {
|
||||
let term_ordinal_mapping = term_ord_mappings
|
||||
.remove(&field)
|
||||
.expect("Logic Error in Tantivy (Please report). Facet field should have required a\
|
||||
`term_ordinal_mapping`.");
|
||||
let term_ordinal_mapping = term_ord_mappings.remove(&field).expect(
|
||||
"Logic Error in Tantivy (Please report). Facet field should have required \
|
||||
a`term_ordinal_mapping`.",
|
||||
);
|
||||
self.write_hierarchical_facet_field(
|
||||
field,
|
||||
&term_ordinal_mapping,
|
||||
@@ -340,25 +329,29 @@ impl IndexMerger {
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
let (min_value, max_value) = self.readers.iter().filter_map(|reader|{
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
let (min_value, max_value) = self
|
||||
.readers
|
||||
.iter()
|
||||
.filter_map(|reader| {
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
);
|
||||
compute_min_max_val(&u64_reader, reader)
|
||||
})
|
||||
.reduce(|a, b| {
|
||||
(a.0.min(b.0), a.1.max(b.1))
|
||||
}).expect("Unexpected error, empty readers in IndexMerger");
|
||||
.reduce(|a, b| (a.0.min(b.0), a.1.max(b.1)))
|
||||
.expect("Unexpected error, empty readers in IndexMerger");
|
||||
|
||||
let fast_field_readers = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
);
|
||||
u64_reader
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
@@ -574,12 +567,20 @@ impl IndexMerger {
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<Vec<u64>> {
|
||||
let reader_ordinal_and_field_accessors = self.readers.iter().map(|reader|{
|
||||
let u64s_reader: MultiValuedFastFieldReader<u64> = reader.fast_fields()
|
||||
.typed_fast_field_multi_reader(field)
|
||||
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
|
||||
(reader, u64s_reader)
|
||||
}).collect::<Vec<_>>();
|
||||
let reader_ordinal_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let u64s_reader: MultiValuedFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_multi_reader(field)
|
||||
.expect(
|
||||
"Failed to find index for multivalued field. This is a bug in tantivy, \
|
||||
please report.",
|
||||
);
|
||||
(reader, u64s_reader)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
Self::write_1_n_fast_field_idx_generic(
|
||||
field,
|
||||
@@ -641,8 +642,8 @@ impl IndexMerger {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index
|
||||
/// sorting and the others
|
||||
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between
|
||||
/// index sorting and the others
|
||||
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> {
|
||||
let total_num_new_docs = self
|
||||
.readers
|
||||
@@ -697,8 +698,8 @@ impl IndexMerger {
|
||||
.fast_fields()
|
||||
.typed_fast_field_multi_reader(field)
|
||||
.expect(
|
||||
"Failed to find multivalued fast field reader. This is a bug in \
|
||||
tantivy. Please report.",
|
||||
"Failed to find multivalued fast field reader. This is a bug in tantivy. \
|
||||
Please report.",
|
||||
);
|
||||
for doc in reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
@@ -792,8 +793,10 @@ impl IndexMerger {
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let bytes_reader = reader.fast_fields().bytes(field)
|
||||
.expect("Failed to find index for bytes field. This is a bug in tantivy, please report.");
|
||||
let bytes_reader = reader.fast_fields().bytes(field).expect(
|
||||
"Failed to find index for bytes field. This is a bug in tantivy, please \
|
||||
report.",
|
||||
);
|
||||
(reader, bytes_reader)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
@@ -877,8 +880,8 @@ impl IndexMerger {
|
||||
// segment are stacked so that :
|
||||
// - Segment 0's doc ids become doc id [0, seg.max_doc]
|
||||
// - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc]
|
||||
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
|
||||
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
|
||||
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc +
|
||||
// seg2.max_doc]
|
||||
//
|
||||
// This stacking applies only when the index is not sorted, in that case the
|
||||
// doc_ids are kmerged by their sort property
|
||||
@@ -1122,34 +1125,26 @@ impl IndexMerger {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::assert_nearly_equals;
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::AllQuery;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::{Cardinality, TEXT};
|
||||
use crate::schema::{Document, FacetOptions};
|
||||
use crate::DocAddress;
|
||||
use crate::IndexSettings;
|
||||
use crate::IndexSortByField;
|
||||
use crate::IndexWriter;
|
||||
use crate::Searcher;
|
||||
use crate::{schema, DocSet, SegmentId};
|
||||
use crate::{schema::INDEXED, Order};
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use futures::executor::block_on;
|
||||
use schema::FAST;
|
||||
|
||||
use crate::collector::tests::{
|
||||
BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE,
|
||||
};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::{AllQuery, BooleanQuery, Scorer, TermQuery};
|
||||
use crate::schema::{
|
||||
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, IntOptions, Term,
|
||||
TextFieldIndexing, INDEXED, TEXT,
|
||||
};
|
||||
use crate::{
|
||||
assert_nearly_equals, schema, DocAddress, DocSet, IndexSettings, IndexSortByField,
|
||||
IndexWriter, Order, Searcher, SegmentId,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_index_merger_no_deletes() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
@@ -1253,23 +1248,29 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 0))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 1))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c"));
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("a b c"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 2))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d"));
|
||||
assert_eq!(
|
||||
doc.get_first(text_field).unwrap().as_text(),
|
||||
Some("a b c d")
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 3))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
|
||||
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g"));
|
||||
assert_eq!(
|
||||
doc.get_first(text_field).unwrap().as_text(),
|
||||
Some("a b c g")
|
||||
);
|
||||
}
|
||||
{
|
||||
let get_fast_vals = |terms: Vec<Term>| {
|
||||
@@ -1674,7 +1675,7 @@ mod tests {
|
||||
index_builder = index_builder.settings(settings);
|
||||
}
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
//let index = Index::create_in_ram(schema_builder.build());
|
||||
// let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index.reader().unwrap();
|
||||
let mut int_val = 0;
|
||||
{
|
||||
@@ -1708,7 +1709,9 @@ mod tests {
|
||||
index_doc(&mut index_writer, &["/top/d"], &mut 0);
|
||||
index_doc(&mut index_writer, &["/top/e"], &mut 10);
|
||||
index_writer.commit().expect("committed");
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the segments don' have disjunct ranges
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the
|
||||
// segments don' have disjunct
|
||||
// ranges
|
||||
} else {
|
||||
index_doc(&mut index_writer, &["/top/d"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/e"], &mut int_val);
|
||||
@@ -2057,7 +2060,8 @@ mod tests {
|
||||
let mut term_scorer = term_query
|
||||
.specialized_weight(&searcher, true)?
|
||||
.specialized_scorer(segment_reader, 1.0)?;
|
||||
// the difference compared to before is instrinsic to the bm25 formula. no worries there.
|
||||
// the difference compared to before is instrinsic to the bm25 formula. no worries
|
||||
// there.
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
assert_eq!(term_scorer.doc(), doc);
|
||||
assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312);
|
||||
|
||||
@@ -1,19 +1,16 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::executor::block_on;
|
||||
|
||||
use crate::collector::TopDocs;
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fastfield::{AliveBitSet, FastFieldReader};
|
||||
use crate::fastfield::{AliveBitSet, FastFieldReader, MultiValuedFastFieldReader};
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, TextFieldIndexing,
|
||||
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, IntOptions,
|
||||
TextFieldIndexing, TextOptions,
|
||||
};
|
||||
use crate::schema::{IntOptions, TextOptions};
|
||||
use crate::DocAddress;
|
||||
use crate::IndexSortByField;
|
||||
use crate::Order;
|
||||
use crate::{DocSet, IndexSettings, Postings, Term};
|
||||
use futures::executor::block_on;
|
||||
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
|
||||
|
||||
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
@@ -59,8 +56,8 @@ mod tests {
|
||||
index
|
||||
}
|
||||
|
||||
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have disjunct
|
||||
// ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
|
||||
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have
|
||||
// disjunct ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
force_disjunct_segment_sort_values: bool,
|
||||
@@ -282,11 +279,11 @@ mod tests {
|
||||
};
|
||||
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
|
||||
assert_eq!(
|
||||
doc.get_first(my_text_field).unwrap().text(),
|
||||
doc.get_first(my_text_field).unwrap().as_text(),
|
||||
Some("blubber")
|
||||
);
|
||||
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(1000));
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -465,17 +462,17 @@ mod tests {
|
||||
// access doc store
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(1));
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
|
||||
let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(2));
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
|
||||
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(3));
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
|
||||
let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(10));
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
|
||||
let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(20));
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
|
||||
let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
|
||||
assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(1_000));
|
||||
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -483,20 +480,14 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench_sorted_index_merge {
|
||||
|
||||
use crate::core::Index;
|
||||
//use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Schema;
|
||||
use crate::IndexSettings;
|
||||
use crate::IndexSortByField;
|
||||
use crate::IndexWriter;
|
||||
use crate::Order;
|
||||
use test::{self, Bencher};
|
||||
|
||||
use crate::core::Index;
|
||||
// use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::schema::{Cardinality, Document, IntOptions, Schema};
|
||||
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
|
||||
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
@@ -544,13 +535,13 @@ mod bench_sorted_index_merge {
|
||||
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
||||
let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
|
||||
b.iter(|| {
|
||||
|
||||
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)|{
|
||||
let reader = &merger.readers[*ordinal as usize];
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)| {
|
||||
let reader = &merger.readers[*ordinal as usize];
|
||||
let u64_reader: DynamicFastFieldReader<u64> =
|
||||
reader.fast_fields().typed_fast_field_reader(field).expect(
|
||||
"Failed to find a reader for single fast field. This is a tantivy bug and \
|
||||
it should never happen.",
|
||||
);
|
||||
(doc_id, reader, u64_reader)
|
||||
});
|
||||
// add values in order of the new doc_ids
|
||||
@@ -560,7 +551,6 @@ mod bench_sorted_index_merge {
|
||||
}
|
||||
|
||||
val
|
||||
|
||||
});
|
||||
|
||||
Ok(())
|
||||
@@ -572,7 +562,7 @@ mod bench_sorted_index_merge {
|
||||
order: Order::Desc,
|
||||
};
|
||||
let index = create_index(Some(sort_by_field.clone()));
|
||||
//let field = index.schema().get_field("intval").unwrap();
|
||||
// let field = index.schema().get_field("intval").unwrap();
|
||||
let segments = index.searchable_segments().unwrap();
|
||||
let merger: IndexMerger =
|
||||
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
||||
|
||||
@@ -23,8 +23,6 @@ mod stamper;
|
||||
use crossbeam::channel;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::indexer::operation::AddOperation;
|
||||
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
@@ -33,9 +31,9 @@ pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::SegmentEntry;
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_updater::merge_filtered_segments;
|
||||
pub use self::segment_updater::merge_indices;
|
||||
pub use self::segment_updater::{merge_filtered_segments, merge_indices};
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
use crate::indexer::operation::AddOperation;
|
||||
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::{Document, Term};
|
||||
use crate::Opstamp;
|
||||
|
||||
/// Timestamped Delete operation.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::IndexWriter;
|
||||
use crate::Opstamp;
|
||||
use futures::executor::block_on;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use common::BitSet;
|
||||
use std::fmt;
|
||||
|
||||
use common::BitSet;
|
||||
|
||||
use crate::core::{SegmentId, SegmentMeta};
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
|
||||
/// A segment entry describes the state of
|
||||
/// a given segment, at a given instant.
|
||||
///
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
|
||||
use super::segment_register::SegmentRegister;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::{SegmentId, SegmentMeta};
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::sync::RwLock;
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
|
||||
#[derive(Default)]
|
||||
struct SegmentRegisters {
|
||||
@@ -154,21 +153,23 @@ impl SegmentManager {
|
||||
let mut segment_entries = vec![];
|
||||
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||
for segment_id in segment_ids {
|
||||
let segment_entry = registers_lock.uncommitted
|
||||
.get(segment_id)
|
||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||
let segment_entry = registers_lock.uncommitted.get(segment_id).expect(
|
||||
"Segment id not found {}. Should never happen because of the contains all \
|
||||
if-block.",
|
||||
);
|
||||
segment_entries.push(segment_entry);
|
||||
}
|
||||
} else if registers_lock.committed.contains_all(segment_ids) {
|
||||
for segment_id in segment_ids {
|
||||
let segment_entry = registers_lock.committed
|
||||
.get(segment_id)
|
||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||
let segment_entry = registers_lock.committed.get(segment_id).expect(
|
||||
"Segment id not found {}. Should never happen because of the contains all \
|
||||
if-block.",
|
||||
);
|
||||
segment_entries.push(segment_entry);
|
||||
}
|
||||
} else {
|
||||
let error_msg = "Merge operation sent for segments that are not \
|
||||
all uncommited or commited."
|
||||
let error_msg = "Merge operation sent for segments that are not all uncommited or \
|
||||
commited."
|
||||
.to_string();
|
||||
return Err(TantivyError::InvalidArgument(error_msg));
|
||||
}
|
||||
@@ -193,8 +194,8 @@ impl SegmentManager {
|
||||
.ok_or_else(|| {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
crate::TantivyError::InvalidArgument(
|
||||
"The segments that were merged could not be found in the SegmentManager. \
|
||||
This is not necessarily a bug, and can happen after a rollback for instance."
|
||||
"The segments that were merged could not be found in the SegmentManager. This \
|
||||
is not necessarily a bug, and can happen after a rollback for instance."
|
||||
.to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::{self, Debug, Display, Formatter};
|
||||
|
||||
use crate::core::{SegmentId, SegmentMeta};
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::segment_entry::SegmentEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Display;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
|
||||
/// The segment register keeps track
|
||||
/// of the list of segment, their size as well
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::{Segment, SegmentComponent};
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::postings::InvertedIndexSerializer;
|
||||
|
||||
@@ -1,11 +1,21 @@
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::ops::Deref;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use fail::fail_point;
|
||||
use futures::channel::oneshot;
|
||||
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
||||
use futures::future::{Future, TryFutureExt};
|
||||
|
||||
use super::segment_manager::SegmentManager;
|
||||
use crate::core::Index;
|
||||
use crate::core::IndexMeta;
|
||||
use crate::core::IndexSettings;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::core::{
|
||||
Index, IndexMeta, IndexSettings, Segment, SegmentId, SegmentMeta, META_FILEPATH,
|
||||
};
|
||||
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
@@ -14,27 +24,12 @@ use crate::indexer::merge_operation::MergeOperationInventory;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::indexer::segment_manager::SegmentsStatus;
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::indexer::SegmentSerializer;
|
||||
use crate::indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use crate::indexer::{MergeCandidate, MergeOperation};
|
||||
use crate::indexer::{
|
||||
DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
|
||||
SegmentSerializer,
|
||||
};
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::TantivyError;
|
||||
use fail::fail_point;
|
||||
use futures::channel::oneshot;
|
||||
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
||||
use futures::future::Future;
|
||||
use futures::future::TryFutureExt;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::ops::Deref;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use crate::{Opstamp, TantivyError};
|
||||
|
||||
const NUM_MERGE_THREADS: usize = 4;
|
||||
|
||||
@@ -702,10 +697,7 @@ mod tests {
|
||||
use crate::indexer::segment_updater::merge_filtered_segments;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::*;
|
||||
use crate::Directory;
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::Segment;
|
||||
use crate::{Directory, DocAddress, Index, Segment};
|
||||
|
||||
#[test]
|
||||
fn test_delete_during_merge() -> crate::Result<()> {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user