mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-05 00:32:55 +00:00
Compare commits
11 Commits
col-trait-
...
column-tra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4a072e3c18 | ||
|
|
84e0c75598 | ||
|
|
08c4412d73 | ||
|
|
70e58adff9 | ||
|
|
0d1cd119e9 | ||
|
|
d3dd620048 | ||
|
|
e89c220b56 | ||
|
|
a451f6d60d | ||
|
|
f740ddeee3 | ||
|
|
7a26cc9022 | ||
|
|
54972caa7c |
@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write + Send {
|
||||
pub trait TerminatingWrite: Write + Send + Sync {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where Self: Sized {
|
||||
|
||||
@@ -289,6 +289,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
|
||||
return None;
|
||||
@@ -300,9 +301,13 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
// If this doesn't overflow the algorithm should be fine
|
||||
let theorethical_maximum_offset =
|
||||
fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
fastfield_accessor
|
||||
if fastfield_accessor
|
||||
.max_value()
|
||||
.checked_add(theorethical_maximum_offset)?;
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::ColumnIter;
|
||||
|
||||
pub trait Column<T = u64> {
|
||||
/// Return the value associated to the given idx.
|
||||
///
|
||||
@@ -12,18 +8,23 @@ pub trait Column<T = u64> {
|
||||
/// May panic if `idx` is greater than the column length.
|
||||
fn get_val(&self, idx: u64) -> T;
|
||||
|
||||
/// Returns an iterator over given doc range.
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `range.end()` is greater than
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
#[inline]
|
||||
fn get_range(&self, range: Range<u64>) -> ColumnIter<'_, Self, T>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
ColumnIter::new(self, range)
|
||||
fn get_range(&self, start: u64, output: &mut [T]) {
|
||||
for (out, idx) in output.iter_mut().zip(start..) {
|
||||
*out = self.get_val(idx);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
|
||||
@@ -4,9 +4,6 @@ extern crate more_asserts;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::iter::FusedIterator;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Range;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use ownedbytes::OwnedBytes;
|
||||
@@ -93,102 +90,41 @@ pub struct FastFieldStats {
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'a> Column for &'a [u64] {
|
||||
struct VecColum<'a>(&'a [u64]);
|
||||
impl<'a> Column for VecColum<'a> {
|
||||
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new((self as &[u64]).iter().cloned())
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.iter().min().unwrap_or(0)
|
||||
self.0.iter().min().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.iter().max().unwrap_or(0)
|
||||
self.0.iter().max().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.len() as u64
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ColumnIter<'a, C: Column<I>, I> {
|
||||
column: &'a C,
|
||||
range: Range<u64>,
|
||||
_phantom: PhantomData<I>,
|
||||
}
|
||||
|
||||
impl<'a, C: Column<I>, I> ColumnIter<'a, C, I> {
|
||||
#[inline]
|
||||
pub fn new(col: &'a C, range: Range<u64>) -> Self {
|
||||
Self {
|
||||
column: col,
|
||||
range,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C: Column<I>, I> Iterator for ColumnIter<'a, C, I> {
|
||||
type Item = I;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
Some(self.column.get_val(self.range.next()?))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn fold<Acc, G>(self, init: Acc, mut g: G) -> Acc
|
||||
where
|
||||
G: FnMut(Acc, Self::Item) -> Acc,
|
||||
{
|
||||
self.range
|
||||
.fold(init, move |acc, idx| g(acc, self.column.get_val(idx)))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let size = (self.range.end - self.range.start) as usize;
|
||||
(size, Some(size))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C: Column<I>, I> ExactSizeIterator for ColumnIter<'a, C, I> {
|
||||
#[inline]
|
||||
fn len(&self) -> usize {
|
||||
let size = (self.range.end - self.range.start) as usize;
|
||||
size as usize
|
||||
}
|
||||
}
|
||||
impl<'a, C: Column<I>, I> FusedIterator for ColumnIter<'a, C, I> {}
|
||||
|
||||
impl Column for Vec<u64> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
}
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new((self as &[u64]).iter().cloned())
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
self.iter().min().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.iter().max().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.len() as u64
|
||||
impl<'a> From<&'a [u64]> for VecColum<'a> {
|
||||
fn from(data: &'a [u64]) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::arbitrary::any;
|
||||
use proptest::proptest;
|
||||
use proptest::prelude::*;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
|
||||
use crate::bitpacked::BitpackedCodec;
|
||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||
@@ -198,10 +134,10 @@ mod tests {
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> Option<(f32, f32)> {
|
||||
let estimation = Codec::estimate(&data)?;
|
||||
let estimation = Codec::estimate(&VecColum::from(data))?;
|
||||
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
Codec::serialize(&mut out, &data).unwrap();
|
||||
Codec::serialize(&mut out, &VecColum::from(data)).unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
|
||||
@@ -219,21 +155,32 @@ mod tests {
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(100))]
|
||||
#[test]
|
||||
fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
|
||||
fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(10))]
|
||||
#[test]
|
||||
fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
|
||||
}
|
||||
fn num_strategy() -> impl Strategy<Value = u64> {
|
||||
prop_oneof![
|
||||
1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
|
||||
1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
|
||||
20 => prop::num::u64::ANY,
|
||||
]
|
||||
}
|
||||
|
||||
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
@@ -287,6 +234,7 @@ mod tests {
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
let data: VecColum = data.as_slice().into();
|
||||
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
@@ -300,8 +248,9 @@ mod tests {
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let data: VecColum = data.into();
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
@@ -312,6 +261,7 @@ mod tests {
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data: Vec<u64> = (200..=20000_u64).collect();
|
||||
data.push(1_000_000);
|
||||
let data: VecColum = data.as_slice().into();
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
|
||||
@@ -193,6 +193,7 @@ impl FastFieldCodec for LinearCodec {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 3 {
|
||||
return None; // disable compressor for this case
|
||||
@@ -204,9 +205,13 @@ impl FastFieldCodec for LinearCodec {
|
||||
// If this doesn't overflow the algorithm should be fine
|
||||
let theorethical_maximum_offset =
|
||||
fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
fastfield_accessor
|
||||
if fastfield_accessor
|
||||
.max_value()
|
||||
.checked_add(theorethical_maximum_offset)?;
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
@@ -254,6 +259,8 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
@@ -336,10 +343,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..10_000)
|
||||
.map(|_| rand::random::<u64>())
|
||||
.collect::<Vec<_>>();
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..50 {
|
||||
let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
@@ -3,9 +3,33 @@ extern crate prettytable;
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
struct Data<'a>(&'a [u64]);
|
||||
|
||||
impl<'a> Column for Data<'a> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
*self.0.iter().min().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
*self.0.iter().max().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
@@ -86,10 +110,11 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
pub fn serialize_with_codec<C: FastFieldCodec>(
|
||||
data: &[u64],
|
||||
) -> Option<(f32, f32, FastFieldCodecType)> {
|
||||
let data = Data(data);
|
||||
let estimation = C::estimate(&data)?;
|
||||
let mut out = Vec::new();
|
||||
C::serialize(&mut out, &data).unwrap();
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
|
||||
Some((estimation, actual_compression, C::CODEC_TYPE))
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{fmt, io, mem};
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
/// this data as a slice.
|
||||
///
|
||||
/// The backing object is required to be `StableDeref`.
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -242,13 +242,13 @@ impl TermBuckets {
|
||||
|
||||
fn increment_bucket(
|
||||
&mut self,
|
||||
term_ids: impl Iterator<Item = u64>,
|
||||
term_ids: &[u64],
|
||||
doc: DocId,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
bucket_count: &BucketCount,
|
||||
blueprint: &Option<SegmentAggregationResultsCollector>,
|
||||
) -> crate::Result<()> {
|
||||
for term_id in term_ids {
|
||||
for &term_id in term_ids {
|
||||
let entry = self.entries.entry(term_id as u32).or_insert_with(|| {
|
||||
bucket_count.add_count(1);
|
||||
|
||||
@@ -432,30 +432,39 @@ impl SegmentTermCollector {
|
||||
.as_multi()
|
||||
.expect("unexpected fast field cardinatility");
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
let mut vals1 = vec![];
|
||||
let mut vals2 = vec![];
|
||||
let mut vals3 = vec![];
|
||||
let mut vals4 = vec![];
|
||||
for docs in iter.by_ref() {
|
||||
accessor.get_vals(docs[0], &mut vals1);
|
||||
accessor.get_vals(docs[1], &mut vals2);
|
||||
accessor.get_vals(docs[2], &mut vals3);
|
||||
accessor.get_vals(docs[3], &mut vals4);
|
||||
|
||||
self.term_buckets.increment_bucket(
|
||||
accessor.get_vals(docs[0]),
|
||||
&vals1,
|
||||
docs[0],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
&self.blueprint,
|
||||
)?;
|
||||
self.term_buckets.increment_bucket(
|
||||
accessor.get_vals(docs[1]),
|
||||
&vals2,
|
||||
docs[1],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
&self.blueprint,
|
||||
)?;
|
||||
self.term_buckets.increment_bucket(
|
||||
accessor.get_vals(docs[2]),
|
||||
&vals3,
|
||||
docs[2],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
&self.blueprint,
|
||||
)?;
|
||||
self.term_buckets.increment_bucket(
|
||||
accessor.get_vals(docs[3]),
|
||||
&vals4,
|
||||
docs[3],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
@@ -463,8 +472,10 @@ impl SegmentTermCollector {
|
||||
)?;
|
||||
}
|
||||
for &doc in iter.remainder() {
|
||||
accessor.get_vals(doc, &mut vals1);
|
||||
|
||||
self.term_buckets.increment_bucket(
|
||||
accessor.get_vals(doc),
|
||||
&vals1,
|
||||
doc,
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
&bucket_with_accessor.bucket_count,
|
||||
@@ -1323,15 +1334,11 @@ mod bench {
|
||||
max_bucket_count: 1_000_001u32,
|
||||
};
|
||||
b.iter(|| {
|
||||
collector
|
||||
.increment_bucket(
|
||||
vals.iter().cloned(),
|
||||
0,
|
||||
&aggregations_with_accessor,
|
||||
&bucket_count,
|
||||
&None,
|
||||
)
|
||||
.unwrap();
|
||||
for &val in &vals {
|
||||
collector
|
||||
.increment_bucket(&[val], 0, &aggregations_with_accessor, &bucket_count, &None)
|
||||
.unwrap();
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::sync::Arc;
|
||||
|
||||
use super::segment::Segment;
|
||||
use super::IndexSettings;
|
||||
use crate::core::single_segment_index_writer::SingleSegmentIndexWriter;
|
||||
use crate::core::{
|
||||
Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
|
||||
};
|
||||
@@ -16,7 +17,7 @@ use crate::directory::MmapDirectory;
|
||||
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
|
||||
use crate::error::{DataCorruption, TantivyError};
|
||||
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
|
||||
use crate::indexer::segment_updater::save_new_metas;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
@@ -47,6 +48,34 @@ fn load_metas(
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_new_metas(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
directory: &dyn Directory,
|
||||
) -> crate::Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
index_settings,
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
},
|
||||
directory,
|
||||
)?;
|
||||
directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// IndexBuilder can be used to create an index.
|
||||
///
|
||||
/// Use in conjunction with `SchemaBuilder`. Global index settings
|
||||
@@ -135,6 +164,25 @@ impl IndexBuilder {
|
||||
self.create(mmap_directory)
|
||||
}
|
||||
|
||||
/// Dragons ahead!!!
|
||||
///
|
||||
/// The point of this API is to let users create a simple index with a single segment
|
||||
/// and without starting any thread.
|
||||
///
|
||||
/// Do not use this method if you are not sure what you are doing.
|
||||
///
|
||||
/// It expects an originally empty directory, and will not run any GC operation.
|
||||
#[doc(hidden)]
|
||||
pub fn single_segment_index_writer(
|
||||
self,
|
||||
dir: impl Into<Box<dyn Directory>>,
|
||||
mem_budget: usize,
|
||||
) -> crate::Result<SingleSegmentIndexWriter> {
|
||||
let index = self.create(dir)?;
|
||||
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
|
||||
Ok(index_simple_writer)
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
@@ -580,10 +628,12 @@ impl fmt::Debug for Index {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::schema::{Field, Schema, INDEXED, TEXT};
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -849,4 +899,28 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_segment_index_writer() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let directory = RamDirectory::default();
|
||||
let mut single_segment_index_writer = Index::builder()
|
||||
.schema(schema)
|
||||
.single_segment_index_writer(directory, 10_000_000)?;
|
||||
for _ in 0..10 {
|
||||
let doc = doc!(text_field=>"hello");
|
||||
single_segment_index_writer.add_document(doc)?;
|
||||
}
|
||||
let index = single_segment_index_writer.finalize()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "hello"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let count = searcher.search(&term_query, &Count)?;
|
||||
assert_eq!(count, 10);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ mod segment;
|
||||
mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_reader;
|
||||
mod single_segment_index_writer;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
@@ -23,6 +24,7 @@ pub use self::segment::Segment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
|
||||
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
|
||||
47
src/core/single_segment_index_writer.rs
Normal file
47
src/core/single_segment_index_writer.rs
Normal file
@@ -0,0 +1,47 @@
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
|
||||
|
||||
#[doc(hidden)]
|
||||
pub struct SingleSegmentIndexWriter {
|
||||
segment_writer: SegmentWriter,
|
||||
segment: Segment,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
impl SingleSegmentIndexWriter {
|
||||
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
|
||||
let segment = index.new_segment();
|
||||
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
|
||||
Ok(Self {
|
||||
segment_writer,
|
||||
segment,
|
||||
opstamp: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
|
||||
let opstamp = self.opstamp;
|
||||
self.opstamp += 1;
|
||||
self.segment_writer
|
||||
.add_document(AddOperation { opstamp, document })
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> crate::Result<Index> {
|
||||
let max_doc = self.segment_writer.max_doc();
|
||||
self.segment_writer.finalize()?;
|
||||
let segment: Segment = self.segment.with_max_doc(max_doc);
|
||||
let index = segment.index();
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: index.settings().clone(),
|
||||
segments: vec![segment.meta().clone()],
|
||||
schema: index.schema(),
|
||||
opstamp: 0,
|
||||
payload: None,
|
||||
};
|
||||
save_metas(&index_meta, index.directory())?;
|
||||
index.directory().sync_directory()?;
|
||||
Ok(segment.index().clone())
|
||||
}
|
||||
}
|
||||
@@ -76,8 +76,7 @@ impl FacetReader {
|
||||
|
||||
/// Return the list of facet ordinals associated to a document.
|
||||
pub fn facet_ords(&self, doc: DocId, output: &mut Vec<u64>) {
|
||||
output.clear();
|
||||
output.extend(self.term_ords.get_vals(doc))
|
||||
self.term_ords.get_vals(doc, output);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -477,7 +477,8 @@ mod tests {
|
||||
for (doc, i) in (-100i64..10_000i64).enumerate() {
|
||||
assert_eq!(fast_field_reader.get_val(doc as u64), i);
|
||||
}
|
||||
let buffer: Vec<i64> = fast_field_reader.get_range(53..154).collect();
|
||||
let mut buffer = vec![0i64; 100];
|
||||
fast_field_reader.get_range(53, &mut buffer[..]);
|
||||
for i in 0..100 {
|
||||
assert_eq!(buffer[i], -100i64 + 53i64 + i as i64);
|
||||
}
|
||||
@@ -606,7 +607,9 @@ mod tests {
|
||||
let mut all = vec![];
|
||||
|
||||
for doc in docs {
|
||||
all.extend(ff.get_vals(doc));
|
||||
let mut out = vec![];
|
||||
ff.get_vals(doc, &mut out);
|
||||
all.extend(out);
|
||||
}
|
||||
all
|
||||
}
|
||||
@@ -651,7 +654,8 @@ mod tests {
|
||||
vec![1, 0, 0, 0, 1, 2]
|
||||
);
|
||||
|
||||
let out = text_fast_field.get_vals(3u32).collect::<Vec<_>>();
|
||||
let mut out = vec![];
|
||||
text_fast_field.get_vals(3, &mut out);
|
||||
assert_eq!(out, vec![0, 1]);
|
||||
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
@@ -836,20 +840,22 @@ mod tests {
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let date_fast_field = fast_fields.date(date_field).unwrap();
|
||||
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
|
||||
let mut dates = vec![];
|
||||
{
|
||||
assert_eq!(date_fast_field.get_val(0).into_timestamp_micros(), 1i64);
|
||||
let dates = dates_fast_field.get_vals(0u32).collect::<Vec<_>>();
|
||||
dates_fast_field.get_vals(0u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_timestamp_micros(), 2i64);
|
||||
assert_eq!(dates[1].into_timestamp_micros(), 3i64);
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get_val(1).into_timestamp_micros(), 4i64);
|
||||
assert!(dates_fast_field.get_vals(1u32).next().is_none());
|
||||
dates_fast_field.get_vals(1u32, &mut dates);
|
||||
assert!(dates.is_empty());
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get_val(2).into_timestamp_micros(), 0i64);
|
||||
let dates = dates_fast_field.get_vals(2u32).collect::<Vec<_>>();
|
||||
dates_fast_field.get_vals(2u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_timestamp_micros(), 5i64);
|
||||
assert_eq!(dates[1].into_timestamp_micros(), 6i64);
|
||||
@@ -972,98 +978,20 @@ mod tests {
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use test::{self, Bencher};
|
||||
|
||||
use super::tests::{generate_permutation, FIELD, SCHEMA};
|
||||
use super::tests::generate_permutation;
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::tests::generate_permutation_gcd;
|
||||
use crate::schema::{NumericOptions, Schema};
|
||||
use crate::Document;
|
||||
|
||||
fn multi_values(num_docs: usize, vals_per_doc: usize) -> Vec<Vec<u64>> {
|
||||
let mut vals = vec![];
|
||||
for _i in 0..num_docs {
|
||||
let mut block = vec![];
|
||||
for j in 0..vals_per_doc {
|
||||
block.push(j as u64);
|
||||
}
|
||||
vals.push(block);
|
||||
}
|
||||
|
||||
vals
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_multi_value_fflookup(b: &mut Bencher) {
|
||||
let num_docs = 100_000;
|
||||
|
||||
let path = Path::new("test");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for block in &multi_values(num_docs, 3) {
|
||||
let mut doc = Document::new();
|
||||
for val in block {
|
||||
doc.add_u64(field, *val);
|
||||
}
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data_idx = fast_fields_composite.open_read_with_idx(*FIELD, 0).unwrap();
|
||||
let idx_reader = DynamicFastFieldReader::<u64>::open(data_idx).unwrap();
|
||||
|
||||
let data_vals = fast_fields_composite.open_read_with_idx(*FIELD, 1).unwrap();
|
||||
let vals_reader = DynamicFastFieldReader::<u64>::open(data_vals).unwrap();
|
||||
let fast_field_reader = MultiValuedFastFieldReader::open(idx_reader, vals_reader);
|
||||
b.iter(|| {
|
||||
let mut sum = 0u64;
|
||||
for i in 0u32..num_docs as u32 {
|
||||
sum += fast_field_reader.get_vals(i).sum::<u64>();
|
||||
}
|
||||
sum
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|v| v * 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
for _ in 0..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
@@ -1071,102 +999,83 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
a = column.get_val(a as u64);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|val| val * 7) {
|
||||
a ^= fast_field_reader.get_val(i as u64);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let n = permutation.len();
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += permutation[i as usize];
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let mut a = 0u32;
|
||||
for i in 0u64..permutation.len() as u64 {
|
||||
a = fast_field_reader.get_val(i) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += column.get_val(i as u64);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0u64..n as u64 {
|
||||
a += column.get_val(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
|
||||
let permutation = generate_permutation_gcd();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..n as u64 {
|
||||
a += column.get_val(i);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
let mut a = 0u32;
|
||||
for i in 0u32..permutation.len() as u32 {
|
||||
a = fast_field_reader.get_val(i as u64) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..permutation.len() {
|
||||
a += permutation[i as usize] as u64;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,17 +36,19 @@ mod tests {
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader.fast_fields().u64s(field)?;
|
||||
{
|
||||
let vals = multi_value_reader.get_vals(2u32).collect::<Vec<_>>();
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4u64]);
|
||||
}
|
||||
{
|
||||
let vals = multi_value_reader.get_vals(0u32).collect::<Vec<_>>();
|
||||
multi_value_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1u64, 3u64]);
|
||||
}
|
||||
{
|
||||
assert!(multi_value_reader.get_vals(1u32).next().is_none());
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -211,13 +213,15 @@ mod tests {
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||
let vals = multi_value_reader.get_vals(2u32).collect::<Vec<_>>();
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[-4i64]);
|
||||
let vals = multi_value_reader.get_vals(0u32).collect::<Vec<_>>();
|
||||
multi_value_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1i64, 3i64]);
|
||||
assert!(multi_value_reader.get_vals(1u32).next().is_none());
|
||||
let vals = multi_value_reader.get_vals(3u32).collect::<Vec<_>>();
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
multi_value_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||
Ok(())
|
||||
}
|
||||
@@ -241,13 +245,15 @@ mod tests {
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader.fast_fields().bools(bool_field).unwrap();
|
||||
let vals = multi_value_reader.get_vals(2u32).collect::<Vec<_>>();
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[false]);
|
||||
let vals = multi_value_reader.get_vals(0u32).collect::<Vec<_>>();
|
||||
multi_value_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[true, false]);
|
||||
assert!(multi_value_reader.get_vals(1u32).next().is_none());
|
||||
let vals = multi_value_reader.get_vals(3u32).collect::<Vec<_>>();
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
multi_value_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[true, true, false]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -18,9 +18,7 @@ pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item>
|
||||
where DynamicFastFieldReader<Item>: Column<Item>
|
||||
{
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
@@ -43,9 +41,17 @@ where DynamicFastFieldReader<Item>: Column<Item>
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId) -> impl Iterator<Item = Item> + '_ {
|
||||
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let range = self.range(doc);
|
||||
self.vals_reader.get_range(range)
|
||||
self.get_vals_for_range(range, vals);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
|
||||
@@ -40,39 +40,40 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
mut bytes: OwnedBytes,
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let reader =
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => DynamicFastFieldReader::Bitpacked(
|
||||
BitpackedCodec::open_from_bytes(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Linear => {
|
||||
DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into())
|
||||
}
|
||||
FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear(
|
||||
BlockwiseLinearCodec::open_from_bytes(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Gcd => {
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD(
|
||||
open_gcd_from_bytes::<BitpackedCodec>(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD(
|
||||
open_gcd_from_bytes::<LinearCodec>(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
DynamicFastFieldReader::BlockwiseLinearGCD(
|
||||
open_gcd_from_bytes::<BlockwiseLinearCodec>(bytes)?.into(),
|
||||
)
|
||||
}
|
||||
FastFieldCodecType::Gcd => return Err(DataCorruption::comment_only(
|
||||
let reader = match codec_type {
|
||||
FastFieldCodecType::Bitpacked => {
|
||||
DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into())
|
||||
}
|
||||
FastFieldCodecType::Linear => {
|
||||
DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into())
|
||||
}
|
||||
FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear(
|
||||
BlockwiseLinearCodec::open_from_bytes(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Gcd => {
|
||||
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD(
|
||||
open_gcd_from_bytes::<BitpackedCodec>(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD(
|
||||
open_gcd_from_bytes::<LinearCodec>(bytes)?.into(),
|
||||
),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
DynamicFastFieldReader::BlockwiseLinearGCD(
|
||||
open_gcd_from_bytes::<BlockwiseLinearCodec>(bytes)?.into(),
|
||||
)
|
||||
}
|
||||
FastFieldCodecType::Gcd => {
|
||||
return Err(DataCorruption::comment_only(
|
||||
"Gcd codec wrapped into another gcd codec. This combination is not \
|
||||
allowed.",
|
||||
)
|
||||
.into()),
|
||||
.into())
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
@@ -96,6 +97,17 @@ impl<Item: FastValue> Column<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::BlockwiseLinearGCD(reader) => reader.get_val(idx),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::Linear(reader) => reader.get_range(start, output),
|
||||
Self::BlockwiseLinear(reader) => reader.get_range(start, output),
|
||||
Self::BitpackedGCD(reader) => reader.get_range(start, output),
|
||||
Self::LinearGCD(reader) => reader.get_range(start, output),
|
||||
Self::BlockwiseLinearGCD(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
@@ -155,6 +167,24 @@ impl<Item: FastValue, D: Column> FastFieldReaderCodecWrapper<Item, D> {
|
||||
let data = self.reader.get_val(idx);
|
||||
Item::from_u64(data)
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
/// It works as follows... A first column contains the list of start index
|
||||
/// for each document, a second column contains the actual values.
|
||||
///
|
||||
/// The values associated to a given doc, are then
|
||||
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
|
||||
///
|
||||
/// Which means single value fast field reader can be indexed internally with
|
||||
/// something different from a `DocId`. For this use case, we want to use `u64`
|
||||
/// values.
|
||||
///
|
||||
/// See `get_range` for an actual documentation about this method.
|
||||
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
|
||||
for (i, out) in output.iter_mut().enumerate() {
|
||||
*out = self.get_u64(start + (i as u64));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWrapper<Item, C> {
|
||||
@@ -170,6 +200,23 @@ impl<Item: FastValue, C: Column + Clone> Column<Item> for FastFieldReaderCodecWr
|
||||
self.get_u64(idx)
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
self.get_range_u64(start, output);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
|
||||
@@ -64,8 +64,8 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<C: FastFieldCodec, D: Column>(
|
||||
fastfield_accessor: &D,
|
||||
fn codec_estimation<C: FastFieldCodec>(
|
||||
fastfield_accessor: &impl Column,
|
||||
estimations: &mut Vec<(f32, FastFieldCodecType)>,
|
||||
) {
|
||||
if let Some(ratio) = C::estimate(fastfield_accessor) {
|
||||
@@ -202,13 +202,13 @@ impl CompositeFastFieldSerializer {
|
||||
let mut estimations = vec![];
|
||||
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
|
||||
codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<BitpackedCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
|
||||
codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<LinearCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
|
||||
codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<BlockwiseLinearCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
||||
{
|
||||
|
||||
@@ -471,13 +471,15 @@ mod tests_indexsorting {
|
||||
|
||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
let multifield = fast_fields.u64s(multi_numbers).unwrap();
|
||||
let vals = multifield.get_vals(0u32).collect::<Vec<_>>();
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(0u32, &mut vals);
|
||||
assert_eq!(vals, &[] as &[u64]);
|
||||
|
||||
let vals = multifield.get_vals(1u32).collect::<Vec<_>>();
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(1u32, &mut vals);
|
||||
assert_eq!(vals, &[5, 6]);
|
||||
|
||||
let vals = multifield.get_vals(2u32).collect::<Vec<_>>();
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(2u32, &mut vals);
|
||||
assert_eq!(vals, &[3]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -174,9 +174,7 @@ fn index_documents(
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> crate::Result<()> {
|
||||
let schema = segment.schema();
|
||||
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
segment_writer.add_document(doc)?;
|
||||
@@ -1535,11 +1533,13 @@ mod tests {
|
||||
let ff_reader = segment_reader.fast_fields().u64s(multi_numbers).unwrap();
|
||||
let bool_ff_reader = segment_reader.fast_fields().bools(multi_bools).unwrap();
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
let vals = ff_reader.get_vals(doc).collect::<Vec<_>>();
|
||||
let mut vals = vec![];
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
assert_eq!(vals.len(), 2);
|
||||
assert_eq!(vals[0], vals[1]);
|
||||
|
||||
let bool_vals = bool_ff_reader.get_vals(doc).collect::<Vec<_>>();
|
||||
let mut bool_vals = vec![];
|
||||
bool_ff_reader.get_vals(doc, &mut bool_vals);
|
||||
assert_eq!(bool_vals.len(), 2);
|
||||
assert_ne!(bool_vals[0], bool_vals[1]);
|
||||
|
||||
|
||||
@@ -133,7 +133,7 @@ impl TermOrdinalMapping {
|
||||
fn max_term_ord(&self) -> TermOrdinal {
|
||||
self.per_segment_new_term_ordinals
|
||||
.iter()
|
||||
.flat_map(|term_ordinals| term_ordinals.iter().max())
|
||||
.flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
|
||||
.max()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
@@ -578,7 +578,6 @@ impl IndexMerger {
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'a> Column for FieldIndexAccessProvider<'a> {
|
||||
#[inline]
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
self.offsets[doc as usize]
|
||||
}
|
||||
@@ -669,13 +668,15 @@ impl IndexMerger {
|
||||
{
|
||||
let mut serialize_vals =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
|
||||
let mut vals = Vec::with_capacity(100);
|
||||
|
||||
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
|
||||
let term_ordinal_mapping: &[TermOrdinal] =
|
||||
term_ordinal_mappings.get_segment(old_doc_addr.segment_ord as usize);
|
||||
|
||||
let ff_reader = &fast_field_reader[old_doc_addr.segment_ord as usize];
|
||||
for prev_term_ord in ff_reader.get_vals(old_doc_addr.doc_id) {
|
||||
ff_reader.get_vals(old_doc_addr.doc_id, &mut vals);
|
||||
for &prev_term_ord in &vals {
|
||||
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
|
||||
serialize_vals.add_val(new_term_ord)?;
|
||||
}
|
||||
@@ -728,6 +729,8 @@ impl IndexMerger {
|
||||
let mut max_value = u64::MIN;
|
||||
let mut num_vals = 0;
|
||||
|
||||
let mut vals = Vec::with_capacity(100);
|
||||
|
||||
let mut ff_readers = Vec::new();
|
||||
|
||||
// Our values are bitpacked and we need to know what should be
|
||||
@@ -745,11 +748,12 @@ impl IndexMerger {
|
||||
Please report.",
|
||||
);
|
||||
for doc in reader.doc_ids_alive() {
|
||||
for val in ff_reader.get_vals(doc) {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
for &val in &vals {
|
||||
min_value = cmp::min(val, min_value);
|
||||
max_value = cmp::max(val, max_value);
|
||||
num_vals += 1;
|
||||
}
|
||||
num_vals += vals.len();
|
||||
}
|
||||
ff_readers.push(ff_reader);
|
||||
// TODO optimize when no deletes
|
||||
@@ -780,7 +784,7 @@ impl IndexMerger {
|
||||
let new_doc_id: DocId =
|
||||
self.offsets
|
||||
.iter()
|
||||
.position(|offset| offset > pos)
|
||||
.position(|&offset| offset > pos)
|
||||
.expect("pos is out of bounds") as DocId
|
||||
- 1u32;
|
||||
|
||||
@@ -792,10 +796,11 @@ impl IndexMerger {
|
||||
let num_vals = self.fast_field_readers[old_doc_addr.segment_ord as usize]
|
||||
.get_len(old_doc_addr.doc_id);
|
||||
assert!(num_vals >= pos_in_values);
|
||||
let mut vals = Vec::new();
|
||||
self.fast_field_readers[old_doc_addr.segment_ord as usize]
|
||||
.get_vals(old_doc_addr.doc_id)
|
||||
.nth(pos_in_values as usize)
|
||||
.expect("computation error in SortedDocIdMultiValueAccessProvider")
|
||||
.get_vals(old_doc_addr.doc_id, &mut vals);
|
||||
|
||||
vals[pos_in_values as usize]
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
@@ -805,7 +810,9 @@ impl IndexMerger {
|
||||
.flat_map(|old_doc_addr| {
|
||||
let ff_reader =
|
||||
&self.fast_field_readers[old_doc_addr.segment_ord as usize];
|
||||
ff_reader.get_vals(old_doc_addr.doc_id)
|
||||
let mut vals = Vec::new();
|
||||
ff_reader.get_vals(old_doc_addr.doc_id, &mut vals);
|
||||
vals.into_iter()
|
||||
}),
|
||||
)
|
||||
}
|
||||
@@ -1968,32 +1975,49 @@ mod tests {
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let mut vals: Vec<u64> = Vec::new();
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
|
||||
assert_eq!(&ff_reader.get_vals(0).collect::<Vec<_>>(), &[1, 2]);
|
||||
assert_eq!(&ff_reader.get_vals(1).collect::<Vec<_>>(), &[1, 2, 3]);
|
||||
assert_eq!(&ff_reader.get_vals(2).collect::<Vec<_>>(), &[4, 5]);
|
||||
assert_eq!(&ff_reader.get_vals(3).collect::<Vec<_>>(), &[1, 2]);
|
||||
assert_eq!(&ff_reader.get_vals(4).collect::<Vec<_>>(), &[1, 5]);
|
||||
assert_eq!(&ff_reader.get_vals(5).collect::<Vec<_>>(), &[3]);
|
||||
assert_eq!(&ff_reader.get_vals(6).collect::<Vec<_>>(), &[17]);
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
|
||||
ff_reader.get_vals(1, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2, 3]);
|
||||
|
||||
ff_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4, 5]);
|
||||
|
||||
ff_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
|
||||
ff_reader.get_vals(4, &mut vals);
|
||||
assert_eq!(&vals, &[1, 5]);
|
||||
|
||||
ff_reader.get_vals(5, &mut vals);
|
||||
assert_eq!(&vals, &[3]);
|
||||
|
||||
ff_reader.get_vals(6, &mut vals);
|
||||
assert_eq!(&vals, &[17]);
|
||||
}
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(1u32);
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
assert_eq!(&ff_reader.get_vals(0).collect::<Vec<_>>(), &[28, 27]);
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[28, 27]);
|
||||
|
||||
assert_eq!(&ff_reader.get_vals(1).collect::<Vec<_>>(), &[1000]);
|
||||
ff_reader.get_vals(1, &mut vals);
|
||||
assert_eq!(&vals, &[1_000]);
|
||||
}
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(2u32);
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
assert_eq!(&ff_reader.get_vals(0).collect::<Vec<_>>(), &[20]);
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[20]);
|
||||
}
|
||||
|
||||
// Merging the segments
|
||||
@@ -2010,16 +2034,35 @@ mod tests {
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
|
||||
assert_eq!(&ff_reader.get_vals(0).collect::<Vec<_>>(), &[1, 2]);
|
||||
assert_eq!(&ff_reader.get_vals(1).collect::<Vec<_>>(), &[1, 2, 3]);
|
||||
assert_eq!(&ff_reader.get_vals(2).collect::<Vec<_>>(), &[4, 5]);
|
||||
assert_eq!(&ff_reader.get_vals(3).collect::<Vec<_>>(), &[1, 2]);
|
||||
assert_eq!(&ff_reader.get_vals(4).collect::<Vec<_>>(), &[1, 5]);
|
||||
assert_eq!(&ff_reader.get_vals(5).collect::<Vec<_>>(), &[3]);
|
||||
assert_eq!(&ff_reader.get_vals(6).collect::<Vec<_>>(), &[17]);
|
||||
assert_eq!(&ff_reader.get_vals(7).collect::<Vec<_>>(), &[28, 27]);
|
||||
assert_eq!(&ff_reader.get_vals(8).collect::<Vec<_>>(), &[1_000]);
|
||||
assert_eq!(&ff_reader.get_vals(9).collect::<Vec<_>>(), &[20]);
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
|
||||
ff_reader.get_vals(1, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2, 3]);
|
||||
|
||||
ff_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4, 5]);
|
||||
|
||||
ff_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
|
||||
ff_reader.get_vals(4, &mut vals);
|
||||
assert_eq!(&vals, &[1, 5]);
|
||||
|
||||
ff_reader.get_vals(5, &mut vals);
|
||||
assert_eq!(&vals, &[3]);
|
||||
|
||||
ff_reader.get_vals(6, &mut vals);
|
||||
assert_eq!(&vals, &[17]);
|
||||
|
||||
ff_reader.get_vals(7, &mut vals);
|
||||
assert_eq!(&vals, &[28, 27]);
|
||||
|
||||
ff_reader.get_vals(8, &mut vals);
|
||||
assert_eq!(&vals, &[1_000]);
|
||||
|
||||
ff_reader.get_vals(9, &mut vals);
|
||||
assert_eq!(&vals, &[20]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -383,7 +383,9 @@ mod tests {
|
||||
assert_eq!(fast_field.get_val(5), 1_000u64);
|
||||
|
||||
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
|
||||
fast_field.get_vals(doc_id).collect()
|
||||
let mut vals = vec![];
|
||||
fast_field.get_vals(doc_id, &mut vals);
|
||||
vals
|
||||
};
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let fast_field = fast_fields.u64s(multi_numbers).unwrap();
|
||||
|
||||
@@ -25,39 +25,10 @@ use crate::indexer::{
|
||||
DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
|
||||
SegmentSerializer,
|
||||
};
|
||||
use crate::schema::Schema;
|
||||
use crate::{FutureResult, Opstamp};
|
||||
|
||||
const NUM_MERGE_THREADS: usize = 4;
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_new_metas(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
directory: &dyn Directory,
|
||||
) -> crate::Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
index_settings,
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
},
|
||||
directory,
|
||||
)?;
|
||||
directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic:
|
||||
/// Either
|
||||
@@ -67,7 +38,7 @@ pub fn save_new_metas(
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||
info!("save metas");
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
|
||||
@@ -80,8 +80,8 @@ impl SegmentWriter {
|
||||
pub fn for_segment(
|
||||
memory_budget_in_bytes: usize,
|
||||
segment: Segment,
|
||||
schema: Schema,
|
||||
) -> crate::Result<SegmentWriter> {
|
||||
let schema = segment.schema();
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
|
||||
@@ -301,7 +301,7 @@ pub use self::docset::{DocSet, TERMINATED};
|
||||
pub use crate::core::{
|
||||
Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
|
||||
Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
|
||||
SegmentReader,
|
||||
SegmentReader, SingleSegmentIndexWriter,
|
||||
};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::demuxer::*;
|
||||
|
||||
@@ -227,7 +227,7 @@ pub mod tests {
|
||||
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap();
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
|
||||
{
|
||||
// checking that position works if the field has two values
|
||||
let op = AddOperation {
|
||||
|
||||
@@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition {
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `MemoryArena`.
|
||||
pub(crate) trait PostingsWriter {
|
||||
pub(crate) trait PostingsWriter: Send + Sync {
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
/// * doc - the document id
|
||||
|
||||
@@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub(crate) trait Recorder: Copy + Default + 'static {
|
||||
pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
|
||||
@@ -639,7 +639,10 @@ Survey in 2016, 2017, and 2018."#;
|
||||
#[test]
|
||||
fn test_collapse_overlapped_ranges() {
|
||||
assert_eq!(&collapse_overlapped_ranges(&[0..1, 2..3,]), &[0..1, 2..3]);
|
||||
assert_eq!(collapse_overlapped_ranges(&[0..1, 1..2,]), &[0..1, 1..2]);
|
||||
assert_eq!(
|
||||
collapse_overlapped_ranges(&vec![0..1, 1..2,]),
|
||||
vec![0..1, 1..2]
|
||||
);
|
||||
assert_eq!(collapse_overlapped_ranges(&[0..2, 1..2,]), vec![0..2]);
|
||||
assert_eq!(collapse_overlapped_ranges(&[0..2, 1..3,]), vec![0..3]);
|
||||
assert_eq!(collapse_overlapped_ranges(&[0..3, 1..2,]), vec![0..3]);
|
||||
|
||||
Reference in New Issue
Block a user