mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-10 19:12:54 +00:00
Compare commits
11 Commits
col-trait-
...
column-tra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4a072e3c18 | ||
|
|
84e0c75598 | ||
|
|
08c4412d73 | ||
|
|
70e58adff9 | ||
|
|
0d1cd119e9 | ||
|
|
d3dd620048 | ||
|
|
e89c220b56 | ||
|
|
a451f6d60d | ||
|
|
f740ddeee3 | ||
|
|
7a26cc9022 | ||
|
|
54972caa7c |
@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write + Send {
|
||||
pub trait TerminatingWrite: Write + Send + Sync {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where Self: Sized {
|
||||
|
||||
@@ -289,6 +289,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
|
||||
return None;
|
||||
|
||||
@@ -90,52 +90,41 @@ pub struct FastFieldStats {
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'a> Column for &'a [u64] {
|
||||
struct VecColum<'a>(&'a [u64]);
|
||||
impl<'a> Column for VecColum<'a> {
|
||||
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new((self as &[u64]).iter().cloned())
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.iter().min().unwrap_or(0)
|
||||
self.0.iter().min().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.iter().max().unwrap_or(0)
|
||||
self.0.iter().max().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.len() as u64
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl Column for Vec<u64> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
}
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new((self as &[u64]).iter().cloned())
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
self.iter().min().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.iter().max().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.len() as u64
|
||||
impl<'a> From<&'a [u64]> for VecColum<'a> {
|
||||
fn from(data: &'a [u64]) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::arbitrary::any;
|
||||
use proptest::proptest;
|
||||
use proptest::prelude::*;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
|
||||
use crate::bitpacked::BitpackedCodec;
|
||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||
@@ -145,10 +134,10 @@ mod tests {
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> Option<(f32, f32)> {
|
||||
let estimation = Codec::estimate(&data)?;
|
||||
let estimation = Codec::estimate(&VecColum::from(data))?;
|
||||
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
Codec::serialize(&mut out, &data).unwrap();
|
||||
Codec::serialize(&mut out, &VecColum::from(data)).unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
|
||||
@@ -166,21 +155,32 @@ mod tests {
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(100))]
|
||||
#[test]
|
||||
fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
|
||||
fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(10))]
|
||||
#[test]
|
||||
fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
|
||||
}
|
||||
fn num_strategy() -> impl Strategy<Value = u64> {
|
||||
prop_oneof![
|
||||
1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
|
||||
1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
|
||||
20 => prop::num::u64::ANY,
|
||||
]
|
||||
}
|
||||
|
||||
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
@@ -234,6 +234,7 @@ mod tests {
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
let data: VecColum = data.as_slice().into();
|
||||
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
@@ -247,8 +248,9 @@ mod tests {
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let data: VecColum = data.into();
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
@@ -259,6 +261,7 @@ mod tests {
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data: Vec<u64> = (200..=20000_u64).collect();
|
||||
data.push(1_000_000);
|
||||
let data: VecColum = data.as_slice().into();
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
|
||||
@@ -193,6 +193,7 @@ impl FastFieldCodec for LinearCodec {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 3 {
|
||||
return None; // disable compressor for this case
|
||||
@@ -258,6 +259,8 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
@@ -340,10 +343,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..10_000)
|
||||
.map(|_| rand::random::<u64>())
|
||||
.collect::<Vec<_>>();
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..50 {
|
||||
let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
@@ -3,9 +3,33 @@ extern crate prettytable;
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
struct Data<'a>(&'a [u64]);
|
||||
|
||||
impl<'a> Column for Data<'a> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
*self.0.iter().min().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
*self.0.iter().max().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
@@ -86,10 +110,11 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
pub fn serialize_with_codec<C: FastFieldCodec>(
|
||||
data: &[u64],
|
||||
) -> Option<(f32, f32, FastFieldCodecType)> {
|
||||
let data = Data(data);
|
||||
let estimation = C::estimate(&data)?;
|
||||
let mut out = Vec::new();
|
||||
C::serialize(&mut out, &data).unwrap();
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
|
||||
Some((estimation, actual_compression, C::CODEC_TYPE))
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{fmt, io, mem};
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
/// this data as a slice.
|
||||
///
|
||||
/// The backing object is required to be `StableDeref`.
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::sync::Arc;
|
||||
|
||||
use super::segment::Segment;
|
||||
use super::IndexSettings;
|
||||
use crate::core::single_segment_index_writer::SingleSegmentIndexWriter;
|
||||
use crate::core::{
|
||||
Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
|
||||
};
|
||||
@@ -16,7 +17,7 @@ use crate::directory::MmapDirectory;
|
||||
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
|
||||
use crate::error::{DataCorruption, TantivyError};
|
||||
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
|
||||
use crate::indexer::segment_updater::save_new_metas;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::reader::{IndexReader, IndexReaderBuilder};
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
@@ -47,6 +48,34 @@ fn load_metas(
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_new_metas(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
directory: &dyn Directory,
|
||||
) -> crate::Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
index_settings,
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
},
|
||||
directory,
|
||||
)?;
|
||||
directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// IndexBuilder can be used to create an index.
|
||||
///
|
||||
/// Use in conjunction with `SchemaBuilder`. Global index settings
|
||||
@@ -135,6 +164,25 @@ impl IndexBuilder {
|
||||
self.create(mmap_directory)
|
||||
}
|
||||
|
||||
/// Dragons ahead!!!
|
||||
///
|
||||
/// The point of this API is to let users create a simple index with a single segment
|
||||
/// and without starting any thread.
|
||||
///
|
||||
/// Do not use this method if you are not sure what you are doing.
|
||||
///
|
||||
/// It expects an originally empty directory, and will not run any GC operation.
|
||||
#[doc(hidden)]
|
||||
pub fn single_segment_index_writer(
|
||||
self,
|
||||
dir: impl Into<Box<dyn Directory>>,
|
||||
mem_budget: usize,
|
||||
) -> crate::Result<SingleSegmentIndexWriter> {
|
||||
let index = self.create(dir)?;
|
||||
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
|
||||
Ok(index_simple_writer)
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
@@ -580,10 +628,12 @@ impl fmt::Debug for Index {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::schema::{Field, Schema, INDEXED, TEXT};
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -849,4 +899,28 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_segment_index_writer() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let directory = RamDirectory::default();
|
||||
let mut single_segment_index_writer = Index::builder()
|
||||
.schema(schema)
|
||||
.single_segment_index_writer(directory, 10_000_000)?;
|
||||
for _ in 0..10 {
|
||||
let doc = doc!(text_field=>"hello");
|
||||
single_segment_index_writer.add_document(doc)?;
|
||||
}
|
||||
let index = single_segment_index_writer.finalize()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "hello"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let count = searcher.search(&term_query, &Count)?;
|
||||
assert_eq!(count, 10);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ mod segment;
|
||||
mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_reader;
|
||||
mod single_segment_index_writer;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
@@ -23,6 +24,7 @@ pub use self::segment::Segment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
|
||||
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
|
||||
47
src/core/single_segment_index_writer.rs
Normal file
47
src/core/single_segment_index_writer.rs
Normal file
@@ -0,0 +1,47 @@
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
|
||||
|
||||
#[doc(hidden)]
|
||||
pub struct SingleSegmentIndexWriter {
|
||||
segment_writer: SegmentWriter,
|
||||
segment: Segment,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
impl SingleSegmentIndexWriter {
|
||||
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
|
||||
let segment = index.new_segment();
|
||||
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
|
||||
Ok(Self {
|
||||
segment_writer,
|
||||
segment,
|
||||
opstamp: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
|
||||
let opstamp = self.opstamp;
|
||||
self.opstamp += 1;
|
||||
self.segment_writer
|
||||
.add_document(AddOperation { opstamp, document })
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> crate::Result<Index> {
|
||||
let max_doc = self.segment_writer.max_doc();
|
||||
self.segment_writer.finalize()?;
|
||||
let segment: Segment = self.segment.with_max_doc(max_doc);
|
||||
let index = segment.index();
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: index.settings().clone(),
|
||||
segments: vec![segment.meta().clone()],
|
||||
schema: index.schema(),
|
||||
opstamp: 0,
|
||||
payload: None,
|
||||
};
|
||||
save_metas(&index_meta, index.directory())?;
|
||||
index.directory().sync_directory()?;
|
||||
Ok(segment.index().clone())
|
||||
}
|
||||
}
|
||||
@@ -978,37 +978,20 @@ mod tests {
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use test::{self, Bencher};
|
||||
|
||||
use super::tests::{generate_permutation, FIELD, SCHEMA};
|
||||
use super::tests::generate_permutation;
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::tests::generate_permutation_gcd;
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|v| v * 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
for _ in 0..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
@@ -1016,102 +999,83 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
a = column.get_val(a as u64);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|val| val * 7) {
|
||||
a ^= fast_field_reader.get_val(i as u64);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let n = permutation.len();
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += permutation[i as usize];
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let mut a = 0u32;
|
||||
for i in 0u64..permutation.len() as u64 {
|
||||
a = fast_field_reader.get_val(i) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in (0..n / 7).map(|val| val * 7) {
|
||||
a += column.get_val(i as u64);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0u64..n as u64 {
|
||||
a += column.get_val(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
|
||||
let permutation = generate_permutation_gcd();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
let n = permutation.len();
|
||||
let column = DynamicFastFieldReader::from(permutation);
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..n as u64 {
|
||||
a += column.get_val(i);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
let mut a = 0u32;
|
||||
for i in 0u32..permutation.len() as u32 {
|
||||
a = fast_field_reader.get_val(i as u64) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let mut a = 0u64;
|
||||
for i in 0..permutation.len() {
|
||||
a += permutation[i as usize] as u64;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,8 +64,8 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<C: FastFieldCodec, D: Column>(
|
||||
fastfield_accessor: &D,
|
||||
fn codec_estimation<C: FastFieldCodec>(
|
||||
fastfield_accessor: &impl Column,
|
||||
estimations: &mut Vec<(f32, FastFieldCodecType)>,
|
||||
) {
|
||||
if let Some(ratio) = C::estimate(fastfield_accessor) {
|
||||
@@ -202,13 +202,13 @@ impl CompositeFastFieldSerializer {
|
||||
let mut estimations = vec![];
|
||||
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
|
||||
codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<BitpackedCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
|
||||
codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<LinearCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
|
||||
codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations);
|
||||
codec_estimation::<BlockwiseLinearCodec>(&fastfield_accessor, &mut estimations);
|
||||
}
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
||||
{
|
||||
|
||||
@@ -174,9 +174,7 @@ fn index_documents(
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> crate::Result<()> {
|
||||
let schema = segment.schema();
|
||||
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
segment_writer.add_document(doc)?;
|
||||
|
||||
@@ -133,7 +133,7 @@ impl TermOrdinalMapping {
|
||||
fn max_term_ord(&self) -> TermOrdinal {
|
||||
self.per_segment_new_term_ordinals
|
||||
.iter()
|
||||
.flat_map(|term_ordinals| term_ordinals.iter().max())
|
||||
.flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
|
||||
.max()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
@@ -784,7 +784,7 @@ impl IndexMerger {
|
||||
let new_doc_id: DocId =
|
||||
self.offsets
|
||||
.iter()
|
||||
.position(|offset| offset > pos)
|
||||
.position(|&offset| offset > pos)
|
||||
.expect("pos is out of bounds") as DocId
|
||||
- 1u32;
|
||||
|
||||
|
||||
@@ -25,39 +25,10 @@ use crate::indexer::{
|
||||
DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
|
||||
SegmentSerializer,
|
||||
};
|
||||
use crate::schema::Schema;
|
||||
use crate::{FutureResult, Opstamp};
|
||||
|
||||
const NUM_MERGE_THREADS: usize = 4;
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_new_metas(
|
||||
schema: Schema,
|
||||
index_settings: IndexSettings,
|
||||
directory: &dyn Directory,
|
||||
) -> crate::Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
index_settings,
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
},
|
||||
directory,
|
||||
)?;
|
||||
directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic:
|
||||
/// Either
|
||||
@@ -67,7 +38,7 @@ pub fn save_new_metas(
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||
info!("save metas");
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
|
||||
@@ -80,8 +80,8 @@ impl SegmentWriter {
|
||||
pub fn for_segment(
|
||||
memory_budget_in_bytes: usize,
|
||||
segment: Segment,
|
||||
schema: Schema,
|
||||
) -> crate::Result<SegmentWriter> {
|
||||
let schema = segment.schema();
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
|
||||
@@ -301,7 +301,7 @@ pub use self::docset::{DocSet, TERMINATED};
|
||||
pub use crate::core::{
|
||||
Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
|
||||
Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
|
||||
SegmentReader,
|
||||
SegmentReader, SingleSegmentIndexWriter,
|
||||
};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::demuxer::*;
|
||||
|
||||
@@ -227,7 +227,7 @@ pub mod tests {
|
||||
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap();
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
|
||||
{
|
||||
// checking that position works if the field has two values
|
||||
let op = AddOperation {
|
||||
|
||||
@@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition {
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `MemoryArena`.
|
||||
pub(crate) trait PostingsWriter {
|
||||
pub(crate) trait PostingsWriter: Send + Sync {
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
/// * doc - the document id
|
||||
|
||||
@@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub(crate) trait Recorder: Copy + Default + 'static {
|
||||
pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
|
||||
Reference in New Issue
Block a user