Compare commits

..

3 Commits

Author SHA1 Message Date
Paul Masurel
1553901f51 Introducing a column trait 2022-08-27 22:48:05 +02:00
Paul Masurel
e8a6e123ae Small refactoring estimate. 2022-08-27 21:53:46 +02:00
Paul Masurel
43a4c8287c Removing Deserializer trait
And renaming the `Serializer` trait `FastFieldCodec`.
2022-08-27 21:11:54 +02:00
21 changed files with 206 additions and 304 deletions

View File

@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
pub struct AntiCallToken(());
/// Trait used to indicate when no more write need to be done on a writer
pub trait TerminatingWrite: Write + Send + Sync {
pub trait TerminatingWrite: Write + Send {
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
fn terminate(mut self) -> io::Result<()>
where Self: Sized {

View File

@@ -289,7 +289,6 @@ impl FastFieldCodec for BlockwiseLinearCodec {
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
#[allow(clippy::question_mark)]
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
return None;

View File

@@ -16,6 +16,7 @@ mod column;
pub use self::column::Column;
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)]
pub enum FastFieldCodecType {
@@ -90,41 +91,52 @@ pub struct FastFieldStats {
pub num_vals: u64,
}
struct VecColum<'a>(&'a [u64]);
impl<'a> Column for VecColum<'a> {
impl<'a> Column for &'a [u64] {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
self[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
Box::new((self as &[u64]).iter().cloned())
}
fn min_value(&self) -> u64 {
self.0.iter().min().cloned().unwrap_or(0)
self.iter().min().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.0.iter().max().cloned().unwrap_or(0)
self.iter().max().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
self.len() as u64
}
}
impl<'a> From<&'a [u64]> for VecColum<'a> {
fn from(data: &'a [u64]) -> Self {
Self(data)
impl Column for Vec<u64> {
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new((self as &[u64]).iter().cloned())
}
fn min_value(&self) -> u64 {
self.iter().min().unwrap_or(0)
}
fn max_value(&self) -> u64 {
self.iter().max().unwrap_or(0)
}
fn num_vals(&self) -> u64 {
self.len() as u64
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use proptest::arbitrary::any;
use proptest::proptest;
use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::BlockwiseLinearCodec;
@@ -134,10 +146,10 @@ mod tests {
data: &[u64],
name: &str,
) -> Option<(f32, f32)> {
let estimation = Codec::estimate(&VecColum::from(data))?;
let estimation = Codec::estimate(&data)?;
let mut out: Vec<u8> = Vec::new();
Codec::serialize(&mut out, &VecColum::from(data)).unwrap();
Codec::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
@@ -155,31 +167,20 @@ mod tests {
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
#[test]
fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
}
fn num_strategy() -> impl Strategy<Value = u64> {
prop_oneof![
1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
20 => prop::num::u64::ANY,
]
}
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
@@ -195,11 +196,6 @@ mod tests {
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
data_and_names.push((vec![10], "single value"));
data_and_names.push((
vec![1572656989877777, 1170935903116329, 720575940379279, 0],
"overflow error",
));
data_and_names
}
@@ -234,7 +230,6 @@ mod tests {
#[test]
fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>();
let data: VecColum = data.as_slice().into();
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.01);
@@ -248,9 +243,8 @@ mod tests {
}
#[test]
fn estimation_test_bad_interpolation_case() {
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
let data = vec![200, 10, 10, 10, 10, 1000, 20];
let data: VecColum = data.into();
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
assert_le!(linear_interpol_estimation, 0.32);
@@ -261,7 +255,6 @@ mod tests {
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
let mut data: Vec<u64> = (200..=20000_u64).collect();
data.push(1_000_000);
let data: VecColum = data.as_slice().into();
// in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior

View File

@@ -115,9 +115,9 @@ fn diff(val1: u64, val2: u64) -> f64 {
#[inline]
pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
if slope < 0.0 {
first_val.saturating_sub((pos as f32 * -slope) as u64)
first_val - (pos as f32 * -slope) as u64
} else {
first_val.saturating_add((pos as f32 * slope) as u64)
first_val + (pos as f32 * slope) as u64
}
}
@@ -193,7 +193,6 @@ impl FastFieldCodec for LinearCodec {
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
#[allow(clippy::question_mark)]
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 3 {
return None; // disable compressor for this case
@@ -259,8 +258,6 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
#[cfg(test)]
mod tests {
use rand::RngCore;
use super::*;
use crate::tests::get_codec_test_datasets;
@@ -317,13 +314,6 @@ mod tests {
create_and_validate(&data, "large amplitude");
}
#[test]
fn overflow_error_test() {
let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
create_and_validate(&data, "overflow test");
}
#[test]
fn linear_interpol_fast_concave_data() {
let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
@@ -343,9 +333,10 @@ mod tests {
#[test]
fn linear_interpol_fast_field_rand() {
let mut rng = rand::thread_rng();
for _ in 0..50 {
let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
for _ in 0..5000 {
let mut data = (0..10_000)
.map(|_| rand::random::<u64>())
.collect::<Vec<_>>();
create_and_validate(&data, "random");
data.reverse();
create_and_validate(&data, "random");

View File

@@ -3,33 +3,9 @@ extern crate prettytable;
use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
use prettytable::{Cell, Row, Table};
struct Data<'a>(&'a [u64]);
impl<'a> Column for Data<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
}
fn min_value(&self) -> u64 {
*self.0.iter().min().unwrap_or(&0)
}
fn max_value(&self) -> u64 {
*self.0.iter().max().unwrap_or(&0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
}
fn main() {
let mut table = Table::new();
@@ -110,11 +86,10 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
pub fn serialize_with_codec<C: FastFieldCodec>(
data: &[u64],
) -> Option<(f32, f32, FastFieldCodecType)> {
let data = Data(data);
let estimation = C::estimate(&data)?;
let mut out = Vec::new();
C::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
Some((estimation, actual_compression, C::CODEC_TYPE))
}

View File

@@ -6,7 +6,7 @@ use std::{fmt, io, mem};
use stable_deref_trait::StableDeref;
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
/// this data as a slice.
/// this data as a static slice.
///
/// The backing object is required to be `StableDeref`.
#[derive(Clone)]

View File

@@ -409,7 +409,7 @@ impl TopDocs {
/// # use tantivy::query::QueryParser;
/// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs;
/// use tantivy::fastfield::Column;
/// use tantivy::fastfield::FastFieldReader;
/// use tantivy::schema::Field;
///
/// fn create_schema() -> Schema {
@@ -458,7 +458,7 @@ impl TopDocs {
///
/// // We can now define our actual scoring function
/// move |doc: DocId, original_score: Score| {
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// let popularity: u64 = popularity_reader.get(doc);
/// // Well.. For the sake of the example we use a simple logarithm
/// // function.
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -517,7 +517,7 @@ impl TopDocs {
/// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs;
/// use tantivy::schema::Field;
/// use fastfield_codecs::Column;
/// use tantivy::fastfield::FastFieldReader;
///
/// # fn create_schema() -> Schema {
/// # let mut schema_builder = Schema::builder();
@@ -569,8 +569,8 @@ impl TopDocs {
///
/// // We can now define our actual scoring function
/// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
/// let boosted: u64 = boosted_reader.get_val(doc as u64);
/// let popularity: u64 = popularity_reader.get(doc);
/// let boosted: u64 = boosted_reader.get(doc);
/// // Score do not have to be `f64` in tantivy.
/// // Here we return a couple to get lexicographical order
/// // for free.

View File

@@ -7,7 +7,6 @@ use std::sync::Arc;
use super::segment::Segment;
use super::IndexSettings;
use crate::core::single_segment_index_writer::SingleSegmentIndexWriter;
use crate::core::{
Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
};
@@ -17,7 +16,7 @@ use crate::directory::MmapDirectory;
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::indexer::segment_updater::save_new_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
@@ -48,34 +47,6 @@ fn load_metas(
.map_err(From::from)
}
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
index_settings,
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)?;
directory.sync_directory()?;
Ok(())
}
/// IndexBuilder can be used to create an index.
///
/// Use in conjunction with `SchemaBuilder`. Global index settings
@@ -164,25 +135,6 @@ impl IndexBuilder {
self.create(mmap_directory)
}
/// Dragons ahead!!!
///
/// The point of this API is to let users create a simple index with a single segment
/// and without starting any thread.
///
/// Do not use this method if you are not sure what you are doing.
///
/// It expects an originally empty directory, and will not run any GC operation.
#[doc(hidden)]
pub fn single_segment_index_writer(
self,
dir: impl Into<Box<dyn Directory>>,
mem_budget: usize,
) -> crate::Result<SingleSegmentIndexWriter> {
let index = self.create(dir)?;
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
Ok(index_simple_writer)
}
/// Creates a new index in a temp directory.
///
/// The index will use the `MMapDirectory` in a newly created directory.
@@ -628,12 +580,10 @@ impl fmt::Debug for Index {
#[cfg(test)]
mod tests {
use crate::collector::Count;
use crate::directory::{RamDirectory, WatchCallback};
use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT};
use crate::schema::{Field, Schema, INDEXED, TEXT};
use crate::tokenizer::TokenizerManager;
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term};
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
#[test]
fn test_indexer_for_field() {
@@ -899,28 +849,4 @@ mod tests {
);
Ok(())
}
#[test]
fn test_single_segment_index_writer() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let directory = RamDirectory::default();
let mut single_segment_index_writer = Index::builder()
.schema(schema)
.single_segment_index_writer(directory, 10_000_000)?;
for _ in 0..10 {
let doc = doc!(text_field=>"hello");
single_segment_index_writer.add_document(doc)?;
}
let index = single_segment_index_writer.finalize()?;
let searcher = index.reader()?.searcher();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "hello"),
IndexRecordOption::Basic,
);
let count = searcher.search(&term_query, &Count)?;
assert_eq!(count, 10);
Ok(())
}
}

View File

@@ -7,7 +7,6 @@ mod segment;
mod segment_component;
mod segment_id;
mod segment_reader;
mod single_segment_index_writer;
use std::path::Path;
@@ -24,7 +23,6 @@ pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
/// The meta file contains all the information about the list of segments and the schema
/// of the index.

View File

@@ -1,47 +0,0 @@
use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
#[doc(hidden)]
pub struct SingleSegmentIndexWriter {
segment_writer: SegmentWriter,
segment: Segment,
opstamp: Opstamp,
}
impl SingleSegmentIndexWriter {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
Ok(Self {
segment_writer,
segment,
opstamp: 0,
})
}
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
let opstamp = self.opstamp;
self.opstamp += 1;
self.segment_writer
.add_document(AddOperation { opstamp, document })
}
pub fn finalize(self) -> crate::Result<Index> {
let max_doc = self.segment_writer.max_doc();
self.segment_writer.finalize()?;
let segment: Segment = self.segment.with_max_doc(max_doc);
let index = segment.index();
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
segments: vec![segment.meta().clone()],
schema: index.schema(),
opstamp: 0,
payload: None,
};
save_metas(&index_meta, index.directory())?;
index.directory().sync_directory()?;
Ok(segment.index().clone())
}
}

View File

@@ -210,7 +210,7 @@ mod tests {
#[test]
fn test_fastfield_gcd_i64() -> crate::Result<()> {
for &code_type in ALL_CODECS {
test_fastfield_gcd_i64_with_codec(code_type, 5500)?;
test_fastfield_gcd_i64_with_codec(code_type, 5005)?;
}
Ok(())
}
@@ -251,7 +251,7 @@ mod tests {
#[test]
fn test_fastfield_gcd_u64() -> crate::Result<()> {
for &code_type in ALL_CODECS {
test_fastfield_gcd_u64_with_codec(code_type, 5500)?;
test_fastfield_gcd_u64_with_codec(code_type, 5005)?;
}
Ok(())
}

View File

@@ -978,20 +978,37 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench {
use fastfield_codecs::Column;
use std::collections::HashMap;
use std::path::Path;
use test::{self, Bencher};
use super::tests::generate_permutation;
use super::tests::{generate_permutation, FIELD, SCHEMA};
use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::fastfield::tests::generate_permutation_gcd;
use crate::fastfield::FastFieldReader;
#[bench]
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for _ in 0..n {
for i in (0u32..n / 7).map(|v| v * 7) {
a ^= permutation[i as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
@@ -999,83 +1016,102 @@ mod bench {
}
#[bench]
fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for _ in 0..n {
a = column.get_val(a as u64);
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
a
});
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n / 7).map(|val| val * 7) {
a ^= fast_field_reader.get(i);
}
a
});
}
}
#[bench]
fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
fn bench_intfastfield_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let n = permutation.len();
b.iter(|| {
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += permutation[i as usize];
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
a
});
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let mut a = 0u32;
for i in 0u32..permutation.len() as u32 {
a = fast_field_reader.get(i) as u32;
}
a
});
}
}
#[bench]
fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in (0..n / 7).map(|val| val * 7) {
a += column.get_val(i as u64);
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
let permutation = generate_permutation();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in 0u64..n as u64 {
a += column.get_val(i);
}
a
});
}
#[bench]
fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation_gcd();
let n = permutation.len();
let column = DynamicFastFieldReader::from(permutation);
b.iter(|| {
let mut a = 0u64;
for i in 0..n as u64 {
a += column.get_val(i);
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
a
});
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
#[bench]
fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let mut a = 0u64;
for i in 0..permutation.len() {
a += permutation[i as usize] as u64;
}
a
});
b.iter(|| {
let mut a = 0u32;
for i in 0u32..permutation.len() as u32 {
a = fast_field_reader.get(i) as u32;
}
a
});
}
}
}

View File

@@ -174,7 +174,9 @@ fn index_documents(
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> crate::Result<()> {
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
let schema = segment.schema();
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
for document_group in grouped_document_iterator {
for doc in document_group {
segment_writer.add_document(doc)?;

View File

@@ -133,7 +133,7 @@ impl TermOrdinalMapping {
fn max_term_ord(&self) -> TermOrdinal {
self.per_segment_new_term_ordinals
.iter()
.flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
.flat_map(|term_ordinals| term_ordinals.iter().max())
.max()
.unwrap_or_default()
}
@@ -784,7 +784,7 @@ impl IndexMerger {
let new_doc_id: DocId =
self.offsets
.iter()
.position(|&offset| offset > pos)
.position(|offset| offset > pos)
.expect("pos is out of bounds") as DocId
- 1u32;

View File

@@ -480,11 +480,11 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench_sorted_index_merge {
use fastfield_codecs::Column;
use test::{self, Bencher};
use crate::core::Index;
use crate::fastfield::DynamicFastFieldReader;
// use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::indexer::merger::IndexMerger;
use crate::schema::{Cardinality, NumericOptions, Schema};
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
@@ -546,7 +546,7 @@ mod bench_sorted_index_merge {
// add values in order of the new doc_ids
let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get_val(doc_id as u64);
val = field_reader.get(doc_id);
}
val

View File

@@ -25,10 +25,39 @@ use crate::indexer::{
DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
SegmentSerializer,
};
use crate::schema::Schema;
use crate::{FutureResult, Opstamp};
const NUM_MERGE_THREADS: usize = 4;
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(
schema: Schema,
index_settings: IndexSettings,
directory: &dyn Directory,
) -> crate::Result<()> {
save_metas(
&IndexMeta {
index_settings,
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)?;
directory.sync_directory()?;
Ok(())
}
/// Save the index meta file.
/// This operation is atomic:
/// Either
@@ -38,7 +67,7 @@ const NUM_MERGE_THREADS: usize = 4;
/// and flushed.
///
/// This method is not part of tantivy's public API
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
info!("save metas");
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.

View File

@@ -80,8 +80,8 @@ impl SegmentWriter {
pub fn for_segment(
memory_budget_in_bytes: usize,
segment: Segment,
schema: Schema,
) -> crate::Result<SegmentWriter> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;

View File

@@ -301,7 +301,7 @@ pub use self::docset::{DocSet, TERMINATED};
pub use crate::core::{
Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
SegmentReader, SingleSegmentIndexWriter,
SegmentReader,
};
pub use crate::directory::Directory;
pub use crate::indexer::demuxer::*;

View File

@@ -227,7 +227,7 @@ pub mod tests {
{
let mut segment_writer =
SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap();
{
// checking that position works if the field has two values
let op = AddOperation {

View File

@@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition {
/// and building a `Segment` in anonymous memory.
///
/// `PostingsWriter` writes in a `MemoryArena`.
pub(crate) trait PostingsWriter: Send + Sync {
pub(crate) trait PostingsWriter {
/// Record that a document contains a term at a given position.
///
/// * doc - the document id

View File

@@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
/// * the document id
/// * the term frequency
/// * the term positions
pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
pub(crate) trait Recorder: Copy + Default + 'static {
/// Returns the current document
fn current_doc(&self) -> u32;
/// Starts recording information about a new document