Compare commits

...

45 Commits

Author SHA1 Message Date
Halvor Bø
d09720c989 Fixed bench 2021-07-07 00:32:01 +01:00
Halvor Bø
d9bcf6e26b Cleaned up variable names 2021-07-07 00:25:42 +01:00
Halvor Bø
7ba0d0df1c Made the implementation respect ordering 2021-07-06 23:58:15 +01:00
Halvor Bø
f1f922ea1b MProof of concept implementation of slop 2021-07-06 20:50:41 +01:00
Halvor Bø
c35343d361 Added todo 2021-07-05 23:30:34 +01:00
Halvor Bø
7fe421ae2a Initial 2021-07-05 23:04:09 +01:00
PSeitz
8176b0335a Merge pull request #1108 from PSeitz/pwnedbytes
move ownedbytes to own crate
2021-07-05 16:07:56 +02:00
Pascal Seitz
811ac98f36 more inlines 2021-07-05 15:49:42 +02:00
François Massot
f4b2e71800 Handle field names with any characters with a known set of special (#1109)
* Handle field names with any characters with a known set of special characters and an escape one

* Update field name validation rule to check only if it has at least one character and does not start with `-`

Closes #1087.
2021-07-05 22:31:36 +09:00
PSeitz
c431cfcf12 extend proptests, fix race condition (#1107)
* extend proptests, fix race condition
* cargo fmt
2021-07-05 18:28:56 +09:00
PSeitz
92f20bc5a2 use nightly image in coverage 2021-07-03 09:38:44 +02:00
PSeitz
57f931da3c Create coverage.yml 2021-07-03 09:35:07 +02:00
Pascal Seitz
9b662e6d03 move ownedbytes to own crate
fixes #1106
2021-07-02 16:51:59 +02:00
PSeitz
18377d949c Merge pull request #1105 from PSeitz/clippy
Fix clippy warnings
2021-07-02 10:01:19 +02:00
Pascal Seitz
e6427b2588 cleanup 2021-07-02 09:21:22 +02:00
Pascal Seitz
0062fe705d cargo fmt 2021-07-01 18:17:08 +02:00
Pascal Seitz
9b3e508753 fix clippy 2021-07-01 18:06:09 +02:00
Pascal Seitz
a1ac63ee1c fix clippy 2021-07-01 18:06:03 +02:00
Pascal Seitz
e496ae0470 clippy fixes 2021-07-01 17:43:50 +02:00
Pascal Seitz
1e4df54ab3 fix clippy 2021-07-01 17:41:53 +02:00
Pascal Seitz
2de249af74 clippy fixes 2021-07-01 17:37:37 +02:00
Pascal Seitz
10f056fbb4 apply clippy fixes 2021-07-01 17:08:44 +02:00
PSeitz
074b09d0c0 Merge pull request #1102 from PSeitz/proptests
extend proptests for sorting and merge
2021-07-01 16:23:53 +02:00
Pascal Seitz
86d0727659 add facet test
closes #1100
2021-07-01 15:36:17 +02:00
Pascal Seitz
be3e1b8718 cargo fmt 2021-07-01 14:02:09 +02:00
Pascal Seitz
8fdf59bdac add search test for proptest 2021-07-01 14:01:30 +02:00
Pascal Seitz
ebebce2102 cargo fmt 2021-07-01 10:47:20 +02:00
Pascal Seitz
8044ec38da test docstore in proptest 2021-07-01 10:15:42 +02:00
Pascal Seitz
7413f87265 use set instead of vec in proptest 2021-07-01 08:28:51 +02:00
PSeitz
aea2e77665 Merge pull request #1097 from PSeitz/multifastfield
Use dynamic fastfield codes for multivalues fixes #1093
2021-06-30 14:38:26 +02:00
Pascal Seitz
a15845f9fd add merge case to proptest, test multivalue fastfields
#1100
2021-06-30 13:13:33 +02:00
Pascal Seitz
94ac44df4f proptest with optional sorting 2021-06-30 12:06:03 +02:00
Pascal Seitz
f80d804a57 add random commits in proptest 2021-06-30 11:18:07 +02:00
Pascal Seitz
3b5c1d7817 use measure_time 0.7 2021-06-30 11:08:02 +02:00
Pascal Seitz
24274edf81 remove trait impl fpr &Vec 2021-06-30 09:50:47 +02:00
Paul Masurel
d58497529b Fixed CHANGELOG to include 0.15.2. 2021-06-30 16:34:47 +09:00
Pascal Seitz
130495abab cleanup 2021-06-30 08:57:55 +02:00
Pascal Seitz
9b743d60fb make docid mapping non optional
make docid mapping non optional
add trivial flag for docid mapping
add time measurements
2021-06-30 08:57:55 +02:00
Pascal Seitz
5c9e2ef036 wrap docidmapping in struct 2021-06-30 08:57:55 +02:00
Pascal Seitz
8526434b63 add dynamic fastfield case
add dynamic fastfield for single fast field unsorted
fix scary documentation bug
add num_len instead of len
2021-06-30 08:57:55 +02:00
Pascal Seitz
6ba302c481 Use dynamic fastfield codes for multivalues fixes #1093
Use dynamic fastfield codes for multivalues fixes (only sorting case covered)
Rename get to get_val due to conflict with Vec
use u64 precision instead of u32 for get_range, to allow use of existing fast field interface interface (actually not sure if it would be better have a different interface)
2021-06-30 08:57:55 +02:00
Paul Masurel
de92f094aa Closes #1101 fix delete documents with sort by field
Closes #1101

* fix delete documents with sort by field

Co-authored-by: Andre-Philippe Paquet <appaquet@gmail.com>
2021-06-30 15:51:32 +09:00
Evance Soumaoro
c82cee66de exposing min/max value interface on MultiValuedFastField Reader (#1096) 2021-06-23 17:38:50 +09:00
Paul Masurel
6eed05b1ce Revert "Exposing min/max value interface on MultiValuedFastField Reader (#1094)" (#1095)
This reverts commit bb488305c9.
2021-06-23 10:25:11 +09:00
Evance Soumaoro
bb488305c9 Exposing min/max value interface on MultiValuedFastField Reader (#1094)
Exposing min/max value interface on MultiValuedFastField Reader
2021-06-23 08:51:36 +09:00
94 changed files with 1856 additions and 1092 deletions

27
.github/workflows/coverage.yml vendored Normal file
View File

@@ -0,0 +1,27 @@
name: coverage
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
test:
name: coverage
runs-on: ubuntu-latest
container:
image: xd009642/tarpaulin:develop-nightly
options: --security-opt seccomp=unconfined
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Generate code coverage
run: |
cargo +nightly tarpaulin --verbose --all-features --workspace --timeout 120 --out Xml
- name: Upload to codecov.io
uses: codecov/codecov-action@v1
with:
# token: ${{secrets.CODECOV_TOKEN}} # not required for public repos
fail_ci_if_error: true

View File

@@ -1,3 +1,12 @@
Tantivy 0.15.3
=========================
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101
Tantivy 0.15.2
========================
- Major bugfix. DocStore still panics when a deleted doc is at the beginning of a block. (@appaquet) #1088
Tantivy 0.15.1
=========================
- Major bugfix. DocStore panics when first block is deleted. (@appaquet) #1077

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.15.1"
version = "0.16.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -37,6 +37,7 @@ tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version="0.1", path="./common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.1", path="./ownedbytes" }
stable_deref_trait = "1.2"
rust-stemmers = "1.2"
downcast-rs = "1.2"
@@ -53,6 +54,7 @@ rayon = "1.5"
lru = "0.6.5"
fastdivide = "0.3"
itertools = "0.10.0"
measure_time = "0.7.0"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -90,7 +92,7 @@ unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs"]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -1,7 +1,7 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &'static str = include_str!("alice.txt");
const ALICE_TXT: &str = include_str!("alice.txt");
pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();

View File

@@ -106,7 +106,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
let mut buf = [0u8; 8];
let data = serialize_vint_u32(val, &mut buf);
writer.write_all(&data)
writer.write_all(data)
}
impl VInt {
@@ -181,8 +181,8 @@ mod tests {
fn aux_test_vint(val: u64) {
let mut v = [14u8; 10];
let num_bytes = VInt(val).serialize_into(&mut v);
for i in num_bytes..10 {
assert_eq!(v[i], 14u8);
for el in &v[num_bytes..10] {
assert_eq!(el, &14u8);
}
assert!(num_bytes > 0);
if num_bytes < 10 {

View File

@@ -139,7 +139,7 @@ fn main() -> tantivy::Result<()> {
//
// Lets index a bunch of fake documents for the sake of
// this example.
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(

View File

@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(30_000_000)?;
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
let query = BooleanQuery::new_multiterms_query(
facets
.iter()
.map(|key| Term::from_facet(ingredient, &key))
.map(|key| Term::from_facet(ingredient, key))
.collect(),
);
let top_docs_by_custom_score =

View File

@@ -22,7 +22,7 @@ fn main() -> tantivy::Result<()> {
let title = schema_builder.add_text_field("title", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"));

View File

@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
}]
}"#;
let short_man_doc = schema.parse_document(&short_man_json)?;
let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc);

View File

@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
// # Indexing documents
let index = Index::create_in_dir(&index_path, schema.clone())?;
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer = index.writer(50_000_000)?;

View File

@@ -1,4 +1,3 @@
use tantivy;
use tantivy::schema::*;
// # Document from json
@@ -22,7 +21,7 @@ fn main() -> tantivy::Result<()> {
}"#;
// We can parse our document
let _mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?;
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
@@ -31,7 +30,7 @@ fn main() -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Prometheus"],
"year": 1818
}"#;
let _frankenstein_doc = schema.parse_document(&frankenstein_json)?;
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
// Note that the schema is saved in your index directory.
//

View File

@@ -27,8 +27,7 @@ mod tests {
}
fn value_iter() -> impl Iterator<Item = u64> {
let data = (0..20_000).collect::<Vec<_>>();
data.into_iter()
0..20_000
}
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
b: &mut Bencher,
@@ -38,7 +37,7 @@ mod tests {
S::serialize(
&mut bytes,
&data,
stats_from_vec(&data),
stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)
@@ -56,7 +55,7 @@ mod tests {
S::serialize(
&mut bytes,
&data,
stats_from_vec(&data),
stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)

View File

@@ -35,7 +35,7 @@ impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &data)
self.min_value_u64 + self.bit_unpacker.get(doc, data)
}
#[inline]
fn min_value(&self) -> u64 {
@@ -147,7 +147,7 @@ mod tests {
fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
&data, name,
data, name,
);
}
@@ -165,7 +165,7 @@ mod tests {
fn bitpacked_fast_field_rand() {
for _ in 0..500 {
let mut data = (0..1 + rand::random::<u8>() as usize)
.map(|_| rand::random::<i64>() as u64 / 2 as u64)
.map(|_| rand::random::<i64>() as u64 / 2)
.collect::<Vec<_>>();
create_and_validate(&data, "rand");

View File

@@ -51,14 +51,14 @@ pub trait FastFieldCodecSerializer {
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given document.
/// Return the value associated to the given position.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: u32) -> u64;
/// May panic if `position` is greater than the index.
fn get_val(&self, position: u64) -> u64;
}
#[derive(Debug, Clone)]
@@ -69,20 +69,14 @@ pub struct FastFieldStats {
}
impl<'a> FastFieldDataAccess for &'a [u64] {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}
impl<'a> FastFieldDataAccess for &'a Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}
impl FastFieldDataAccess for Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}
@@ -100,15 +94,15 @@ mod tests {
data: &[u64],
name: &str,
) -> (f32, f32) {
if !S::is_applicable(&data, crate::tests::stats_from_vec(&data)) {
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data, crate::tests::stats_from_vec(&data));
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
let mut out = vec![];
S::serialize(
&mut out,
&data,
crate::tests::stats_from_vec(&data),
crate::tests::stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)
@@ -125,7 +119,7 @@ mod tests {
}
}
let actual_compression = data.len() as f32 / out.len() as f32;
return (estimation, actual_compression);
(estimation, actual_compression)
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];

View File

@@ -78,7 +78,7 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &data)) - self.footer.offset
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
}
#[inline]
@@ -123,8 +123,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
// calculate offset to ensure all values are positive
let mut offset = 0;
@@ -191,8 +191,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
// let's sample at 0%, 5%, 10% .. 95%, 100%
@@ -205,7 +205,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
.iter()
.map(|pos| {
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
let actual_value = fastfield_accessor.get_val(*pos as u64);
distance(calculated_value, actual_value)
})
.max()
@@ -243,7 +243,7 @@ mod tests {
crate::tests::create_and_validate::<
LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader,
>(&data, name);
>(data, name);
}
#[test]
@@ -285,9 +285,7 @@ mod tests {
#[test]
fn linear_interpol_fast_field_rand() {
for _ in 0..5000 {
let mut data = (0..50 as usize)
.map(|_| rand::random::<u64>())
.collect::<Vec<_>>();
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
create_and_validate(&data, "random");
data.reverse();

View File

@@ -30,7 +30,7 @@ fn main() {
//.unwrap();
let best_compression_ratio_codec = results
.iter()
.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
.cloned()
.unwrap();
@@ -41,6 +41,7 @@ fn main() {
} else {
(est.to_string(), comp.to_string())
};
#[allow(clippy::all)]
let style = if comp == best_compression_ratio_codec.1 {
"Fb"
} else {
@@ -96,23 +97,23 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
data: &[u64],
) -> (bool, f32, f32, &'static str) {
let is_applicable = S::is_applicable(&data, stats_from_vec(&data));
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
if !is_applicable {
return (false, 0.0, 0.0, S::NAME);
}
let estimation = S::estimate(&data, stats_from_vec(&data));
let estimation = S::estimate(&data, stats_from_vec(data));
let mut out = vec![];
S::serialize(
&mut out,
&data,
stats_from_vec(&data),
stats_from_vec(data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
return (true, estimation, actual_compression, S::NAME);
(true, estimation, actual_compression, S::NAME)
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -196,8 +196,8 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let mut first_function = Function {
end_pos: stats.num_vals,
@@ -309,9 +309,10 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val_in_first_block = fastfield_accessor.get(0);
let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
let last_val_in_first_block = fastfield_accessor.get(last_elem_in_first_chunk as u32 - 1);
let last_val_in_first_block =
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
let slope = get_slope(
first_val_in_first_block,
last_val_in_first_block,
@@ -328,7 +329,7 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
.map(|pos| {
let calculated_value =
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
let actual_value = fastfield_accessor.get_val(*pos as u64);
distance(calculated_value, actual_value)
})
.max()
@@ -367,7 +368,7 @@ mod tests {
crate::tests::create_and_validate::<
MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader,
>(&data, name);
>(data, name);
}
#[test]

10
ownedbytes/Cargo.toml Normal file
View File

@@ -0,0 +1,10 @@
[package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes"
version = "0.1.0"
edition = "2018"
description = "Expose data as static slice"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
stable_deref_trait = "1.2.0"

290
ownedbytes/src/lib.rs Normal file
View File

@@ -0,0 +1,290 @@
use stable_deref_trait::StableDeref;
use std::convert::TryInto;
use std::mem;
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::{fmt, io};
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
/// this data as a static slice.
///
/// The backing object is required to be `StableDeref`.
#[derive(Clone)]
pub struct OwnedBytes {
data: &'static [u8],
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
}
impl OwnedBytes {
/// Creates an empty `OwnedBytes`.
pub fn empty() -> OwnedBytes {
OwnedBytes::new(&[][..])
}
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
data_holder: T,
) -> OwnedBytes {
let box_stable_deref = Arc::new(data_holder);
let bytes: &[u8] = box_stable_deref.as_ref();
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
OwnedBytes {
data,
box_stable_deref,
}
}
/// creates a fileslice that is just a view over a slice of the data.
pub fn slice(&self, range: Range<usize>) -> Self {
OwnedBytes {
data: &self.data[range],
box_stable_deref: self.box_stable_deref.clone(),
}
}
/// Returns the underlying slice of data.
/// `Deref` and `AsRef` are also available.
#[inline]
pub fn as_slice(&self) -> &[u8] {
self.data
}
/// Returns the len of the slice.
#[inline]
pub fn len(&self) -> usize {
self.data.len()
}
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
///
/// Left will hold `split_len` bytes.
///
/// This operation is cheap and does not require to copy any memory.
/// On the other hand, both `left` and `right` retain a handle over
/// the entire slice of memory. In other words, the memory will only
/// be released when both left and right are dropped.
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes {
data: &self.data[..split_len],
box_stable_deref: self.box_stable_deref,
};
let right = OwnedBytes {
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
(left, right)
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.as_slice().is_empty()
}
/// Drops the left most `advance_len` bytes.
///
/// See also [.clip(clip_len: usize))](#method.clip).
#[inline]
pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..]
}
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
#[inline]
pub fn read_u8(&mut self) -> u8 {
assert!(!self.is_empty());
let byte = self.as_slice()[0];
self.advance(1);
byte
}
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
#[inline]
pub fn read_u64(&mut self) -> u64 {
assert!(self.len() > 7);
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
self.advance(8);
u64::from_le_bytes(octlet)
}
}
impl fmt::Debug for OwnedBytes {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We truncate the bytes in order to make sure the debug string
// is not too long.
let bytes_truncated: &[u8] = if self.len() > 8 {
&self.as_slice()[..10]
} else {
self.as_slice()
};
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
}
}
impl Deref for OwnedBytes {
type Target = [u8];
#[inline]
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl io::Read for OwnedBytes {
#[inline]
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
if data.len() >= buf.len() {
let buf_len = buf.len();
buf.copy_from_slice(&data[..buf_len]);
buf.len()
} else {
let data_len = data.len();
buf[..data_len].copy_from_slice(data);
data_len
}
};
self.advance(read_len);
Ok(read_len)
}
#[inline]
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
buf.extend(data);
data.len()
};
self.advance(read_len);
Ok(read_len)
}
#[inline]
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
let read_len = self.read(buf)?;
if read_len != buf.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
));
}
Ok(())
}
}
impl AsRef<[u8]> for OwnedBytes {
#[inline]
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
#[cfg(test)]
mod tests {
use std::io::{self, Read};
use super::OwnedBytes;
#[test]
fn test_owned_bytes_debug() {
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
assert_eq!(
format!("{:?}", short_bytes),
"OwnedBytes([97, 98, 99, 100], len=4)"
);
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
assert_eq!(
format!("{:?}", long_bytes),
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
);
}
#[test]
fn test_owned_bytes_read() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
{
let mut buf = [0u8; 5];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
}
{
let mut buf = [0u8; 2];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"fg");
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
}
Ok(())
}
#[test]
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 5];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
assert_eq!(&buf, b"abcde");
Ok(())
}
#[test]
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 7];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf[..5], b"abcde");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_to_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = Vec::new();
bytes.read_to_end(&mut buf)?;
assert_eq!(buf.as_slice(), b"abcde".as_ref());
Ok(())
}
#[test]
fn test_owned_bytes_read_u8() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
assert_eq!(bytes.read_u8(), 255);
assert_eq!(bytes.len(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_u64() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
assert_eq!(bytes.read_u64(), u64::MAX - 255);
assert_eq!(bytes.len(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_split() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
let (left, right) = bytes.split(3);
assert_eq!(left.as_slice(), b"abc");
assert_eq!(right.as_slice(), b"defghi");
}
#[test]
fn test_owned_bytes_split_boundary() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
{
let (left, right) = bytes.clone().split(0);
assert_eq!(left.as_slice(), b"");
assert_eq!(right.as_slice(), b"abcdefghi");
}
{
let (left, right) = bytes.split(9);
assert_eq!(left.as_slice(), b"abcdefghi");
assert_eq!(right.as_slice(), b"");
}
}
}

View File

@@ -14,3 +14,5 @@ edition = "2018"
[dependencies]
combine = {version="4", default-features=false, features=[] }
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }

View File

@@ -1,21 +1,44 @@
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use crate::Occur;
use combine::parser::char::{char, digit, letter, space, spaces, string};
use combine::parser::char::{char, digit, space, spaces, string};
use combine::parser::range::{take_while, take_while1};
use combine::parser::repeat::escaped;
use combine::parser::Parser;
use combine::{
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
};
use combine::{error::StringStreamError, parser::combinator::recognize};
use once_cell::sync::Lazy;
use regex::Regex;
fn field<'a>() -> impl Parser<&'a str, Output = String> {
(
(letter().or(char('_'))),
many(satisfy(|c: char| {
c.is_alphanumeric() || c == '_' || c == '-'
})),
)
.skip(char(':'))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to special characters.
const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
];
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
/// Parses a field_name
/// A field name must have at least one character and be followed by a colon.
/// All characters are allowed including special characters `SPECIAL_CHARS`, but these
/// need to be escaped with a backslack character '\'.
fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
static ESCAPED_SPECIAL_CHARS_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap());
recognize::<String, _, _>(escaped(
(
take_while1(|c| !SPECIAL_CHARS.contains(&c) && c != '-'),
take_while(|c| !SPECIAL_CHARS.contains(&c)),
),
'\\',
satisfy(|c| SPECIAL_CHARS.contains(&c)),
))
.skip(char(':'))
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
.and_then(|s: String| match s.is_empty() {
true => Err(StringStreamError::UnexpectedParse),
_ => Ok(s),
})
}
fn word<'a>() -> impl Parser<&'a str, Output = String> {
@@ -98,7 +121,7 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
let term_val_with_field = negative_number().or(term_val());
(field(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
field_name: Some(field_name),
phrase,
})
@@ -195,7 +218,7 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
);
(
optional(field()).skip(spaces()),
optional(field_name()).skip(spaces()),
// try elastic first, if it matches, the range is unbounded
attempt(elastic_unbounded_range).or(lower_to_upper),
)
@@ -464,21 +487,21 @@ mod test {
#[test]
fn test_parse_elastic_query_ranges() {
test_parse_query_to_ast_helper("title: >a", "title:{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title:>=a", "title:[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title: <a", "title:{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("title:<=a", "title:{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("title:<=bsd", "title:{\"*\" TO \"bsd\"]");
test_parse_query_to_ast_helper("title: >a", "\"title\":{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title:>=a", "\"title\":[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("title: <a", "\"title\":{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("title:<=a", "\"title\":{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("title:<=bsd", "\"title\":{\"*\" TO \"bsd\"]");
test_parse_query_to_ast_helper("weight: >70", "weight:{\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight:>=70", "weight:[\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <70", "weight:{\"*\" TO \"70\"}");
test_parse_query_to_ast_helper("weight:<=70", "weight:{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: >60.7", "weight:{\"60.7\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: >70", "\"weight\":{\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight:>=70", "\"weight\":[\"70\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <70", "\"weight\":{\"*\" TO \"70\"}");
test_parse_query_to_ast_helper("weight:<=70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: >60.7", "\"weight\":{\"60.7\" TO \"*\"}");
test_parse_query_to_ast_helper("weight: <= 70", "weight:{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
}
#[test]
@@ -491,22 +514,43 @@ mod test {
#[test]
fn test_field_name() -> TestParseResult {
assert_eq!(
super::field().parse("my-field-name:a")?,
("my-field-name".to_string(), "a")
super::field_name().parse(".my.field.name:a"),
Ok((".my.field.name".to_string(), "a"))
);
assert_eq!(
super::field().parse("my_field_name:a")?,
("my_field_name".to_string(), "a")
super::field_name().parse("my\\ field\\ name:a"),
Ok(("my field name".to_string(), "a"))
);
assert!(super::field().parse(":a").is_err());
assert!(super::field().parse("-my_field:a").is_err());
assert!(super::field_name().parse("my field:a").is_err());
assert_eq!(
super::field().parse("_my_field:a")?,
super::field_name().parse("\\(1\\+1\\):2"),
Ok(("(1+1)".to_string(), "2"))
);
assert_eq!(
super::field_name().parse("my_field_name:a"),
Ok(("my_field_name".to_string(), "a"))
);
assert!(super::field_name().parse("my_field_name").is_err());
assert!(super::field_name().parse(":a").is_err());
assert!(super::field_name().parse("-my_field:a").is_err());
assert_eq!(
super::field_name().parse("_my_field:a")?,
("_my_field".to_string(), "a")
);
Ok(())
}
#[test]
fn test_field_name_re() {
let escaped_special_chars_re = Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap();
for special_char in SPECIAL_CHARS.iter() {
assert_eq!(
escaped_special_chars_re.replace_all(&format!("\\{}", special_char), "$1"),
special_char.to_string()
);
}
}
#[test]
fn test_range_parser() {
// testing the range() parser separately
@@ -600,12 +644,14 @@ mod test {
#[test]
fn test_single_term_with_field() {
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\"");
}
#[test]
fn test_single_term_with_float() {
test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\"");
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\"");
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\"");
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\"");
}
#[test]
@@ -621,22 +667,27 @@ mod test {
#[test]
fn test_parse_test_query_other() {
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
test_parse_query_to_ast_helper("+abc:toto", "abc:\"toto\"");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+abc:\"toto\" -\"titi\")");
test_parse_query_to_ast_helper("-abc:toto", "(-abc:\"toto\")");
test_parse_query_to_ast_helper("abc:a b", "(*abc:\"a\" *\"b\")");
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\"");
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\"");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")");
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")");
test_is_parse_err("--abc:toto");
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")");
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
}
#[test]
fn test_parse_query_with_range() {
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:{\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
test_parse_query_to_ast_helper("foo:{a TO z}", "\"foo\":{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "\"foo\":[\"1\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "\"foo\":{\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[1 TO *}", "\"foo\":[\"1\" TO \"*\"}");
test_parse_query_to_ast_helper(
"1.2.foo.bar:[1.1 TO *}",
"\"1.2.foo.bar\":[\"1.1\" TO \"*\"}",
);
test_is_parse_err("abc + ");
}
}

View File

@@ -24,7 +24,7 @@ impl Debug for UserInputLeaf {
ref upper,
} => {
if let Some(ref field) = field {
write!(formatter, "{}:", field)?;
write!(formatter, "\"{}\":", field)?;
}
lower.display_lower(formatter)?;
write!(formatter, " TO ")?;
@@ -45,7 +45,7 @@ pub struct UserInputLiteral {
impl fmt::Debug for UserInputLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
match self.field_name {
Some(ref field_name) => write!(formatter, "{}:\"{}\"", field_name, self.phrase),
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
None => write!(formatter, "\"{}\"", self.phrase),
}
}
@@ -79,7 +79,7 @@ impl UserInputBound {
match *self {
UserInputBound::Inclusive(ref contents) => contents,
UserInputBound::Exclusive(ref contents) => contents,
UserInputBound::Unbounded => &"*",
UserInputBound::Unbounded => "*",
}
}
}

View File

@@ -297,10 +297,8 @@ impl Collector for FacetCollector {
if depth == collapse_depth + 1 {
collapsed_id = collapse_facet_ords.len();
collapse_facet_ords.push(facet_streamer.term_ord());
collapse_mapping.push(collapsed_id);
} else {
collapse_mapping.push(collapsed_id);
}
collapse_mapping.push(collapsed_id);
}
break;
}

View File

@@ -276,7 +276,7 @@ mod tests {
let mut collectors = MultiCollector::new();
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
let count_handler = collectors.add_collector(Count);
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
let mut multifruits = searcher.search(&query, &collectors).unwrap();
assert_eq!(count_handler.extract(&mut multifruits), 5);
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);

View File

@@ -1080,7 +1080,7 @@ mod tests {
query: &str,
query_field: Field,
schema: Schema,
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
mut doc_adder: impl FnMut(&mut IndexWriter),
) -> (Index, Box<dyn Query>) {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();

View File

@@ -42,7 +42,7 @@ fn load_metas(
"Meta file does not contain valid utf8 file.".to_string(),
)
})?;
IndexMeta::deserialize(&meta_string, &inventory)
IndexMeta::deserialize(&meta_string, inventory)
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.to_path_buf(),
@@ -604,7 +604,7 @@ mod tests {
.is_ok());
assert!(Index::exists(&directory).unwrap());
assert!(Index::create(
directory.clone(),
directory,
Schema::builder().build(),
IndexSettings::default()
)

View File

@@ -369,7 +369,6 @@ mod tests {
schema::{Schema, TEXT},
IndexSettings, IndexSortByField, Order,
};
use serde_json;
#[test]
fn test_serialize_metas() {

View File

@@ -16,7 +16,6 @@ pub use self::index_meta::{
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::Searcher;
pub use self::segment::Segment;
pub use self::segment::SerializableSegment;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;

View File

@@ -1,13 +1,12 @@
use super::SegmentComponent;
use crate::core::Index;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::directory::error::{OpenReadError, OpenWriteError};
use crate::directory::Directory;
use crate::directory::{FileSlice, WritePtr};
use crate::indexer::segment_serializer::SegmentSerializer;
use crate::schema::Schema;
use crate::Opstamp;
use crate::{core::Index, indexer::doc_id_mapping::DocIdMapping};
use std::fmt;
use std::path::PathBuf;
@@ -90,20 +89,3 @@ impl Segment {
Ok(write)
}
}
pub trait SerializableSegment {
/// Writes a view of a segment by pushing information
/// to the `SegmentSerializer`.
///
/// # Returns
/// The number of documents in the segment.
///
/// doc_id_map is used when index is created and sorted, to map to the new doc_id order.
/// It is not used by the `IndexMerger`, since the doc_id_mapping on cross-segments works
/// differently
fn write(
&self,
serializer: SegmentSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<u32>;
}

View File

@@ -22,7 +22,7 @@ use std::sync::atomic;
pub struct SegmentId(Uuid);
#[cfg(test)]
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default());
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(atomic::AtomicUsize::default);
#[cfg(test)]
const ZERO_ARRAY: [u8; 8] = [0u8; 8];
@@ -108,6 +108,12 @@ impl fmt::Debug for SegmentId {
}
}
impl fmt::Display for SegmentId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Seg({:?})", self.short_uuid_string())
}
}
impl PartialOrd for SegmentId {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))

View File

@@ -1,4 +1,3 @@
use crate::common::HasLen;
use crate::core::InvertedIndexReader;
use crate::core::Segment;
use crate::core::SegmentComponent;
@@ -63,11 +62,9 @@ impl SegmentReader {
self.max_doc
}
/// Returns the number of documents.
/// Returns the number of alive documents.
/// Deleted documents are not counted.
///
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
self.num_docs
}
@@ -81,7 +78,7 @@ impl SegmentReader {
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset()
.map(|delete_set| delete_set.len() as DocId)
.map(|delete_set| delete_set.num_deleted() as DocId)
.unwrap_or(0u32)
}
@@ -329,6 +326,32 @@ mod test {
use crate::schema::{Schema, Term, STORED, TEXT};
use crate::DocId;
#[test]
fn test_num_alive() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("name", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs
index_writer.delete_term(Term::from_field_text(name, "horse"));
index_writer.delete_term(Term::from_field_text(name, "cap"));
// ok, now we should have a deleted doc
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
assert_eq!(2, searcher.segment_reader(0).num_docs());
assert_eq!(4, searcher.segment_reader(0).max_doc());
Ok(())
}
#[test]
fn test_alive_docs_iterator() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -211,7 +211,7 @@ mod tests {
assert_eq!(right.read_bytes()?.as_slice(), b"");
}
{
let (left, right) = file_slice.clone().split_from_end(2);
let (left, right) = file_slice.split_from_end(2);
assert_eq!(left.read_bytes()?.as_slice(), b"abcd");
assert_eq!(right.read_bytes()?.as_slice(), b"ef");
}

View File

@@ -402,10 +402,8 @@ mod tests_mmap_specific {
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
assert!(managed_directory.garbage_collect(|| living_files).is_ok());
assert!(!managed_directory.exists(test_path1).unwrap());
} else {
assert!(!managed_directory.exists(test_path1).unwrap());
}
assert!(!managed_directory.exists(test_path1).unwrap());
}
#[test]
@@ -430,7 +428,7 @@ mod tests_mmap_specific {
assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]);
assert!(managed_directory.list_damaged().unwrap().is_empty());
let mut corrupted_path = tempdir_path.clone();
let mut corrupted_path = tempdir_path;
corrupted_path.push(test_path2);
let mut file = OpenOptions::new().write(true).open(&corrupted_path)?;
file.write_all(&[255u8])?;

View File

@@ -1,290 +1,11 @@
use crate::directory::FileHandle;
use stable_deref_trait::StableDeref;
use std::convert::TryInto;
use std::mem;
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::{fmt, io};
use std::io;
use std::ops::Range;
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
/// this data as a static slice.
///
/// The backing object is required to be `StableDeref`.
#[derive(Clone)]
pub struct OwnedBytes {
data: &'static [u8],
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
}
pub use ownedbytes::OwnedBytes;
impl FileHandle for OwnedBytes {
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
Ok(self.slice(range))
}
}
impl OwnedBytes {
/// Creates an empty `OwnedBytes`.
pub fn empty() -> OwnedBytes {
OwnedBytes::new(&[][..])
}
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
data_holder: T,
) -> OwnedBytes {
let box_stable_deref = Arc::new(data_holder);
let bytes: &[u8] = box_stable_deref.as_ref();
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
OwnedBytes {
data,
box_stable_deref,
}
}
/// creates a fileslice that is just a view over a slice of the data.
pub fn slice(&self, range: Range<usize>) -> Self {
OwnedBytes {
data: &self.data[range],
box_stable_deref: self.box_stable_deref.clone(),
}
}
/// Returns the underlying slice of data.
/// `Deref` and `AsRef` are also available.
#[inline]
pub fn as_slice(&self) -> &[u8] {
self.data
}
/// Returns the len of the slice.
#[inline]
pub fn len(&self) -> usize {
self.data.len()
}
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
///
/// Left will hold `split_len` bytes.
///
/// This operation is cheap and does not require to copy any memory.
/// On the other hand, both `left` and `right` retain a handle over
/// the entire slice of memory. In other words, the memory will only
/// be released when both left and right are dropped.
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes {
data: &self.data[..split_len],
box_stable_deref: self.box_stable_deref,
};
let right = OwnedBytes {
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
(left, right)
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.as_slice().is_empty()
}
/// Drops the left most `advance_len` bytes.
///
/// See also [.clip(clip_len: usize))](#method.clip).
#[inline]
pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..]
}
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
pub fn read_u8(&mut self) -> u8 {
assert!(!self.is_empty());
let byte = self.as_slice()[0];
self.advance(1);
byte
}
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
pub fn read_u64(&mut self) -> u64 {
assert!(self.len() > 7);
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
self.advance(8);
u64::from_le_bytes(octlet)
}
}
impl fmt::Debug for OwnedBytes {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We truncate the bytes in order to make sure the debug string
// is not too long.
let bytes_truncated: &[u8] = if self.len() > 8 {
&self.as_slice()[..10]
} else {
self.as_slice()
};
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
}
}
impl Deref for OwnedBytes {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl io::Read for OwnedBytes {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
if data.len() >= buf.len() {
let buf_len = buf.len();
buf.copy_from_slice(&data[..buf_len]);
buf.len()
} else {
let data_len = data.len();
buf[..data_len].copy_from_slice(data);
data_len
}
};
self.advance(read_len);
Ok(read_len)
}
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
let read_len = {
let data = self.as_slice();
buf.extend(data);
data.len()
};
self.advance(read_len);
Ok(read_len)
}
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
let read_len = self.read(buf)?;
if read_len != buf.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
));
}
Ok(())
}
}
impl AsRef<[u8]> for OwnedBytes {
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
#[cfg(test)]
mod tests {
use std::io::{self, Read};
use super::OwnedBytes;
#[test]
fn test_owned_bytes_debug() {
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
assert_eq!(
format!("{:?}", short_bytes),
"OwnedBytes([97, 98, 99, 100], len=4)"
);
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
assert_eq!(
format!("{:?}", long_bytes),
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
);
}
#[test]
fn test_owned_bytes_read() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
{
let mut buf = [0u8; 5];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
}
{
let mut buf = [0u8; 2];
bytes.read_exact(&mut buf[..]).unwrap();
assert_eq!(&buf, b"fg");
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
}
Ok(())
}
#[test]
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 5];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf, b"abcde");
assert_eq!(bytes.as_slice(), b"");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
assert_eq!(&buf, b"abcde");
Ok(())
}
#[test]
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = [0u8; 7];
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
assert_eq!(&buf[..5], b"abcde");
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_to_end() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
let mut buf = Vec::new();
bytes.read_to_end(&mut buf)?;
assert_eq!(buf.as_slice(), b"abcde".as_ref());
Ok(())
}
#[test]
fn test_owned_bytes_read_u8() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
assert_eq!(bytes.read_u8(), 255);
assert_eq!(bytes.len(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_read_u64() -> io::Result<()> {
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
assert_eq!(bytes.read_u64(), u64::MAX - 255);
assert_eq!(bytes.len(), 0);
Ok(())
}
#[test]
fn test_owned_bytes_split() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
let (left, right) = bytes.split(3);
assert_eq!(left.as_slice(), b"abc");
assert_eq!(right.as_slice(), b"defghi");
}
#[test]
fn test_owned_bytes_split_boundary() {
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
{
let (left, right) = bytes.clone().split(0);
assert_eq!(left.as_slice(), b"");
assert_eq!(right.as_slice(), b"abcdefghi");
}
{
let (left, right) = bytes.split(9);
assert_eq!(left.as_slice(), b"abcdefghi");
assert_eq!(right.as_slice(), b"");
}
}
}

View File

@@ -166,26 +166,26 @@ fn test_write_create_the_file(directory: &dyn Directory) {
fn test_directory_delete(directory: &dyn Directory) -> crate::Result<()> {
let test_path: &'static Path = Path::new("some_path_for_test");
assert!(directory.open_read(test_path).is_err());
let mut write_file = directory.open_write(&test_path)?;
let mut write_file = directory.open_write(test_path)?;
write_file.write_all(&[1, 2, 3, 4])?;
write_file.flush()?;
{
let read_handle = directory.open_read(&test_path)?.read_bytes()?;
let read_handle = directory.open_read(test_path)?.read_bytes()?;
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(&test_path).is_ok());
assert!(directory.delete(test_path).is_ok());
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
if cfg!(windows) {
assert!(directory.delete(&test_path).is_ok());
assert!(directory.delete(test_path).is_ok());
}
assert!(directory.open_read(&test_path).is_err());
assert!(directory.delete(&test_path).is_err());
assert!(directory.open_read(test_path).is_err());
assert!(directory.delete(test_path).is_err());
Ok(())
}

View File

@@ -83,11 +83,11 @@ impl BytesFastFieldWriter {
&'a self,
doc_id_map: Option<&'b DocIdMapping>,
) -> impl Iterator<Item = &'b [u8]> {
let doc_id_iter = if let Some(doc_id_map) = doc_id_map {
Box::new(doc_id_map.iter_old_doc_ids().cloned()) as Box<dyn Iterator<Item = u32>>
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
Box::new(doc_id_map.iter_old_doc_ids())
} else {
Box::new(self.doc_index.iter().enumerate().map(|el| el.0 as u32))
as Box<dyn Iterator<Item = u32>>
let max_doc = self.doc_index.len() as u32;
Box::new(0..max_doc)
};
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
}

View File

@@ -91,6 +91,10 @@ impl DeleteBitSet {
b & (1u8 << shift) != 0
}
/// The number of deleted docs
pub fn num_deleted(&self) -> usize {
self.num_deleted
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()

View File

@@ -267,8 +267,8 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 37 as usize);
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 37);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
@@ -298,8 +298,8 @@ mod tests {
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
serializer.close()?;
}
let file = directory.open_read(&path)?;
assert_eq!(file.len(), 62 as usize);
let file = directory.open_read(path)?;
assert_eq!(file.len(), 62);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -334,8 +334,8 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 35 as usize);
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -366,8 +366,8 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 80043 as usize);
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80043);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -405,9 +405,9 @@ mod tests {
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
let file = directory.open_read(path).unwrap();
//assert_eq!(file.len(), 17710 as usize); //bitpacked size
assert_eq!(file.len(), 10175 as usize); // linear interpol size
assert_eq!(file.len(), 10175_usize); // linear interpol size
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();
@@ -447,7 +447,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
let file = directory.open_read(path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap();
@@ -480,7 +480,7 @@ mod tests {
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
serializer.close()?;
}
let file = directory.open_read(&path)?;
let file = directory.open_read(path)?;
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();

View File

@@ -90,7 +90,7 @@ mod tests {
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
@@ -121,7 +121,7 @@ mod tests {
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))

View File

@@ -1,8 +1,6 @@
use std::ops::Range;
use crate::fastfield::{
BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength,
};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
use crate::DocId;
/// Reader for a multivalued `u64` fast field.
@@ -16,13 +14,13 @@ use crate::DocId;
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
vals_reader: DynamicFastFieldReader<Item>,
}
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
vals_reader: DynamicFastFieldReader<Item>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,
@@ -32,6 +30,7 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
@@ -39,20 +38,41 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
}
/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.range(doc);
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range_u64(range.start, &mut vals[..]);
self.vals_reader.get_range(range.start, &mut vals[..]);
}
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual mimimum value.
pub fn min_value(&self) -> Item {
self.vals_reader.min_value()
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self) -> Item {
self.vals_reader.max_value()
}
/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> usize {
let range = self.range(doc);
(range.end - range.start) as usize
}
/// Returns the overall number of values in this field .
#[inline]
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
@@ -71,7 +91,7 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
mod tests {
use crate::core::Index;
use crate::schema::{Facet, Schema, INDEXED};
use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
#[test]
fn test_multifastfield_reader() {
@@ -126,4 +146,32 @@ mod tests {
assert_eq!(&vals[..], &[4]);
}
}
#[test]
fn test_multifastfield_reader_min_max() {
let mut schema_builder = Schema::builder();
let field_options = IntOptions::default()
.set_indexed()
.set_fast(Cardinality::MultiValues);
let item_field = schema_builder.add_i64_field("items", field_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index
.writer_for_tests()
.expect("Failed to create index writer.");
index_writer.add_document(doc!(
item_field => 2i64,
item_field => 3i64,
item_field => -2i64,
));
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64));
index_writer.add_document(doc!(item_field => 4i64));
index_writer.commit().expect("Commit failed");
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let field_reader = segment_reader.fast_fields().i64s(item_field).unwrap();
assert_eq!(field_reader.min_value(), -2);
assert_eq!(field_reader.max_value(), 6);
}
}

View File

@@ -102,11 +102,11 @@ impl MultiValuedFastFieldWriter {
&'a self,
doc_id_map: Option<&'b DocIdMapping>,
) -> impl Iterator<Item = &'b [u64]> {
let doc_id_iter = if let Some(doc_id_map) = doc_id_map {
Box::new(doc_id_map.iter_old_doc_ids().cloned()) as Box<dyn Iterator<Item = u32>>
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
Box::new(doc_id_map.iter_old_doc_ids())
} else {
Box::new(self.doc_index.iter().enumerate().map(|el| el.0 as u32))
as Box<dyn Iterator<Item = u32>>
let max_doc = self.doc_index.len() as DocId;
Box::new(0..max_doc)
};
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
}

View File

@@ -44,13 +44,13 @@ pub trait FastFieldReader<Item: FastValue>: Clone {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]);
fn get_range(&self, start: u64, output: &mut [Item]);
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual mimimum value.
fn min_value(&self) -> Item;
/// Returns the maximum value for this fast field.
@@ -120,7 +120,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::MultiLinearInterpol(reader) => reader.get(doc),
}
}
fn get_range(&self, start: DocId, output: &mut [Item]) {
fn get_range(&self, start: u64, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::LinearInterpol(reader) => reader.get_range(start, output),
@@ -226,8 +226,8 @@ impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
fn get_range(&self, start: u64, output: &mut [Item]) {
self.get_range_u64(start, output);
}
/// Returns the minimum value for this fast field.

View File

@@ -99,12 +99,19 @@ impl FastFieldReaders {
Ok(())
}
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
&self,
field: Field,
index: usize,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, index)?;
DynamicFastFieldReader::open(fast_field_slice)
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, 0)?;
DynamicFastFieldReader::open(fast_field_slice)
self.typed_fast_field_reader_with_idx(field, 0)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
@@ -112,9 +119,7 @@ impl FastFieldReaders {
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let idx_reader = self.typed_fast_field_reader(field)?;
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
let vals_reader: BitpackedFastFieldReader<TFastValue> =
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
let vals_reader = self.typed_fast_field_reader_with_idx(field, 1)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}

View File

@@ -46,11 +46,7 @@ fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
if !T::is_applicable(fastfield_accessor, stats.clone()) {
return;
}
let (ratio, name, id) = (
T::estimate(fastfield_accessor, stats.clone()),
T::NAME,
T::ID,
);
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
estimations.push((ratio, name, id));
}
@@ -71,7 +67,26 @@ impl CompositeFastFieldSerializer {
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, 0);
self.create_auto_detect_u64_fast_field_with_idx(
field,
stats,
fastfield_accessor,
data_iter_1,
data_iter_2,
0,
)
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
pub fn create_auto_detect_u64_fast_field_with_idx(
&mut self,
field: Field,
stats: FastFieldStats,
fastfield_accessor: impl FastFieldDataAccess,
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
let mut estimations = vec![];

View File

@@ -7,7 +7,6 @@ use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
use crate::termdict::TermOrdinal;
use crate::DocId;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
@@ -296,7 +295,7 @@ impl IntFastFieldWriter {
if let Some(doc_id_map) = doc_id_map {
let iter = doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(*doc_id as usize));
.map(|doc_id| self.vals.get(doc_id as usize));
serializer.create_auto_detect_u64_fast_field(
self.field,
stats,
@@ -323,16 +322,17 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
vals: &'bitp BlockedBitpacker,
}
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated to the given document.
/// Return the value associated to the given doc.
///
/// This accessor should return as fast as possible.
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: DocId) -> u64 {
/// May panic if `doc` is greater than the index.
fn get_val(&self, doc: u64) -> u64 {
if let Some(doc_id_map) = self.doc_id_map {
self.vals.get(doc_id_map.get_old_doc_id(doc) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
self.vals
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
} else {
self.vals.get(doc as usize)
}

View File

@@ -98,7 +98,7 @@ impl FieldNormsWriter {
let mut mapped_fieldnorm_values = vec![];
mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[*old_doc_id as usize];
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize];
}
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
} else {

View File

@@ -38,7 +38,7 @@ fn test_functional_store() -> crate::Result<()> {
for iteration in 0..500 {
dbg!(iteration);
let num_docs: usize = rng.gen_range(0..4);
if doc_set.len() >= 1 {
if !doc_set.is_empty() {
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
@@ -88,19 +88,17 @@ fn test_functional_indexing() -> crate::Result<()> {
&searcher,
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
)?;
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
index_writer.add_document(doc);
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
index_writer.add_document(doc);
}
}
Ok(())

View File

@@ -1,6 +1,6 @@
use super::operation::DeleteOperation;
use crate::Opstamp;
use std::mem;
use std::ops::DerefMut;
use std::sync::{Arc, RwLock, Weak};
@@ -105,7 +105,7 @@ impl DeleteQueue {
return None;
}
let delete_operations = mem::replace(&mut self_wlock.writer, vec![]);
let delete_operations = std::mem::take(&mut self_wlock.writer);
let new_block = Arc::new(Block {
operations: Arc::from(delete_operations.into_boxed_slice()),
@@ -286,7 +286,7 @@ mod tests {
operations_it.advance();
}
{
let mut operations_it = snapshot.clone();
let mut operations_it = snapshot;
assert_eq!(operations_it.get().unwrap().opstamp, 1);
operations_it.advance();
assert_eq!(operations_it.get().unwrap().opstamp, 2);

View File

@@ -2,13 +2,61 @@
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
//!
use super::SegmentWriter;
use super::{merger::SegmentReaderWithOrdinal, SegmentWriter};
use crate::{
schema::{Field, Schema},
DocId, IndexSortByField, Order, TantivyError,
};
use std::cmp::Reverse;
/// Struct to provide mapping from old doc_id to new doc_id and vice versa
use std::{cmp::Reverse, ops::Index};
/// Struct to provide mapping from new doc_id to old doc_id and segment.
#[derive(Clone)]
pub(crate) struct SegmentDocidMapping<'a> {
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
is_trivial: bool,
}
impl<'a> SegmentDocidMapping<'a> {
pub(crate) fn new(
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
is_trivial: bool,
) -> Self {
Self {
new_doc_id_to_old_and_segment,
is_trivial,
}
}
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentReaderWithOrdinal)> {
self.new_doc_id_to_old_and_segment.iter()
}
pub(crate) fn len(&self) -> usize {
self.new_doc_id_to_old_and_segment.len()
}
/// This flags means the segments are simply stacked in the order of their ordinal.
/// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)]
///
/// This allows for some optimization.
pub(crate) fn is_trivial(&self) -> bool {
self.is_trivial
}
}
impl<'a> Index<usize> for SegmentDocidMapping<'a> {
type Output = (DocId, SegmentReaderWithOrdinal<'a>);
fn index(&self, idx: usize) -> &Self::Output {
&self.new_doc_id_to_old_and_segment[idx]
}
}
impl<'a> IntoIterator for SegmentDocidMapping<'a> {
type Item = (DocId, SegmentReaderWithOrdinal<'a>);
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
self.new_doc_id_to_old_and_segment.into_iter()
}
}
/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
pub struct DocIdMapping {
new_doc_id_to_old: Vec<DocId>,
old_doc_id_to_new: Vec<DocId>,
@@ -24,8 +72,8 @@ impl DocIdMapping {
self.new_doc_id_to_old[doc_id as usize]
}
/// iterate over old doc_ids in order of the new doc_ids
pub fn iter_old_doc_ids(&self) -> std::slice::Iter<'_, DocId> {
self.new_doc_id_to_old.iter()
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
self.new_doc_id_to_old.iter().cloned()
}
}

View File

@@ -14,35 +14,27 @@ use crate::Opstamp;
// The doc to opstamp mapping stores precisely an array
// indexed by doc id and storing the opstamp of the document.
//
// This mapping is (for the moment) stricly increasing
// because of the way document id are allocated.
// This mapping is NOT necessarily increasing, because
// we might be sorting documents according to a fast field.
#[derive(Clone)]
pub enum DocToOpstampMapping<'a> {
WithMap(&'a [Opstamp]),
None,
}
impl<'a> From<&'a [u64]> for DocToOpstampMapping<'a> {
fn from(opstamps: &[Opstamp]) -> DocToOpstampMapping {
DocToOpstampMapping::WithMap(opstamps)
}
}
impl<'a> DocToOpstampMapping<'a> {
/// Given an opstamp return the limit doc id L
/// such that all doc id D such that
// D >= L iff opstamp(D) >= than `target_opstamp`.
//
// The edge case opstamp = some doc opstamp is in practise
// never called.
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
match *self {
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
match doc_opstamps.binary_search(&target_opstamp) {
Ok(doc_id) | Err(doc_id) => doc_id as DocId,
}
/// Assess whether a document should be considered deleted given that it contains
/// a deleted term that was deleted at the opstamp: `delete_opstamp`.
///
/// This function returns true if the `DocToOpstamp` mapping is none or if
/// the `doc_opstamp` is anterior to the delete opstamp.
pub fn is_deleted(&self, doc_id: DocId, delete_opstamp: Opstamp) -> bool {
match self {
Self::WithMap(doc_opstamps) => {
let doc_opstamp = doc_opstamps[doc_id as usize];
doc_opstamp < delete_opstamp
}
DocToOpstampMapping::None => DocId::max_value(),
Self::None => true,
}
}
}
@@ -55,40 +47,17 @@ mod tests {
#[test]
fn test_doc_to_opstamp_mapping_none() {
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
assert_eq!(
doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value()
);
assert!(doc_to_opstamp_mapping.is_deleted(1u32, 0u64));
assert!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64));
}
#[test]
fn test_doc_to_opstamp_mapping_complex() {
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[][..]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
}
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[1u64][..]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
}
{
let doc_to_opstamp_mapping =
DocToOpstampMapping::from(&[1u64, 12u64, 17u64, 23u64][..]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
for i in 2u64..13u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
}
for i in 13u64..18u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 2);
}
for i in 18u64..24u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 3);
}
for i in 24u64..30u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 4);
}
}
fn test_doc_to_opstamp_mapping_with_map() {
let doc_to_opstamp_mapping = DocToOpstampMapping::WithMap(&[5u64, 1u64, 0u64, 4u64, 3u64]);
assert_eq!(doc_to_opstamp_mapping.is_deleted(0u32, 2u64), false);
assert_eq!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64), true);
assert_eq!(doc_to_opstamp_mapping.is_deleted(2u32, 2u64), true);
assert_eq!(doc_to_opstamp_mapping.is_deleted(3u32, 2u64), false);
assert_eq!(doc_to_opstamp_mapping.is_deleted(4u32, 2u64), false);
}
}

View File

@@ -106,22 +106,18 @@ fn compute_deleted_bitset(
}
// A delete operation should only affect
// document that were inserted after it.
//
// Limit doc helps identify the first document
// that may be affected by the delete operation.
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
// document that were inserted before it.
let inverted_index = segment_reader.inverted_index(delete_op.term.field())?;
if let Some(mut docset) =
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)?
{
let mut deleted_doc = docset.doc();
while deleted_doc != TERMINATED {
if deleted_doc < limit_doc {
delete_bitset.insert(deleted_doc);
let mut doc_matching_deleted_term = docset.doc();
while doc_matching_deleted_term != TERMINATED {
if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
delete_bitset.insert(doc_matching_deleted_term);
might_have_changed = true;
}
deleted_doc = docset.advance();
doc_matching_deleted_term = docset.advance();
}
}
delete_cursor.advance();
@@ -230,14 +226,8 @@ fn index_documents(
let segment_with_max_doc = segment.with_max_doc(max_doc);
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
let delete_bitset_opt = apply_deletes(
&segment_with_max_doc,
&mut delete_cursor,
&doc_opstamps,
last_docstamp,
)?;
let delete_bitset_opt =
apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let meta = segment_with_max_doc.meta().clone();
meta.untrack_temp_docstore();
@@ -247,19 +237,26 @@ fn index_documents(
Ok(true)
}
/// `doc_opstamps` is required to be non-empty.
fn apply_deletes(
segment: &Segment,
mut delete_cursor: &mut DeleteCursor,
doc_opstamps: &[Opstamp],
last_docstamp: Opstamp,
) -> crate::Result<Option<BitSet>> {
if delete_cursor.get().is_none() {
// if there are no delete operation in the queue, no need
// to even open the segment.
return Ok(None);
}
let max_doc_opstamp: Opstamp = doc_opstamps
.iter()
.cloned()
.max()
.expect("Empty DocOpstamp is forbidden");
let segment_reader = SegmentReader::open(segment)?;
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
let max_doc = segment.meta().max_doc();
let mut deleted_bitset = BitSet::with_max_value(max_doc);
@@ -268,7 +265,7 @@ fn apply_deletes(
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
max_doc_opstamp,
)?;
Ok(if may_have_deletes {
Some(deleted_bitset)
@@ -358,7 +355,7 @@ impl IndexWriter {
// dropping the last reference to the segment_updater.
self.drop_sender();
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
let former_workers_handles = std::mem::take(&mut self.workers_join_handle);
for join_handle in former_workers_handles {
join_handle
.join()
@@ -628,7 +625,7 @@ impl IndexWriter {
// and recreate a new one.
self.recreate_document_channel();
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
let former_workers_join_handle = std::mem::take(&mut self.workers_join_handle);
for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle
@@ -784,17 +781,44 @@ impl Drop for IndexWriter {
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::collections::HashSet;
use futures::executor::block_on;
use proptest::prelude::*;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
use super::super::operation::UserOperation;
use crate::collector::TopDocs;
use crate::directory::error::LockError;
use crate::error::*;
use crate::fastfield::FastFieldReader;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::query::TermQuery;
use crate::schema::{self, IndexRecordOption, STRING};
use crate::schema::Cardinality;
use crate::schema::Facet;
use crate::schema::IntOptions;
use crate::schema::TextFieldIndexing;
use crate::schema::TextOptions;
use crate::schema::STORED;
use crate::schema::TEXT;
use crate::schema::{self, IndexRecordOption, FAST, INDEXED, STRING};
use crate::DocAddress;
use crate::Index;
use crate::ReloadPolicy;
use crate::Term;
use crate::{IndexSettings, IndexSortByField, Order};
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.";
#[test]
fn test_operations_group() {
@@ -1282,6 +1306,330 @@ mod tests {
assert!(commit_again.is_ok());
}
#[test]
fn test_delete_with_sort_by_field() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let id_field =
schema_builder.add_u64_field("id", schema::INDEXED | schema::STORED | schema::FAST);
let schema = schema_builder.build();
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Desc,
}),
..Default::default()
};
let index = Index::builder()
.schema(schema)
.settings(settings)
.create_in_ram()?;
let index_reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
// create and delete docs in same commit
for id in 0u64..5u64 {
index_writer.add_document(doc!(id_field => id));
}
for id in 2u64..4u64 {
index_writer.delete_term(Term::from_field_u64(id_field, id));
}
for id in 5u64..10u64 {
index_writer.add_document(doc!(id_field => id));
}
index_writer.commit()?;
index_reader.reload()?;
let searcher = index_reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
assert_eq!(segment_reader.num_docs(), 8);
assert_eq!(segment_reader.max_doc(), 10);
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
let in_order_alive_ids: Vec<u64> = segment_reader
.doc_ids_alive()
.map(|doc| fast_field_reader.get(doc))
.collect();
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
Ok(())
}
#[derive(Debug, Clone, Copy)]
enum IndexingOp {
AddDoc { id: u64 },
DeleteDoc { id: u64 },
Commit,
}
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..2u64).prop_map(|_| IndexingOp::Commit),
]
}
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
let mut existing_ids = HashMap::new();
let mut deleted_ids = HashSet::new();
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
*existing_ids.entry(id).or_insert(0) += 1;
deleted_ids.remove(&id);
}
IndexingOp::DeleteDoc { id } => {
existing_ids.remove(&id);
deleted_ids.insert(id);
}
_ => {}
}
}
(existing_ids, deleted_ids)
}
fn test_operation_strategy(
ops: &[IndexingOp],
sort_index: bool,
force_merge: bool,
) -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
let text_field = schema_builder.add_text_field(
"text_field",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);
let large_text_field = schema_builder.add_text_field("large_text_field", TEXT | STORED);
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_stored(),
);
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build();
let settings = if sort_index {
IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Asc,
}),
..Default::default()
}
} else {
IndexSettings {
..Default::default()
}
};
let index = Index::builder()
.schema(schema)
.settings(settings)
.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
index_writer
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM));
}
IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id));
}
IndexingOp::Commit => {
index_writer.commit()?;
}
}
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();
if force_merge {
index_writer.wait_merging_threads()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.wait_merging_threads().is_ok());
}
}
let ids: HashSet<u64> = searcher
.segment_readers()
.iter()
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get(doc))
})
.collect();
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
assert_eq!(
ids,
expected_ids_and_num_occurences
.keys()
.cloned()
.collect::<HashSet<_>>()
);
// multivalue fast field tests
for segment_reader in searcher.segment_readers().iter() {
let ff_reader = segment_reader.fast_fields().u64s(multi_numbers).unwrap();
for doc in segment_reader.doc_ids_alive() {
let mut vals = vec![];
ff_reader.get_vals(doc, &mut vals);
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert!(expected_ids_and_num_occurences.contains_key(&vals[0]));
}
}
// doc store tests
for segment_reader in searcher.segment_readers().iter() {
let store_reader = segment_reader.get_store_reader().unwrap();
// test store iterator
for doc in store_reader.iter(segment_reader.delete_bitset()) {
let id = doc
.unwrap()
.get_first(id_field)
.unwrap()
.u64_value()
.unwrap();
assert!(expected_ids_and_num_occurences.contains_key(&id));
}
// test store random access
for doc_id in segment_reader.doc_ids_alive() {
let id = store_reader
.get(doc_id)
.unwrap()
.get_first(id_field)
.unwrap()
.u64_value()
.unwrap();
assert!(expected_ids_and_num_occurences.contains_key(&id));
let id2 = store_reader
.get(doc_id)
.unwrap()
.get_first(multi_numbers)
.unwrap()
.u64_value()
.unwrap();
assert_eq!(id, id2);
}
}
// test search
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
};
for (existing_id, count) in expected_ids_and_num_occurences {
assert_eq!(do_search(&existing_id.to_string()).len() as u64, count);
}
for existing_id in deleted_ids {
assert_eq!(do_search(&existing_id.to_string()).len(), 0);
}
// test facets
for segment_reader in searcher.segment_readers().iter() {
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
for doc_id in segment_reader.doc_ids_alive() {
let mut facet_ords = Vec::new();
facet_reader.facet_ords(doc_id, &mut facet_ords);
assert_eq!(facet_ords.len(), 1);
let mut facet = Facet::default();
facet_reader
.facet_from_ord(facet_ords[0], &mut facet)
.unwrap();
let id = ff_reader.get(doc_id);
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
assert_eq!(facet, facet_expected);
}
}
Ok(())
}
proptest! {
#[test]
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
}
#[test]
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
}
#[test]
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
}
#[test]
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
}
}
#[test]
fn test_delete_with_sort_by_field_last_opstamp_is_not_max() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let sort_by_field = schema_builder.add_u64_field("sort_by", FAST);
let id_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "sort_by".to_string(),
order: Order::Asc,
}),
..Default::default()
};
let index = Index::builder()
.schema(schema)
.settings(settings)
.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
// We add a doc...
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64));
// And remove it.
index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
// We add another doc.
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64));
// The expected result is a segment with
// maxdoc = 2
// numdoc = 1.
index_writer.commit()?;
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
assert_eq!(segment_reader.max_doc(), 2);
assert_eq!(segment_reader.num_deleted_docs(), 1);
Ok(())
}
#[test]
fn test_index_doc_missing_field() {
let mut schema_builder = schema::Schema::builder();

View File

@@ -1,4 +1,3 @@
use super::doc_id_mapping::DocIdMapping;
use crate::error::DataCorruption;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fastfield::DeleteBitSet;
@@ -10,6 +9,7 @@ use crate::fastfield::MultiValuedFastFieldReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::indexer::doc_id_mapping::SegmentDocidMapping;
use crate::indexer::SegmentSerializer;
use crate::postings::Postings;
use crate::postings::{InvertedIndexSerializer, SegmentPostings};
@@ -19,17 +19,18 @@ use crate::schema::{Field, Schema};
use crate::store::StoreWriter;
use crate::termdict::TermMerger;
use crate::termdict::TermOrdinal;
use crate::IndexSortByField;
use crate::{common::HasLen, fastfield::MultiValueLength};
use crate::{common::MAX_DOC_LIMIT, IndexSettings};
use crate::{core::Segment, indexer::doc_id_mapping::expect_field_id_for_sort_field};
use crate::{core::SegmentReader, Order};
use crate::{core::SerializableSegment, IndexSortByField};
use crate::{
docset::{DocSet, TERMINATED},
SegmentOrdinal,
};
use crate::{DocId, InvertedIndexReader, SegmentComponent};
use itertools::Itertools;
use measure_time::debug_time;
use std::cmp;
use std::collections::HashMap;
use std::sync::Arc;
@@ -215,7 +216,7 @@ impl IndexMerger {
let mut readers_with_min_sort_values = readers
.into_iter()
.map(|reader| {
let accessor = Self::get_sort_field_accessor(&reader, &sort_by_field)?;
let accessor = Self::get_sort_field_accessor(&reader, sort_by_field)?;
Ok((reader, accessor.min_value()))
})
.collect::<crate::Result<Vec<_>>>()?;
@@ -233,33 +234,24 @@ impl IndexMerger {
fn write_fieldnorms(
&self,
mut fieldnorms_serializer: FieldNormsSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
for field in fields {
fieldnorms_data.clear();
if let Some(doc_id_mapping) = doc_id_mapping {
let fieldnorms_readers: Vec<FieldNormReader> = self
.readers
.iter()
.map(|reader| reader.get_fieldnorms_reader(field))
.collect::<Result<_, _>>()?;
for (doc_id, reader_with_ordinal) in doc_id_mapping {
let fieldnorms_reader =
&fieldnorms_readers[reader_with_ordinal.ordinal as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
fieldnorms_data.push(fieldnorm_id);
}
} else {
for reader in &self.readers {
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc_id in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id);
fieldnorms_data.push(fieldnorm_id);
}
}
let fieldnorms_readers: Vec<FieldNormReader> = self
.readers
.iter()
.map(|reader| reader.get_fieldnorms_reader(field))
.collect::<Result<_, _>>()?;
for (doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let fieldnorms_reader = &fieldnorms_readers[reader_with_ordinal.ordinal as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
fieldnorms_data.push(fieldnorm_id);
}
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
}
fieldnorms_serializer.close()?;
@@ -270,8 +262,10 @@ impl IndexMerger {
&self,
fast_field_serializer: &mut CompositeFastFieldSerializer,
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
debug_time!("write_fast_fields");
for (field, field_entry) in self.schema.fields() {
let field_type = field_entry.field_type();
match field_type {
@@ -319,7 +313,7 @@ impl IndexMerger {
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
let (min_value, max_value) = self.readers.iter().map(|reader|{
let u64_reader: DynamicFastFieldReader<u64> = reader
@@ -328,7 +322,7 @@ impl IndexMerger {
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
})
.filter_map(|x| x)
.flatten()
.reduce(|a, b| {
(a.0.min(b.0), a.1.max(b.1))
}).expect("Unexpected error, empty readers in IndexMerger");
@@ -344,68 +338,44 @@ impl IndexMerger {
u64_reader
})
.collect::<Vec<_>>();
if let Some(doc_id_mapping) = doc_id_mapping {
#[derive(Clone)]
struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
}
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
fn get(&self, doc: DocId) -> u64 {
let (doc_id, reader_with_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
}
}
let stats = FastFieldStats {
min_value,
max_value,
num_vals: doc_id_mapping.len() as u64,
};
let fastfield_accessor = SortedDocidFieldAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
};
let iter = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
fast_field_reader.get(*doc_id)
});
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
fastfield_accessor,
iter.clone(),
iter,
)?;
Ok(())
} else {
let u64_readers = self.readers.iter()
.filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0))
.map(|reader|{
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
(reader.max_doc(), u64_reader, reader.delete_bitset())
}).collect::<Vec<_>>();
let mut fast_single_field_serializer =
fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
for (max_doc, u64_reader, delete_bitset_opt) in u64_readers {
for doc_id in 0u32..max_doc {
let is_deleted = delete_bitset_opt
.map(|delete_bitset| delete_bitset.is_deleted(doc_id))
.unwrap_or(false);
if !is_deleted {
let val = u64_reader.get(doc_id);
fast_single_field_serializer.add_val(val)?;
}
}
}
fast_single_field_serializer.close_field()?;
Ok(())
let stats = FastFieldStats {
min_value,
max_value,
num_vals: doc_id_mapping.len() as u64,
};
#[derive(Clone)]
struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocidMapping<'a>,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
}
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
fn get_val(&self, doc: u64) -> u64 {
let (doc_id, reader_with_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
}
}
let fastfield_accessor = SortedDocidFieldAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
};
let iter1 = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
fast_field_reader.get(*doc_id)
});
let iter2 = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
fast_field_reader.get(*doc_id)
});
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
fastfield_accessor,
iter1,
iter2,
)?;
Ok(())
}
/// Checks if the readers are disjunct for their sort property and in the correct order to be
@@ -434,7 +404,7 @@ impl IndexMerger {
reader: &SegmentReader,
sort_by_field: &IndexSortByField,
) -> crate::Result<impl FastFieldReader<u64>> {
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
let field_id = expect_field_id_for_sort_field(reader.schema(), sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Ok(value_accessor)
}
@@ -469,7 +439,7 @@ impl IndexMerger {
pub(crate) fn generate_doc_id_mapping(
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<(DocId, SegmentReaderWithOrdinal)>> {
) -> crate::Result<SegmentDocidMapping> {
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
// Loading the field accessor on demand causes a 15x regression
@@ -505,7 +475,7 @@ impl IndexMerger {
})
.map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id))
.collect::<Vec<_>>();
Ok(sorted_doc_ids)
Ok(SegmentDocidMapping::new(sorted_doc_ids, false))
}
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
@@ -517,9 +487,9 @@ impl IndexMerger {
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
reader_and_field_accessors: &[(&SegmentReader, T)],
) -> crate::Result<()> {
) -> crate::Result<Vec<u64>> {
let mut total_num_vals = 0u64;
// In the first pass, we compute the total number of vals.
//
@@ -548,56 +518,37 @@ impl IndexMerger {
};
// We can now create our `idx` serializer, and in a second pass,
// can effectively push the different indexes.
if let Some(doc_id_mapping) = doc_id_mapping {
// copying into a temp vec is not ideal, but the fast field codec api requires random
// access, which is used in the estimation. It's possible to 1. calculate random
// acccess on the fly or 2. change the codec api to make random access optional, but
// they both have also major drawbacks.
let mut offsets = vec![];
let mut offset = 0;
for (doc_id, reader) in doc_id_mapping {
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
offsets.push(offset);
offset += reader.get_len(*doc_id) as u64;
}
// copying into a temp vec is not ideal, but the fast field codec api requires random
// access, which is used in the estimation. It's possible to 1. calculate random
// acccess on the fly or 2. change the codec api to make random access optional, but
// they both have also major drawbacks.
let mut offsets = vec![];
let mut offset = 0;
for (doc_id, reader) in doc_id_mapping.iter() {
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
offsets.push(offset);
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets,
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
} else {
let mut offsets = vec![];
let mut offset = 0;
for (segment_reader, u64s_reader) in reader_and_field_accessors.iter() {
for doc in segment_reader.doc_ids_alive() {
offsets.push(offset);
offset += u64s_reader.get_len(doc) as u64;
}
}
offsets.push(offset);
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets,
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
offset += reader.get_len(*doc_id) as u64;
}
offsets.push(offset);
Ok(())
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets[..],
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
Ok(offsets)
}
/// Returns the fastfield index (index for the data, not the data).
fn write_multi_value_fast_field_idx(
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<Vec<u64>> {
let reader_and_field_accessors = self.readers.iter().map(|reader|{
let u64s_reader: MultiValuedFastFieldReader<u64> = reader.fast_fields()
.typed_fast_field_multi_reader(field)
@@ -618,9 +569,11 @@ impl IndexMerger {
field: Field,
term_ordinal_mappings: &TermOrdinalMapping,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields.
debug_time!("write_hierarchical_facet_field");
// Multifastfield consists of 2 fastfields.
// The first serves as an index into the second one and is stricly increasing.
// The second contains the actual values.
@@ -645,53 +598,61 @@ impl IndexMerger {
let mut serialize_vals =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
let mut vals = Vec::with_capacity(100);
if let Some(doc_id_mapping) = doc_id_mapping {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(reader_with_ordinal.ordinal as usize);
let ff_reader = &fast_field_reader[reader_with_ordinal.ordinal as usize];
ff_reader.get_vals(*old_doc_id, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
serialize_vals.add_val(new_term_ord)?;
}
}
} else {
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(segment_ord);
let ff_reader = &fast_field_reader[segment_ord as usize];
// TODO optimize if no deletes
for doc in segment_reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
serialize_vals.add_val(new_term_ord)?;
}
}
for (old_doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(reader_with_ordinal.ordinal as usize);
let ff_reader = &fast_field_reader[reader_with_ordinal.ordinal as usize];
ff_reader.get_vals(*old_doc_id, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
serialize_vals.add_val(new_term_ord)?;
}
}
serialize_vals.close_field()?;
}
Ok(())
}
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index
/// sorting and the others
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocidMapping> {
let mapping: Vec<_> = self
.readers
.iter()
.enumerate()
.map(|(ordinal, reader)| {
let reader_with_ordinal = SegmentReaderWithOrdinal {
ordinal: ordinal as u32,
reader,
};
reader
.doc_ids_alive()
.map(move |doc_id| (doc_id, reader_with_ordinal))
})
.flatten()
.collect();
Ok(SegmentDocidMapping::new(mapping, true))
}
fn write_multi_fast_field(
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields.
// The first serves as an index into the second one and is stricly increasing.
// The second contains the actual values.
// First we merge the idx fast field.
self.write_multi_value_fast_field_idx(field, fast_field_serializer, doc_id_mapping)?;
let offsets =
self.write_multi_value_fast_field_idx(field, fast_field_serializer, doc_id_mapping)?;
let mut min_value = u64::max_value();
let mut max_value = u64::min_value();
let mut num_vals = 0;
let mut vals = Vec::with_capacity(100);
@@ -717,6 +678,7 @@ impl IndexMerger {
min_value = cmp::min(val, min_value);
max_value = cmp::max(val, max_value);
}
num_vals += vals.len();
}
ff_readers.push(ff_reader);
// TODO optimize when no deletes
@@ -727,40 +689,76 @@ impl IndexMerger {
max_value = 0;
}
let fast_field_reader = self
.readers
.iter()
.map(|reader| {
let ff_reader : MultiValuedFastFieldReader<u64> = reader.fast_fields()
.typed_fast_field_multi_reader(field)
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
ff_reader
})
.collect::<Vec<_>>();
// We can now initialize our serializer, and push it the different values
let mut serialize_vals =
fast_field_serializer.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
if let Some(doc_id_mapping) = doc_id_mapping {
for (doc_id, reader_with_ordinal) in doc_id_mapping {
let ff_reader = &fast_field_reader[reader_with_ordinal.ordinal as usize];
ff_reader.get_vals(*doc_id, &mut vals);
for &val in &vals {
serialize_vals.add_val(val)?;
}
}
} else {
for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
// TODO optimize if no deletes
for doc in reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals);
for &val in &vals {
serialize_vals.add_val(val)?;
}
}
let stats = FastFieldStats {
max_value,
num_vals: num_vals as u64,
min_value,
};
struct SortedDocidMultiValueAccessProvider<'a> {
doc_id_mapping: &'a SegmentDocidMapping<'a>,
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
offsets: Vec<u64>,
}
impl<'a> FastFieldDataAccess for SortedDocidMultiValueAccessProvider<'a> {
fn get_val(&self, pos: u64) -> u64 {
// use the offsets index to find the doc_id which will contain the position.
// the offsets are stricly increasing so we can do a simple search on it.
let new_docid = self
.offsets
.iter()
.position(|&offset| offset > pos)
.expect("pos is out of bounds")
- 1;
// now we need to find the position of `pos` in the multivalued bucket
let num_pos_covered_until_now = self.offsets[new_docid];
let pos_in_values = pos - num_pos_covered_until_now;
let (old_doc_id, reader_with_ordinal) = self.doc_id_mapping[new_docid as usize];
let num_vals = self.fast_field_readers[reader_with_ordinal.ordinal as usize]
.get_len(old_doc_id);
assert!(num_vals >= pos_in_values);
let mut vals = vec![];
self.fast_field_readers[reader_with_ordinal.ordinal as usize]
.get_vals(old_doc_id, &mut vals);
vals[pos_in_values as usize]
}
}
serialize_vals.close_field()?;
let fastfield_accessor = SortedDocidMultiValueAccessProvider {
doc_id_mapping,
fast_field_readers: &ff_readers,
offsets,
};
let iter1 = doc_id_mapping
.iter()
.map(|(doc_id, reader_with_ordinal)| {
let ff_reader = &ff_readers[reader_with_ordinal.ordinal as usize];
let mut vals = vec![];
ff_reader.get_vals(*doc_id, &mut vals);
vals.into_iter()
})
.flatten();
let iter2 = doc_id_mapping
.iter()
.map(|(doc_id, reader_with_ordinal)| {
let ff_reader = &ff_readers[reader_with_ordinal.ordinal as usize];
let mut vals = vec![];
ff_reader.get_vals(*doc_id, &mut vals);
vals.into_iter()
})
.flatten();
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
field,
stats,
fastfield_accessor,
iter1,
iter2,
1,
)?;
Ok(())
}
@@ -768,7 +766,7 @@ impl IndexMerger {
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
let reader_and_field_accessors = self
.readers
@@ -787,24 +785,13 @@ impl IndexMerger {
&reader_and_field_accessors,
)?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1);
if let Some(doc_id_mapping) = doc_id_mapping {
for (doc_id, reader_with_ordinal) in doc_id_mapping {
let bytes_reader =
&reader_and_field_accessors[reader_with_ordinal.ordinal as usize].1;
let val = bytes_reader.get_bytes(*doc_id);
serialize_vals.write_all(val)?;
}
} else {
for segment_reader in &self.readers {
let bytes_reader = segment_reader.fast_fields().bytes(field)
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
// TODO: optimize if no deletes
for doc in segment_reader.doc_ids_alive() {
let val = bytes_reader.get_bytes(doc);
serialize_vals.write_all(val)?;
}
}
for (doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let bytes_reader = &reader_and_field_accessors[reader_with_ordinal.ordinal as usize].1;
let val = bytes_reader.get_bytes(*doc_id);
serialize_vals.write_all(val)?;
}
serialize_vals.flush()?;
Ok(())
}
@@ -815,8 +802,9 @@ impl IndexMerger {
field_type: &FieldType,
serializer: &mut InvertedIndexSerializer,
fieldnorm_reader: Option<FieldNormReader>,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<Option<TermOrdinalMapping>> {
debug_time!("write_postings_for_field");
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new();
@@ -841,40 +829,23 @@ impl IndexMerger {
};
let mut merged_terms = TermMerger::new(field_term_streams);
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
if let Some(doc_id_mapping) = doc_id_mapping {
merged_doc_id_map = self
.readers
.iter()
.map(|reader| {
let mut segment_local_map = vec![];
segment_local_map.resize(reader.max_doc() as usize, None);
segment_local_map
})
.collect();
for (new_doc_id, (old_doc_id, segment_and_ordinal)) in doc_id_mapping.iter().enumerate()
{
let segment_map = &mut merged_doc_id_map[segment_and_ordinal.ordinal as usize];
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}
} else {
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
}
merged_doc_id_map.push(segment_local_map);
}
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = self
.readers
.iter()
.map(|reader| {
let mut segment_local_map = vec![];
segment_local_map.resize(reader.max_doc() as usize, None);
segment_local_map
})
.collect();
for (new_doc_id, (old_doc_id, segment_and_ordinal)) in doc_id_mapping.iter().enumerate() {
let segment_map = &mut merged_doc_id_map[segment_and_ordinal.ordinal as usize];
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}
// The total number of tokens will only be exact when there has been no deletes.
//
// Otherwise, we approximate by removing deleted documents proportionally.
@@ -970,7 +941,7 @@ impl IndexMerger {
// I think this is not strictly necessary, it would be possible to
// avoid the loading into a vec via some form of kmerge, but then the merge
// logic would deviate much more from the stacking case (unsorted index)
if doc_id_mapping.is_some() {
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.push((
remapped_doc_id,
term_freq,
@@ -985,7 +956,7 @@ impl IndexMerger {
doc = segment_postings.advance();
}
}
if doc_id_mapping.is_some() {
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
for (doc_id, term_freq, positions) in &doc_id_and_positions {
field_serializer.write_doc(*doc_id, *term_freq, positions);
@@ -1004,7 +975,7 @@ impl IndexMerger {
&self,
serializer: &mut InvertedIndexSerializer,
fieldnorm_readers: FieldNormReaders,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<HashMap<Field, TermOrdinalMapping>> {
let mut term_ordinal_mappings = HashMap::new();
for (field, field_entry) in self.schema.fields() {
@@ -1027,8 +998,10 @@ impl IndexMerger {
fn write_storable_fields(
&self,
store_writer: &mut StoreWriter,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
doc_id_mapping: &SegmentDocidMapping,
) -> crate::Result<()> {
debug_time!("write_storable_fields");
let store_readers: Vec<_> = self
.readers
.iter()
@@ -1039,8 +1012,8 @@ impl IndexMerger {
.enumerate()
.map(|(i, store)| store.iter_raw(self.readers[i].delete_bitset()))
.collect();
if let Some(doc_id_mapping) = doc_id_mapping {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
if !doc_id_mapping.is_trivial() {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping.iter() {
let doc_bytes_it = &mut document_iterators[reader_with_ordinal.ordinal as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
@@ -1084,25 +1057,24 @@ impl IndexMerger {
}
Ok(())
}
}
impl SerializableSegment for IndexMerger {
fn write(
&self,
mut serializer: SegmentSerializer,
_: Option<&DocIdMapping>,
) -> crate::Result<u32> {
/// Writes the merged segment by pushing information
/// to the `SegmentSerializer`.
///
/// # Returns
/// The number of documents in the resulting segment.
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
{
// If the documents are already sorted and stackable, we ignore the mapping and execute
// it as if there was no sorting
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
None
self.get_doc_id_from_concatenated_data()?
} else {
Some(self.generate_doc_id_mapping(sort_by_field)?)
self.generate_doc_id_mapping(sort_by_field)?
}
} else {
None
self.get_doc_id_from_concatenated_data()?
};
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {

View File

@@ -32,6 +32,12 @@ impl SegmentRegisters {
} else if self.committed.contains_all(segment_ids) {
Some(SegmentsStatus::Committed)
} else {
warn!(
"segment_ids: {:?}, committed_ids: {:?}, uncommitted_ids {:?}",
segment_ids,
self.committed.segment_ids(),
self.uncommitted.segment_ids()
);
None
}
}
@@ -58,21 +64,6 @@ impl Debug for SegmentManager {
}
}
pub fn get_mergeable_segments(
in_merge_segment_ids: &HashSet<SegmentId>,
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(
registers_lock
.committed
.get_mergeable_segments(in_merge_segment_ids),
registers_lock
.uncommitted
.get_mergeable_segments(in_merge_segment_ids),
)
}
impl SegmentManager {
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
@@ -86,6 +77,20 @@ impl SegmentManager {
}
}
pub fn get_mergeable_segments(
&self,
in_merge_segment_ids: &HashSet<SegmentId>,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = self.read();
(
registers_lock
.committed
.get_mergeable_segments(in_merge_segment_ids),
registers_lock
.uncommitted
.get_mergeable_segments(in_merge_segment_ids),
)
}
/// Returns all of the segment entries (committed or uncommitted)
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
let registers_lock = self.read();

View File

@@ -4,6 +4,7 @@ use crate::indexer::delete_queue::DeleteCursor;
use crate::indexer::segment_entry::SegmentEntry;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Display;
use std::fmt::{self, Debug, Formatter};
/// The segment register keeps track
@@ -29,6 +30,16 @@ impl Debug for SegmentRegister {
Ok(())
}
}
impl Display for SegmentRegister {
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
write!(f, "SegmentRegister(")?;
for k in self.segment_states.keys() {
write!(f, "{}, ", k.short_uuid_string())?;
}
write!(f, ")")?;
Ok(())
}
}
impl SegmentRegister {
pub fn clear(&mut self) {
@@ -46,6 +57,10 @@ impl SegmentRegister {
.collect()
}
pub fn segment_ids(&self) -> Vec<SegmentId> {
self.segment_states.keys().cloned().collect()
}
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
self.segment_states.values().cloned().collect()
}

View File

@@ -1,11 +1,10 @@
use super::segment_manager::{get_mergeable_segments, SegmentManager};
use super::segment_manager::SegmentManager;
use crate::core::Index;
use crate::core::IndexMeta;
use crate::core::IndexSettings;
use crate::core::Segment;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::core::SerializableSegment;
use crate::core::META_FILEPATH;
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
use crate::indexer::delete_queue::DeleteCursor;
@@ -140,7 +139,7 @@ fn merge(
// ... we just serialize this index merger in our new segment to merge the segments.
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone(), true)?;
let num_docs = merger.write(segment_serializer, None)?;
let num_docs = merger.write(segment_serializer)?;
let merged_segment_id = merged_segment.id();
@@ -209,7 +208,7 @@ pub fn merge_segments<Dir: Directory>(
&segments[..],
)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
let num_docs = merger.write(segment_serializer, None)?;
let num_docs = merger.write(segment_serializer)?;
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
@@ -528,10 +527,14 @@ impl SegmentUpdater {
}))
}
async fn consider_merge_options(&self) {
pub(crate) fn get_mergeable_segments(&self) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let merge_segment_ids: HashSet<SegmentId> = self.merge_operations.segment_in_merge();
let (committed_segments, uncommitted_segments) =
get_mergeable_segments(&merge_segment_ids, &self.segment_manager);
self.segment_manager
.get_mergeable_segments(&merge_segment_ids)
}
async fn consider_merge_options(&self) {
let (committed_segments, uncommitted_segments) = self.get_mergeable_segments();
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.
@@ -717,7 +720,7 @@ mod tests {
let seg_ids = index.searchable_segment_ids()?;
// docs exist, should have at least 1 segment
assert!(seg_ids.len() > 0);
assert!(!seg_ids.is_empty());
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
for term_val in term_vals {

View File

@@ -12,12 +12,12 @@ use crate::schema::Schema;
use crate::schema::Term;
use crate::schema::Value;
use crate::schema::{Field, FieldEntry};
use crate::store::StoreReader;
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp;
use crate::{core::Segment, store::StoreWriter};
use crate::{core::SerializableSegment, store::StoreReader};
use crate::{DocId, SegmentComponent};
/// Computes the initial size of the hash table.
@@ -36,6 +36,20 @@ fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
}
}
fn remap_doc_opstamps(
opstamps: Vec<Opstamp>,
doc_id_mapping_opt: Option<&DocIdMapping>,
) -> Vec<Opstamp> {
if let Some(doc_id_mapping_opt) = doc_id_mapping_opt {
doc_id_mapping_opt
.iter_old_doc_ids()
.map(|doc| opstamps[doc as usize])
.collect()
} else {
opstamps
}
}
/// A `SegmentWriter` is in charge of creating segment index from a
/// set of documents.
///
@@ -112,14 +126,15 @@ impl SegmentWriter {
.clone()
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
.transpose()?;
write(
remap_and_write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
mapping.as_ref(),
)?;
Ok(self.doc_opstamps)
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
Ok(doc_opstamps)
}
pub fn mem_usage(&self) -> usize {
@@ -176,7 +191,7 @@ impl SegmentWriter {
.process(&mut |token| {
term_buffer.set_text(&token.text);
let unordered_term_id =
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
@@ -237,7 +252,7 @@ impl SegmentWriter {
.u64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
}
}
FieldType::Date(_) => {
@@ -248,7 +263,7 @@ impl SegmentWriter {
.date_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_i64(date_val.timestamp());
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
}
}
FieldType::I64(_) => {
@@ -259,7 +274,7 @@ impl SegmentWriter {
.i64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
}
}
FieldType::F64(_) => {
@@ -270,7 +285,7 @@ impl SegmentWriter {
.f64_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
multifield_postings.subscribe(doc_id, &term_buffer);
multifield_postings.subscribe(doc_id, term_buffer);
}
}
FieldType::Bytes(_) => {
@@ -281,7 +296,7 @@ impl SegmentWriter {
.bytes_value()
.ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
self.multifield_postings.subscribe(doc_id, &term_buffer);
self.multifield_postings.subscribe(doc_id, term_buffer);
}
}
}
@@ -315,8 +330,12 @@ impl SegmentWriter {
}
}
// This method is used as a trick to workaround the borrow checker
fn write(
/// This method is used as a trick to workaround the borrow checker
/// Writes a view of a segment by pushing information
/// to the `SegmentSerializer`.
///
/// `doc_id_map` is used to map to the new doc_id order.
fn remap_and_write(
multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FieldNormsWriter,
@@ -340,6 +359,7 @@ fn write(
&term_ord_map,
doc_id_map,
)?;
// finalize temp docstore and create version, which reflects the doc_id_map
if let Some(doc_id_map) = doc_id_map {
let store_write = serializer
@@ -356,31 +376,16 @@ fn write(
.segment()
.open_read(SegmentComponent::TempStore)?,
)?;
for old_doc_id in doc_id_map.iter_old_doc_ids() {
let doc_bytes = store_read.get_document_bytes(*old_doc_id)?;
let doc_bytes = store_read.get_document_bytes(old_doc_id)?;
serializer.get_store_writer().store_bytes(&doc_bytes)?;
}
}
serializer.close()?;
Ok(())
}
impl SerializableSegment for SegmentWriter {
fn write(
&self,
serializer: SegmentSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<u32> {
let max_doc = self.max_doc;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer,
doc_id_map,
)?;
Ok(max_doc)
}
serializer.close()?;
Ok(())
}
#[cfg(test)]

View File

@@ -1,6 +1,13 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
#![cfg_attr(
feature = "cargo-clippy",
allow(
clippy::module_inception,
clippy::needless_range_loop,
clippy::bool_assert_comparison
)
)]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
@@ -100,7 +107,6 @@
#[cfg_attr(test, macro_use)]
extern crate serde_json;
#[macro_use]
extern crate log;
@@ -934,7 +940,7 @@ mod tests {
let id = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let index_reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
@@ -973,7 +979,7 @@ mod tests {
let searcher = index_reader.searcher();
let segment_ids: Vec<SegmentId> = searcher
.segment_readers()
.into_iter()
.iter()
.map(|reader| reader.segment_id())
.collect();
block_on(index_writer.merge(&segment_ids)).unwrap();

View File

@@ -46,7 +46,7 @@ pub mod tests {
fn create_positions_data(vals: &[u32]) -> crate::Result<OwnedBytes> {
let mut positions_buffer = vec![];
let mut serializer = PositionSerializer::new(&mut positions_buffer);
serializer.write_positions_delta(&vals);
serializer.write_positions_delta(vals);
serializer.close_term()?;
serializer.close()?;
Ok(OwnedBytes::new(positions_buffer))
@@ -169,7 +169,7 @@ pub mod tests {
let positions_delta: Vec<u32> = (0..2_000_000).collect();
let positions_data = create_positions_data(&positions_delta[..])?;
assert_eq!(positions_data.len(), 5003499);
let mut position_reader = PositionReader::open(positions_data.clone())?;
let mut position_reader = PositionReader::open(positions_data)?;
let mut buf = [0u32; 256];
position_reader.read(128, &mut buf);
for i in 0..256 {

View File

@@ -57,7 +57,7 @@ mod sse2 {
fn test_linear_search_sse2_128_u32() {
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
for el in 0u32..128u32 {
block[el as usize] = el * 2 + 1 << 18;
block[el as usize] = (el * 2 + 1) << 18;
}
let target = block[64] + 1;
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
@@ -91,7 +91,7 @@ fn exponential_search(arr: &[u32], target: u32) -> Range<usize> {
#[inline(never)]
fn galloping(block_docs: &[u32], target: u32) -> usize {
let range = exponential_search(&block_docs, target);
let range = exponential_search(block_docs, target);
range.start + linear_search(&block_docs[range], target)
}

View File

@@ -13,11 +13,7 @@ use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
if let Some(first) = it.next() {
Some(it.fold(first, Score::max))
} else {
None
}
it.next().map(|first| it.fold(first, Score::max))
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
@@ -482,11 +478,11 @@ mod tests {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] {
block_postings.seek(i);
for i in &[0, 424, 10000] {
block_postings.seek(*i);
let docs = block_postings.docs();
assert!(docs[0] <= i);
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
assert!(docs[0] <= *i);
assert!(docs.last().cloned().unwrap_or(0u32) >= *i);
}
block_postings.seek(100_000);
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);

View File

@@ -85,13 +85,13 @@ impl BlockDecoder {
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
.decompress_sorted(offset, compressed_data, &mut self.output.0, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(&compressed_data, &mut self.output.0, num_bits)
.decompress(compressed_data, &mut self.output.0, num_bits)
}
#[inline]
@@ -303,7 +303,7 @@ pub mod tests {
assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::default();
let consumed_num_bytes =
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len(), PADDING_VALUE);
decoder.uncompress_vint_sorted(encoded_data, *offset, input.len(), PADDING_VALUE);
assert_eq!(consumed_num_bytes, encoded_data.len());
assert_eq!(input, decoder.output_array());
for i in input.len()..COMPRESSION_BLOCK_SIZE {

View File

@@ -54,7 +54,7 @@ pub mod tests {
use crate::DocId;
use crate::HasLen;
use crate::Score;
use std::{iter, mem};
use std::mem;
#[test]
pub fn test_position_write() -> crate::Result<()> {
@@ -153,8 +153,8 @@ pub mod tests {
#[test]
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
exceeding_token_text.push_str(" hello");
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
@@ -164,7 +164,7 @@ pub mod tests {
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
@@ -229,7 +229,7 @@ pub mod tests {
segment_writer.add_document(op, &schema).unwrap();
}
for i in 2..1000 {
let mut text: String = iter::repeat("e ").take(i).collect();
let mut text: String = "e ".repeat(i);
text.push_str(" a");
let op = AddOperation {
opstamp: 2u64,
@@ -409,7 +409,7 @@ pub mod tests {
.unwrap();
for i in 0..num_docs / 2 - 1 {
assert!(segment_postings.seek(i * 2 + 1) > (i * 1) * 2);
assert!(segment_postings.seek(i * 2 + 1) > i * 2);
assert_eq!(segment_postings.doc(), (i + 1) * 2);
}
}
@@ -431,13 +431,10 @@ pub mod tests {
.read_postings(&term_2, IndexRecordOption::Basic)?
.unwrap();
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
if i % 2 == 0 {
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
assert!(segment_reader.is_deleted(i));
} else {
assert_eq!(segment_postings.seek(i), i);
assert_eq!(segment_postings.doc(), i);
}
}

View File

@@ -231,13 +231,7 @@ pub trait PostingsWriter {
// We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN {
term_buffer.set_text(token.text.as_str());
self.subscribe(
term_index,
doc_id,
token.position as u32,
&term_buffer,
heap,
);
self.subscribe(term_index, doc_id, token.position as u32, term_buffer, heap);
} else {
warn!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \

View File

@@ -282,7 +282,7 @@ impl Recorder for TfAndPositionRecorder {
doc_id_and_positions
.push((doc_id_map.get_new_doc_id(doc), buffer_positions.to_vec()));
} else {
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions);
serializer.write_doc(doc, buffer_positions.len() as u32, buffer_positions);
}
}
if doc_id_map.is_some() {

View File

@@ -107,7 +107,7 @@ impl SegmentPostings {
let fieldnorm_reader = fieldnorms.map(FieldNormReader::for_test);
let average_field_norm = fieldnorms
.map(|fieldnorms| {
if fieldnorms.len() == 0 {
if fieldnorms.is_empty() {
return 0.0;
}
let total_num_tokens: u64 = fieldnorms
@@ -184,7 +184,7 @@ impl DocSet for SegmentPostings {
// At this point we are on the block, that might contain our document.
let output = self.block_cursor.docs_aligned();
self.cur = self.block_searcher.search_in_block(&output, target);
self.cur = self.block_searcher.search_in_block(output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)

View File

@@ -356,7 +356,7 @@ impl<W: Write> PostingsSerializer<W> {
// encode the doc ids
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
.compress_block_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.skip_write
.write_doc(self.last_doc_id_encoded, num_bits);
@@ -366,7 +366,7 @@ impl<W: Write> PostingsSerializer<W> {
if self.mode.has_freq() {
let (num_bits, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(&self.block.term_freqs());
.compress_block_unsorted(self.block.term_freqs());
self.postings_write.extend(block_encoded);
self.skip_write.write_term_freq(num_bits);
if self.mode.has_positions() {
@@ -426,7 +426,7 @@ impl<W: Write> PostingsSerializer<W> {
{
let block_encoded = self
.block_encoder
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
.compress_vint_sorted(self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
}
// ... Idem for term frequencies

View File

@@ -280,11 +280,8 @@ mod tests {
if v.len() >= 10 {
break;
}
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
v.push((i, cap));
}
_ => {}
if let CapacityResult::NeedAlloc(cap) = len_to_capacity(i) {
v.push((i, cap));
}
}
assert_eq!(

View File

@@ -151,7 +151,7 @@ impl TermHashMap {
pub fn iter(&self) -> Iter<'_> {
Iter {
inner: self.occupied.iter(),
hashmap: &self,
hashmap: self,
}
}
@@ -261,8 +261,8 @@ mod tests {
}
let mut vanilla_hash_map = HashMap::new();
let mut iter_values = hash_map.iter();
while let Some((key, addr, _)) = iter_values.next() {
let iter_values = hash_map.iter();
for (key, addr, _) in iter_values {
let val: u32 = hash_map.heap.read(addr);
vanilla_hash_map.insert(key.to_owned(), val);
}

View File

@@ -251,7 +251,7 @@ mod tests {
impl PartialEq for Float {
fn eq(&self, other: &Self) -> bool {
self.cmp(&other) == Ordering::Equal
self.cmp(other) == Ordering::Equal
}
}
@@ -289,7 +289,7 @@ mod tests {
if !nearly_equals(score, limit) {
checkpoints.push((doc, score));
}
return limit;
limit
});
checkpoints
}
@@ -368,10 +368,10 @@ mod tests {
.iter()
.map(|posting_list| {
posting_list
.into_iter()
.iter()
.cloned()
.flat_map(|(doc, term_freq)| {
(0 as u32..REPEAT as u32).map(move |offset| {
(0_u32..REPEAT as u32).map(move |offset| {
(
doc * (REPEAT as u32) + offset,
if offset == 0 { term_freq } else { 1 },

View File

@@ -329,7 +329,7 @@ impl MoreLikeThis {
continue;
}
let doc_freq = searcher.doc_freq(&term)?;
let doc_freq = searcher.doc_freq(term)?;
// ignore terms with less than min_doc_frequency
if self

View File

@@ -260,7 +260,7 @@ mod tests {
.with_document(DocAddress::new(0, 0));
let top_docs = searcher.search(&query, &TopDocs::with_limit(5)).unwrap();
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
doc_ids.sort();
doc_ids.sort_unstable();
assert_eq!(doc_ids.len(), 3);
assert_eq!(doc_ids, vec![0, 1, 3]);
@@ -276,7 +276,7 @@ mod tests {
.with_document(DocAddress::new(0, 4));
let top_docs = searcher.search(&query, &TopDocs::with_limit(5)).unwrap();
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
doc_ids.sort();
doc_ids.sort_unstable();
assert_eq!(doc_ids.len(), 2);
assert_eq!(doc_ids, vec![3, 4]);

View File

@@ -179,6 +179,87 @@ pub mod tests {
assert_nearly_equals!(scores[1], 0.46844664);
}
#[ignore]
#[test]
pub fn test_phrase_score_with_slop() {
let index = create_index(&["a c b", "a b c a b"]);
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.slop(1);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664);
}
#[test]
pub fn test_phrase_score_with_slop_size() {
let index = create_index(&["a b e c", "a e e e c", "a e e e e c"]);
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.slop(3);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "c"]);
assert_nearly_equals!(scores[0], 0.29086056);
assert_nearly_equals!(scores[1], 0.26706287);
}
#[test]
pub fn test_phrase_score_with_slop_ordering() {
let index = create_index(&[
"a e b e c",
"a e e e e e b e e e e c",
"a c b",
"a c e b e",
"a e c b",
"a e b c",
]);
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.slop(3);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "b", "c"]);
// The first and last matches.
assert_nearly_equals!(scores[0], 0.23091172);
assert_nearly_equals!(scores[1], 0.25024384);
}
#[test] // motivated by #234
pub fn test_phrase_query_docfreq_order() {
let mut schema_builder = Schema::builder();

View File

@@ -26,6 +26,7 @@ use crate::schema::{Field, Term};
pub struct PhraseQuery {
field: Field,
phrase_terms: Vec<(usize, Term)>,
slop: u32,
}
impl PhraseQuery {
@@ -56,9 +57,15 @@ impl PhraseQuery {
PhraseQuery {
field,
phrase_terms: terms,
slop: 0,
}
}
/// Slop allowed for the phrase.
pub fn slop(&mut self, value: u32) {
self.slop = value;
}
/// The `Field` this `PhraseQuery` is targeting.
pub fn field(&self) -> Field {
self.field
@@ -97,11 +104,11 @@ impl PhraseQuery {
}
let terms = self.phrase_terms();
let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
Ok(PhraseWeight::new(
self.phrase_terms.clone(),
bm25_weight,
scoring_enabled,
))
let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
if self.slop > 0 {
weight.slop(self.slop);
}
Ok(weight)
}
}

View File

@@ -51,6 +51,7 @@ pub struct PhraseScorer<TPostings: Postings> {
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
slop: u32,
}
/// Returns true iff the two sorted array contain a common element
@@ -130,12 +131,82 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize {
count
}
/// Intersect twos sorted arrays `left` and `right` and outputs the
/// resulting array in left.
///
/// Condition for match is that the value stored in left is less than the value in right,
/// and that the difference between the value in left_begin is within
///
/// Returns the length of the intersection
fn intersection_with_distance(
left: &mut [u32],
left_begin: &mut [u32],
right: &mut [u32],
max_distance_to_begin: u32,
) -> usize {
// TODO: Improve variable names?
let mut left_i = 0;
let mut right_i = 0;
let mut count = 0;
let left_len = left.len();
let right_len = right.len();
// Is the current last value guaranteed to be the final value.
let mut is_temporary = false;
while left_i < left_len && right_i < right_len {
let left_val = left[left_i];
let right_val = right[right_i];
match left_val.cmp(&right_val) {
Ordering::Less => {
if right_val - left_begin[left_i] <= max_distance_to_begin {
if is_temporary {
// If the value was temporary we have found a closer match.
count -= 1;
};
left[count] = left_val;
left_begin[count] = left_begin[left_i];
right[count] = right_val;
count += 1;
left_i += 1;
// Still possible to find a closer match.
is_temporary = true;
} else {
left_i += 1;
}
}
Ordering::Equal => {
if is_temporary {
// If the value was temporary we have found an.
count -= 1;
is_temporary = false;
}
left[count] = left_val;
left_begin[count] = left_begin[left_i];
right[count] = right_val;
count += 1;
left_i += 1;
right_i += 1;
}
Ordering::Greater => {
right_i += 1;
// Given the constraint that left cannot be greater than right we know that the value in left is
// final.
is_temporary = false;
}
}
}
for i in 0..count {
left[i] = right[i];
}
count
}
impl<TPostings: Postings> PhraseScorer<TPostings> {
pub fn new(
term_postings: Vec<(usize, TPostings)>,
similarity_weight: Bm25Weight,
fieldnorm_reader: FieldNormReader,
scoring_enabled: bool,
slop: u32,
) -> PhraseScorer<TPostings> {
let max_offset = term_postings
.iter()
@@ -158,6 +229,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
similarity_weight,
fieldnorm_reader,
scoring_enabled,
slop,
};
if scorer.doc() != TERMINATED && !scorer.phrase_match() {
scorer.advance();
@@ -170,6 +242,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
fn phrase_match(&mut self) -> bool {
// Need to add support for slop in phrase_count and phrase_exists.
if self.scoring_enabled {
let count = self.compute_phrase_count();
self.phrase_count = count;
@@ -180,29 +253,25 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
fn phrase_exists(&mut self) -> bool {
self.intersection_docset
.docset_mut_specialized(0)
.positions(&mut self.left);
let mut intersection_len = self.left.len();
for i in 1..self.num_terms - 1 {
{
self.intersection_docset
.docset_mut_specialized(i)
.positions(&mut self.right);
}
intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
if intersection_len == 0 {
return false;
}
}
self.intersection_docset
.docset_mut_specialized(self.num_terms - 1)
.positions(&mut self.right);
let intersection_len = if self.has_slop() {
self.compute_match_with_slop()
} else {
self.compute_match()
};
intersection_exists(&self.left[..intersection_len], &self.right[..])
}
fn compute_phrase_count(&mut self) -> u32 {
let intersection_len = if self.has_slop() {
self.compute_match_with_slop()
} else {
self.compute_match()
};
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
}
/// Computes match without slop.
fn compute_match(&mut self) -> usize {
{
self.intersection_docset
.docset_mut_specialized(0)
@@ -217,14 +286,52 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
if intersection_len == 0 {
return 0u32;
return 0;
}
}
self.intersection_docset
.docset_mut_specialized(self.num_terms - 1)
.positions(&mut self.right);
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
intersection_len
}
// Computes match with slop.
fn compute_match_with_slop(&mut self) -> usize {
{
self.intersection_docset
.docset_mut_specialized(0)
.positions(&mut self.left);
}
let mut intersection_len = self.left.len();
// We'll increment the values to be equal to the next match in the right array to achieve ordered slop.
let mut left_begin_vec = self.left.clone();
let left_begin = &mut left_begin_vec[..];
for i in 1..self.num_terms {
{
self.intersection_docset
.docset_mut_specialized(i)
.positions(&mut self.right);
}
intersection_len = intersection_with_distance(
&mut self.left[..intersection_len],
&mut left_begin[..intersection_len],
&mut self.right[..],
self.slop,
);
// Update the left to be equal to the right. Merge the initial left.
if intersection_len == 0 {
return 0;
}
}
self.intersection_docset
.docset_mut_specialized(self.num_terms - 1)
.positions(&mut self.right);
intersection_len
}
fn has_slop(&self) -> bool {
self.slop > 0
}
}
@@ -267,18 +374,28 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
#[cfg(test)]
mod tests {
use super::{intersection, intersection_count};
use super::{intersection, intersection_count, intersection_with_distance};
fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {
test_intersection_aux(left, right, expected);
test_intersection_aux(right, left, expected);
test_intersection_aux(left, right, expected, 0);
test_intersection_aux(right, left, expected, 0);
}
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32]) {
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32], slop: u32) {
let mut left_vec = Vec::from(left);
let left_mut = &mut left_vec[..];
assert_eq!(intersection_count(left_mut, right), expected.len());
let count = intersection(left_mut, right);
if slop == 0 {
let left_mut = &mut left_vec[..];
assert_eq!(intersection_count(left_mut, right), expected.len());
let count = intersection(left_mut, right);
assert_eq!(&left_mut[..count], expected);
return;
}
let mut right_vec = Vec::from(right);
let right_mut = &mut right_vec[..];
let mut left_begin_vec = Vec::from(left);
let left_begin_mut = &mut left_begin_vec[..];
let count = intersection_with_distance(left_mut, left_begin_mut, right_mut, slop);
assert_eq!(&left_mut[..count], expected);
}
@@ -290,6 +407,51 @@ mod tests {
test_intersection_sym(&[5, 7], &[1, 5, 10, 12], &[5]);
test_intersection_sym(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12]);
}
#[test]
fn test_slop() {
// The slop is not symetric. It does not allow for the phrase to be out of order.
test_intersection_aux(&[1], &[2], &[2], 1);
test_intersection_aux(&[1], &[3], &[], 1);
test_intersection_aux(&[1], &[3], &[3], 2);
test_intersection_aux(&[], &[2], &[], 100000);
test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 12], 1);
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 1);
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
}
fn test_merge(
left: &[u32],
right: &[u32],
left_begin: &[u32],
expected_left: &[u32],
expected_left_begin: &[u32],
slop: u32,
) {
let mut left_vec = Vec::from(left);
let left_mut = &mut left_vec[..];
let mut right_vec = Vec::from(right);
let right_mut = &mut right_vec[..];
let mut left_begin_vec = Vec::from(left_begin);
let left_begin_mut = &mut left_begin_vec[..];
let count = intersection_with_distance(left_mut, left_begin_mut, right_mut, slop);
assert_eq!(&left_mut[..count], expected_left);
assert_eq!(&left_begin_mut[..count], expected_left_begin);
}
#[test]
fn test_merge_slop() {
test_merge(&[1, 2], &[1], &[0, 1], &[1], &[0], 1);
test_merge(&[3], &[4], &[2], &[4], &[2], 2);
test_merge(&[3], &[4], &[2], &[4], &[2], 2);
test_merge(
&[1, 5, 6, 9, 10, 12],
&[6, 8, 9, 12],
&[0, 1, 2, 3, 4, 5],
&[6, 9, 12],
&[2, 3, 5],
10,
);
}
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -16,6 +16,7 @@ pub struct PhraseWeight {
phrase_terms: Vec<(usize, Term)>,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
slop: u32,
}
impl PhraseWeight {
@@ -25,10 +26,12 @@ impl PhraseWeight {
similarity_weight: Bm25Weight,
scoring_enabled: bool,
) -> PhraseWeight {
let slop = 0;
PhraseWeight {
phrase_terms,
similarity_weight,
scoring_enabled,
slop,
}
}
@@ -48,43 +51,41 @@ impl PhraseWeight {
) -> crate::Result<Option<PhraseScorer<SegmentPostings>>> {
let similarity_weight = self.similarity_weight.boost_by(boost);
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let mut term_postings_list = Vec::new();
if reader.has_deletes() {
let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)?
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
}
Ok(Some(PhraseScorer::new(
term_postings_list,
similarity_weight,
fieldnorm_reader,
self.scoring_enabled,
)))
} else {
let mut term_postings_list = Vec::new();
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings_no_deletes(&term, IndexRecordOption::WithFreqsAndPositions)?
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
}
Ok(Some(PhraseScorer::new(
term_postings_list,
similarity_weight,
fieldnorm_reader,
self.scoring_enabled,
)))
}
Ok(Some(PhraseScorer::new(
term_postings_list,
similarity_weight,
fieldnorm_reader,
self.scoring_enabled,
self.slop,
)))
}
pub fn slop(&mut self, slop: u32) {
self.slop = slop;
}
}

View File

@@ -722,7 +722,7 @@ mod test {
let is_not_indexed_err = |query: &str| {
let result: Result<Box<dyn Query>, QueryParserError> = query_parser.parse_query(query);
if let Err(QueryParserError::FieldNotIndexed(field_name)) = result {
Some(field_name.clone())
Some(field_name)
} else {
None
}

View File

@@ -91,8 +91,8 @@ impl RangeQuery {
RangeQuery {
field,
value_type,
left_bound: map_bound(&left_bound, &verify_and_unwrap_term),
right_bound: map_bound(&right_bound, &verify_and_unwrap_term),
left_bound: map_bound(left_bound, &verify_and_unwrap_term),
right_bound: map_bound(right_bound, &verify_and_unwrap_term),
}
}
@@ -493,7 +493,7 @@ mod tests {
let year = schema.get_field("year").unwrap();
index_writer.add_document(doc!(
title => "hemoglobin blood",
year => 1990 as i64
year => 1990_i64
));
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -58,7 +58,7 @@ pub struct RegexQuery {
impl RegexQuery {
/// Creates a new RegexQuery from a given pattern
pub fn from_pattern(regex_pattern: &str, field: Field) -> crate::Result<Self> {
let regex = Regex::new(&regex_pattern)
let regex = Regex::new(regex_pattern)
.map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_string()))?;
Ok(RegexQuery::from_regex(regex, field))
}
@@ -169,9 +169,9 @@ mod test {
verify_regex_query(matching_one, matching_zero, reader.clone());
let matching_one = RegexQuery::from_regex(r1.clone(), field);
let matching_zero = RegexQuery::from_regex(r2.clone(), field);
let matching_one = RegexQuery::from_regex(r1, field);
let matching_zero = RegexQuery::from_regex(r2, field);
verify_regex_query(matching_one, matching_zero, reader.clone());
verify_regex_query(matching_one, matching_zero, reader);
}
}

View File

@@ -194,8 +194,8 @@ mod tests {
.collect();
let mut fieldnorms: Vec<u32> = vec![];
for i in 0..term_doc_freq {
let (tf, num_extra_terms) = term_freqs_fieldnorms[i];
for item in term_freqs_fieldnorms.iter().take(term_doc_freq) {
let (tf, num_extra_terms) = item;
fieldnorms.push(tf + num_extra_terms);
}
let average_fieldnorm = fieldnorms
@@ -253,7 +253,7 @@ mod tests {
}
fn test_block_wand_aux(term_query: &TermQuery, searcher: &Searcher) -> crate::Result<()> {
let term_weight = term_query.specialized_weight(&searcher, true)?;
let term_weight = term_query.specialized_weight(searcher, true)?;
for reader in searcher.segment_readers() {
let mut block_max_scores = vec![];
let mut block_max_scores_b = vec![];
@@ -309,7 +309,7 @@ mod tests {
}
writer.commit()?;
let term_query = TermQuery::new(
Term::from_field_text(text_field, &"bbbb"),
Term::from_field_text(text_field, "bbbb"),
IndexRecordOption::WithFreqs,
);
let segment_ids: Vec<SegmentId>;

View File

@@ -319,7 +319,7 @@ mod tests {
let res: Box<dyn DocSet> = Box::new(Union::<_, DoNothingCombiner>::from(
docs_list
.iter()
.map(|docs| docs.clone())
.cloned()
.map(VecDocSet::from)
.map(|docset| ConstScorer::new(docset, 1.0))
.collect::<Vec<_>>(),

View File

@@ -253,7 +253,7 @@ mod tests {
threads.push(thread::spawn(move || {
assert_eq!(start_1_recv.recv(), Ok("start"));
let _leased_searcher = &pool1.acquire();
let _leased_searcher = pool1.acquire();
assert!(event_send1.send("1 acquired").is_ok());
assert_eq!(start_1_recv.recv(), Ok("stop"));
assert!(event_send1.send("1 stopped").is_ok());
@@ -262,7 +262,7 @@ mod tests {
threads.push(thread::spawn(move || {
assert_eq!(start_2_recv.recv(), Ok("start"));
let _leased_searcher = &pool2.acquire();
let _leased_searcher = pool2.acquire();
assert!(event_send2.send("2 acquired").is_ok());
assert_eq!(start_2_recv.recv(), Ok("stop"));
mem::drop(_leased_searcher);
@@ -271,7 +271,7 @@ mod tests {
threads.push(thread::spawn(move || {
assert_eq!(start_3_recv.recv(), Ok("start"));
let _leased_searcher = &pool3.acquire();
let _leased_searcher = pool3.acquire();
assert!(event_send3.send("3 acquired").is_ok());
assert_eq!(start_3_recv.recv(), Ok("stop"));
mem::drop(_leased_searcher);

View File

@@ -39,7 +39,7 @@ pub enum FacetParseError {
/// belonging to a facet also belongs to the ancestor of
/// its facet. In the example above, `/electronics/tv_and_video/`
/// and `/electronics`.
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
#[derive(Clone, Default, Eq, Hash, PartialEq, Ord, PartialOrd)]
pub struct Facet(String);
impl Facet {

View File

@@ -317,7 +317,7 @@ mod tests {
assert_eq!("title", field_value.name);
match field_value.field_type {
FieldType::Str(_) => assert!(true),
FieldType::Str(_) => {}
_ => panic!("expected FieldType::Str"),
}
}

View File

@@ -151,20 +151,13 @@ pub use self::flags::{FAST, INDEXED, STORED};
pub use self::int_options::Cardinality;
pub use self::int_options::IntOptions;
use once_cell::sync::Lazy;
use regex::Regex;
/// Regular expression representing the restriction on a valid field names.
pub const FIELD_NAME_PATTERN: &str = r#"^[_a-zA-Z][_\-a-zA-Z0-9]*$"#;
/// Validator for a potential `field_name`.
/// Returns true iff the name can be use for a field name.
/// Returns true if the name can be use for a field name.
///
/// A field name must start by a letter `[a-zA-Z]`.
/// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`.
/// A field name can be any character, must have at least one character
/// and must not start with a `-`.
pub fn is_valid_field_name(field_name: &str) -> bool {
static FIELD_NAME_PTN: Lazy<Regex> = Lazy::new(|| Regex::new(FIELD_NAME_PATTERN).unwrap());
FIELD_NAME_PTN.is_match(field_name)
field_name.len() > 0 && !field_name.starts_with('-')
}
#[cfg(test)]
@@ -174,16 +167,8 @@ mod tests {
#[test]
fn test_is_valid_name() {
assert!(is_valid_field_name("text"));
assert!(is_valid_field_name("text0"));
assert!(!is_valid_field_name("0text"));
assert!(is_valid_field_name("field-name"));
assert!(is_valid_field_name("field_name"));
assert!(!is_valid_field_name("field!name"));
assert!(is_valid_field_name("シャボン玉"));
assert!(!is_valid_field_name("-fieldname"));
assert!(is_valid_field_name("_fieldname"));
assert!(!is_valid_field_name(""));
assert!(!is_valid_field_name("シャボン玉"));
assert!(is_valid_field_name("my_text_field"));
}
}

View File

@@ -779,7 +779,7 @@ mod tests {
}
]"#;
let tmp_schema: Schema =
serde_json::from_str(&schema_content).expect("error while reading json");
serde_json::from_str(schema_content).expect("error while reading json");
for (_field, field_entry) in tmp_schema.fields() {
schema_builder.add_field(field_entry.clone());
}

View File

@@ -137,7 +137,7 @@ fn search_fragments<'a>(
};
fragment = FragmentCandidate::new(next.offset_from);
}
fragment.try_add_token(next, &terms);
fragment.try_add_token(next, terms);
}
if fragment.score > 0.0 {
fragments.push(fragment)
@@ -286,8 +286,8 @@ impl SnippetGenerator {
/// Generates a snippet for the given text.
pub fn snippet(&self, text: &str) -> Snippet {
let fragment_candidates =
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars);
select_best_fragment_combination(&fragment_candidates[..], &text)
search_fragments(&self.tokenizer, text, &self.terms_text, self.max_num_chars);
select_best_fragment_combination(&fragment_candidates[..], text)
}
}
@@ -301,9 +301,8 @@ mod tests {
use crate::SnippetGenerator;
use maplit::btreemap;
use std::collections::BTreeMap;
use std::iter::Iterator;
const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by
const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by
Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and
imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],
but its designers intend it to provide better memory safety while still maintaining
@@ -330,7 +329,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 1.9);
assert_eq!(first.stop_offset, 89);
}
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
assert_eq!(
snippet.fragments,
"Rust is a systems programming language sponsored by\n\
@@ -356,7 +355,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 1.0);
assert_eq!(first.stop_offset, 17);
}
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
}
{
@@ -371,7 +370,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 0.9);
assert_eq!(first.stop_offset, 17);
}
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
assert_eq!(snippet.to_html(), "programming <b>language</b>")
}
}
@@ -383,17 +382,17 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("c"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
assert_eq!(fragments.len(), 1);
{
let first = fragments.iter().nth(0).unwrap();
let first = &fragments[0];
assert_eq!(first.score, 1.0);
assert_eq!(first.start_offset, 4);
assert_eq!(first.stop_offset, 7);
}
let snippet = select_best_fragment_combination(&fragments[..], &text);
let snippet = select_best_fragment_combination(&fragments[..], text);
assert_eq!(snippet.fragments, "c d");
assert_eq!(snippet.to_html(), "<b>c</b> d");
}
@@ -405,17 +404,17 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("f"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
assert_eq!(fragments.len(), 2);
{
let first = fragments.iter().nth(0).unwrap();
let first = &fragments[0];
assert_eq!(first.score, 1.0);
assert_eq!(first.stop_offset, 11);
assert_eq!(first.start_offset, 8);
}
let snippet = select_best_fragment_combination(&fragments[..], &text);
let snippet = select_best_fragment_combination(&fragments[..], text);
assert_eq!(snippet.fragments, "e f");
assert_eq!(snippet.to_html(), "e <b>f</b>");
}
@@ -428,17 +427,17 @@ Survey in 2016, 2017, and 2018."#;
terms.insert(String::from("f"), 1.0);
terms.insert(String::from("a"), 0.9);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7);
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 7);
assert_eq!(fragments.len(), 2);
{
let first = fragments.iter().nth(0).unwrap();
let first = &fragments[0];
assert_eq!(first.score, 0.9);
assert_eq!(first.stop_offset, 7);
assert_eq!(first.start_offset, 0);
}
let snippet = select_best_fragment_combination(&fragments[..], &text);
let snippet = select_best_fragment_combination(&fragments[..], text);
assert_eq!(snippet.fragments, "e f g");
assert_eq!(snippet.to_html(), "e <b>f</b> g");
}
@@ -450,11 +449,11 @@ Survey in 2016, 2017, and 2018."#;
let mut terms = BTreeMap::new();
terms.insert(String::from("z"), 1.0);
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], &text);
let snippet = select_best_fragment_combination(&fragments[..], text);
assert_eq!(snippet.fragments, "");
assert_eq!(snippet.to_html(), "");
}
@@ -464,10 +463,10 @@ Survey in 2016, 2017, and 2018."#;
let text = "a b c d";
let terms = BTreeMap::new();
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], &text);
let snippet = select_best_fragment_combination(&fragments[..], text);
assert_eq!(snippet.fragments, "");
assert_eq!(snippet.to_html(), "");
}

View File

@@ -296,7 +296,7 @@ mod test {
#[test]
fn test_empty() {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage().unwrap();
@@ -325,7 +325,7 @@ mod test {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_u64_field("name", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
@@ -362,7 +362,7 @@ mod test {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_text_field("name", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
@@ -401,7 +401,7 @@ mod test {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_text_field("name", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
@@ -439,7 +439,7 @@ mod test {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_u64_field("name", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;

View File

@@ -40,7 +40,7 @@ impl fmt::Debug for Checkpoint {
#[cfg(test)]
mod tests {
use std::{io, iter};
use std::io;
use futures::executor::block_on;
use proptest::strategy::{BoxedStrategy, Strategy};
@@ -134,9 +134,7 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let long_text: String = iter::repeat("abcdefghijklmnopqrstuvwxyz")
.take(1_000)
.collect();
let long_text: String = "abcdefghijklmnopqrstuvwxyz".repeat(1_000);
for _ in 0..20 {
index_writer.add_document(doc!(body=>long_text.clone()));
}
@@ -219,8 +217,7 @@ mod tests {
) -> Option<Checkpoint> {
checkpoints
.into_iter()
.filter(|checkpoint| checkpoint.doc_range.end > target)
.next()
.find(|checkpoint| checkpoint.doc_range.end > target)
}
fn test_skip_index_aux(skip_index: SkipIndex, checkpoints: &[Checkpoint]) {

View File

@@ -163,7 +163,7 @@ impl StoreReader {
let mut curr_checkpoint = checkpoint_block_iter.next();
let mut curr_block = curr_checkpoint
.as_ref()
.map(|checkpoint| self.read_block(&checkpoint).map_err(|e| e.kind())); // map error in order to enable cloning
.map(|checkpoint| self.read_block(checkpoint).map_err(|e| e.kind())); // map error in order to enable cloning
let mut block_start_pos = 0;
let mut num_skipped = 0;
let mut reset_block_pos = false;
@@ -177,7 +177,7 @@ impl StoreReader {
curr_checkpoint = checkpoint_block_iter.next();
curr_block = curr_checkpoint
.as_ref()
.map(|checkpoint| self.read_block(&checkpoint).map_err(|e| e.kind()));
.map(|checkpoint| self.read_block(checkpoint).map_err(|e| e.kind()));
reset_block_pos = true;
num_skipped = 0;
}

View File

@@ -64,7 +64,7 @@ impl StoreWriter {
pub fn store_bytes(&mut self, serialized_document: &[u8]) -> io::Result<()> {
let doc_num_bytes = serialized_document.len();
VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?;
self.current_block.write_all(&serialized_document)?;
self.current_block.write_all(serialized_document)?;
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
self.write_and_compress_block()?;

View File

@@ -25,7 +25,7 @@ fn test_empty_term_dictionary() {
#[test]
fn test_term_ordinals() -> crate::Result<()> {
const COUNTRIES: [&'static str; 7] = [
const COUNTRIES: [&str; 7] = [
"San Marino",
"Serbia",
"Slovakia",
@@ -74,7 +74,7 @@ fn test_term_dictionary_simple() -> crate::Result<()> {
{
{
let (k, v) = stream.next().unwrap();
assert_eq!(k.as_ref(), "abc".as_bytes());
assert_eq!(k, "abc".as_bytes());
assert_eq!(v.doc_freq, 34u32);
}
assert_eq!(stream.key(), "abc".as_bytes());
@@ -114,7 +114,7 @@ fn test_term_dictionary_stream() -> crate::Result<()> {
let mut i = 0;
while let Some((streamer_k, streamer_v)) = streamer.next() {
let &(ref key, ref v) = &ids[i];
assert_eq!(streamer_k.as_ref(), key.as_bytes());
assert_eq!(streamer_k, key.as_bytes());
assert_eq!(streamer_v, &make_term_info(*v as u64));
i += 1;
}
@@ -182,7 +182,7 @@ fn test_stream_range() -> crate::Result<()> {
for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j];
assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key);
assert_eq!(str::from_utf8(streamer_k).unwrap(), key);
assert_eq!(streamer_v.doc_freq, *v);
assert_eq!(streamer_v, &make_term_info(*v as u64));
}
@@ -199,7 +199,7 @@ fn test_stream_range() -> crate::Result<()> {
for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j + 1];
assert_eq!(streamer_k.as_ref(), key.as_bytes());
assert_eq!(streamer_k, key.as_bytes());
assert_eq!(streamer_v.doc_freq, *v);
}
}
@@ -230,10 +230,10 @@ fn test_empty_string() -> crate::Result<()> {
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
term_dictionary_builder
.insert(&[], &make_term_info(1 as u64))
.insert(&[], &make_term_info(1_u64))
.unwrap();
term_dictionary_builder
.insert(&[1u8], &make_term_info(2 as u64))
.insert(&[1u8], &make_term_info(2_u64))
.unwrap();
term_dictionary_builder.finish()?
};
@@ -266,7 +266,7 @@ fn test_stream_range_boundaries_forward() -> crate::Result<()> {
let term_dictionary = stream_range_test_dict()?;
let value_list = |mut streamer: TermStreamer<'_>| {
let mut res: Vec<u32> = vec![];
while let Some((_, ref v)) = streamer.next() {
while let Some((_, v)) = streamer.next() {
res.push(v.doc_freq);
}
res
@@ -308,7 +308,7 @@ fn test_stream_range_boundaries_backward() -> crate::Result<()> {
let term_dictionary = stream_range_test_dict()?;
let value_list_backward = |mut streamer: TermStreamer<'_>| {
let mut res: Vec<u32> = vec![];
while let Some((_, ref v)) = streamer.next() {
while let Some((_, v)) = streamer.next() {
res.push(v.doc_freq);
}
res.reverse();
@@ -393,7 +393,7 @@ fn test_automaton_search() -> crate::Result<()> {
use crate::query::DfaWrapper;
use levenshtein_automata::LevenshteinAutomatonBuilder;
const COUNTRIES: [&'static str; 7] = [
const COUNTRIES: [&str; 7] = [
"San Marino",
"Serbia",
"Slovakia",

View File

@@ -4036,7 +4036,7 @@ mod tests {
for (characters, folded) in foldings {
for &c in characters {
assert_eq!(
folding_using_raw_tokenizer_helper(&c),
folding_using_raw_tokenizer_helper(c),
folded,
"testing that character \"{}\" becomes \"{}\"",
c,

View File

@@ -437,8 +437,7 @@ mod tests {
#[test]
fn test_stutterring_iterator() {
let rg: Vec<usize> = (0..10).collect();
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
let mut it = StutteringIterator::new(0..10, 1, 2);
assert_eq!(it.next(), Some((0, 1)));
assert_eq!(it.next(), Some((0, 2)));
assert_eq!(it.next(), Some((1, 2)));

View File

@@ -1,4 +1,3 @@
use fail;
use std::path::Path;
use tantivy::directory::{Directory, ManagedDirectory, RamDirectory, TerminatingWrite};
use tantivy::doc;