Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
3a8a83da80 tracing 2023-10-16 19:23:47 +09:00
176 changed files with 3131 additions and 8990 deletions

View File

@@ -3,6 +3,8 @@ name: Coverage
on:
push:
branches: [main]
pull_request:
branches: [main]
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
@@ -15,11 +17,11 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Install Rust
run: rustup toolchain install nightly-2023-09-10 --profile minimal --component llvm-tools-preview
run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo +nightly-2023-09-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
run: cargo +nightly llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true

View File

@@ -39,13 +39,6 @@ jobs:
- name: Check Formatting
run: cargo +nightly fmt --all -- --check
- name: Check Stable Compilation
run: cargo build --all-features
- name: Check Bench Compilation
run: cargo +nightly bench --no-run --profile=dev --all-features
- uses: actions-rs/clippy-check@v1
with:

View File

@@ -1,9 +1,3 @@
Tantivy 0.21.1
================================
#### Bugfixes
- Range queries on fast fields with less values on that field than documents had an invalid end condition, leading to missing results. [#2226](https://github.com/quickwit-oss/tantivy/issues/2226)(@appaquet @PSeitz)
- Increase the minimum memory budget from 3MB to 15MB to avoid single doc segments (API fix). [#2176](https://github.com/quickwit-oss/tantivy/issues/2176)(@PSeitz)
Tantivy 0.21
================================
#### Bugfixes

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.22.0-dev"
version = "0.21.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -19,37 +19,40 @@ oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
tracing = "0.1"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
tantivy-fst = "0.5"
memmap2 = { version = "0.9.0", optional = true }
tantivy-fst = "0.4.0"
memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false }
zstd = { version = "0.12", optional = true, default-features = false }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79"
num_cpus = "1.13.1"
fs4 = { version = "0.7.0", optional = true }
fs4 = { version = "0.6.3", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker4x"] }
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
census = "0.4.0"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true }
murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.12.0"
lru = "0.11.0"
fastdivide = "0.4.0"
itertools = "0.12.0"
itertools = "0.11.0"
measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"
columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" }
@@ -61,7 +64,6 @@ common = { version= "0.6", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -73,13 +75,15 @@ matches = "0.1.9"
pretty_assertions = "1.2.1"
proptest = "1.0.0"
test-log = "0.2.10"
env_logger = "0.10.0"
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
rand_distr = "0.4.3"
[target.'cfg(not(windows))'.dev-dependencies]
criterion = { version = "0.5", default-features = false }
criterion = "0.5"
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
[dev-dependencies.fail]
version = "0.5.0"
@@ -112,11 +116,6 @@ unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"]
# Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
# Uses 64bit ahash.
compare_hash_only = ["stacker/compare_hash_only"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]

View File

@@ -1,99 +1,14 @@
use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion, Throughput};
use tantivy::schema::{TantivyDocument, FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::{tokenizer, Index, IndexWriter};
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
const HDFS_LOGS: &str = include_str!("hdfs.json");
const GH_LOGS: &str = include_str!("gh.json");
const WIKI: &str = include_str!("wiki.json");
fn benchmark(
b: &mut Bencher,
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
parse_json: bool,
is_dynamic: bool,
) {
if is_dynamic {
benchmark_dynamic_json(b, input, schema, commit, parse_json)
} else {
_benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
TantivyDocument::parse_json(&schema, doc_json).unwrap()
})
}
}
fn get_index(schema: tantivy::schema::Schema) -> Index {
let mut index = Index::create_in_ram(schema.clone());
let ff_tokenizer_manager = tokenizer::TokenizerManager::default();
ff_tokenizer_manager.register(
"raw",
tokenizer::TextAnalyzer::builder(tokenizer::RawTokenizer::default())
.filter(tokenizer::RemoveLongFilter::limit(255))
.build(),
);
index.set_fast_field_tokenizers(ff_tokenizer_manager.clone());
index
}
fn _benchmark(
b: &mut Bencher,
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
include_json_parsing: bool,
create_doc: impl Fn(&tantivy::schema::Schema, &str) -> TantivyDocument,
) {
if include_json_parsing {
let lines: Vec<&str> = input.trim().split('\n').collect();
b.iter(|| {
let index = get_index(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = create_doc(&schema, doc_json);
index_writer.add_document(doc).unwrap();
}
if commit {
index_writer.commit().unwrap();
}
})
} else {
let docs: Vec<_> = input
.trim()
.split('\n')
.map(|doc_json| create_doc(&schema, doc_json))
.collect();
b.iter_batched(
|| docs.clone(),
|docs| {
let index = get_index(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}
if commit {
index_writer.commit().unwrap();
}
},
BatchSize::SmallInput,
)
}
}
fn benchmark_dynamic_json(
b: &mut Bencher,
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
parse_json: bool,
) {
let json_field = schema.get_field("json").unwrap();
_benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
tantivy::doc!(json_field=>json_val)
})
fn get_lines(input: &str) -> Vec<&str> {
input.trim().split('\n').collect()
}
pub fn hdfs_index_benchmark(c: &mut Criterion) {
@@ -104,14 +19,7 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
schema_builder.add_text_field("severity", STRING);
schema_builder.build()
};
let schema_only_fast = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", FAST);
schema_builder.add_text_field("body", FAST);
schema_builder.add_text_field("severity", FAST);
schema_builder.build()
};
let _schema_with_store = {
let schema_with_store = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED | STORED);
schema_builder.add_text_field("body", TEXT | STORED);
@@ -120,39 +28,74 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
};
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.add_json_field("json", TEXT);
schema_builder.build()
};
let mut group = c.benchmark_group("index-hdfs");
group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
group.sample_size(20);
let benches = [
("only-indexed-".to_string(), schema, false),
//("stored-".to_string(), _schema_with_store, false),
("only-fast-".to_string(), schema_only_fast, false),
("dynamic-".to_string(), dynamic_schema, true),
];
for (prefix, schema, is_dynamic) in benches {
for commit in [false, true] {
let suffix = if commit { "with-commit" } else { "no-commit" };
for parse_json in [false] {
// for parse_json in [false, true] {
let suffix = if parse_json {
format!("{}-with-json-parsing", suffix)
} else {
format!("{}", suffix)
};
let bench_name = format!("{}{}", prefix, suffix);
group.bench_function(bench_name, |b| {
benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
});
group.bench_function("index-hdfs-no-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
}
})
});
group.bench_function("index-hdfs-with-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
}
pub fn gh_index_benchmark(c: &mut Criterion) {
@@ -161,24 +104,38 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let dynamic_schema_fast = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-gh");
group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
group.bench_function("index-gh-no-commit", |b| {
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema.clone(), false, false)
let lines = get_lines(GH_LOGS);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-gh-fast", |b| {
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), false, false)
});
group.bench_function("index-gh-fast-with-commit", |b| {
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), true, false)
group.bench_function("index-gh-with-commit", |b| {
let lines = get_lines(GH_LOGS);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
}
@@ -193,10 +150,33 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
group.throughput(Throughput::Bytes(WIKI.len() as u64));
group.bench_function("index-wiki-no-commit", |b| {
benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), false, false)
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-wiki-with-commit", |b| {
benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), true, false)
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
}
@@ -207,12 +187,12 @@ criterion_group! {
}
criterion_group! {
name = gh_benches;
config = Criterion::default();
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = gh_index_benchmark
}
criterion_group! {
name = wiki_benches;
config = Criterion::default();
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = wiki_index_benchmark
}
criterion_main!(benches, gh_benches, wiki_benches);

View File

@@ -15,7 +15,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker1x"] }
bitpacking = {version="0.8", default-features=false, features = ["bitpacker1x"]}
[dev-dependencies]
rand = "0.8"

View File

@@ -367,7 +367,7 @@ mod test {
let mut output: Vec<u32> = Vec::new();
for len in [0, 1, 2, 32, 33, 34, 64] {
for start_idx in 0u32..32u32 {
output.resize(len, 0);
output.resize(len as usize, 0);
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
for i in 0..len {
let expected = (start_idx + i as u32) & mask;

View File

@@ -9,7 +9,8 @@ description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]
itertools = "0.12.0"
itertools = "0.11.0"
fnv = "1.0.7"
fastdivide = "0.4.0"
stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"}

View File

@@ -8,6 +8,7 @@ license = "MIT"
columnar = {path="../", package="tantivy-columnar"}
serde_json = "1"
serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"}
serde = "1"
[workspace]
members = []

View File

@@ -1,8 +1,3 @@
//! # `column_index`
//!
//! `column_index` provides rank and select operations to associate positions when not all
//! documents have exactly one element.
mod merge;
mod multivalued_index;
mod optional_index;
@@ -46,10 +41,10 @@ impl ColumnIndex {
pub fn is_multivalue(&self) -> bool {
matches!(self, ColumnIndex::Multivalued(_))
}
/// Returns the cardinality of the column index.
///
/// By convention, if the column contains no docs, we consider that it is
/// full.
// Returns the cardinality of the column index.
//
// By convention, if the column contains no docs, we consider that it is
// full.
#[inline]
pub fn get_cardinality(&self) -> Cardinality {
match self {

View File

@@ -215,12 +215,12 @@ mod bench {
let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio))
.enumerate()
.filter(|(_pos, val)| *val)
.filter(|(pos, val)| *val)
.map(|(pos, _)| pos as RowId)
.collect();
serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
open_optional_index(OwnedBytes::new(out)).unwrap()
let codec = open_optional_index(OwnedBytes::new(out)).unwrap();
codec
}
fn random_range_iterator(
@@ -242,7 +242,7 @@ mod bench {
}
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0;
let ratio = percent as f32 / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)

View File

@@ -30,7 +30,6 @@ impl<'a> SerializableColumnIndex<'a> {
}
}
/// Serialize a column index.
pub fn serialize_column_index(
column_index: SerializableColumnIndex,
output: &mut impl Write,
@@ -52,7 +51,6 @@ pub fn serialize_column_index(
Ok(column_index_num_bytes)
}
/// Open a serialized column index.
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
if bytes.is_empty() {
return Err(io::Error::new(

View File

@@ -269,8 +269,7 @@ impl StrOrBytesColumnWriter {
dictionaries: &mut [DictionaryBuilder],
arena: &mut MemoryArena,
) {
let unordered_id =
dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes, arena);
let unordered_id = dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes);
self.column_writer.record(doc, unordered_id, arena);
}

View File

@@ -338,7 +338,7 @@ impl ColumnarWriter {
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
.numerical_field_hash_map
.iter()
.map(|(column_name, addr)| {
.map(|(column_name, addr, _)| {
let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr);
let column_type = numerical_column_writer.numerical_type().into();
@@ -348,27 +348,27 @@ impl ColumnarWriter {
columns.extend(
self.bytes_field_hash_map
.iter()
.map(|(term, addr)| (term, ColumnType::Bytes, addr)),
.map(|(term, addr, _)| (term, ColumnType::Bytes, addr)),
);
columns.extend(
self.str_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::Str, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::Str, addr)),
);
columns.extend(
self.bool_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::Bool, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::Bool, addr)),
);
columns.extend(
self.ip_addr_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::IpAddr, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::IpAddr, addr)),
);
columns.extend(
self.datetime_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::DateTime, addr)),
);
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
@@ -437,7 +437,6 @@ impl ColumnarWriter {
&mut symbol_byte_buffer,
),
buffers,
&self.arena,
&mut column_serializer,
)?;
column_serializer.finalize()?;
@@ -491,7 +490,6 @@ impl ColumnarWriter {
// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
// Column: [Column Index, Column Values, column index num bytes U32::LE]
#[allow(clippy::too_many_arguments)]
fn serialize_bytes_or_str_column(
cardinality: Cardinality,
num_docs: RowId,
@@ -499,7 +497,6 @@ fn serialize_bytes_or_str_column(
dictionary_builder: &DictionaryBuilder,
operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>,
buffers: &mut SpareBuffers,
arena: &MemoryArena,
wrt: impl io::Write,
) -> io::Result<()> {
let SpareBuffers {
@@ -508,8 +505,7 @@ fn serialize_bytes_or_str_column(
..
} = buffers;
let mut counting_writer = CountingWriter::wrap(wrt);
let term_id_mapping: TermIdMapping =
dictionary_builder.serialize(arena, &mut counting_writer)?;
let term_id_mapping: TermIdMapping = dictionary_builder.serialize(&mut counting_writer)?;
let dictionary_num_bytes: u32 = counting_writer.written_bytes() as u32;
let mut wrt = counting_writer.finish();
let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| {

View File

@@ -1,7 +1,7 @@
use std::io;
use fnv::FnvHashMap;
use sstable::SSTable;
use stacker::{MemoryArena, SharedArenaHashMap};
pub(crate) struct TermIdMapping {
unordered_to_ord: Vec<OrderedId>,
@@ -31,38 +31,29 @@ pub struct OrderedId(pub u32);
/// mapping.
#[derive(Default)]
pub(crate) struct DictionaryBuilder {
dict: SharedArenaHashMap,
dict: FnvHashMap<Vec<u8>, UnorderedId>,
memory_consumption: usize,
}
impl DictionaryBuilder {
/// Get or allocate an unordered id.
/// (This ID is simply an auto-incremented id.)
pub fn get_or_allocate_id(&mut self, term: &[u8], arena: &mut MemoryArena) -> UnorderedId {
let next_id = self.dict.len() as u32;
let unordered_id = self
.dict
.mutate_or_create(term, arena, |unordered_id: Option<u32>| {
if let Some(unordered_id) = unordered_id {
unordered_id
} else {
next_id
}
});
UnorderedId(unordered_id)
pub fn get_or_allocate_id(&mut self, term: &[u8]) -> UnorderedId {
if let Some(term_id) = self.dict.get(term) {
return *term_id;
}
let new_id = UnorderedId(self.dict.len() as u32);
self.dict.insert(term.to_vec(), new_id);
self.memory_consumption += term.len();
self.memory_consumption += 40; // Term Metadata + HashMap overhead
new_id
}
/// Serialize the dictionary into an fst, and returns the
/// `UnorderedId -> TermOrdinal` map.
pub fn serialize<'a, W: io::Write + 'a>(
&self,
arena: &MemoryArena,
wrt: &mut W,
) -> io::Result<TermIdMapping> {
let mut terms: Vec<(&[u8], UnorderedId)> = self
.dict
.iter(arena)
.map(|(k, v)| (k, arena.read(v)))
.collect();
pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<TermIdMapping> {
let mut terms: Vec<(&[u8], UnorderedId)> =
self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect();
terms.sort_unstable_by_key(|(key, _)| *key);
// TODO Remove the allocation.
let mut unordered_to_ord: Vec<OrderedId> = vec![OrderedId(0u32); terms.len()];
@@ -77,7 +68,7 @@ impl DictionaryBuilder {
}
pub(crate) fn mem_usage(&self) -> usize {
self.dict.mem_usage()
self.memory_consumption
}
}
@@ -87,13 +78,12 @@ mod tests {
#[test]
fn test_dictionary_builder() {
let mut arena = MemoryArena::default();
let mut dictionary_builder = DictionaryBuilder::default();
let hello_uid = dictionary_builder.get_or_allocate_id(b"hello", &mut arena);
let happy_uid = dictionary_builder.get_or_allocate_id(b"happy", &mut arena);
let tax_uid = dictionary_builder.get_or_allocate_id(b"tax", &mut arena);
let hello_uid = dictionary_builder.get_or_allocate_id(b"hello");
let happy_uid = dictionary_builder.get_or_allocate_id(b"happy");
let tax_uid = dictionary_builder.get_or_allocate_id(b"tax");
let mut buffer = Vec::new();
let id_mapping = dictionary_builder.serialize(&arena, &mut buffer).unwrap();
let id_mapping = dictionary_builder.serialize(&mut buffer).unwrap();
assert_eq!(id_mapping.to_ord(hello_uid), OrderedId(1));
assert_eq!(id_mapping.to_ord(happy_uid), OrderedId(0));
assert_eq!(id_mapping.to_ord(tax_uid), OrderedId(2));

View File

@@ -1,22 +1,3 @@
//! # Tantivy-Columnar
//!
//! `tantivy-columnar`provides a columnar storage for tantivy.
//! The crate allows for efficient read operations on specific columns rather than entire records.
//!
//! ## Overview
//!
//! - **columnar**: Reading, writing, and merging multiple columns:
//! - **[ColumnarWriter]**: Makes it possible to create a new columnar.
//! - **[ColumnarReader]**: The ColumnarReader makes it possible to access a set of columns
//! associated to field names.
//! - **[merge_columnar]**: Contains the functionalities to merge multiple ColumnarReader or
//! segments into a single one.
//!
//! - **column**: A single column, which contains
//! - [column_index]: Resolves the rows for a document id. Manages the cardinality of the
//! column.
//! - [column_values]: Stores the values of a column in a dense format.
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#[cfg(test)]
@@ -31,7 +12,7 @@ use std::io;
mod block_accessor;
mod column;
pub mod column_index;
mod column_index;
pub mod column_values;
mod columnar;
mod dictionary;

View File

@@ -26,7 +26,7 @@ fn test_dataframe_writer_str() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 73);
assert_eq!(cols[0].num_bytes(), 87);
}
#[test]
@@ -40,7 +40,7 @@ fn test_dataframe_writer_bytes() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 73);
assert_eq!(cols[0].num_bytes(), 87);
}
#[test]
@@ -330,9 +330,9 @@ fn bytes_strategy() -> impl Strategy<Value = &'static [u8]> {
// A random column value
fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
prop_oneof![
10 => string_strategy().prop_map(ColumnValue::Str),
1 => bytes_strategy().prop_map(ColumnValue::Bytes),
40 => num_strategy().prop_map(ColumnValue::Numerical),
10 => string_strategy().prop_map(|s| ColumnValue::Str(s)),
1 => bytes_strategy().prop_map(|b| ColumnValue::Bytes(b)),
40 => num_strategy().prop_map(|n| ColumnValue::Numerical(n)),
1 => (1u16..3u16).prop_map(|ip_addr_byte| ColumnValue::IpAddr(Ipv6Addr::new(
127,
0,
@@ -343,7 +343,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
0,
ip_addr_byte
))),
1 => any::<bool>().prop_map(ColumnValue::Bool),
1 => any::<bool>().prop_map(|b| ColumnValue::Bool(b)),
1 => (0_679_723_993i64..1_679_723_995i64)
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
]
@@ -419,8 +419,8 @@ fn build_columnar_with_mapping(
columnar_writer
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
.unwrap();
ColumnarReader::open(buffer).unwrap()
let columnar_reader = ColumnarReader::open(buffer).unwrap();
columnar_reader
}
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
@@ -746,7 +746,7 @@ proptest! {
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
@@ -772,7 +772,7 @@ fn test_columnar_merging_empty_columnar() {
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
@@ -809,7 +809,7 @@ fn test_columnar_merging_number_columns() {
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}

View File

@@ -1,14 +1,11 @@
#![allow(deprecated)]
use std::fmt;
use std::io::{Read, Write};
use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
use crate::BinarySerializable;
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
#[derive(
@@ -167,15 +164,3 @@ impl fmt::Debug for DateTime {
f.write_str(&utc_rfc3339)
}
}
impl BinarySerializable for DateTime {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
let timestamp_micros = self.into_timestamp_micros();
<i64 as BinarySerializable>::serialize(&timestamp_micros, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let timestamp_micros = <i64 as BinarySerializable>::deserialize(reader)?;
Ok(Self::from_timestamp_micros(timestamp_micros))
}
}

View File

@@ -1,112 +0,0 @@
use crate::replace_in_place;
/// Separates the different segments of a json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };
/// Create a new JsonPathWriter, that creates flattened json paths for tantivy.
#[derive(Clone, Debug, Default)]
pub struct JsonPathWriter {
path: String,
indices: Vec<usize>,
expand_dots: bool,
}
impl JsonPathWriter {
pub fn new() -> Self {
JsonPathWriter {
path: String::new(),
indices: Vec::new(),
expand_dots: false,
}
}
/// When expand_dots is enabled, json object like
/// `{"k8s.node.id": 5}` is processed as if it was
/// `{"k8s": {"node": {"id": 5}}}`.
/// This option has the merit of allowing users to
/// write queries like `k8s.node.id:5`.
/// On the other, enabling that feature can lead to
/// ambiguity.
#[inline]
pub fn set_expand_dots(&mut self, expand_dots: bool) {
self.expand_dots = expand_dots;
}
/// Push a new segment to the path.
#[inline]
pub fn push(&mut self, segment: &str) {
let len_path = self.path.len();
self.indices.push(len_path);
if !self.path.is_empty() {
self.path.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
self.path.push_str(segment);
if self.expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut self.path[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
unsafe {
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment.as_bytes_mut())
};
}
}
/// Remove the last segment. Does nothing if the path is empty.
#[inline]
pub fn pop(&mut self) {
if let Some(last_idx) = self.indices.pop() {
self.path.truncate(last_idx);
}
}
/// Clear the path.
#[inline]
pub fn clear(&mut self) {
self.path.clear();
self.indices.clear();
}
/// Get the current path.
#[inline]
pub fn as_str(&self) -> &str {
&self.path
}
}
impl From<JsonPathWriter> for String {
#[inline]
fn from(value: JsonPathWriter) -> Self {
value.path
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn json_path_writer_test() {
let mut writer = JsonPathWriter::new();
writer.push("root");
assert_eq!(writer.as_str(), "root");
writer.push("child");
assert_eq!(writer.as_str(), "root\u{1}child");
writer.pop();
assert_eq!(writer.as_str(), "root");
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s.node.id");
writer.set_expand_dots(true);
writer.pop();
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s\u{1}node\u{1}id");
}
}

View File

@@ -9,7 +9,6 @@ mod byte_count;
mod datetime;
pub mod file_slice;
mod group_by;
mod json_path_writer;
mod serialize;
mod vint;
mod writer;
@@ -19,7 +18,6 @@ pub use byte_count::ByteCount;
pub use datetime::DatePrecision;
pub use datetime::{DateTime, DateTimePrecision};
pub use group_by::GroupByIteratorExtended;
pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{
@@ -118,7 +116,6 @@ pub fn u64_to_f64(val: u64) -> f64 {
///
/// This function assumes that the needle is rarely contained in the bytes string
/// and offers a fast path if the needle is not present.
#[inline]
pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
if !bytes.contains(&needle) {
return;

View File

@@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io::{Read, Write};
use std::{fmt, io};
@@ -250,43 +249,6 @@ impl BinarySerializable for String {
}
}
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self.iter() {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))
}
}
#[cfg(test)]
pub mod test {

View File

@@ -12,7 +12,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::AllQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Create Schema
@@ -132,10 +132,10 @@ fn main() -> tantivy::Result<()> {
let stream = Deserializer::from_str(data).into_iter::<Value>();
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let mut num_indexed = 0;
for value in stream {
let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?;
let doc = schema.parse_document(&serde_json::to_string(&value.unwrap())?)?;
index_writer.add_document(doc)?;
num_indexed += 1;
if num_indexed > 4 {

View File

@@ -15,7 +15,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -75,7 +75,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = TantivyDocument::default();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(
body,
@@ -164,7 +164,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
// We now need to acquire a searcher.
@@ -217,8 +217,8 @@ fn main() -> tantivy::Result<()> {
// the document returned will only contain
// a title.
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
// We can also get an explanation to understand

View File

@@ -13,7 +13,7 @@ use columnar::Column;
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, Index, Score, SegmentReader};
#[derive(Default)]
struct Stats {
@@ -142,7 +142,7 @@ fn main() -> tantivy::Result<()> {
// this example.
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
product_name => "Super Broom 2000",
product_description => "While it is ok for short distance travel, this broom \

View File

@@ -6,7 +6,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -62,7 +62,7 @@ fn main() -> tantivy::Result<()> {
//
// Here we use a buffer of 50MB per thread. Using a bigger
// memory arena for the indexer can increase its throughput.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
@@ -103,8 +103,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

View File

@@ -4,8 +4,8 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -22,18 +22,16 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// The dates are passed as string in the RFC3339 format
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T12:53:50.53Z",
"event": "pull-request"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T13:00:00.22Z",
"event": "comment"
@@ -60,13 +58,13 @@ fn main() -> tantivy::Result<()> {
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
let retrieved_doc = searcher.doc(doc_address)?;
assert!(matches!(
retrieved_doc.get_first(occurred_at),
Some(OwnedValue::Date(_))
Some(Value::Date(_))
));
assert_eq!(
retrieved_doc.to_json(&schema),
schema.to_json(&retrieved_doc),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
);
}

View File

@@ -11,7 +11,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexReader, IndexWriter};
use tantivy::{doc, Index, IndexReader};
// A simple helper function to fetch a single document
// given its id from our index.
@@ -19,7 +19,7 @@ use tantivy::{doc, Index, IndexReader, IndexWriter};
fn extract_doc_given_isbn(
reader: &IndexReader,
isbn_term: &Term,
) -> tantivy::Result<Option<TantivyDocument>> {
) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher();
// This is the simplest query you can think of.
@@ -69,10 +69,10 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's add a couple of documents, for the sake of the example.
let mut old_man_doc = TantivyDocument::default();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
index_writer.add_document(doc!(
isbn => "978-0099908401",
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
// Oops our frankenstein doc seems misspelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
frankenstein_doc_misspelled.to_json(&schema),
schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
);
@@ -136,7 +136,7 @@ fn main() -> tantivy::Result<()> {
// No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
frankenstein_new_doc.to_json(&schema),
schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
);

View File

@@ -17,7 +17,7 @@
use tantivy::collector::FacetCollector;
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the sake of this example
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
let mut index_writer = index.writer(30_000_000)?;
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.

View File

@@ -12,7 +12,7 @@ use std::collections::HashSet;
use tantivy::collector::TopDocs;
use tantivy::query::BooleanQuery;
use tantivy::schema::*;
use tantivy::{doc, DocId, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, DocId, Index, Score, SegmentReader};
fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
let mut index_writer = index.writer(30_000_000)?;
index_writer.add_document(doc!(
title => "Fried egg",
@@ -91,10 +91,11 @@ fn main() -> tantivy::Result<()> {
.iter()
.map(|(_, doc_id)| {
searcher
.doc::<TantivyDocument>(*doc_id)
.doc(*doc_id)
.unwrap()
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.as_text()
.unwrap()
.to_owned()
})

View File

@@ -14,7 +14,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::FuzzyTermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -66,7 +66,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -123,7 +123,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
// We now need to acquire a searcher.
@@ -151,10 +151,10 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3);
for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
// Note that the score is not lower for the fuzzy hit.
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("score {score:?} doc {}", retrieved_doc.to_json(&schema));
println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
// score 1.0 doc {"title":["The Diary of Muadib"]}
//
// score 1.0 doc {"title":["The Diary of a Young Girl"]}

View File

@@ -21,7 +21,7 @@ fn main() -> tantivy::Result<()> {
}"#;
// We can parse our document
let _mice_and_men_doc = TantivyDocument::parse_json(&schema, mice_and_men_doc_json)?;
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Prometheus"],
"year": 1818
}"#;
let _frankenstein_doc = TantivyDocument::parse_json(&schema, frankenstein_json)?;
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
// Note that the schema is saved in your index directory.
//

View File

@@ -5,7 +5,7 @@
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result};
use tantivy::{doc, Index, Result};
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
{
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 6_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?;
}

View File

@@ -6,7 +6,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -22,22 +22,20 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// ### IPv4
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
// `String`. Since the field is of type ip, we parse the IP address from the string and store it
// internally as IPv6.
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.33",
"event_type": "login"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.80",
"event_type": "checkout"
@@ -46,8 +44,7 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc)?;
// ### IPv6
// Adding a document that contains an IPv6 address.
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout"

View File

@@ -10,7 +10,7 @@
// ---
// Importing tantivy...
use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED};
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the
@@ -24,7 +24,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?;

View File

@@ -7,7 +7,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -20,9 +20,8 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let doc = TantivyDocument::parse_json(
&schema,
let mut index_writer = index.writer(50_000_000)?;
let doc = schema.parse_document(
r#"{
"timestamp": "2022-02-22T23:20:50.53Z",
"event_type": "click",
@@ -34,8 +33,7 @@ fn main() -> tantivy::Result<()> {
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"timestamp": "2022-02-22T23:20:51.53Z",
"event_type": "click",

View File

@@ -1,7 +1,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy, Result};
use tantivy::{doc, Index, ReloadPolicy, Result};
use tempfile::TempDir;
fn main() -> Result<()> {
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
@@ -51,7 +51,7 @@ fn main() -> Result<()> {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let searcher = reader.searcher();
@@ -67,12 +67,8 @@ fn main() -> Result<()> {
let mut titles = top_docs
.into_iter()
.map(|(_score, doc_address)| {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let title = doc
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.to_owned();
let doc = searcher.doc(doc_address)?;
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
Ok(title)
})
.collect::<Result<Vec<_>>>()?;

View File

@@ -13,7 +13,7 @@ use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> {
@@ -38,7 +38,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// We can create a document manually, by setting the fields
// one by one in a Document object.
@@ -83,7 +83,7 @@ fn main() -> tantivy::Result<()> {
}]
}"#;
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?;
let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc)?;
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let searcher = reader.searcher();
@@ -115,8 +115,8 @@ fn main() -> tantivy::Result<()> {
// Note that the tokens are not stored along with the original text
// in the document store
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("Document: {}", schema.to_json(&retrieved_doc));
}
// In contrary to the previous query, when we search for the "man" term we

View File

@@ -10,8 +10,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::snippet::{Snippet, SnippetGenerator};
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index, Snippet, SnippetGenerator};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -28,7 +27,7 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// we'll only need one doc for this example.
index_writer.add_document(doc!(
@@ -55,10 +54,13 @@ fn main() -> tantivy::Result<()> {
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
println!(
"title: {}",
doc.get_first(title).unwrap().as_text().unwrap()
);
println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet));
}

View File

@@ -15,7 +15,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search`
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
index.tokenizers().register("stoppy", tokenizer);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
@@ -105,9 +105,9 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
let retrieved_doc = searcher.doc(doc_address)?;
println!("\n==\nDocument score {score}:");
println!("{}", retrieved_doc.to_json(&schema));
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

View File

@@ -6,8 +6,8 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId,
SegmentReader, Warmer,
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
Warmer,
};
// This example shows how warmers can be used to
@@ -143,7 +143,7 @@ fn main() -> tantivy::Result<()> {
const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;

View File

@@ -185,7 +185,7 @@ fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> {
fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>> {
map(
// ~* for slop/prefix, ) inside group or ast tree, ^ if boost
tuple_infallible((simple_term_infallible(")^"), slop_or_prefix_val)),
tuple_infallible((simple_term_infallible("*)^"), slop_or_prefix_val)),
|((delimiter_phrase, (slop, prefix)), errors)| {
let leaf = if let Some((delimiter, phrase)) = delimiter_phrase {
Some(
@@ -1113,9 +1113,6 @@ mod test {
test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
test_parse_query_to_ast_helper("mr james bo?d", "(*mr *james *bo?d)");
test_parse_query_to_ast_helper("mr james bo*", "(*mr *james *bo*)");
test_parse_query_to_ast_helper("mr james b*d", "(*mr *james *b*d)");
}
#[test]

View File

@@ -48,7 +48,7 @@ mod bench {
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
@@ -85,7 +85,7 @@ mod bench {
if cardinality == Cardinality::Sparse {
doc_with_value /= 20;
}
let _val_max = 1_000_000.0;
let val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {

View File

@@ -73,9 +73,9 @@ impl AggregationLimits {
/// Create a new ResourceLimitGuard, that will release the memory when dropped.
pub fn new_guard(&self) -> ResourceLimitGuard {
ResourceLimitGuard {
// The counter which is shared between the aggregations for one request.
/// The counter which is shared between the aggregations for one request.
memory_consumption: Arc::clone(&self.memory_consumption),
// The memory_limit in bytes
/// The memory_limit in bytes
memory_limit: self.memory_limit,
allocated_with_the_guard: 0,
}
@@ -134,142 +134,3 @@ impl Drop for ResourceLimitGuard {
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
}
}
#[cfg(test)]
mod tests {
use crate::aggregation::tests::exec_request_with_query;
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_merge() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb" }"#],
vec![r#"{ "text": "aaa", "text2": "bbb" }"#],
];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 1, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 1,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_data() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![vec![r#"{ "text": "aaa", "text2": "bbb" }"#]];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
// Empty result since there is no doc with dates
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 0, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 0,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
}

View File

@@ -103,8 +103,7 @@ impl AggregationWithAccessor {
field: field_name, ..
}) => {
let (accessor, column_type) =
// Only DateTime is supported for DateHistogram
get_ff_reader(reader, field_name, Some(&[ColumnType::DateTime]))?;
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
Terms(TermsAggregation {

View File

@@ -9,7 +9,7 @@ use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_v
use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term};
use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation {
serde_json::from_value(json!({
@@ -586,7 +586,7 @@ fn test_aggregation_on_json_object() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"color": "red"})))
.unwrap();
@@ -624,72 +624,13 @@ fn test_aggregation_on_json_object() {
);
}
#[test]
fn test_aggregation_on_nested_json_object() {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json.blub", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"color.dot": "red", "color": {"nested":"red"} })))
.unwrap();
index_writer
.add_document(doc!(json => json!({"color.dot": "blue", "color": {"nested":"blue"} })))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let agg: Aggregations = serde_json::from_value(json!({
"jsonagg1": {
"terms": {
"field": "json\\.blub.color\\.dot",
}
},
"jsonagg2": {
"terms": {
"field": "json\\.blub.color.nested",
}
}
}))
.unwrap();
let aggregation_collector = get_collector(agg);
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
assert_eq!(
&aggregation_res_json,
&serde_json::json!({
"jsonagg1": {
"buckets": [
{"doc_count": 1, "key": "blue"},
{"doc_count": 1, "key": "red"}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
},
"jsonagg2": {
"buckets": [
{"doc_count": 1, "key": "blue"},
{"doc_count": 1, "key": "red"}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
})
);
}
#[test]
fn test_aggregation_on_json_object_empty_columns() {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Empty column when accessing color
index_writer
.add_document(doc!(json => json!({"price": 10.0})))
@@ -807,7 +748,7 @@ fn test_aggregation_on_json_object_mixed_types() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -132,7 +132,6 @@ impl DateHistogramAggregationReq {
hard_bounds: self.hard_bounds,
extended_bounds: self.extended_bounds,
keyed: self.keyed,
is_normalized_to_ns: false,
})
}
@@ -244,15 +243,15 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
}
#[cfg(test)]
pub mod tests {
mod tests {
use pretty_assertions::assert_eq;
use super::*;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING};
use crate::{Index, IndexWriter, TantivyDocument};
use crate::schema::{Schema, FAST};
use crate::Index;
#[test]
fn test_parse_into_millisecs() {
@@ -307,8 +306,7 @@ pub mod tests {
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("text", FAST | STRING);
schema_builder.add_text_field("text2", FAST | STRING);
schema_builder.add_text_field("text", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
@@ -316,7 +314,7 @@ pub mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for values in segment_and_docs {
for doc_str in values {
let doc = TantivyDocument::parse_json(&schema, doc_str)?;
let doc = schema.parse_document(doc_str)?;
index_writer.add_document(doc)?;
}
// writing the segment
@@ -328,7 +326,7 @@ pub mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() > 1 {
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}

View File

@@ -122,14 +122,11 @@ pub struct HistogramAggregation {
/// Whether to return the buckets as a hash map
#[serde(default)]
pub keyed: bool,
/// Whether the values are normalized to ns for date time values. Defaults to false.
#[serde(default)]
pub is_normalized_to_ns: bool,
}
impl HistogramAggregation {
pub(crate) fn normalize_date_time(&mut self) {
if !self.is_normalized_to_ns {
pub(crate) fn normalize(&mut self, column_type: ColumnType) {
if column_type.is_date_time() {
// values are provided in ms, but the fastfield is in nano seconds
self.interval *= 1_000_000.0;
self.offset = self.offset.map(|off| off * 1_000_000.0);
@@ -141,7 +138,6 @@ impl HistogramAggregation {
min: bounds.min * 1_000_000.0,
max: bounds.max * 1_000_000.0,
});
self.is_normalized_to_ns = true;
}
}
@@ -374,7 +370,7 @@ impl SegmentHistogramCollector {
Ok(IntermediateBucketResult::Histogram {
buckets,
is_date_agg: self.column_type == ColumnType::DateTime,
column_type: Some(self.column_type),
})
}
@@ -385,9 +381,7 @@ impl SegmentHistogramCollector {
accessor_idx: usize,
) -> crate::Result<Self> {
req.validate()?;
if field_type == ColumnType::DateTime {
req.normalize_date_time();
}
req.normalize(field_type);
let sub_aggregation_blueprint = if sub_aggregation.is_empty() {
None
@@ -445,7 +439,6 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// memory check upfront
let (_, first_bucket_num, last_bucket_num) =
generate_bucket_pos_with_opt_minmax(histogram_req, min_max);
// It's based on user input, so we need to account for overflows
let added_buckets = ((last_bucket_num.saturating_sub(first_bucket_num)).max(0) as u64)
.saturating_sub(buckets.len() as u64);
@@ -489,7 +482,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// Convert to BucketEntry
pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
buckets: Vec<IntermediateHistogramBucketEntry>,
is_date_agg: bool,
column_type: Option<ColumnType>,
histogram_req: &HistogramAggregation,
sub_aggregation: &Aggregations,
limits: &AggregationLimits,
@@ -498,8 +491,8 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// The request used in the the call to final is not yet be normalized.
// Normalization is changing the precision from milliseconds to nanoseconds.
let mut histogram_req = histogram_req.clone();
if is_date_agg {
histogram_req.normalize_date_time();
if let Some(column_type) = column_type {
histogram_req.normalize(column_type);
}
let mut buckets = if histogram_req.min_doc_count() == 0 {
// With min_doc_count != 0, we may need to add buckets, so that there are no
@@ -523,7 +516,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339
// and normalize from nanoseconds to milliseconds
if is_date_agg {
if column_type == Some(ColumnType::DateTime) {
for bucket in buckets.iter_mut() {
if let crate::aggregation::Key::F64(ref mut val) = bucket.key {
let key_as_string = format_date(*val as i64)?;

View File

@@ -601,7 +601,7 @@ mod tests {
use crate::aggregation::AggregationLimits;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING};
use crate::{Index, IndexWriter};
use crate::Index;
#[test]
fn terms_aggregation_test_single_segment() -> crate::Result<()> {
@@ -1473,7 +1473,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();

View File

@@ -117,7 +117,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter};
use crate::Index;
#[test]
fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> {
@@ -126,7 +126,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))
@@ -186,7 +186,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?;
index_writer.add_document(doc!(score => 5.0))?;
@@ -231,7 +231,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
@@ -278,7 +278,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
@@ -323,7 +323,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -385,7 +385,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -427,7 +427,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -172,16 +172,10 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Range(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(
Default::default(),
)),
Histogram(_) => {
Histogram(_) | DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
is_date_agg: false,
})
}
DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
is_date_agg: true,
column_type: None,
})
}
Average(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Average(
@@ -349,8 +343,8 @@ pub enum IntermediateBucketResult {
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
Histogram {
/// The column_type of the underlying `Column` is DateTime
is_date_agg: bool,
/// The column_type of the underlying `Column`
column_type: Option<ColumnType>,
/// The buckets
buckets: Vec<IntermediateHistogramBucketEntry>,
},
@@ -405,7 +399,7 @@ impl IntermediateBucketResult {
Ok(BucketResult::Range { buckets })
}
IntermediateBucketResult::Histogram {
is_date_agg,
column_type,
buckets,
} => {
let histogram_req = &req
@@ -414,7 +408,7 @@ impl IntermediateBucketResult {
.expect("unexpected aggregation, expected histogram aggregation");
let buckets = intermediate_histogram_buckets_to_final_buckets(
buckets,
is_date_agg,
column_type,
histogram_req,
req.sub_aggregation(),
limits,
@@ -463,11 +457,11 @@ impl IntermediateBucketResult {
(
IntermediateBucketResult::Histogram {
buckets: buckets_left,
is_date_agg: _,
..
},
IntermediateBucketResult::Histogram {
buckets: buckets_right,
is_date_agg: _,
..
},
) => {
let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> =

View File

@@ -71,7 +71,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter};
use crate::Index;
#[test]
fn test_max_agg_with_missing() -> crate::Result<()> {
@@ -79,7 +79,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();

View File

@@ -88,7 +88,7 @@ mod tests {
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
use crate::schema::{NumericOptions, Schema};
use crate::{Index, IndexWriter};
use crate::Index;
#[test]
fn test_metric_aggregations() {
@@ -96,7 +96,7 @@ mod tests {
let field_options = NumericOptions::default().set_fast();
let field = schema_builder.add_f64_field("price", field_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..3 {
index_writer

View File

@@ -300,7 +300,7 @@ mod tests {
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term};
use crate::{Index, Term};
#[test]
fn test_aggregation_stats_empty_index() -> crate::Result<()> {
@@ -494,7 +494,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();
@@ -541,7 +541,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();

View File

@@ -319,7 +319,7 @@ mod tests {
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, IndexWriter, Term};
use crate::{Index, Term};
pub fn get_test_index_with_num_docs(
merge_segments: bool,
@@ -451,7 +451,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() > 1 {
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -565,7 +565,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}

View File

@@ -495,8 +495,8 @@ mod tests {
use crate::collector::Count;
use crate::core::Index;
use crate::query::{AllQuery, QueryParser, TermQuery};
use crate::schema::{Facet, FacetOptions, IndexRecordOption, Schema, TantivyDocument};
use crate::{IndexWriter, Term};
use crate::schema::{Document, Facet, FacetOptions, IndexRecordOption, Schema};
use crate::Term;
fn test_collapse_mapping_aux(
facet_terms: &[&str],
@@ -559,7 +559,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from("/facet/a")))
.unwrap();
@@ -588,7 +588,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
@@ -601,7 +601,7 @@ mod tests {
})
.collect();
for i in 0..num_facets * 10 {
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc).unwrap();
}
@@ -732,25 +732,24 @@ mod tests {
let index = Index::create_in_ram(schema);
let uniform = Uniform::new_inclusive(1, 100_000);
let mut docs: Vec<TantivyDocument> =
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)),
);
doc
})
.collect();
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)),
);
doc
})
.collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}
@@ -781,7 +780,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
let docs: Vec<Document> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
@@ -829,7 +828,7 @@ mod bench {
use crate::collector::FacetCollector;
use crate::query::AllQuery;
use crate::schema::{Facet, Schema, INDEXED};
use crate::{Index, IndexWriter};
use crate::Index;
#[bench]
fn bench_facet_collector(b: &mut Bencher) {
@@ -848,7 +847,7 @@ mod bench {
// 40425 docs
docs[..].shuffle(&mut thread_rng());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}

View File

@@ -12,7 +12,8 @@ use std::marker::PhantomData;
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
use crate::collector::{Collector, SegmentCollector};
use crate::{DocId, Score, SegmentReader};
use crate::schema::Field;
use crate::{DocId, Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate.
///
@@ -49,13 +50,13 @@ use crate::{DocId, Score, SegmentReader};
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new("price".to_string(), |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new("price".to_string(), |value| value < 5u64, TopDocs::with_limit(2));
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
///
/// assert_eq!(filtered_top_docs.len(), 0);
@@ -69,7 +70,7 @@ use crate::{DocId, Score, SegmentReader};
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
where TPredicate: 'static + Clone
{
field: String,
field: Field,
collector: TCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
@@ -82,7 +83,7 @@ where
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
{
/// Create a new `FilterCollector`.
pub fn new(field: String, predicate: TPredicate, collector: TCollector) -> Self {
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
@@ -109,7 +110,18 @@ where
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let column_opt = segment_reader.fast_fields().column_opt(&self.field)?;
let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let column_opt = segment_reader
.fast_fields()
.column_opt(field_entry.name())?;
let segment_collector = self
.collector
@@ -217,7 +229,7 @@ where
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let filter_collector = BytesFilterCollector::new("barcode".to_string(), |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
@@ -228,7 +240,7 @@ where
pub struct BytesFilterCollector<TCollector, TPredicate>
where TPredicate: 'static + Clone
{
field: String,
field: Field,
collector: TCollector,
predicate: TPredicate,
}
@@ -239,7 +251,7 @@ where
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
{
/// Create a new `BytesFilterCollector`.
pub fn new(field: String, predicate: TPredicate, collector: TCollector) -> Self {
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
@@ -262,7 +274,10 @@ where
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let column_opt = segment_reader.fast_fields().bytes(&self.field)?;
let schema = segment_reader.schema();
let field_name = schema.get_field_name(self.field);
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
let segment_collector = self
.collector

View File

@@ -97,7 +97,7 @@ pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
mod top_collector;
mod top_score_collector;
pub use self::top_score_collector::{TopDocs, TopNComputer};
pub use self::top_score_collector::TopDocs;
mod custom_score_top_collector;
pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};

View File

@@ -7,9 +7,7 @@ use crate::query::{AllQuery, QueryParser};
use crate::schema::{Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{
doc, DateTime, DocAddress, DocId, Index, Score, Searcher, SegmentOrdinal, TantivyDocument,
};
use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
compute_score: true,
@@ -42,7 +40,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
let query_parser = QueryParser::for_index(&index, vec![title]);
let query = query_parser.parse_query("diary")?;
let filter_some_collector = FilterCollector::new(
"price".to_string(),
price,
&|value: u64| value > 20_120u64,
TopDocs::with_limit(2),
);
@@ -51,11 +49,8 @@ pub fn test_filter_collector() -> crate::Result<()> {
assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(
"price".to_string(),
&|value| value < 5u64,
TopDocs::with_limit(2),
);
let filter_all_collector: FilterCollector<_, _, u64> =
FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
assert_eq!(filtered_top_docs.len(), 0);
@@ -66,8 +61,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
> 0
}
let filter_dates_collector =
FilterCollector::new("date".to_string(), &date_filter, TopDocs::with_limit(5));
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
assert_eq!(filtered_date_docs.len(), 2);
@@ -286,8 +280,8 @@ fn make_test_searcher() -> crate::Result<Searcher> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(Document::default())?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
Ok(index.reader()?.searcher())
}

View File

@@ -1,7 +1,7 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::marker::PhantomData;
use super::top_score_collector::TopNComputer;
use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader};
/// Contains a feature (field, score, etc.) of a document along with the document address.
@@ -20,14 +20,6 @@ pub(crate) struct ComparableDoc<T, D> {
pub feature: T,
pub doc: D,
}
impl<T: std::fmt::Debug, D: std::fmt::Debug> std::fmt::Debug for ComparableDoc<T, D> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ComparableDoc")
.field("feature", &self.feature)
.field("doc", &self.doc)
.finish()
}
}
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
@@ -99,13 +91,18 @@ where T: PartialOrd + Clone
if self.limit == 0 {
return Ok(Vec::new());
}
let mut top_collector = TopNComputer::new(self.limit + self.offset);
let mut top_collector = BinaryHeap::new();
for child_fruit in children {
for (feature, doc) in child_fruit {
top_collector.push(ComparableDoc { feature, doc });
if top_collector.len() < (self.limit + self.offset) {
top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
}
}
}
Ok(top_collector
.into_sorted_vec()
.into_iter()
@@ -114,7 +111,7 @@ where T: PartialOrd + Clone
.collect())
}
pub(crate) fn for_segment<F: PartialOrd + Clone>(
pub(crate) fn for_segment<F: PartialOrd>(
&self,
segment_id: SegmentOrdinal,
_: &SegmentReader,
@@ -139,18 +136,20 @@ where T: PartialOrd + Clone
/// The Top Collector keeps track of the K documents
/// sorted by type `T`.
///
/// The implementation is based on a repeatedly truncating on the median after K * 2 documents
/// The implementation is based on a `BinaryHeap`.
/// The theoretical complexity for collecting the top `K` out of `n` documents
/// is `O(n + K)`.
/// is `O(n log K)`.
pub(crate) struct TopSegmentCollector<T> {
topn_computer: TopNComputer<T, DocId>,
limit: usize,
heap: BinaryHeap<ComparableDoc<T, DocId>>,
segment_ord: u32,
}
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
impl<T: PartialOrd> TopSegmentCollector<T> {
fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {
TopSegmentCollector {
topn_computer: TopNComputer::new(limit),
limit,
heap: BinaryHeap::with_capacity(limit),
segment_ord,
}
}
@@ -159,7 +158,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
pub fn harvest(self) -> Vec<(T, DocAddress)> {
let segment_ord = self.segment_ord;
self.topn_computer
self.heap
.into_sorted_vec()
.into_iter()
.map(|comparable_doc| {
@@ -174,13 +173,33 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
.collect()
}
/// Return true if more documents have been collected than the limit.
#[inline]
pub(crate) fn at_capacity(&self) -> bool {
self.heap.len() >= self.limit
}
/// Collects a document scored by the given feature
///
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
/// will compare the lowest scoring item with the given one and keep whichever is greater.
#[inline]
pub fn collect(&mut self, doc: DocId, feature: T) {
self.topn_computer.push(ComparableDoc { feature, doc });
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
if let Some(limit_feature) = self.heap.peek().map(|head| head.feature.clone()) {
if limit_feature < feature {
if let Some(mut head) = self.heap.peek_mut() {
head.feature = feature;
head.doc = doc;
}
}
}
} else {
// we have not reached capacity yet, so we can just push the
// element.
self.heap.push(ComparableDoc { feature, doc });
}
}
}

View File

@@ -1,3 +1,4 @@
use std::collections::BinaryHeap;
use std::fmt;
use std::marker::PhantomData;
use std::sync::Arc;
@@ -85,15 +86,12 @@ where
/// The `TopDocs` collector keeps track of the top `K` documents
/// sorted by their score.
///
/// The implementation is based on a repeatedly truncating on the median after K * 2 documents
/// with pattern defeating QuickSort.
/// The theoretical complexity for collecting the top `K` out of `N` documents
/// is `O(N + K)`.
/// The implementation is based on a `BinaryHeap`.
/// The theoretical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
///
/// This collector does not guarantee a stable sorting in case of a tie on the
/// document score, for stable sorting `PartialOrd` needs to resolve on other fields
/// like docid in case of score equality.
/// Only then, it is suitable for pagination.
/// This collector guarantees a stable sorting in case of a tie on the
/// document score. As such, it is suitable to implement pagination.
///
/// ```rust
/// use tantivy::collector::TopDocs;
@@ -663,35 +661,50 @@ impl Collector for TopDocs {
reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let heap_len = self.0.limit + self.0.offset;
let mut top_n = TopNComputer::new(heap_len);
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
if let Some(alive_bitset) = reader.alive_bitset() {
let mut threshold = Score::MIN;
top_n.threshold = Some(threshold);
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
if alive_bitset.is_deleted(doc) {
return threshold;
}
let doc = ComparableDoc {
let heap_item = ComparableDoc {
feature: score,
doc,
};
top_n.push(doc);
threshold = top_n.threshold.unwrap_or(Score::MIN);
if heap.len() < heap_len {
heap.push(heap_item);
if heap.len() == heap_len {
threshold = heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
}
return threshold;
}
*heap.peek_mut().unwrap() = heap_item;
threshold = heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
threshold
})?;
} else {
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
let doc = ComparableDoc {
let heap_item = ComparableDoc {
feature: score,
doc,
};
top_n.push(doc);
top_n.threshold.unwrap_or(Score::MIN)
if heap.len() < heap_len {
heap.push(heap_item);
// TODO the threshold is suboptimal for heap.len == heap_len
if heap.len() == heap_len {
return heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
} else {
return Score::MIN;
}
}
*heap.peek_mut().unwrap() = heap_item;
heap.peek().map(|el| el.feature).unwrap_or(Score::MIN)
})?;
}
let fruit = top_n
let fruit = heap
.into_sorted_vec()
.into_iter()
.map(|cid| {
@@ -723,81 +736,9 @@ impl SegmentCollector for TopScoreSegmentCollector {
}
}
/// Fast TopN Computation
///
/// For TopN == 0, it will be relative expensive.
pub struct TopNComputer<Score, DocId> {
buffer: Vec<ComparableDoc<Score, DocId>>,
top_n: usize,
pub(crate) threshold: Option<Score>,
}
impl<Score, DocId> TopNComputer<Score, DocId>
where
Score: PartialOrd + Clone,
DocId: Ord + Clone,
{
/// Create a new `TopNComputer`.
/// Internally it will allocate a buffer of size `2 * top_n`.
pub fn new(top_n: usize) -> Self {
let vec_cap = top_n.max(1) * 2;
TopNComputer {
buffer: Vec::with_capacity(vec_cap),
top_n,
threshold: None,
}
}
#[inline]
pub(crate) fn push(&mut self, doc: ComparableDoc<Score, DocId>) {
if let Some(last_median) = self.threshold.clone() {
if doc.feature < last_median {
return;
}
}
if self.buffer.len() == self.buffer.capacity() {
let median = self.truncate_top_n();
self.threshold = Some(median);
}
// This is faster since it avoids the buffer resizing to be inlined from vec.push()
// (this is in the hot path)
// TODO: Replace with `push_within_capacity` when it's stabilized
let uninit = self.buffer.spare_capacity_mut();
// This cannot panic, because we truncate_median will at least remove one element, since
// the min capacity is 2.
uninit[0].write(doc);
// This is safe because it would panic in the line above
unsafe {
self.buffer.set_len(self.buffer.len() + 1);
}
}
#[inline(never)]
fn truncate_top_n(&mut self) -> Score {
// Use select_nth_unstable to find the top nth score
let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n);
let median_score = median_el.feature.clone();
// Remove all elements below the top_n
self.buffer.truncate(self.top_n);
median_score
}
pub(crate) fn into_sorted_vec(mut self) -> Vec<ComparableDoc<Score, DocId>> {
if self.buffer.len() > self.top_n {
self.truncate_top_n();
}
self.buffer.sort_unstable();
self.buffer
}
}
#[cfg(test)]
mod tests {
use super::{TopDocs, TopNComputer};
use crate::collector::top_collector::ComparableDoc;
use super::TopDocs;
use crate::collector::Collector;
use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
@@ -826,78 +767,6 @@ mod tests {
}
}
#[test]
fn test_empty_topn_computer() {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(0);
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 2u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 3u32,
});
assert!(computer.into_sorted_vec().is_empty());
}
#[test]
fn test_topn_computer() {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(2);
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
computer.push(ComparableDoc {
feature: 2u32,
doc: 2u32,
});
computer.push(ComparableDoc {
feature: 3u32,
doc: 3u32,
});
computer.push(ComparableDoc {
feature: 2u32,
doc: 4u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 5u32,
});
assert_eq!(
computer.into_sorted_vec(),
&[
ComparableDoc {
feature: 3u32,
doc: 3u32,
},
ComparableDoc {
feature: 2u32,
doc: 2u32,
}
]
);
}
#[test]
fn test_topn_computer_no_panic() {
for top_n in 0..10 {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(top_n);
for _ in 0..1 + top_n * 2 {
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
}
let _vals = computer.into_sorted_vec();
}
}
#[test]
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> {
let index = make_index()?;
@@ -983,25 +852,20 @@ mod tests {
// using AllQuery to get a constant score
let searcher = index.reader().unwrap().searcher();
let page_0 = searcher.search(&AllQuery, &TopDocs::with_limit(1)).unwrap();
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
// precondition for the test to be meaningful: we did get documents
// with the same score
assert!(page_0.iter().all(|result| result.0 == page_1[0].0));
assert!(page_1.iter().all(|result| result.0 == page_1[0].0));
assert!(page_2.iter().all(|result| result.0 == page_2[0].0));
// sanity check since we're relying on make_index()
assert_eq!(page_0.len(), 1);
assert_eq!(page_1.len(), 2);
assert_eq!(page_2.len(), 3);
assert_eq!(page_1, &page_2[..page_1.len()]);
assert_eq!(page_0, &page_2[..page_0.len()]);
}
#[test]

View File

@@ -18,12 +18,10 @@ use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_L
use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::indexer::IndexWriter;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::document::Document;
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{merge_field_meta_data, FieldMetadata, SegmentReader};
use crate::IndexWriter;
fn load_metas(
directory: &dyn Directory,
@@ -186,11 +184,11 @@ impl IndexBuilder {
///
/// It expects an originally empty directory, and will not run any GC operation.
#[doc(hidden)]
pub fn single_segment_index_writer<D: Document>(
pub fn single_segment_index_writer(
self,
dir: impl Into<Box<dyn Directory>>,
mem_budget: usize,
) -> crate::Result<SingleSegmentIndexWriter<D>> {
) -> crate::Result<SingleSegmentIndexWriter> {
let index = self.create(dir)?;
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
Ok(index_simple_writer)
@@ -490,28 +488,6 @@ impl Index {
self.inventory.all()
}
/// Returns the list of fields that have been indexed in the Index.
/// The field list includes the field defined in the schema as well as the fields
/// that have been indexed as a part of a JSON field.
/// The returned field name is the full field name, including the name of the JSON field.
///
/// The returned field names can be used in queries.
///
/// Notice: If your data contains JSON fields this is **very expensive**, as it requires
/// browsing through the inverted index term dictionary and the columnar field dictionary.
///
/// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json
/// field that is not indexed nor a fast field but is stored, it is possible for the field
/// to not be listed.
pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
let segments = self.searchable_segments()?;
let fields_metadata: Vec<Vec<FieldMetadata>> = segments
.into_iter()
.map(|segment| SegmentReader::open(&segment)?.fields_metadata())
.collect::<Result<_, _>>()?;
Ok(merge_field_meta_data(fields_metadata, &self.schema()))
}
/// Creates a new segment_meta (Advanced user only).
///
/// As long as the `SegmentMeta` lives, the files associated with the
@@ -555,11 +531,11 @@ impl Index {
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub fn writer_with_num_threads<D: Document>(
pub fn writer_with_num_threads(
&self,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
) -> crate::Result<IndexWriter> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
@@ -588,8 +564,8 @@ impl Index {
/// That index writer only simply has a single thread and a memory budget of 15 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<D>> {
self.writer_with_num_threads(1, MEMORY_BUDGET_NUM_BYTES_MIN)
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, 15_000_000)
}
/// Creates a multithreaded writer
@@ -603,10 +579,7 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub fn writer<D: Document>(
&self,
memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
pub fn writer(&self, memory_budget_in_bytes: usize) -> crate::Result<IndexWriter> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {

View File

@@ -1,12 +1,11 @@
use std::io;
use common::BinarySerializable;
use fnv::FnvHashSet;
use crate::directory::FileSlice;
use crate::positions::PositionReader;
use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
use crate::schema::{IndexRecordOption, Term, Type, JSON_END_OF_PATH};
use crate::schema::{IndexRecordOption, Term};
use crate::termdict::TermDictionary;
/// The inverted index reader is in charge of accessing
@@ -70,28 +69,6 @@ impl InvertedIndexReader {
&self.termdict
}
/// Return the fields and types encoded in the dictionary in lexicographic oder.
/// Only valid on JSON fields.
///
/// Notice: This requires a full scan and therefore **very expensive**.
/// TODO: Move to sstable to use the index.
pub fn list_encoded_fields(&self) -> io::Result<Vec<(String, Type)>> {
let mut stream = self.termdict.stream()?;
let mut fields = Vec::new();
let mut fields_set = FnvHashSet::default();
while let Some((term, _term_info)) = stream.next() {
if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
if !fields_set.contains(&term[..index + 2]) {
fields_set.insert(term[..index + 2].to_vec());
let typ = Type::from_code(term[index + 1]).unwrap();
fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ));
}
}
}
Ok(fields)
}
/// Resets the block segment to another position of the postings
/// file.
///

View File

@@ -1,11 +1,11 @@
use columnar::MonotonicallyMappableToU64;
use common::{replace_in_place, JsonPathWriter};
use common::replace_in_place;
use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap;
use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::term::JSON_PATH_SEGMENT_SEP;
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
@@ -57,41 +57,31 @@ struct IndexingPositionsPerPath {
}
impl IndexingPositionsPerPath {
fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition {
self.positions_per_path.entry(id).or_default()
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
self.positions_per_path
.entry(murmurhash2(term.serialized_term()))
.or_default()
}
}
/// Convert JSON_PATH_SEGMENT_SEP to a dot.
pub fn json_path_sep_to_dot(path: &mut str) {
// This is safe since we are replacing a ASCII character by another ASCII character.
unsafe {
replace_in_place(JSON_PATH_SEGMENT_SEP, b'.', path.as_bytes_mut());
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_json_values<'a, V: Value<'a>>(
pub(crate) fn index_json_values<'a>(
doc: DocId,
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
json_path_writer: &mut JsonPathWriter,
ctx: &mut IndexingContext,
) -> crate::Result<()> {
json_path_writer.clear();
json_path_writer.set_expand_dots(expand_dots_enabled);
let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_visitor_res in json_visitors {
let json_visitor = json_visitor_res?;
index_json_object::<V>(
for json_value_res in json_values {
let json_value = json_value_res?;
index_json_object(
doc,
json_visitor,
json_value,
text_analyzer,
term_buffer,
json_path_writer,
&mut json_term_writer,
postings_writer,
ctx,
&mut positions_per_path,
@@ -100,154 +90,93 @@ pub(crate) fn index_json_values<'a, V: Value<'a>>(
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn index_json_object<'a, V: Value<'a>>(
fn index_json_object(
doc: DocId,
json_visitor: V::ObjectIter,
json_value: &serde_json::Map<String, serde_json::Value>,
text_analyzer: &mut TextAnalyzer,
term_buffer: &mut Term,
json_path_writer: &mut JsonPathWriter,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
for (json_path_segment, json_value_visitor) in json_visitor {
json_path_writer.push(json_path_segment);
for (json_path_segment, json_value) in json_value {
json_term_writer.push_path_segment(json_path_segment);
index_json_value(
doc,
json_value_visitor,
json_value,
text_analyzer,
term_buffer,
json_path_writer,
json_term_writer,
postings_writer,
ctx,
positions_per_path,
);
json_path_writer.pop();
json_term_writer.pop_path_segment();
}
}
#[allow(clippy::too_many_arguments)]
fn index_json_value<'a, V: Value<'a>>(
fn index_json_value(
doc: DocId,
json_value: V,
json_value: &serde_json::Value,
text_analyzer: &mut TextAnalyzer,
term_buffer: &mut Term,
json_path_writer: &mut JsonPathWriter,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
let set_path_id = |term_buffer: &mut Term, unordered_id: u32| {
term_buffer.truncate_value_bytes(0);
term_buffer.append_bytes(&unordered_id.to_be_bytes());
};
let set_type = |term_buffer: &mut Term, typ: Type| {
term_buffer.append_bytes(&[typ.to_code()]);
};
match json_value.as_value() {
ReferenceValue::Leaf(leaf) => match leaf {
ReferenceValueLeaf::Null => {}
ReferenceValueLeaf::Str(val) => {
let mut token_stream = text_analyzer.token_stream(val);
let unordered_id = ctx
.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str());
// TODO: make sure the chain position works out.
set_path_id(term_buffer, unordered_id);
set_type(term_buffer, Type::Str);
let indexing_position = positions_per_path.get_position_from_id(unordered_id);
match json_value {
serde_json::Value::Null => {}
serde_json::Value::Bool(val_bool) => {
json_term_writer.set_fast_value(*val_bool);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
serde_json::Value::Number(number) => {
if let Some(number_i64) = number.as_i64() {
json_term_writer.set_fast_value(number_i64);
} else if let Some(number_u64) = number.as_u64() {
json_term_writer.set_fast_value(number_u64);
} else if let Some(number_f64) = number.as_f64() {
json_term_writer.set_fast_value(number_f64);
}
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
serde_json::Value::String(text) => match infer_type_from_str(text) {
TextOrDateTime::Text(text) => {
let mut token_stream = text_analyzer.token_stream(text);
// TODO make sure the chain position works out.
json_term_writer.close_path_and_set_type(Type::Str);
let indexing_position = positions_per_path.get_position(json_term_writer.term());
postings_writer.index_text(
doc,
&mut *token_stream,
term_buffer,
json_term_writer.term_buffer,
ctx,
indexing_position,
);
}
ReferenceValueLeaf::U64(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::I64(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::F64(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Bool(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Date(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::PreTokStr(_) => {
unimplemented!(
"Pre-tokenized string support in dynamic fields is not yet implemented"
)
}
ReferenceValueLeaf::Bytes(_) => {
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
TextOrDateTime::DateTime(dt) => {
json_term_writer.set_fast_value(DateTime::from_utc(dt));
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
},
ReferenceValue::Array(elements) => {
for val in elements {
serde_json::Value::Array(arr) => {
for val in arr {
index_json_value(
doc,
val,
text_analyzer,
term_buffer,
json_path_writer,
json_term_writer,
postings_writer,
ctx,
positions_per_path,
);
}
}
ReferenceValue::Object(object) => {
index_json_object::<V>(
serde_json::Value::Object(map) => {
index_json_object(
doc,
object,
map,
text_analyzer,
term_buffer,
json_path_writer,
json_term_writer,
postings_writer,
ctx,
positions_per_path,
@@ -256,6 +185,21 @@ fn index_json_value<'a, V: Value<'a>>(
}
}
enum TextOrDateTime<'a> {
Text(&'a str),
DateTime(OffsetDateTime),
}
fn infer_type_from_str(text: &str) -> TextOrDateTime {
match OffsetDateTime::parse(text, &Rfc3339) {
Ok(dt) => {
let dt_utc = dt.to_offset(UtcOffset::UTC);
TextOrDateTime::DateTime(dt_utc)
}
Err(_) => TextOrDateTime::Text(text),
}
}
// Tries to infer a JSON type from a string.
pub fn convert_to_fast_value_and_get_term(
json_term_writer: &mut JsonTermWriter,
@@ -328,7 +272,7 @@ pub struct JsonTermWriter<'a> {
/// In other words,
/// - `k8s.node` ends up as `["k8s", "node"]`.
/// - `k8s\.node` ends up as `["k8s.node"]`.
pub fn split_json_path(json_path: &str) -> Vec<String> {
fn split_json_path(json_path: &str) -> Vec<String> {
let mut escaped_state: bool = false;
let mut json_path_segments = Vec::new();
let mut buffer = String::new();
@@ -368,13 +312,17 @@ pub(crate) fn encode_column_name(
json_path: &str,
expand_dots_enabled: bool,
) -> String {
let mut path = JsonPathWriter::default();
path.push(field_name);
path.set_expand_dots(expand_dots_enabled);
for segment in split_json_path(json_path) {
path.push(&segment);
let mut column_key: String = String::with_capacity(field_name.len() + json_path.len() + 1);
column_key.push_str(field_name);
for mut segment in split_json_path(json_path) {
column_key.push_str(JSON_PATH_SEGMENT_SEP_STR);
if expand_dots_enabled {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
unsafe { replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, segment.as_bytes_mut()) };
}
column_key.push_str(&segment);
}
path.into()
column_key
}
impl<'a> JsonTermWriter<'a> {
@@ -414,7 +362,6 @@ impl<'a> JsonTermWriter<'a> {
self.term_buffer.append_bytes(&[typ.to_code()]);
}
// TODO: Remove this function and use JsonPathWriter instead.
pub fn push_path_segment(&mut self, segment: &str) {
// the path stack should never be empty.
self.trim_to_end_of_path();

View File

@@ -25,7 +25,7 @@ pub use self::searcher::{Searcher, SearcherGeneration};
pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::{merge_field_meta_data, FieldMetadata, SegmentReader};
pub use self::segment_reader::SegmentReader;
pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
/// The meta file contains all the information about the list of segments and the schema

View File

@@ -5,8 +5,7 @@ use std::{fmt, io};
use crate::collector::Collector;
use crate::core::{Executor, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::document::DocumentDeserialize;
use crate::schema::{Schema, Term};
use crate::schema::{Document, Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader};
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
@@ -84,7 +83,7 @@ impl Searcher {
///
/// The searcher uses the segment ordinal to route the
/// request to the right `Segment`.
pub fn doc<D: DocumentDeserialize>(&self, doc_address: DocAddress) -> crate::Result<D> {
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get(doc_address.doc_id)
}
@@ -104,10 +103,7 @@ impl Searcher {
/// Fetches a document in an asynchronous manner.
#[cfg(feature = "quickwit")]
pub async fn doc_async<D: DocumentDeserialize>(
&self,
doc_address: DocAddress,
) -> crate::Result<D> {
pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get_async(doc_address.doc_id).await
}

View File

@@ -1,17 +1,12 @@
use std::collections::HashMap;
use std::ops::BitOrAssign;
use std::sync::{Arc, RwLock};
use std::{fmt, io};
use fnv::FnvHashMap;
use itertools::Itertools;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::json_utils::json_path_sep_to_dot;
use crate::schema::{Field, IndexRecordOption, Schema, Type};
use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
@@ -285,103 +280,6 @@ impl SegmentReader {
Ok(inv_idx_reader)
}
/// Returns the list of fields that have been indexed in the segment.
/// The field list includes the field defined in the schema as well as the fields
/// that have been indexed as a part of a JSON field.
/// The returned field name is the full field name, including the name of the JSON field.
///
/// The returned field names can be used in queries.
///
/// Notice: If your data contains JSON fields this is **very expensive**, as it requires
/// browsing through the inverted index term dictionary and the columnar field dictionary.
///
/// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json
/// field that is not indexed nor a fast field but is stored, it is possible for the field
/// to not be listed.
pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
let mut indexed_fields: Vec<FieldMetadata> = Vec::new();
let mut map_to_canonical = FnvHashMap::default();
for (field, field_entry) in self.schema().fields() {
let field_name = field_entry.name().to_string();
let is_indexed = field_entry.is_indexed();
if is_indexed {
let is_json = field_entry.field_type().value_type() == Type::Json;
if is_json {
let inv_index = self.inverted_index(field)?;
let encoded_fields_in_index = inv_index.list_encoded_fields()?;
let mut build_path = |field_name: &str, mut json_path: String| {
// In this case we need to map the potential fast field to the field name
// accepted by the query parser.
let create_canonical =
!field_entry.is_expand_dots_enabled() && json_path.contains('.');
if create_canonical {
// Without expand dots enabled dots need to be escaped.
let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path);
let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path
} else {
// With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path)
}
};
indexed_fields.extend(
encoded_fields_in_index
.into_iter()
.map(|(name, typ)| (build_path(&field_name, name), typ))
.map(|(field_name, typ)| FieldMetadata {
indexed: true,
stored: false,
field_name,
fast: false,
typ,
}),
);
} else {
indexed_fields.push(FieldMetadata {
indexed: true,
stored: false,
field_name: field_name.to_string(),
fast: false,
typ: field_entry.field_type().value_type(),
});
}
}
}
let mut fast_fields: Vec<FieldMetadata> = self
.fast_fields()
.columnar()
.iter_columns()?
.map(|(mut field_name, handle)| {
json_path_sep_to_dot(&mut field_name);
// map to canonical path, to avoid similar but different entries.
// Eventually we should just accept '.' seperated for all cases.
let field_name = map_to_canonical
.get(&field_name)
.unwrap_or(&field_name)
.to_string();
FieldMetadata {
indexed: false,
stored: false,
field_name,
fast: true,
typ: Type::from(handle.column_type()),
}
})
.collect();
// Since the type is encoded differently in the fast field and in the inverted index,
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
// If we are sure that the order is the same, we can remove this sort.
indexed_fields.sort_unstable();
fast_fields.sort_unstable();
let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);
Ok(merged)
}
/// Returns the segment id
pub fn segment_id(&self) -> SegmentId {
self.segment_id
@@ -432,65 +330,6 @@ impl SegmentReader {
}
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
/// FieldMetadata
pub struct FieldMetadata {
/// The field name
// Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
// field_name then typ.
pub field_name: String,
/// The field type
// Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
// field_name then typ.
pub typ: Type,
/// Is the field indexed for search
pub indexed: bool,
/// Is the field stored in the doc store
pub stored: bool,
/// Is the field stored in the columnar storage
pub fast: bool,
}
impl BitOrAssign for FieldMetadata {
fn bitor_assign(&mut self, rhs: Self) {
assert!(self.field_name == rhs.field_name);
assert!(self.typ == rhs.typ);
self.indexed |= rhs.indexed;
self.stored |= rhs.stored;
self.fast |= rhs.fast;
}
}
// Maybe too slow for the high cardinality case
fn is_field_stored(field_name: &str, schema: &Schema) -> bool {
schema
.find_field(field_name)
.map(|(field, _path)| schema.get_field_entry(field).is_stored())
.unwrap_or(false)
}
/// Helper to merge the field metadata from multiple segments.
pub fn merge_field_meta_data(
field_metadatas: Vec<Vec<FieldMetadata>>,
schema: &Schema,
) -> Vec<FieldMetadata> {
let mut merged_field_metadata = Vec::new();
for (_key, mut group) in &field_metadatas
.into_iter()
.kmerge_by(|left, right| left < right)
// TODO: Remove allocation
.group_by(|el| (el.field_name.to_string(), el.typ))
{
let mut merged: FieldMetadata = group.next().unwrap();
for el in group {
merged |= el;
}
// Currently is_field_stored is maybe too slow for the high cardinality case
merged.stored = is_field_stored(&merged.field_name, schema);
merged_field_metadata.push(merged);
}
merged_field_metadata
}
fn intersect_alive_bitset(
left_opt: Option<AliveBitSet>,
right_opt: Option<AliveBitSet>,
@@ -514,127 +353,9 @@ impl fmt::Debug for SegmentReader {
#[cfg(test)]
mod test {
use super::*;
use crate::core::Index;
use crate::schema::{Schema, SchemaBuilder, Term, STORED, TEXT};
use crate::{DocId, FieldMetadata, IndexWriter};
#[test]
fn test_merge_field_meta_data_same() {
let schema = SchemaBuilder::new().build();
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
let res = merge_field_meta_data(
vec![vec![field_metadata1.clone()], vec![field_metadata2]],
&schema,
);
assert_eq!(res, vec![field_metadata1]);
}
#[test]
fn test_merge_field_meta_data_different() {
let schema = SchemaBuilder::new().build();
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: false,
stored: false,
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "b".to_string(),
typ: crate::schema::Type::Str,
indexed: false,
stored: false,
fast: true,
};
let field_metadata3 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: false,
};
let res = merge_field_meta_data(
vec![
vec![field_metadata1.clone(), field_metadata2.clone()],
vec![field_metadata3],
],
&schema,
);
let field_metadata_expected1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
assert_eq!(res, vec![field_metadata_expected1, field_metadata2.clone()]);
}
#[test]
fn test_merge_field_meta_data_merge() {
use pretty_assertions::assert_eq;
let get_meta_data = |name: &str, typ: Type| FieldMetadata {
field_name: name.to_string(),
typ,
indexed: false,
stored: false,
fast: true,
};
let schema = SchemaBuilder::new().build();
let mut metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
metas.sort();
let res = merge_field_meta_data(vec![vec![get_meta_data("e", Type::Str)], metas], &schema);
assert_eq!(
res,
vec![
get_meta_data("d", Type::Str),
get_meta_data("e", Type::Str),
get_meta_data("e", Type::U64),
]
);
}
#[test]
fn test_merge_field_meta_data_bitxor() {
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: false,
stored: false,
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: false,
};
let field_metadata_expected = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
let mut res1 = field_metadata1.clone();
res1 |= field_metadata2.clone();
let mut res2 = field_metadata2.clone();
res2 |= field_metadata1;
assert_eq!(res1, field_metadata_expected);
assert_eq!(res2, field_metadata_expected);
}
use crate::schema::{Schema, Term, STORED, TEXT};
use crate::DocId;
#[test]
fn test_num_alive() -> crate::Result<()> {
@@ -645,7 +366,7 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?;
@@ -671,7 +392,7 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?;
@@ -681,7 +402,7 @@ mod test {
}
{
let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
let mut index_writer2 = index.writer(50_000_000)?;
index_writer2.delete_term(Term::from_field_text(name, "horse"));
index_writer2.delete_term(Term::from_field_text(name, "cap"));

View File

@@ -1,20 +1,16 @@
use std::marker::PhantomData;
use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
use crate::schema::document::Document;
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
#[doc(hidden)]
pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> {
pub struct SingleSegmentIndexWriter {
segment_writer: SegmentWriter,
segment: Segment,
opstamp: Opstamp,
_phantom: PhantomData<D>,
}
impl<D: Document> SingleSegmentIndexWriter<D> {
impl SingleSegmentIndexWriter {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
@@ -22,7 +18,6 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
segment_writer,
segment,
opstamp: 0,
_phantom: PhantomData,
})
}
@@ -30,7 +25,7 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
self.segment_writer.mem_usage()
}
pub fn add_document(&mut self, document: D) -> crate::Result<()> {
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
let opstamp = self.opstamp;
self.opstamp += 1;
self.segment_writer

View File

@@ -1,13 +1,12 @@
use crate::collector::Count;
use crate::directory::{RamDirectory, WatchCallback};
use crate::indexer::{LogMergePolicy, NoMergePolicy};
use crate::json_utils::JsonTermWriter;
use crate::indexer::NoMergePolicy;
use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, Type, INDEXED, STRING, TEXT};
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
use crate::tokenizer::TokenizerManager;
use crate::{
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, Postings,
ReloadPolicy, SegmentId, TantivyDocument, Term,
Directory, Document, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, SegmentId,
Term,
};
#[test]
@@ -122,7 +121,7 @@ fn test_index_on_commit_reload_policy() -> crate::Result<()> {
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
@@ -148,7 +147,7 @@ mod mmap_specific {
let index = Index::create_in_dir(tempdir_path, schema).unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
@@ -160,7 +159,7 @@ mod mmap_specific {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema)?;
let mut writer: IndexWriter = index.writer_for_tests()?;
let mut writer = index.writer_for_tests()?;
writer.commit()?;
let reader = index
.reader_builder()
@@ -190,7 +189,7 @@ mod mmap_specific {
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
let reader = read_index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
@@ -209,7 +208,7 @@ fn test_index_on_commit_reload_policy_aux(
.watch(WatchCallback::new(move || {
let _ = sender.send(());
}));
let mut writer: IndexWriter = index.writer_for_tests()?;
let mut writer = index.writer_for_tests()?;
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?;
writer.commit().unwrap();
@@ -243,7 +242,7 @@ fn garbage_collect_works_as_intended() -> crate::Result<()> {
let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
let mut writer: IndexWriter = index.writer_with_num_threads(1, 32_000_000).unwrap();
let mut writer = index.writer_with_num_threads(1, 32_000_000).unwrap();
for _seg in 0..8 {
for i in 0u64..1_000u64 {
writer.add_document(doc!(field => i))?;
@@ -307,7 +306,7 @@ fn test_merging_segment_update_docfreq() {
let id_field = schema_builder.add_text_field("id", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let mut writer = index.writer_for_tests().unwrap();
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5 {
writer.add_document(doc!(text_field=>"hello")).unwrap();
@@ -318,13 +317,13 @@ fn test_merging_segment_update_docfreq() {
writer
.add_document(doc!(text_field=>"hello", id_field=>"TO_BE_DELETED"))
.unwrap();
writer.add_document(TantivyDocument::default()).unwrap();
writer.add_document(Document::default()).unwrap();
writer.commit().unwrap();
for _ in 0..7 {
writer.add_document(doc!(text_field=>"hello")).unwrap();
}
writer.add_document(TantivyDocument::default()).unwrap();
writer.add_document(TantivyDocument::default()).unwrap();
writer.add_document(Document::default()).unwrap();
writer.add_document(Document::default()).unwrap();
writer.delete_term(Term::from_field_text(id_field, "TO_BE_DELETED"));
writer.commit().unwrap();
@@ -345,132 +344,3 @@ fn test_merging_segment_update_docfreq() {
let term_info = inv_index.get_term_info(&term).unwrap().unwrap();
assert_eq!(term_info.doc_freq, 12);
}
// motivated by https://github.com/quickwit-oss/quickwit/issues/4130
#[test]
fn test_positions_merge_bug_non_text_json_vint() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("dynamic", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let mut merge_policy = LogMergePolicy::default();
merge_policy.set_min_num_segments(2);
writer.set_merge_policy(Box::new(merge_policy));
// Here a string would work.
let doc_json = r#"{"tenant_id":75}"#;
let vals = serde_json::from_str(doc_json).unwrap();
let mut doc = TantivyDocument::default();
doc.add_object(field, vals);
writer.add_document(doc.clone()).unwrap();
writer.commit().unwrap();
writer.add_document(doc.clone()).unwrap();
writer.commit().unwrap();
writer.wait_merging_threads().unwrap();
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
}
// Same as above but with bitpacked blocks
#[test]
fn test_positions_merge_bug_non_text_json_bitpacked_block() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("dynamic", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let mut merge_policy = LogMergePolicy::default();
merge_policy.set_min_num_segments(2);
writer.set_merge_policy(Box::new(merge_policy));
// Here a string would work.
let doc_json = r#"{"tenant_id":75}"#;
let vals = serde_json::from_str(doc_json).unwrap();
let mut doc = TantivyDocument::default();
doc.add_object(field, vals);
for _ in 0..128 {
writer.add_document(doc.clone()).unwrap();
}
writer.commit().unwrap();
writer.add_document(doc.clone()).unwrap();
writer.commit().unwrap();
writer.wait_merging_threads().unwrap();
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
}
#[test]
fn test_non_text_json_term_freq() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("dynamic", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
// Here a string would work.
let doc_json = r#"{"tenant_id":75}"#;
let vals = serde_json::from_str(doc_json).unwrap();
let mut doc = TantivyDocument::default();
doc.add_object(field, vals);
writer.add_document(doc.clone()).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("tenant_id");
json_term_writer.close_path_and_set_type(Type::U64);
json_term_writer.set_fast_value(75u64);
let postings = inv_idx
.read_postings(
&json_term_writer.term(),
IndexRecordOption::WithFreqsAndPositions,
)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 1u32);
}
#[test]
fn test_non_text_json_term_freq_bitpacked() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("dynamic", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
// Here a string would work.
let doc_json = r#"{"tenant_id":75}"#;
let vals = serde_json::from_str(doc_json).unwrap();
let mut doc = TantivyDocument::default();
doc.add_object(field, vals);
let num_docs = 132;
for _ in 0..num_docs {
writer.add_document(doc.clone()).unwrap();
}
writer.commit().unwrap();
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("tenant_id");
json_term_writer.close_path_and_set_type(Type::U64);
json_term_writer.set_fast_value(75u64);
let mut postings = inv_idx
.read_postings(
&json_term_writer.term(),
IndexRecordOption::WithFreqsAndPositions,
)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 1u32);
for i in 1..num_docs {
assert_eq!(postings.advance(), i);
assert_eq!(postings.term_freq(), 1u32);
}
}

View File

@@ -222,8 +222,8 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// registered (and whose [`WatchHandle`] is still alive) are triggered.
///
/// Internally, tantivy only uses this API to detect new commits to implement the
/// `OnCommitWithDelay` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents
/// the `OnCommitWithDelay` `ReloadPolicy` to work properly.
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
/// `OnCommit` `ReloadPolicy` to work properly.
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>;
}

View File

@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
use crate::directory::error::Incompatibility;
use crate::directory::{AntiCallToken, FileSlice, TerminatingWrite};
use crate::{Version, INDEX_FORMAT_OLDEST_SUPPORTED_VERSION, INDEX_FORMAT_VERSION};
use crate::{Version, INDEX_FORMAT_VERSION};
const FOOTER_MAX_LEN: u32 = 50_000;
@@ -102,11 +102,10 @@ impl Footer {
/// Confirms that the index will be read correctly by this version of tantivy
/// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory
pub fn is_compatible(&self) -> Result<(), Incompatibility> {
const SUPPORTED_INDEX_FORMAT_VERSION_RANGE: std::ops::RangeInclusive<u32> =
INDEX_FORMAT_OLDEST_SUPPORTED_VERSION..=INDEX_FORMAT_VERSION;
let library_version = crate::version();
if !SUPPORTED_INDEX_FORMAT_VERSION_RANGE.contains(&self.version.index_format_version) {
if self.version.index_format_version < 4
|| self.version.index_format_version > INDEX_FORMAT_VERSION
{
return Err(Incompatibility::IndexMismatch {
library_version: library_version.clone(),
index_version: self.version.clone(),

View File

@@ -1,15 +1,13 @@
use std::collections::HashMap;
use std::fmt;
use std::fs::{self, File, OpenOptions};
use std::io::{self, BufWriter, Read, Write};
use std::io::{self, BufWriter, Read, Seek, Write};
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock, Weak};
use common::StableDeref;
use fs4::FileExt;
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
use memmap2::Mmap;
use serde::{Deserialize, Serialize};
use tempfile::TempDir;
@@ -23,6 +21,8 @@ use crate::directory::{
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
WatchCallback, WatchHandle, WritePtr,
};
#[cfg(unix)]
use crate::Advice;
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
@@ -328,6 +328,12 @@ impl Write for SafeFileWriter {
}
}
impl Seek for SafeFileWriter {
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
self.0.seek(pos)
}
}
impl TerminatingWrite for SafeFileWriter {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
self.0.flush()?;
@@ -533,7 +539,7 @@ mod tests {
use super::*;
use crate::indexer::LogMergePolicy;
use crate::schema::{Schema, SchemaBuilder, TEXT};
use crate::{Index, IndexSettings, IndexWriter, ReloadPolicy};
use crate::{Index, IndexSettings, ReloadPolicy};
#[test]
fn test_open_non_existent_path() {
@@ -645,7 +651,7 @@ mod tests {
let index =
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_num_segments(3);
index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -42,9 +42,6 @@ pub struct GarbageCollectionResult {
pub failed_to_delete_files: Vec<PathBuf>,
}
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
pub use self::managed_directory::ManagedDirectory;
#[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory;

View File

@@ -1,5 +1,5 @@
use std::collections::HashMap;
use std::io::{self, BufWriter, Cursor, Write};
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};
use std::{fmt, result};
@@ -48,6 +48,12 @@ impl Drop for VecWriter {
}
}
impl Seek for VecWriter {
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
self.data.seek(pos)
}
}
impl Write for VecWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.is_flushed = false;

View File

@@ -17,7 +17,7 @@ pub trait DocSet: Send {
///
/// The DocId of the next element is returned.
/// In other words we should always have :
/// ```compile_fail
/// ```ignore
/// let doc = docset.advance();
/// assert_eq!(doc, docset.doc());
/// ```

View File

@@ -11,7 +11,6 @@ use crate::directory::error::{
Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
};
use crate::fastfield::FastFieldNotAvailableError;
use crate::schema::document::DeserializeError;
use crate::{query, schema};
/// Represents a `DataCorruption` error.
@@ -107,9 +106,6 @@ pub enum TantivyError {
/// e.g. a datastructure is incorrectly inititalized.
#[error("Internal error: '{0}'")]
InternalError(String),
#[error("Deserialize error: {0}")]
/// An error occurred while attempting to deserialize a document.
DeserializeError(DeserializeError),
}
impl From<io::Error> for TantivyError {
@@ -180,9 +176,3 @@ impl From<rayon::ThreadPoolBuildError> for TantivyError {
TantivyError::SystemError(error.to_string())
}
}
impl From<DeserializeError> for TantivyError {
fn from(error: DeserializeError) -> TantivyError {
TantivyError::DeserializeError(error)
}
}

View File

@@ -62,9 +62,8 @@ impl FacetReader {
#[cfg(test)]
mod tests {
use crate::schema::document::Value;
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
use crate::{DocAddress, Document, Index};
#[test]
fn test_facet_only_indexed() {
@@ -72,7 +71,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))
.unwrap();
@@ -86,10 +85,8 @@ mod tests {
let mut facet = Facet::default();
facet_reader.facet_from_ord(0, &mut facet).unwrap();
assert_eq!(facet.to_path_string(), "/a/b");
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
.unwrap();
let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap();
let value = doc.get_first(facet_field).and_then(Value::as_facet);
assert_eq!(value, None);
}
@@ -99,7 +96,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap()))
.unwrap();
@@ -145,8 +142,8 @@ mod tests {
let mut facet_ords = Vec::new();
facet_ords.extend(facet_reader.facet_ords(0u32));
assert_eq!(&facet_ords, &[0u64]);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet());
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet);
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
Ok(())
}
@@ -159,7 +156,7 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
@@ -179,8 +176,8 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(Document::default())?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();

View File

@@ -90,12 +90,12 @@ mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{
Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument,
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader};
use crate::{DateOptions, DateTimePrecision, Index, SegmentId, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
@@ -131,7 +131,7 @@ mod tests {
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80);
assert_eq!(file.len(), 93);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let column = fast_field_readers
.u64("field")
@@ -181,7 +181,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 108);
assert_eq!(file.len(), 121);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let col = fast_field_readers
.u64("field")
@@ -214,7 +214,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 81);
assert_eq!(file.len(), 94);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let fast_field_reader = fast_field_readers
.u64("field")
@@ -246,7 +246,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 4476);
assert_eq!(file.len(), 4489);
{
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let col = fast_field_readers
@@ -271,7 +271,7 @@ mod tests {
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
for i in -100i64..10_000i64 {
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
doc.add_i64(i64_field, i);
fast_field_writers.add_document(&doc).unwrap();
}
@@ -279,7 +279,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 252);
assert_eq!(file.len(), 265);
{
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
@@ -312,7 +312,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
@@ -345,7 +345,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
@@ -416,7 +416,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer
.add_document(doc!(date_field => DateTime::from_utc(OffsetDateTime::now_utc())))
@@ -452,7 +452,7 @@ mod tests {
{
// first segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer
.add_document(doc!(
@@ -506,7 +506,7 @@ mod tests {
{
// second segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(
@@ -537,7 +537,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.merge(&segment_ids).wait().unwrap();
index_writer.wait_merging_threads().unwrap();
}
@@ -662,7 +662,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -773,7 +773,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 84);
assert_eq!(file.len(), 102);
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
let bool_col = fast_field_readers.bool("field_bool").unwrap();
assert_eq!(bool_col.first(0), Some(true));
@@ -805,7 +805,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 96);
assert_eq!(file.len(), 114);
let readers = FastFieldReaders::open(file, schema).unwrap();
let bool_col = readers.bool("field_bool").unwrap();
for i in 0..25 {
@@ -824,13 +824,13 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 86);
assert_eq!(file.len(), 104);
let fastfield_readers = FastFieldReaders::open(file, schema).unwrap();
let col = fastfield_readers.bool("field_bool").unwrap();
assert_eq!(col.first(0), None);
@@ -846,7 +846,7 @@ mod tests {
assert_eq!(col.get_val(0), true);
}
fn get_index(docs: &[crate::TantivyDocument], schema: &Schema) -> crate::Result<RamDirectory> {
fn get_index(docs: &[crate::Document], schema: &Schema) -> crate::Result<RamDirectory> {
let directory: RamDirectory = RamDirectory::create();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
@@ -888,7 +888,7 @@ mod tests {
let field = schema_builder.add_date_field("field", date_options);
let schema = schema_builder.build();
let docs: Vec<TantivyDocument> = times.iter().map(|time| doc!(field=>*time)).collect();
let docs: Vec<Document> = times.iter().map(|time| doc!(field=>*time)).collect();
let directory = get_index(&docs[..], &schema).unwrap();
let path = Path::new("test");
@@ -962,15 +962,11 @@ mod tests {
let ip_field = schema_builder.add_u64_field("ip", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let ip_addr = Ipv6Addr::new(1, 2, 3, 4, 5, 1, 2, 3);
index_writer
.add_document(TantivyDocument::default())
.unwrap();
index_writer.add_document(Document::default()).unwrap();
index_writer.add_document(doc!(ip_field=>ip_addr)).unwrap();
index_writer
.add_document(TantivyDocument::default())
.unwrap();
index_writer.add_document(Document::default()).unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fastfields = searcher.segment_reader(0u32).fast_fields();
@@ -1090,7 +1086,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"attr.age": 32})))
.unwrap();
@@ -1116,7 +1112,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"age": 32})))
.unwrap();
@@ -1143,7 +1139,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"attr.age": 32})))
.unwrap();
@@ -1166,7 +1162,7 @@ mod tests {
let field_with_dot = schema_builder.add_i64_field("field.with.dot", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field_with_dot => 32i64))
.unwrap();
@@ -1188,7 +1184,7 @@ mod tests {
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json_field=> json!({"attr": {"age": 32}}), shadowing_json_field=>json!({"age": 33})))
.unwrap();
@@ -1219,7 +1215,7 @@ mod tests {
let mut index = Index::create_in_ram(schema);
index.set_fast_field_tokenizers(ff_tokenizer_manager);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "Test1 test2"))
.unwrap();
@@ -1248,7 +1244,7 @@ mod tests {
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(log_field => "info"))
.unwrap();
@@ -1281,25 +1277,18 @@ mod tests {
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json_field=> json!({"attr.age": 32}), shadowing_json_field=>json!({"age": 33})))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fast_field_reader = searcher.segment_reader(0u32).fast_fields();
// Supported for now, maybe dropped in the future.
let column = fast_field_reader
.column_opt::<i64>("jsonfield.attr.age")
.unwrap()
.unwrap();
let vals: Vec<i64> = column.values_for_doc(0u32).collect();
assert_eq!(&vals, &[33]);
let column = fast_field_reader
.column_opt::<i64>("jsonfield\\.attr.age")
.unwrap()
.unwrap();
let vals: Vec<i64> = column.values_for_doc(0u32).collect();
assert_eq!(&vals, &[33]);
}
}

View File

@@ -357,7 +357,7 @@ mod tests {
use columnar::ColumnType;
use crate::schema::{JsonObjectOptions, Schema, FAST};
use crate::{Index, IndexWriter, TantivyDocument};
use crate::{Document, Index};
#[test]
fn test_fast_field_reader_resolve_with_dynamic_internal() {
@@ -373,10 +373,8 @@ mod tests {
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(TantivyDocument::default())
.unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::default()).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
@@ -445,7 +443,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(id=> 1u64, json => json!({"foo": 42})))
.unwrap();

View File

@@ -1,12 +1,12 @@
use std::io;
use columnar::{ColumnarWriter, NumericalValue};
use common::JsonPathWriter;
use common::replace_in_place;
use tokenizer_api::Token;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{value_type_to_column_type, Document, FieldType, Schema, Type, Value};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DateTimePrecision, DocId, TantivyError};
@@ -23,7 +23,7 @@ pub struct FastFieldsWriter {
expand_dots: Vec<bool>,
num_docs: DocId,
// Buffer that we recycle to avoid allocation.
json_path_buffer: JsonPathWriter,
json_path_buffer: String,
}
impl FastFieldsWriter {
@@ -97,7 +97,7 @@ impl FastFieldsWriter {
num_docs: 0u32,
date_precisions,
expand_dots,
json_path_buffer: JsonPathWriter::default(),
json_path_buffer: String::new(),
})
}
@@ -117,121 +117,114 @@ impl FastFieldsWriter {
}
/// Indexes all of the fastfields of a new document.
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
let doc_id = self.num_docs;
for (field, value) in doc.iter_fields_and_values() {
let value_access = value as D::Value<'_>;
for field_value in doc.field_values() {
if let Some(field_name) =
&self.fast_field_names[field_value.field().field_id() as usize]
{
match &field_value.value {
Value::U64(u64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*u64_val),
);
}
Value::I64(i64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*i64_val),
);
}
Value::F64(f64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*f64_val),
);
}
Value::Str(text_val) => {
if let Some(tokenizer) =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
})
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
}
Value::Bytes(bytes_val) => {
self.columnar_writer
.record_bytes(doc_id, field_name.as_str(), bytes_val);
}
Value::PreTokStr(pre_tok) => {
for token in &pre_tok.tokens {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
}
}
Value::Bool(bool_val) => {
self.columnar_writer
.record_bool(doc_id, field_name.as_str(), *bool_val);
}
Value::Date(datetime) => {
let date_precision =
self.date_precisions[field_value.field().field_id() as usize];
let truncated_datetime = datetime.truncate(date_precision);
self.columnar_writer.record_datetime(
doc_id,
field_name.as_str(),
truncated_datetime,
);
}
Value::Facet(facet) => {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
facet.encoded_str(),
);
}
Value::JsonObject(json_obj) => {
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
self.add_doc_value(doc_id, field, value_access)?;
let text_analyzer =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
Value::IpAddr(ip_addr) => {
self.columnar_writer
.record_ip_addr(doc_id, field_name.as_str(), *ip_addr);
}
}
}
}
self.num_docs += 1;
Ok(())
}
fn add_doc_value<'a, V: Value<'a>>(
&mut self,
doc_id: DocId,
field: Field,
value: V,
) -> crate::Result<()> {
let field_name = match &self.fast_field_names[field.field_id() as usize] {
None => return Ok(()),
Some(name) => name,
};
match value.as_value() {
ReferenceValue::Leaf(leaf) => match leaf {
ReferenceValueLeaf::Null => {}
ReferenceValueLeaf::Str(val) => {
if let Some(tokenizer) =
&mut self.per_field_tokenizer[field.field_id() as usize]
{
let mut token_stream = tokenizer.token_stream(val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer
.record_str(doc_id, field_name, &token.text);
})
} else {
self.columnar_writer.record_str(doc_id, field_name, val);
}
}
ReferenceValueLeaf::U64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValueLeaf::I64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValueLeaf::F64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValueLeaf::Date(val) => {
let date_precision = self.date_precisions[field.field_id() as usize];
let truncated_datetime = val.truncate(date_precision);
self.columnar_writer
.record_datetime(doc_id, field_name, truncated_datetime);
}
ReferenceValueLeaf::Facet(val) => {
self.columnar_writer
.record_str(doc_id, field_name, val.encoded_str());
}
ReferenceValueLeaf::Bytes(val) => {
self.columnar_writer.record_bytes(doc_id, field_name, val);
}
ReferenceValueLeaf::IpAddr(val) => {
self.columnar_writer.record_ip_addr(doc_id, field_name, val);
}
ReferenceValueLeaf::Bool(val) => {
self.columnar_writer.record_bool(doc_id, field_name, val);
}
ReferenceValueLeaf::PreTokStr(val) => {
for token in &val.tokens {
self.columnar_writer
.record_str(doc_id, field_name, &token.text);
}
}
},
ReferenceValue::Array(val) => {
// TODO: Check this is the correct behaviour we want.
for value in val {
self.add_doc_value(doc_id, field, value)?;
}
}
ReferenceValue::Object(val) => {
let expand_dots = self.expand_dots[field.field_id() as usize];
self.json_path_buffer.clear();
// First field should not be expanded.
self.json_path_buffer.set_expand_dots(false);
self.json_path_buffer.push(field_name);
self.json_path_buffer.set_expand_dots(expand_dots);
let text_analyzer = &mut self.per_field_tokenizer[field.field_id() as usize];
record_json_obj_to_columnar_writer::<V>(
doc_id,
val,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
}
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
@@ -248,33 +241,66 @@ impl FastFieldsWriter {
}
}
fn record_json_obj_to_columnar_writer<'a, V: Value<'a>>(
#[inline]
fn columnar_numerical_value(json_number: &serde_json::Number) -> Option<NumericalValue> {
if let Some(num_i64) = json_number.as_i64() {
return Some(num_i64.into());
}
if let Some(num_u64) = json_number.as_u64() {
return Some(num_u64.into());
}
if let Some(num_f64) = json_number.as_f64() {
return Some(num_f64.into());
}
// This can happen with arbitrary precision.... but we do not handle it.
None
}
fn record_json_obj_to_columnar_writer(
doc: DocId,
json_visitor: V::ObjectIter,
json_obj: &serde_json::Map<String, serde_json::Value>,
expand_dots: bool,
remaining_depth_limit: usize,
json_path_buffer: &mut JsonPathWriter,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_visitor {
json_path_buffer.push(key);
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
if !json_path_buffer.is_empty() {
json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
json_path_buffer.push_str(key);
if expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut json_path_buffer[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, unsafe {
appended_segment.as_bytes_mut()
});
}
record_json_value_to_columnar_writer(
doc,
child,
expand_dots,
remaining_depth_limit,
json_path_buffer,
columnar_writer,
tokenizer,
);
json_path_buffer.pop();
// popping our sub path.
json_path_buffer.truncate(len_path);
}
}
fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
fn record_json_value_to_columnar_writer(
doc: DocId,
json_val: V,
json_val: &serde_json::Value,
expand_dots: bool,
mut remaining_depth_limit: usize,
json_path_writer: &mut JsonPathWriter,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
) {
@@ -282,69 +308,34 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
return;
}
remaining_depth_limit -= 1;
match json_val.as_value() {
ReferenceValue::Leaf(leaf) => match leaf {
ReferenceValueLeaf::Null => {} // TODO: Handle null
ReferenceValueLeaf::Str(val) => {
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(val);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), val);
}
match json_val {
serde_json::Value::Null => {
// TODO handle null
}
serde_json::Value::Bool(bool_val) => {
columnar_writer.record_bool(doc, json_path_writer, *bool_val);
}
serde_json::Value::Number(json_number) => {
if let Some(numerical_value) = columnar_numerical_value(json_number) {
columnar_writer.record_numerical(doc, json_path_writer.as_str(), numerical_value);
}
ReferenceValueLeaf::U64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
serde_json::Value::String(text) => {
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
ReferenceValueLeaf::I64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValueLeaf::F64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValueLeaf::Bool(val) => {
columnar_writer.record_bool(doc, json_path_writer.as_str(), val);
}
ReferenceValueLeaf::Date(val) => {
columnar_writer.record_datetime(doc, json_path_writer.as_str(), val);
}
ReferenceValueLeaf::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::Bytes(_) => {
// TODO: This can be re added once it is added to the JSON Utils section as well.
// columnar_writer.record_bytes(doc, json_path_writer.as_str(), val);
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::PreTokStr(_) => {
unimplemented!(
"Pre-tokenized string support in dynamic fields is not yet implemented"
)
}
},
ReferenceValue::Array(elements) => {
for el in elements {
}
serde_json::Value::Array(arr) => {
for el in arr {
record_json_value_to_columnar_writer(
doc,
el,
expand_dots,
remaining_depth_limit,
json_path_writer,
columnar_writer,
@@ -352,10 +343,11 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
);
}
}
ReferenceValue::Object(object) => {
record_json_obj_to_columnar_writer::<V>(
serde_json::Value::Object(json_obj) => {
record_json_obj_to_columnar_writer(
doc,
object,
json_obj,
expand_dots,
remaining_depth_limit,
json_path_writer,
columnar_writer,
@@ -368,7 +360,6 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
#[cfg(test)]
mod tests {
use columnar::{Column, ColumnarReader, ColumnarWriter, StrColumn};
use common::JsonPathWriter;
use super::record_json_value_to_columnar_writer;
use crate::fastfield::writer::JSON_DEPTH_LIMIT;
@@ -379,12 +370,12 @@ mod tests {
expand_dots: bool,
) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default();
let mut json_path = JsonPathWriter::default();
json_path.set_expand_dots(expand_dots);
let mut json_path = String::new();
for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer(
doc as u32,
json_doc,
expand_dots,
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,

View File

@@ -4,7 +4,7 @@ use rand::{thread_rng, Rng};
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::*;
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, Order, Searcher};
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
assert!(searcher.segment_readers().len() < 20);
@@ -12,7 +12,7 @@ fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
for segment_reader in searcher.segment_readers() {
let store_reader = segment_reader.get_store_reader(1)?;
for doc_id in 0..segment_reader.max_doc() {
let _doc: TantivyDocument = store_reader.get(doc_id)?;
let _doc = store_reader.get(doc_id)?;
}
}
Ok(())
@@ -31,8 +31,7 @@ fn test_functional_store() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut index_writer = index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut doc_set: Vec<u64> = Vec::new();
@@ -92,8 +91,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
@@ -116,7 +114,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
@@ -168,8 +166,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
@@ -192,7 +189,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);

View File

@@ -158,7 +158,6 @@ mod tests_indexsorting {
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{Schema, *};
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order};
@@ -309,16 +308,16 @@ mod tests_indexsorting {
{
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.doc(DocAddress::new(0, 0))?
.get_first(my_string_field),
None
);
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
.doc(DocAddress::new(0, 3))?
.get_first(my_string_field)
.unwrap()
.as_str(),
.as_text(),
Some("blublub")
);
}
@@ -338,13 +337,13 @@ mod tests_indexsorting {
{
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.doc(DocAddress::new(0, 0))?
.get_first(my_string_field)
.unwrap()
.as_str(),
.as_text(),
Some("blublub")
);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(my_string_field), None);
}
// sort by field desc
@@ -361,9 +360,9 @@ mod tests_indexsorting {
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(
doc.get_first(my_string_field).unwrap().as_str(),
doc.get_first(my_string_field).unwrap().as_text(),
Some("blublub")
);
}

View File

@@ -20,8 +20,7 @@ use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper;
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
use crate::query::{EnableScoring, Query, TermQuery};
use crate::schema::document::Document;
use crate::schema::{IndexRecordOption, TantivyDocument, Term};
use crate::schema::{Document, IndexRecordOption, Term};
use crate::{FutureResult, Opstamp};
// Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
@@ -51,7 +50,7 @@ fn error_in_index_worker_thread(context: &str) -> TantivyError {
/// indexing queue.
/// Each indexing thread builds its own independent [`Segment`], via
/// a `SegmentWriter` object.
pub struct IndexWriter<D: Document = TantivyDocument> {
pub struct IndexWriter {
// the lock is just used to bind the
// lifetime of the lock with that of the IndexWriter.
_directory_lock: Option<DirectoryLock>,
@@ -63,8 +62,8 @@ pub struct IndexWriter<D: Document = TantivyDocument> {
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
index_writer_status: IndexWriterStatus<D>,
operation_sender: AddBatchSender<D>,
index_writer_status: IndexWriterStatus,
operation_sender: AddBatchSender,
segment_updater: SegmentUpdater,
@@ -165,10 +164,10 @@ pub(crate) fn advance_deletes(
Ok(())
}
fn index_documents<D: Document>(
fn index_documents(
memory_budget: usize,
segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch<D>>,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
segment_updater: &SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> crate::Result<()> {
@@ -248,7 +247,7 @@ fn apply_deletes(
})
}
impl<D: Document> IndexWriter<D> {
impl IndexWriter {
/// Create a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
@@ -268,7 +267,7 @@ impl<D: Document> IndexWriter<D> {
num_threads: usize,
memory_budget_in_bytes_per_thread: usize,
directory_lock: DirectoryLock,
) -> crate::Result<Self> {
) -> crate::Result<IndexWriter> {
if memory_budget_in_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
let err_msg = format!(
"The memory arena in bytes per thread needs to be at least \
@@ -282,7 +281,7 @@ impl<D: Document> IndexWriter<D> {
);
return Err(TantivyError::InvalidArgument(err_msg));
}
let (document_sender, document_receiver) =
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
@@ -294,7 +293,7 @@ impl<D: Document> IndexWriter<D> {
let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = Self {
let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock),
memory_budget_in_bytes_per_thread,
@@ -376,7 +375,7 @@ impl<D: Document> IndexWriter<D> {
self.index.new_segment()
}
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver<D>> {
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
self.index_writer_status
.operation_receiver()
.ok_or_else(|| {
@@ -526,7 +525,7 @@ impl<D: Document> IndexWriter<D> {
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) {
let (document_sender, document_receiver) =
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
self.operation_sender = document_sender;
self.index_writer_status = IndexWriterStatus::from(document_receiver);
@@ -553,7 +552,7 @@ impl<D: Document> IndexWriter<D> {
.take()
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
let new_index_writer = IndexWriter::new(
let new_index_writer: IndexWriter = IndexWriter::new(
&self.index,
self.num_threads,
self.memory_budget_in_bytes_per_thread,
@@ -599,7 +598,7 @@ impl<D: Document> IndexWriter<D> {
/// It is also possible to add a payload to the `commit`
/// using this API.
/// See [`PreparedCommit::set_payload()`].
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
// Here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.
@@ -708,7 +707,7 @@ impl<D: Document> IndexWriter<D> {
/// The opstamp is an increasing `u64` that can
/// be used by the client to align commits with its own
/// document queue.
pub fn add_document(&self, document: D) -> crate::Result<Opstamp> {
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> {
let opstamp = self.stamper.stamp();
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?;
Ok(opstamp)
@@ -745,7 +744,7 @@ impl<D: Document> IndexWriter<D> {
/// visible to readers only after calling `commit()`.
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp>
where
I: IntoIterator<Item = UserOperation<D>>,
I: IntoIterator<Item = UserOperation>,
I::IntoIter: ExactSizeIterator,
{
let user_operations_it = user_operations.into_iter();
@@ -779,7 +778,7 @@ impl<D: Document> IndexWriter<D> {
Ok(batch_opstamp)
}
fn send_add_documents_batch(&self, add_ops: AddBatch<D>) -> crate::Result<()> {
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> {
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
Ok(())
} else {
@@ -788,7 +787,7 @@ impl<D: Document> IndexWriter<D> {
}
}
impl<D: Document> Drop for IndexWriter<D> {
impl Drop for IndexWriter {
fn drop(&mut self) {
self.segment_updater.kill();
self.drop_sender();
@@ -815,15 +814,13 @@ mod tests {
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy;
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, IndexWriter, Order,
ReloadPolicy, TantivyDocument, Term,
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term,
};
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
@@ -855,7 +852,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "hello1"))
.unwrap();
@@ -908,7 +905,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let operations = vec![
@@ -946,7 +943,7 @@ mod tests {
fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer: IndexWriter = index.writer_for_tests().unwrap();
let index_writer = index.writer_for_tests().unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1).unwrap();
assert_eq!(batch_opstamp1, 0u64);
@@ -959,8 +956,8 @@ mod tests {
fn test_lockfile_stops_duplicates() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
match index.writer_for_tests::<TantivyDocument>() {
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
_ => panic!("Expected a `LockFailure` error"),
}
@@ -970,8 +967,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
match index.writer_for_tests::<TantivyDocument>() {
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
Err(err) => {
let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`"));
@@ -984,7 +981,7 @@ mod tests {
fn test_set_merge_policy() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer: IndexWriter = index.writer_for_tests().unwrap();
let index_writer = index.writer_for_tests().unwrap();
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \
@@ -1003,11 +1000,11 @@ mod tests {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
{
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
let _index_writer = index.writer_for_tests().unwrap();
// the lock should be released when the
// index_writer leaves the scope.
}
let _index_writer_two: IndexWriter = index.writer_for_tests().unwrap();
let _index_writer_two = index.writer_for_tests().unwrap();
}
#[test]
@@ -1059,7 +1056,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
// this should create 1 segment
@@ -1099,7 +1096,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?;
@@ -1385,7 +1382,7 @@ mod tests {
fn test_delete_all_documents_empty_index() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let clear = index_writer.delete_all_documents();
@@ -1398,7 +1395,7 @@ mod tests {
fn test_delete_all_documents_index_twice() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let clear = index_writer.delete_all_documents();
@@ -1418,7 +1415,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::builder().schema(schema).create_in_ram().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "one"))
.unwrap();
@@ -1706,8 +1703,7 @@ mod tests {
let old_reader = index.reader()?;
// Every 3rd doc has only id field
let id_is_full_doc = |id| id % 3 != 0;
let id_exists = |id| id % 3 != 0; // 0 does not exist
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left
@@ -1723,7 +1719,7 @@ mod tests {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
if !id_is_full_doc(id) {
if !id_exists(id) {
// every 3rd doc has no ip field
index_writer.add_document(doc!(
id_field=>id,
@@ -1781,7 +1777,7 @@ mod tests {
let num_segments_before_merge = searcher.segment_readers().len();
if force_end_merge {
index_writer.wait_merging_threads()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
@@ -1843,7 +1839,7 @@ mod tests {
let num_docs_with_values = expected_ids_and_num_occurrences
.iter()
.filter(|(id, _id_occurrences)| id_is_full_doc(**id))
.filter(|(id, _id_occurrences)| id_exists(**id))
.map(|(_, id_occurrences)| *id_occurrences as usize)
.sum::<usize>();
@@ -1867,7 +1863,7 @@ mod tests {
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
let mut expected_multi_ips: Vec<_> = id_list
.iter()
.filter(|id| id_is_full_doc(**id))
.filter(|id| id_exists(**id))
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
.collect();
assert_eq!(num_ips, expected_multi_ips.len() as u32);
@@ -1905,7 +1901,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.flat_map(|id| {
if !id_is_full_doc(*id) {
if !id_exists(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1917,7 +1913,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.filter_map(|id| {
if !id_is_full_doc(*id) {
if !id_exists(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1952,7 +1948,7 @@ mod tests {
let id = id_reader.first(doc).unwrap();
let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect();
if id_is_full_doc(id) {
if id_exists(id) {
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
@@ -1962,7 +1958,7 @@ mod tests {
}
let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect();
if id_is_full_doc(id) {
if id_exists(id) {
assert_eq!(bool_vals.len(), 2);
assert_ne!(bool_vals[0], bool_vals[1]);
} else {
@@ -1977,23 +1973,23 @@ mod tests {
.get_store_reader(DOCSTORE_CACHE_CAPACITY)
.unwrap();
// test store iterator
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
for doc in store_reader.iter(segment_reader.alive_bitset()) {
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
}
// test store random access
for doc_id in segment_reader.doc_ids_alive() {
let id = store_reader
.get::<TantivyDocument>(doc_id)
.get(doc_id)
.unwrap()
.get_first(id_field)
.unwrap()
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
if id_is_full_doc(id) {
if id_exists(id) {
let id2 = store_reader
.get::<TantivyDocument>(doc_id)
.get(doc_id)
.unwrap()
.get_first(multi_numbers)
.unwrap()
@@ -2001,13 +1997,13 @@ mod tests {
.unwrap();
assert_eq!(id, id2);
let bool = store_reader
.get::<TantivyDocument>(doc_id)
.get(doc_id)
.unwrap()
.get_first(bool_field)
.unwrap()
.as_bool()
.unwrap();
let doc = store_reader.get::<TantivyDocument>(doc_id).unwrap();
let doc = store_reader.get(doc_id).unwrap();
let mut bool2 = doc.get_all(multi_bools);
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
@@ -2038,7 +2034,7 @@ mod tests {
let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(id_field), count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
assert_eq!(get_num_hits(text_field), count);
@@ -2088,7 +2084,7 @@ mod tests {
//
for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
@@ -2105,84 +2101,34 @@ mod tests {
}
}
// Range query
// assert data is like expected
//
// Take half as sample
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
sample.sort_by_key(|(k, _num_occurences)| *k);
// sample.truncate(sample.len() / 2);
if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
.map(|(_id, num_occurences)| **num_occurences)
.sum::<u64>()
};
fn gen_query_inclusive<T1: ToString, T2: ToString>(
field: &str,
from: T1,
to: T2,
) -> String {
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
}
};
let ip = ip_from_id(existing_id);
// Query first half
if !left_sample.is_empty() {
let expected_count = expected_count(left_sample);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
}
// Query second half
if !right_sample.is_empty() {
let expected_count = expected_count(right_sample);
let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field
let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
}
assert_eq!(do_search_ip_field(&query), count);
}
// ip range query on fast field
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_is_full_doc(existing_id) {
if !id_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
@@ -2210,7 +2156,7 @@ mod tests {
.first_or_default_col(9999);
for doc_id in segment_reader.doc_ids_alive() {
let id = ff_reader.get_val(doc_id);
if !id_is_full_doc(id) {
if !id_exists(id) {
continue;
}
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
@@ -2248,12 +2194,6 @@ mod tests {
Ok(index)
}
#[test]
fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok());
}
#[test]
fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy(
@@ -2603,7 +2543,7 @@ mod tests {
// Merge
{
assert!(index_writer.wait_merging_threads().is_ok());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
@@ -2645,7 +2585,7 @@ mod tests {
// Merge
{
assert!(index_writer.wait_merging_threads().is_ok());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");

View File

@@ -2,15 +2,13 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use super::AddBatchReceiver;
use crate::schema::document::Document;
use crate::TantivyDocument;
#[derive(Clone)]
pub(crate) struct IndexWriterStatus<D: Document = TantivyDocument> {
inner: Arc<Inner<D>>,
pub(crate) struct IndexWriterStatus {
inner: Arc<Inner>,
}
impl<D: Document> IndexWriterStatus<D> {
impl IndexWriterStatus {
/// Returns true iff the index writer is alive.
pub fn is_alive(&self) -> bool {
self.inner.as_ref().is_alive()
@@ -18,7 +16,7 @@ impl<D: Document> IndexWriterStatus<D> {
/// Returns a copy of the operation receiver.
/// If the index writer was killed, returns `None`.
pub fn operation_receiver(&self) -> Option<AddBatchReceiver<D>> {
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
let rlock = self
.inner
.receive_channel
@@ -29,19 +27,19 @@ impl<D: Document> IndexWriterStatus<D> {
/// Create an index writer bomb.
/// If dropped, the index writer status will be killed.
pub(crate) fn create_bomb(&self) -> IndexWriterBomb<D> {
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
IndexWriterBomb {
inner: Some(self.inner.clone()),
}
}
}
struct Inner<D: Document> {
struct Inner {
is_alive: AtomicBool,
receive_channel: RwLock<Option<AddBatchReceiver<D>>>,
receive_channel: RwLock<Option<AddBatchReceiver>>,
}
impl<D: Document> Inner<D> {
impl Inner {
fn is_alive(&self) -> bool {
self.is_alive.load(Ordering::Relaxed)
}
@@ -55,8 +53,8 @@ impl<D: Document> Inner<D> {
}
}
impl<D: Document> From<AddBatchReceiver<D>> for IndexWriterStatus<D> {
fn from(receiver: AddBatchReceiver<D>) -> Self {
impl From<AddBatchReceiver> for IndexWriterStatus {
fn from(receiver: AddBatchReceiver) -> Self {
IndexWriterStatus {
inner: Arc::new(Inner {
is_alive: AtomicBool::new(true),
@@ -68,11 +66,11 @@ impl<D: Document> From<AddBatchReceiver<D>> for IndexWriterStatus<D> {
/// If dropped, the index writer will be killed.
/// To prevent this, clients can call `.defuse()`.
pub(crate) struct IndexWriterBomb<D: Document> {
inner: Option<Arc<Inner<D>>>,
pub(crate) struct IndexWriterBomb {
inner: Option<Arc<Inner>>,
}
impl<D: Document> IndexWriterBomb<D> {
impl IndexWriterBomb {
/// Defuses the bomb.
///
/// This is the only way to drop the bomb without killing
@@ -82,7 +80,7 @@ impl<D: Document> IndexWriterBomb<D> {
}
}
impl<D: Document> Drop for IndexWriterBomb<D> {
impl Drop for IndexWriterBomb {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
inner.kill();

View File

@@ -63,13 +63,10 @@ impl MergeOperation {
}
}
/// Returns the opstamp up to which we want to consume the delete queue and reflect their
/// deletes.
pub fn target_opstamp(&self) -> Opstamp {
self.inner.target_opstamp
}
/// Returns the list of segment to be merged.
pub fn segment_ids(&self) -> &[SegmentId] {
&self.inner.segment_ids[..]
}

View File

@@ -552,41 +552,7 @@ impl IndexMerger {
continue;
}
// This should never happen as we early exited for total_doc_freq == 0.
assert!(!segment_postings_containing_the_term.is_empty());
let has_term_freq = {
let has_term_freq = !segment_postings_containing_the_term[0]
.1
.block_cursor
.freqs()
.is_empty();
for (_, postings) in &segment_postings_containing_the_term[1..] {
// This may look at a strange way to test whether we have term freq or not.
// With JSON object, the schema is not sufficient to know whether a term
// has its term frequency encoded or not:
// strings may have term frequencies, while number terms never have one.
//
// Ideally, we should have burnt one bit of two in the `TermInfo`.
// However, we preferred not changing the codec too much and detect this
// instead by
// - looking at the size of the skip data for bitpacked blocks
// - observing the absence of remaining data after reading the docs for vint
// blocks.
//
// Overall the reliable way to know if we have actual frequencies loaded or not
// is to check whether the actual decoded array is empty or not.
if has_term_freq != !postings.block_cursor.freqs().is_empty() {
return Err(DataCorruption::comment_only(
"Term freqs are inconsistent across segments",
)
.into());
}
}
has_term_freq
};
field_serializer.new_term(term_bytes, total_doc_freq, has_term_freq)?;
field_serializer.new_term(term_bytes, total_doc_freq)?;
// We can now serialize this postings, by pushing each document to the
// postings serializer.
@@ -601,13 +567,8 @@ impl IndexMerger {
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term if
// there is at least one document.
let term_freq = if has_term_freq {
segment_postings.positions(&mut positions_buffer);
segment_postings.term_freq()
} else {
0u32
};
let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer);
// if doc_id_mapping exists, the doc_ids are reordered, they are
// not just stacked. The field serializer expects monotonically increasing
// doc_ids, so we collect and sort them first, before writing.
@@ -792,10 +753,9 @@ mod tests {
use crate::collector::{Count, FacetCollector};
use crate::core::Index;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
TextFieldIndexing, INDEXED, TEXT,
Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term, TextFieldIndexing,
INDEXED, TEXT,
};
use crate::time::OffsetDateTime;
use crate::{
@@ -857,7 +817,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -906,24 +866,30 @@ mod tests {
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
let doc = searcher.doc(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c"));
let doc = searcher.doc(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("a b c"));
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d"));
let doc = searcher.doc(DocAddress::new(0, 2))?;
assert_eq!(
doc.get_first(text_field).unwrap().as_text(),
Some("a b c d")
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
let doc = searcher.doc(DocAddress::new(0, 3))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c g"));
let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(
doc.get_first(text_field).unwrap().as_text(),
Some("a b c g")
);
}
{
@@ -1334,10 +1300,10 @@ mod tests {
let reader = index.reader().unwrap();
let mut int_val = 0;
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc =
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
@@ -1418,7 +1384,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.merge(&segment_ids)
.wait()
@@ -1440,7 +1406,7 @@ mod tests {
// Deleting one term
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
@@ -1465,7 +1431,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64))?;
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64))?;
@@ -1494,7 +1460,7 @@ mod tests {
let reader = index.reader()?;
{
let mut index_writer = index.writer_for_tests()?;
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone())?;
index_writer.commit()?;
@@ -1537,7 +1503,7 @@ mod tests {
{
let mut index_writer = index.writer_for_tests()?;
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
@@ -1600,7 +1566,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -1647,7 +1613,7 @@ mod tests {
writer.set_merge_policy(Box::new(policy));
for i in 0..100 {
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27);

View File

@@ -4,15 +4,11 @@ mod tests {
use crate::core::Index;
use crate::fastfield::AliveBitSet;
use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
};
use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings,
TantivyDocument, Term,
};
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
let mut schema_builder = schema::Schema::builder();
@@ -30,7 +26,7 @@ mod tests {
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
.unwrap();
@@ -49,7 +45,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
assert!(index_writer.merge(&segment_ids).wait().is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
@@ -137,7 +133,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -276,16 +272,12 @@ mod tests {
} else {
2
};
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap();
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
assert_eq!(
doc.get_first(my_text_field).unwrap().as_str(),
doc.get_first(my_text_field).unwrap().as_text(),
Some("blubber")
);
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap();
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
}
}
@@ -502,7 +494,7 @@ mod bench_sorted_index_merge {
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
index_writer.add_document(doc!(int_field=>val)).unwrap();
};
@@ -543,7 +535,7 @@ mod bench_sorted_index_merge {
//);
//(doc_addr.doc_id, reader, u64_reader)
//});
/// add values in order of the new doc_ids
//// add values in order of the new doc_ids
// let mut val = 0;
// for (doc_id, _reader, field_reader) in sorted_doc_ids {
// val = field_reader.get_val(doc_id);

View File

@@ -1,30 +1,23 @@
//! Indexing and merging data.
//!
//! Contains code to create and merge segments.
//! `IndexWriter` is the main entry point for that, which created from
//! [`Index::writer`](crate::Index::writer).
pub mod delete_queue;
pub(crate) mod delete_queue;
pub(crate) mod path_to_unordered_id;
pub(crate) mod doc_id_mapping;
pub mod doc_id_mapping;
mod doc_opstamp_mapping;
mod flat_map_with_buffer;
pub(crate) mod index_writer;
pub(crate) mod index_writer_status;
pub mod index_writer;
mod index_writer_status;
mod log_merge_policy;
mod merge_operation;
pub(crate) mod merge_policy;
pub(crate) mod merger;
pub mod merge_policy;
pub mod merger;
mod merger_sorted_index_test;
pub(crate) mod operation;
pub(crate) mod prepared_commit;
pub mod operation;
pub mod prepared_commit;
mod segment_entry;
mod segment_manager;
mod segment_register;
pub(crate) mod segment_serializer;
pub(crate) mod segment_updater;
pub(crate) mod segment_writer;
pub mod segment_serializer;
pub mod segment_updater;
mod segment_writer;
mod stamper;
use crossbeam_channel as channel;
@@ -34,10 +27,10 @@ pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::operation::UserOperation;
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry;
pub(crate) use self::segment_serializer::SegmentSerializer;
pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::{merge_filtered_segments, merge_indices};
pub use self::segment_writer::SegmentWriter;
use crate::indexer::operation::AddOperation;
@@ -51,28 +44,25 @@ pub type DefaultMergePolicy = LogMergePolicy;
// - all docs in the operation will happen on the same segment and continuous doc_ids.
// - all operations in the group are committed at the same time, making the group
// atomic.
type AddBatch<D> = SmallVec<[AddOperation<D>; 4]>;
type AddBatchSender<D> = channel::Sender<AddBatch<D>>;
type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>;
type AddBatch = SmallVec<[AddOperation; 4]>;
type AddBatchSender = channel::Sender<AddBatch>;
type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(feature = "mmap")]
#[cfg(test)]
mod tests_mmap {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::AggregationCollector;
use crate::collector::{Count, TopDocs};
use crate::query::{AllQuery, QueryParser};
use crate::schema::{JsonObjectOptions, Schema, Type, FAST, INDEXED, STORED, TEXT};
use crate::{FieldMetadata, Index, IndexWriter, Term};
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};
#[test]
fn test_advance_delete_bug() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_from_tempdir(schema_builder.build())?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
// there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -89,7 +79,7 @@ mod tests_mmap {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
@@ -113,35 +103,6 @@ mod tests_mmap {
}
}
#[test]
fn test_json_field_number() {
// this test was added specifically to reach some cases related to using json fields, with
// frequency enabled, to store integers, with enough documents containing a single integer
// that the posting list can be bitpacked.
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..256 {
let json = serde_json::json!({"somekey": 1u64, "otherkey": -2i64});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"somekey": "1str", "otherkey": "2str"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 512);
let parse_query = QueryParser::for_index(&index, Vec::new());
{
let query = parse_query.parse_query(r"json.somekey:1").unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 256);
}
}
#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();
@@ -149,7 +110,7 @@ mod tests_mmap {
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
@@ -172,275 +133,4 @@ mod tests_mmap {
assert_eq!(num_docs, 1);
}
}
#[test]
fn test_json_field_list_fields() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions = JsonObjectOptions::from(TEXT);
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "sub": {"a": 1, "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": "mixed", "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 3);
let reader = &searcher.segment_readers()[0];
let inverted_index = reader.inverted_index(json_field).unwrap();
assert_eq!(
inverted_index.list_encoded_fields().unwrap(),
[
("k8s.container.name".to_string(), Type::Str),
("sub\u{1}a".to_string(), Type::I64),
("sub\u{1}b".to_string(), Type::I64),
("suber\u{1}a".to_string(), Type::I64),
("suber\u{1}a".to_string(), Type::Str),
("suber\u{1}b".to_string(), Type::I64),
("val".to_string(), Type::Str),
]
);
}
#[test]
fn test_json_fields_metadata_expanded_dots_one_segment() {
test_json_fields_metadata(true, true);
}
#[test]
fn test_json_fields_metadata_expanded_dots_multi_segment() {
test_json_fields_metadata(true, false);
}
#[test]
fn test_json_fields_metadata_no_expanded_dots_one_segment() {
test_json_fields_metadata(false, true);
}
#[test]
fn test_json_fields_metadata_no_expanded_dots_multi_segment() {
test_json_fields_metadata(false, false);
}
fn test_json_fields_metadata(expanded_dots: bool, one_segment: bool) {
use pretty_assertions::assert_eq;
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_fast(None).set_stored();
let json_options = if expanded_dots {
json_options.set_expand_dots_enabled()
} else {
json_options
};
schema_builder.add_json_field("json.confusing", json_options.clone());
let json_field = schema_builder.add_json_field("json.shadow", json_options.clone());
let json_field2 = schema_builder.add_json_field("json", json_options.clone());
schema_builder.add_json_field("empty_json", json_options);
let number_field = schema_builder.add_u64_field("numbers", FAST);
schema_builder.add_u64_field("empty", FAST | INDEXED | STORED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json =
serde_json::json!({"k8s.container.name": "a", "val": "a", "sub": {"a": 1, "b": 1}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json =
serde_json::json!({"k8s.container.name": "a", "val": "a", "suber": {"a": 1, "b": 1}});
if !one_segment {
index_writer.commit().unwrap();
}
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"k8s.container.name": "a", "k8s.container.name": "a", "val": "a", "suber": {"a": "a", "b": 1}});
index_writer
.add_document(doc!(number_field => 50u64, json_field=>json, json_field2=>json!({"shadow": {"val": "a"}})))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 3);
let fields_metadata = index.fields_metadata().unwrap();
assert_eq!(
fields_metadata,
[
FieldMetadata {
field_name: "empty".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::U64
},
FieldMetadata {
field_name: if expanded_dots {
"json.shadow.k8s.container.name".to_string()
} else {
"json.shadow.k8s\\.container\\.name".to_string()
},
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "json.shadow.sub.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.sub.b".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.suber.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.suber.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "json.shadow.suber.b".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.val".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "numbers".to_string(),
indexed: false,
stored: false,
fast: true,
typ: Type::U64
}
]
);
let query_parser = QueryParser::for_index(&index, vec![]);
// Test if returned field name can be queried
for indexed_field in fields_metadata.iter().filter(|meta| meta.indexed) {
let val = if indexed_field.typ == Type::Str {
"a"
} else {
"1"
};
let query_str = &format!("{}:{}", indexed_field.field_name, val);
let query = query_parser.parse_query(query_str).unwrap();
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
if indexed_field.field_name.contains("empty") || indexed_field.typ == Type::Json {
assert_eq!(count_docs.len(), 0);
} else {
assert!(!count_docs.is_empty(), "{}", indexed_field.field_name);
}
}
// Test if returned field name can be used for aggregation
for fast_field in fields_metadata.iter().filter(|meta| meta.fast) {
let agg_req_str = json!(
{
"termagg": {
"terms": {
"field": fast_field.field_name,
}
}
});
let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap();
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res = serde_json::to_value(agg_res).unwrap();
if !fast_field.field_name.contains("empty") && fast_field.typ != Type::Json {
assert!(
!res["termagg"]["buckets"].as_array().unwrap().is_empty(),
"{}",
fast_field.field_name
);
}
}
}
#[test]
fn test_json_field_shadowing_field_name_bug() {
/// This test is only there to display a bug on addressing a field if it gets shadowed
/// The issues only occurs if the field name that shadows contains a dot.
///
/// Happens independently of the `expand_dots` option. Since that option does not
/// affect the field name itself.
use pretty_assertions::assert_eq;
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_fast(None).set_stored();
// let json_options = json_options.set_expand_dots_enabled();
let json_field_shadow = schema_builder.add_json_field("json.shadow", json_options.clone());
let json_field = schema_builder.add_json_field("json", json_options.clone());
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(
doc!(json_field_shadow=>json!({"val": "b"}), json_field=>json!({"shadow": {"val": "a"}})),
)
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let fields_and_vals = vec![
// Only way to address or it gets shadowed by `json.shadow` field
("json.shadow\u{1}val".to_string(), "a"), // Succeeds
//("json.shadow.val".to_string(), "a"), // Fails
("json.shadow.val".to_string(), "b"), // Succeeds
];
let query_parser = QueryParser::for_index(&index, vec![]);
// Test if field name can be queried
for (indexed_field, val) in fields_and_vals.iter() {
let query_str = &format!("{}:{}", indexed_field, val);
let query = query_parser.parse_query(query_str).unwrap();
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val);
}
// Test if field name can be used for aggregation
for (field_name, val) in fields_and_vals.iter() {
let agg_req_str = json!(
{
"termagg": {
"terms": {
"field": field_name,
}
}
});
let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap();
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res = serde_json::to_value(agg_res).unwrap();
assert_eq!(
res["termagg"]["buckets"].as_array().unwrap()[0]["key"]
.as_str()
.unwrap(),
*val,
"{}",
field_name
);
}
}
}

View File

@@ -1,6 +1,5 @@
use crate::query::Weight;
use crate::schema::document::Document;
use crate::schema::{TantivyDocument, Term};
use crate::schema::{Document, Term};
use crate::Opstamp;
/// Timestamped Delete operation.
@@ -11,16 +10,16 @@ pub struct DeleteOperation {
/// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)]
pub struct AddOperation<D: Document = TantivyDocument> {
pub struct AddOperation {
pub opstamp: Opstamp,
pub document: D,
pub document: Document,
}
/// UserOperation is an enum type that encapsulates other operation types.
#[derive(Eq, PartialEq, Debug)]
pub enum UserOperation<D: Document = TantivyDocument> {
pub enum UserOperation {
/// Add operation
Add(D),
Add(Document),
/// Delete operation
Delete(Term),
}

View File

@@ -1,92 +0,0 @@
use fnv::FnvHashMap;
/// `Field` is represented by an unsigned 32-bit integer type.
/// The schema holds the mapping between field names and `Field` objects.
#[derive(Copy, Default, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)]
pub struct OrderedPathId(u32);
impl OrderedPathId {
/// Create a new field object for the given PathId.
pub const fn from_ordered_id(field_id: u32) -> OrderedPathId {
OrderedPathId(field_id)
}
/// Returns a u32 identifying uniquely a path within a schema.
pub const fn path_id(self) -> u32 {
self.0
}
}
impl From<u32> for OrderedPathId {
fn from(id: u32) -> Self {
Self(id)
}
}
#[derive(Default)]
pub(crate) struct PathToUnorderedId {
map: FnvHashMap<String, u32>,
}
impl PathToUnorderedId {
#[inline]
pub(crate) fn get_or_allocate_unordered_id(&mut self, path: &str) -> u32 {
if let Some(id) = self.map.get(path) {
return *id;
}
self.insert_new_path(path)
}
#[cold]
fn insert_new_path(&mut self, path: &str) -> u32 {
let next_id = self.map.len() as u32;
self.map.insert(path.to_string(), next_id);
next_id
}
/// Retuns ids which reflect the lexical order of the paths.
///
/// The returned vec can be indexed with the unordered id to get the ordered id.
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> {
let mut sorted_ids: Vec<(&str, &u32)> =
self.map.iter().map(|(k, v)| (k.as_str(), v)).collect();
sorted_ids.sort_unstable_by_key(|(path, _)| *path);
let mut result = vec![OrderedPathId::default(); sorted_ids.len()];
for (ordered, unordered) in sorted_ids.iter().map(|(_k, v)| v).enumerate() {
result[**unordered as usize] = OrderedPathId::from_ordered_id(ordered as u32);
}
result
}
/// Retuns the paths so they can be queried by the ordered id (which is the index).
pub(crate) fn ordered_id_to_path(&self) -> Vec<&str> {
let mut paths = self.map.keys().map(String::as_str).collect::<Vec<_>>();
paths.sort_unstable();
paths
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn path_to_unordered_test() {
let mut path_to_id = PathToUnorderedId::default();
let terms = vec!["b", "a", "b", "c"];
let ids = terms
.iter()
.map(|term| path_to_id.get_or_allocate_unordered_id(term))
.collect::<Vec<u32>>();
assert_eq!(ids, vec![0, 1, 0, 2]);
let ordered_ids = ids
.iter()
.map(|id| path_to_id.unordered_id_to_ordered_id()[*id as usize])
.collect::<Vec<OrderedPathId>>();
assert_eq!(ordered_ids, vec![1.into(), 0.into(), 1.into(), 2.into()]);
// Fetch terms
let terms_fetched = ordered_ids
.iter()
.map(|id| path_to_id.ordered_id_to_path()[id.path_id() as usize])
.collect::<Vec<&str>>();
assert_eq!(terms_fetched, terms);
}
}

View File

@@ -1,17 +1,16 @@
use super::IndexWriter;
use crate::schema::document::Document;
use crate::{FutureResult, Opstamp, TantivyDocument};
use crate::{FutureResult, Opstamp};
/// A prepared commit
pub struct PreparedCommit<'a, D: Document = TantivyDocument> {
index_writer: &'a mut IndexWriter<D>,
pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter,
payload: Option<String>,
opstamp: Opstamp,
}
impl<'a, D: Document> PreparedCommit<'a, D> {
pub(crate) fn new(index_writer: &'a mut IndexWriter<D>, opstamp: Opstamp) -> Self {
Self {
impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> {
PreparedCommit {
index_writer,
payload: None,
opstamp,

View File

@@ -1,5 +1,4 @@
use columnar::MonotonicallyMappableToU64;
use common::JsonPathWriter;
use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
@@ -14,11 +13,10 @@ use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::document::{Document, ReferenceValue, Value};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Opstamp, SegmentComponent, TantivyError};
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
/// Computes the initial size of the hash table.
///
@@ -67,7 +65,6 @@ pub struct SegmentWriter {
pub(crate) segment_serializer: SegmentSerializer,
pub(crate) fast_field_writers: FastFieldsWriter,
pub(crate) fieldnorms_writer: FieldNormsWriter,
pub(crate) json_path_writer: JsonPathWriter,
pub(crate) doc_opstamps: Vec<Opstamp>,
per_field_text_analyzers: Vec<TextAnalyzer>,
term_buffer: Term,
@@ -84,7 +81,10 @@ impl SegmentWriter {
/// the flushing behavior as a memory limit.
/// - segment: The segment being written
/// - schema
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
pub fn for_segment(
memory_budget_in_bytes: usize,
segment: Segment,
) -> crate::Result<SegmentWriter> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
@@ -113,12 +113,11 @@ impl SegmentWriter {
})
})
.collect::<Result<Vec<_>, _>>()?;
Ok(Self {
Ok(SegmentWriter {
max_doc: 0,
ctx: IndexingContext::new(table_size),
per_field_postings_writers,
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
json_path_writer: JsonPathWriter::default(),
segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
&schema,
@@ -147,7 +146,6 @@ impl SegmentWriter {
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
.transpose()?;
remap_and_write(
self.schema,
&self.per_field_postings_writers,
self.ctx,
self.fast_field_writers,
@@ -159,8 +157,6 @@ impl SegmentWriter {
Ok(doc_opstamps)
}
/// Returns an estimation of the current memory usage of the segment writer.
/// If the mem usage exceeds the `memory_budget`, the segment be serialized.
pub fn mem_usage(&self) -> usize {
self.ctx.mem_usage()
+ self.fieldnorms_writer.mem_usage()
@@ -168,21 +164,18 @@ impl SegmentWriter {
+ self.segment_serializer.mem_usage()
}
fn index_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
fn index_document(&mut self, doc: &Document) -> crate::Result<()> {
let doc_id = self.max_doc;
// TODO: Can this be optimised a bit?
let vals_grouped_by_field = doc
.iter_fields_and_values()
.sorted_by_key(|(field, _)| *field)
.group_by(|(field, _)| *field);
.field_values()
.iter()
.sorted_by_key(|el| el.field())
.group_by(|el| el.field());
for (field, field_values) in &vals_grouped_by_field {
let values = field_values.map(|el| el.1);
let values = field_values.map(|field_value| field_value.value());
let field_entry = self.schema.get_field_entry(field);
let make_schema_error = || {
TantivyError::SchemaError(format!(
crate::TantivyError::SchemaError(format!(
"Expected a {:?} for field {:?}",
field_entry.field_type().value_type(),
field_entry.name()
@@ -200,10 +193,7 @@ impl SegmentWriter {
match field_entry.field_type() {
FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -219,18 +209,19 @@ impl SegmentWriter {
}
FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default();
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
let mut token_stream = if let Some(text) = value.as_str() {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
} else {
continue;
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
}
Value::Str(ref text) => {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
}
_ => {
continue;
}
};
assert!(term_buffer.is_empty());
@@ -249,10 +240,7 @@ impl SegmentWriter {
}
FieldType::U64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val);
@@ -264,13 +252,9 @@ impl SegmentWriter {
}
FieldType::Date(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
for value in values {
num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
let date_val = value.as_date().ok_or_else(make_schema_error)?;
term_buffer
.set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
@@ -281,10 +265,7 @@ impl SegmentWriter {
}
FieldType::I64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val);
@@ -296,10 +277,7 @@ impl SegmentWriter {
}
FieldType::F64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
@@ -311,10 +289,7 @@ impl SegmentWriter {
}
FieldType::Bool(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val);
@@ -326,10 +301,7 @@ impl SegmentWriter {
}
FieldType::Bytes(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
@@ -342,33 +314,21 @@ impl SegmentWriter {
FieldType::JsonObject(json_options) => {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it = values.map(|value_access| {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
match value {
ReferenceValue::Object(object_iter) => Ok(object_iter),
_ => Err(make_schema_error()),
}
});
index_json_values::<D::Value<'_>>(
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
index_json_values(
doc_id,
json_values_it,
text_analyzer,
json_options.is_expand_dots_enabled(),
term_buffer,
postings_writer,
&mut self.json_path_writer,
ctx,
)?;
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
term_buffer.set_ip_addr(ip_addr);
@@ -386,10 +346,7 @@ impl SegmentWriter {
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document<D: Document>(
&mut self,
add_operation: AddOperation<D>,
) -> crate::Result<()> {
pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
let AddOperation { document, opstamp } = add_operation;
self.doc_opstamps.push(opstamp);
self.fast_field_writers.add_document(&document)?;
@@ -427,7 +384,6 @@ impl SegmentWriter {
///
/// `doc_id_map` is used to map to the new doc_id order.
fn remap_and_write(
schema: Schema,
per_field_postings_writers: &PerFieldPostingsWriter,
ctx: IndexingContext,
fast_field_writers: FastFieldsWriter,
@@ -445,7 +401,6 @@ fn remap_and_write(
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
serialize_postings(
ctx,
schema,
per_field_postings_writers,
fieldnorm_readers,
doc_id_map,
@@ -490,37 +445,32 @@ fn remap_and_write(
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use tempfile::TempDir;
use crate::collector::{Count, TopDocs};
use super::compute_initial_table_size;
use crate::collector::Count;
use crate::core::json_utils::JsonTermWriter;
use crate::directory::RamDirectory;
use crate::postings::TermInfo;
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::document::Value;
use crate::query::PhraseQuery;
use crate::schema::{
Document, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING,
TEXT,
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument,
Term, TERMINATED,
DateTime, Directory, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED,
};
#[test]
#[cfg(not(feature = "compare_hash_only"))]
fn test_hashmap_size() {
use super::compute_initial_table_size;
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 12);
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 15);
assert_eq!(compute_initial_table_size(15_000_000).unwrap(), 1 << 19);
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
assert_eq!(compute_initial_table_size(15_000_000).unwrap(), 1 << 18);
assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
}
@@ -530,7 +480,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("title", TEXT | STORED);
let schema = schema_builder.build();
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
let pre_tokenized_text = PreTokenizedString {
text: String::from("A"),
tokens: vec![Token {
@@ -554,48 +504,11 @@ mod tests {
store_writer.close().unwrap();
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get::<TantivyDocument>(0).unwrap();
let doc = reader.get(0).unwrap();
assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.field_values()[0].value().as_str(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_str(), Some("title"));
}
#[test]
fn test_simple_json_indexing() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "b"})))
.unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "a"})))
.unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "b"})))
.unwrap();
writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, vec![json_field]);
let text_query = query_parser.parse_query("my_field:a").unwrap();
let score_docs: Vec<(_, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_eq!(score_docs.len(), 1);
let text_query = query_parser.parse_query("my_field:b").unwrap();
let score_docs: Vec<(_, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_eq!(score_docs.len(), 2);
assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
}
#[test]
@@ -626,13 +539,13 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let doc = searcher
.doc::<TantivyDocument>(DocAddress {
.doc(DocAddress {
segment_ord: 0u32,
doc_id: 0u32,
})
.unwrap();
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
&doc.to_json(&schema),
&schema.to_json(&doc),
)
.unwrap()
.get("json")
@@ -762,10 +675,10 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build();
let mut doc = TantivyDocument::default();
let json_val: BTreeMap<String, crate::schema::OwnedValue> =
let mut doc = Document::default();
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap();
doc.add_object(json_field, json_val);
doc.add_json_object(json_field, json_val);
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap();
@@ -889,10 +802,11 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let doc = TantivyDocument::parse_json(&schema, r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
let doc = schema
.parse_document(r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
.unwrap();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
// On debug this did panic on the underflow
index_writer.commit().unwrap();
@@ -917,7 +831,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "roller-coaster".to_string(),
@@ -932,7 +846,7 @@ mod tests {
doc.add_pre_tokenized_text(text, tokens.clone());
doc.add_pre_tokenized_text(text, tokens);
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
@@ -955,7 +869,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
@@ -980,7 +894,7 @@ mod tests {
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
@@ -1016,7 +930,7 @@ mod tests {
let schema = index.schema();
let mut index_writer = index.writer(50_000_000).unwrap();
let title = schema.get_field("title").unwrap();
let mut document = TantivyDocument::default();
let mut document = Document::default();
document.add_text(title, "The Old Man and the Sea");
index_writer.add_document(document).unwrap();
let error = index_writer.commit().unwrap_err();

View File

@@ -21,7 +21,7 @@
//! # use tantivy::collector::TopDocs;
//! # use tantivy::query::QueryParser;
//! # use tantivy::schema::*;
//! # use tantivy::{doc, DocAddress, Index, IndexWriter, Score};
//! # use tantivy::{doc, DocAddress, Index, Score};
//! #
//! # fn main() {
//! # // Let's create a temporary directory for the
@@ -53,7 +53,7 @@
//!
//! // Here we use a buffer of 100MB that will be split
//! // between indexing threads.
//! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
//! let mut index_writer = index.writer(100_000_000)?;
//!
//! // Let's index one documents!
//! index_writer.add_document(doc!(
@@ -89,8 +89,8 @@
//!
//! for (_score, doc_address) in top_docs {
//! // Retrieve the actual content of documents given its `doc_address`.
//! let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
//! println!("{}", retrieved_doc.to_json(&schema));
//! let retrieved_doc = searcher.doc(doc_address)?;
//! println!("{}", schema.to_json(&retrieved_doc));
//! }
//!
//! # Ok(())
@@ -103,48 +103,7 @@
//! the example code (
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/quickwit-oss/tantivy/blob/main/examples/basic_search.rs))
//!
//! # Tantivy Architecture Overview
//!
//! Tantivy is inspired by Lucene, the Architecture is very similar.
//!
//! ## Core Concepts
//!
//! - **[Index]**: A collection of segments. The top level entry point for tantivy users to search
//! and index data.
//!
//! - **[Segment]**: At the heart of Tantivy's indexing structure is the [Segment]. It contains
//! documents and indices and is the atomic unit of indexing and search.
//!
//! - **[Schema](schema)**: A schema is a set of fields in an index. Each field has a specific data
//! type and set of attributes.
//!
//! - **[IndexWriter]**: Responsible creating and merging segments. It executes the indexing
//! pipeline including tokenization, creating indices, and storing the index in the
//! [Directory](directory).
//!
//! - **Searching**: [Searcher] searches the segments with anything that implements
//! [Query](query::Query) and merges the results. The list of [supported
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
//! [Query](query::Query) trait.
//!
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
//!
//! - **[Tokenizer](tokenizer)**: Breaks down text into individual tokens. Users can implement or
//! use provided tokenizers.
//!
//! ## Architecture Flow
//!
//! 1. **Document Addition**: Users create documents according to the defined schema. The documents
//! fields are tokenized, processed, and added to the current segment. See
//! [Document](schema::document) for the structure and usage.
//!
//! 2. **Segment Creation**: Once the memory limit threshold is reached or a commit is called, the
//! segment is written to the Directory. Documents are searchable after `commit`.
//!
//! 3. **Merging**: To optimize space and search speed, segments might be merged. This operation is
//! performed in the background. Customize the merge behaviour via
//! [IndexWriter::set_merge_policy].
#[cfg_attr(test, macro_use)]
extern crate serde_json;
#[macro_use]
@@ -178,7 +137,7 @@ pub use crate::future_result::FutureResult;
pub type Result<T> = std::result::Result<T, TantivyError>;
mod core;
pub mod indexer;
mod indexer;
#[allow(unused_doc_comments)]
pub mod error;
@@ -202,7 +161,8 @@ pub mod termdict;
mod reader;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
pub mod snippet;
mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
mod docset;
use std::fmt;
@@ -213,34 +173,26 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
pub use self::docset::{DocSet, TERMINATED};
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
)]
pub use self::snippet::{Snippet, SnippetGenerator};
#[doc(hidden)]
pub use crate::core::json_utils;
pub use crate::core::{
merge_field_meta_data, Executor, FieldMetadata, Index, IndexBuilder, IndexMeta, IndexSettings,
IndexSortByField, InvertedIndexReader, Order, Searcher, SearcherGeneration, Segment,
SegmentComponent, SegmentId, SegmentMeta, SegmentReader, SingleSegmentIndexWriter,
Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
SegmentReader, SingleSegmentIndexWriter,
};
pub use crate::directory::Directory;
pub use crate::indexer::IndexWriter;
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
)]
pub use crate::indexer::{merge_filtered_segments, merge_indices, PreparedCommit};
pub use crate::indexer::operation::UserOperation;
pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit};
pub use crate::postings::Postings;
#[allow(deprecated)]
pub use crate::schema::DatePrecision;
pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term};
pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 6;
/// Oldest index format version this tantivy version can read.
const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
const INDEX_FORMAT_VERSION: u32 = 5;
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
/// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -390,9 +342,8 @@ pub mod tests {
use crate::docset::{DocSet, TERMINATED};
use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery;
use crate::schema::document::Value;
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy};
use crate::{DateTime, DocAddress, Index, Postings, ReloadPolicy};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
@@ -463,7 +414,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc)?;
@@ -485,7 +436,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?;
@@ -512,7 +463,7 @@ pub mod tests {
let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
let index_reader = index.reader()?;
@@ -534,7 +485,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(text_field=>"a b"))?;
@@ -577,7 +528,7 @@ pub mod tests {
.unwrap();
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"))?;
// 1
@@ -624,7 +575,7 @@ pub mod tests {
}
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"))?;
// 1
@@ -661,7 +612,7 @@ pub mod tests {
}
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?;
@@ -711,7 +662,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64))?;
index_writer.commit()?;
let reader = index.reader()?;
@@ -734,7 +685,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val))?;
index_writer.commit()?;
@@ -758,7 +709,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val))?;
index_writer.commit()?;
@@ -782,7 +733,7 @@ pub mod tests {
let absent_field = schema_builder.add_text_field("absent_text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"))?;
assert!(index_writer.commit().is_ok());
let reader = index.reader()?;
@@ -805,7 +756,7 @@ pub mod tests {
.try_into()?;
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"))?;
index_writer.add_document(doc!(text_field=>"70"))?;
index_writer.add_document(doc!(text_field=>"34"))?;
@@ -830,7 +781,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?;
index_writer.commit()?;
}
@@ -862,7 +813,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?;
@@ -926,7 +877,7 @@ pub mod tests {
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 0u64);
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?;
@@ -1034,13 +985,13 @@ pub mod tests {
text_field => "some other value",
other_text_field => "short");
assert_eq!(document.len(), 3);
let values: Vec<&OwnedValue> = document.get_all(text_field).collect();
let values: Vec<&Value> = document.get_all(text_field).collect();
assert_eq!(values.len(), 2);
assert_eq!(values[0].as_str(), Some("tantivy"));
assert_eq!(values[1].as_str(), Some("some other value"));
let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect();
assert_eq!(values[0].as_text(), Some("tantivy"));
assert_eq!(values[1].as_text(), Some("some other value"));
let values: Vec<&Value> = document.get_all(other_text_field).collect();
assert_eq!(values.len(), 1);
assert_eq!(values[0].as_str(), Some("short"));
assert_eq!(values[0].as_text(), Some("short"));
}
#[test]
@@ -1054,7 +1005,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
{
let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
@@ -1120,7 +1071,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let index_reader = index.reader()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT {
@@ -1173,7 +1124,7 @@ pub mod tests {
let body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?;
let mut writer: IndexWriter = index.writer(50_000_000)?;
let mut writer = index.writer(50_000_000)?;
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?;

View File

@@ -45,12 +45,12 @@
macro_rules! doc(
() => {
{
($crate::TantivyDocument::default())
($crate::Document::default())
}
}; // avoids a warning due to the useless `mut`.
($($field:expr => $value:expr),*) => {
{
let mut document = $crate::TantivyDocument::default();
let mut document = $crate::Document::default();
$(
document.add_field_value($field, $value);
)*

View File

@@ -92,7 +92,7 @@ impl PositionReader {
// that block is bitpacked.
let bit_width = bit_widths[block_rel_id];
self.block_decoder
.uncompress_block_unsorted(compressed_data, bit_width, false);
.uncompress_block_unsorted(compressed_data, bit_width);
} else {
// that block is vint encoded.
self.block_decoder

View File

@@ -62,9 +62,8 @@ impl<W: io::Write> PositionSerializer<W> {
return;
}
if self.block.len() == COMPRESSION_BLOCK_SIZE {
let (bit_width, block_encoded): (u8, &[u8]) = self
.block_encoder
.compress_block_unsorted(&self.block[..], false);
let (bit_width, block_encoded): (u8, &[u8]) =
self.block_encoder.compress_block_unsorted(&self.block[..]);
self.bit_widths.push(bit_width);
self.positions_buffer.extend(block_encoded);
} else {

View File

@@ -24,13 +24,13 @@ fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
#[derive(Clone)]
pub struct BlockSegmentPostings {
pub(crate) doc_decoder: BlockDecoder,
block_loaded: bool,
loaded_offset: usize,
freq_decoder: BlockDecoder,
freq_reading_option: FreqReadingOption,
block_max_score_cache: Option<Score>,
doc_freq: u32,
data: OwnedBytes,
skip_reader: SkipReader,
pub(crate) skip_reader: SkipReader,
}
fn decode_bitpacked_block(
@@ -40,16 +40,10 @@ fn decode_bitpacked_block(
doc_offset: DocId,
doc_num_bits: u8,
tf_num_bits: u8,
strict_delta: bool,
) {
let num_consumed_bytes =
doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits, strict_delta);
let num_consumed_bytes = doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits);
if let Some(freq_decoder) = freq_decoder_opt {
freq_decoder.uncompress_block_unsorted(
&data[num_consumed_bytes..],
tf_num_bits,
strict_delta,
);
freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits);
}
}
@@ -63,15 +57,11 @@ fn decode_vint_block(
let num_consumed_bytes =
doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs, TERMINATED);
if let Some(freq_decoder) = freq_decoder_opt {
// if it's a json term with freq, containing less than 256 docs, we can reach here thinking
// we have a freq, despite not really having one.
if data.len() > num_consumed_bytes {
freq_decoder.uncompress_vint_unsorted(
&data[num_consumed_bytes..],
num_vint_docs,
TERMINATED,
);
}
freq_decoder.uncompress_vint_unsorted(
&data[num_consumed_bytes..],
num_vint_docs,
TERMINATED,
);
}
}
@@ -88,46 +78,28 @@ fn split_into_skips_and_postings(
}
impl BlockSegmentPostings {
/// Opens a `BlockSegmentPostings`.
/// `doc_freq` is the number of documents in the posting list.
/// `record_option` represents the amount of data available according to the schema.
/// `requested_option` is the amount of data requested by the user.
/// If for instance, we do not request for term frequencies, this function will not decompress
/// term frequency blocks.
pub(crate) fn open(
doc_freq: u32,
data: FileSlice,
mut record_option: IndexRecordOption,
record_option: IndexRecordOption,
requested_option: IndexRecordOption,
) -> io::Result<BlockSegmentPostings> {
let bytes = data.read_bytes()?;
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
let skip_reader = match skip_data_opt {
Some(skip_data) => {
let block_count = doc_freq as usize / COMPRESSION_BLOCK_SIZE;
// 8 is the minimum size of a block with frequency (can be more if pos are stored
// too)
if skip_data.len() < 8 * block_count {
// the field might be encoded with frequency, but this term in particular isn't.
// This can happen for JSON field with term frequencies:
// - text terms are encoded with term freqs.
// - numerical terms are encoded without term freqs.
record_option = IndexRecordOption::Basic;
}
SkipReader::new(skip_data, doc_freq, record_option)
}
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
};
let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq,
};
let bytes = data.read_bytes()?;
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option),
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
};
let mut block_segment_postings = BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
block_loaded: false,
loaded_offset: usize::MAX,
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option,
block_max_score_cache: None,
@@ -197,7 +169,7 @@ impl BlockSegmentPostings {
split_into_skips_and_postings(doc_freq, postings_data)?;
self.data = postings_data;
self.block_max_score_cache = None;
self.block_loaded = false;
self.loaded_offset = usize::MAX;
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq);
} else {
@@ -293,23 +265,22 @@ impl BlockSegmentPostings {
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None;
self.block_loaded = false;
}
}
pub(crate) fn block_is_loaded(&self) -> bool {
self.block_loaded
self.loaded_offset == self.skip_reader.byte_offset()
}
pub(crate) fn load_block(&mut self) {
let offset = self.skip_reader.byte_offset();
if self.block_is_loaded() {
if self.loaded_offset == offset {
return;
}
self.loaded_offset = offset;
match self.skip_reader.block_info() {
BlockInfo::BitPacked {
doc_num_bits,
strict_delta_encoded,
tf_num_bits,
..
} => {
@@ -324,7 +295,6 @@ impl BlockSegmentPostings {
self.skip_reader.last_doc_in_previous_block,
doc_num_bits,
tf_num_bits,
strict_delta_encoded,
);
}
BlockInfo::VInt { num_docs } => {
@@ -348,13 +318,13 @@ impl BlockSegmentPostings {
);
}
}
self.block_loaded = true;
}
/// Advance to the next block.
///
/// Returns false if and only if there is no remaining block.
pub fn advance(&mut self) {
self.skip_reader.advance();
self.block_loaded = false;
self.block_max_score_cache = None;
self.load_block();
}
@@ -363,7 +333,7 @@ impl BlockSegmentPostings {
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED),
block_loaded: true,
loaded_offset: 0,
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
block_max_score_cache: None,
@@ -372,10 +342,6 @@ impl BlockSegmentPostings {
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
}
}
pub(crate) fn skip_reader(&self) -> &SkipReader {
&self.skip_reader
}
}
#[cfg(test)]

View File

@@ -33,40 +33,14 @@ impl BlockEncoder {
}
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) {
// if offset is zero, convert it to None. This is correct as long as we do the same when
// decompressing. It's required in case the block starts with an actual zero.
let offset = if offset == 0u32 { None } else { Some(offset) };
let num_bits = self.bitpacker.num_bits_strictly_sorted(offset, block);
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
let written_size =
self.bitpacker
.compress_strictly_sorted(offset, block, &mut self.output[..], num_bits);
.compress_sorted(offset, block, &mut self.output[..], num_bits);
(num_bits, &self.output[..written_size])
}
/// Compress a single block of unsorted numbers.
///
/// If `minus_one_encoded` is set, each value must be >= 1, and will be encoded in a sligly
/// more compact format. This is useful for some values where 0 isn't a correct value, such
/// as term frequency, but isn't correct for some usages like position lists, where 0 can
/// appear.
pub fn compress_block_unsorted(
&mut self,
block: &[u32],
minus_one_encoded: bool,
) -> (u8, &[u8]) {
debug_assert!(!minus_one_encoded || !block.contains(&0));
let mut block_minus_one = [0; COMPRESSION_BLOCK_SIZE];
let block = if minus_one_encoded {
for (elem_min_one, elem) in block_minus_one.iter_mut().zip(block) {
*elem_min_one = elem - 1;
}
&block_minus_one
} else {
block
};
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) {
let num_bits = self.bitpacker.num_bits(block);
let written_size = self
.bitpacker
@@ -97,55 +71,21 @@ impl BlockDecoder {
}
}
/// Decompress block of sorted integers.
///
/// `strict_delta` depends on what encoding was used. Older version of tantivy never use strict
/// deltas, newer versions always use them.
pub fn uncompress_block_sorted(
&mut self,
compressed_data: &[u8],
offset: u32,
num_bits: u8,
strict_delta: bool,
) -> usize {
if strict_delta {
let offset = std::num::NonZeroU32::new(offset).map(std::num::NonZeroU32::get);
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker.decompress_strictly_sorted(
offset,
compressed_data,
&mut self.output,
num_bits,
)
} else {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
}
}
/// Decompress block of unsorted integers.
///
/// `minus_one_encoded` depends on what encoding was used. Older version of tantivy never use
/// that encoding. Newer version use it for some structures, but not all. See the corresponding
/// call to `BlockEncoder::compress_block_unsorted`.
pub fn uncompress_block_unsorted(
&mut self,
compressed_data: &[u8],
num_bits: u8,
minus_one_encoded: bool,
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
let res = self
.bitpacker
.decompress(compressed_data, &mut self.output, num_bits);
if minus_one_encoded {
for val in &mut self.output {
*val += 1;
}
}
res
self.bitpacker
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(compressed_data, &mut self.output, num_bits)
}
#[inline]
@@ -278,8 +218,7 @@ pub mod tests {
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 0);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes =
decoder.uncompress_block_sorted(compressed_data, 0, num_bits, true);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0, num_bits);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
@@ -294,8 +233,7 @@ pub mod tests {
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes =
decoder.uncompress_block_sorted(compressed_data, 10, num_bits, true);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10, num_bits);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
@@ -314,8 +252,7 @@ pub mod tests {
compressed.push(173u8);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes =
decoder.uncompress_block_sorted(&compressed, 10, num_bits, true);
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10, num_bits);
assert_eq!(consumed_num_bytes, compressed.len() - 1);
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
@@ -326,25 +263,21 @@ pub mod tests {
#[test]
fn test_encode_unsorted_block_with_junk() {
for minus_one_encode in [false, true] {
let mut compressed: Vec<u8> = Vec::new();
let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
let mut encoder = BlockEncoder::default();
let (num_bits, compressed_data) =
encoder.compress_block_unsorted(&vals, minus_one_encode);
compressed.extend_from_slice(compressed_data);
compressed.push(173u8);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes =
decoder.uncompress_block_unsorted(&compressed, num_bits, minus_one_encode);
assert_eq!(consumed_num_bytes + 1, compressed.len());
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
}
let mut compressed: Vec<u8> = Vec::new();
let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
let mut encoder = BlockEncoder::default();
let (num_bits, compressed_data) = encoder.compress_block_unsorted(&vals);
compressed.extend_from_slice(compressed_data);
compressed.push(173u8);
let mut decoder = BlockDecoder::default();
{
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed, num_bits);
assert_eq!(consumed_num_bytes + 1, compressed.len());
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
}
}
@@ -411,7 +344,7 @@ mod bench {
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::default();
b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits, true);
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
});
}

View File

@@ -1,7 +1,5 @@
use stacker::{ArenaHashMap, MemoryArena};
use crate::indexer::path_to_unordered_id::PathToUnorderedId;
/// IndexingContext contains all of the transient memory arenas
/// required for building the inverted index.
pub(crate) struct IndexingContext {
@@ -10,7 +8,6 @@ pub(crate) struct IndexingContext {
pub term_index: ArenaHashMap,
/// Arena is a memory arena that stores posting lists / term frequencies / positions.
pub arena: MemoryArena,
pub path_to_unordered_id: PathToUnorderedId,
}
impl IndexingContext {
@@ -20,7 +17,6 @@ impl IndexingContext {
IndexingContext {
arena: MemoryArena::default(),
term_index,
path_to_unordered_id: PathToUnorderedId::default(),
}
}

View File

@@ -3,18 +3,13 @@ use std::io;
use stacker::Addr;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::{Field, Type, JSON_END_OF_PATH};
use crate::schema::Type;
use crate::tokenizer::TokenStream;
use crate::{DocId, Term};
/// The `JsonPostingsWriter` is odd in that it relies on a hidden contract:
///
/// `subscribe` is called directly to index non-text tokens, while
/// `index_text` is used to index text.
#[derive(Default)]
pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
str_posting_writer: SpecializedPostingsWriter<Rec>,
@@ -59,24 +54,18 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut term_buffer = Term::with_capacity(48);
let mut buffer_lender = BufferLender::default();
for (_field, path_id, term, addr) in term_addrs {
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
term_buffer.append_bytes(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
term_buffer.append_bytes(&[JSON_END_OF_PATH]);
term_buffer.append_bytes(term);
if let Some(json_value) = term_buffer.value().as_json_value_bytes() {
for (term, addr) in term_addrs {
if let Some(json_value) = term.value().as_json_value_bytes() {
let typ = json_value.typ();
if typ == Type::Str {
SpecializedPostingsWriter::<Rec>::serialize_one_term(
term_buffer.serialized_value_bytes(),
term,
*addr,
doc_id_map,
&mut buffer_lender,
@@ -85,7 +74,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
)?;
} else {
SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
term_buffer.serialized_value_bytes(),
term,
*addr,
doc_id_map,
&mut buffer_lender,

View File

@@ -52,7 +52,7 @@ pub mod tests {
Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT,
};
use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
use crate::{DocId, HasLen, IndexWriter, Score};
use crate::{DocId, HasLen, Score};
#[test]
pub fn test_position_write() -> crate::Result<()> {
@@ -63,7 +63,7 @@ pub mod tests {
let mut segment = index.new_segment();
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment)?;
let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4, None)?;
field_serializer.new_term("abc".as_bytes(), 12u32, true)?;
field_serializer.new_term("abc".as_bytes(), 12u32)?;
for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2];
field_serializer.write_doc(doc_id, 4, &delta_positions);
@@ -432,7 +432,7 @@ pub mod tests {
// delete some of the documents
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
@@ -483,7 +483,7 @@ pub mod tests {
// delete everything else
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
@@ -568,8 +568,8 @@ mod bench {
use crate::docset::TERMINATED;
use crate::query::Intersection;
use crate::schema::{Field, IndexRecordOption, Schema, TantivyDocument, Term, STRING};
use crate::{tests, DocSet, Index, IndexWriter};
use crate::schema::{Document, Field, IndexRecordOption, Schema, Term, STRING};
use crate::{tests, DocSet, Index};
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
let field = Field::from_field_id(0);
@@ -598,9 +598,9 @@ mod bench {
let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000;
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size {
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) {
doc.add_text(text_field, "a");
}

Some files were not shown because too many files have changed in this diff Show More