Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
3a8a83da80 tracing 2023-10-16 19:23:47 +09:00
176 changed files with 3131 additions and 8990 deletions

View File

@@ -3,6 +3,8 @@ name: Coverage
on: on:
push: push:
branches: [main] branches: [main]
pull_request:
branches: [main]
# Ensures that we cancel running jobs for the same PR / same workflow. # Ensures that we cancel running jobs for the same PR / same workflow.
concurrency: concurrency:
@@ -15,11 +17,11 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Install Rust - name: Install Rust
run: rustup toolchain install nightly-2023-09-10 --profile minimal --component llvm-tools-preview run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2 - uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov - uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage - name: Generate code coverage
run: cargo +nightly-2023-09-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info run: cargo +nightly llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v3 uses: codecov/codecov-action@v3
continue-on-error: true continue-on-error: true

View File

@@ -39,13 +39,6 @@ jobs:
- name: Check Formatting - name: Check Formatting
run: cargo +nightly fmt --all -- --check run: cargo +nightly fmt --all -- --check
- name: Check Stable Compilation
run: cargo build --all-features
- name: Check Bench Compilation
run: cargo +nightly bench --no-run --profile=dev --all-features
- uses: actions-rs/clippy-check@v1 - uses: actions-rs/clippy-check@v1
with: with:

View File

@@ -1,9 +1,3 @@
Tantivy 0.21.1
================================
#### Bugfixes
- Range queries on fast fields with less values on that field than documents had an invalid end condition, leading to missing results. [#2226](https://github.com/quickwit-oss/tantivy/issues/2226)(@appaquet @PSeitz)
- Increase the minimum memory budget from 3MB to 15MB to avoid single doc segments (API fix). [#2176](https://github.com/quickwit-oss/tantivy/issues/2176)(@PSeitz)
Tantivy 0.21 Tantivy 0.21
================================ ================================
#### Bugfixes #### Bugfixes

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.22.0-dev" version = "0.21.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -19,37 +19,40 @@ oneshot = "0.1.5"
base64 = "0.21.0" base64 = "0.21.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
tracing = "0.1"
once_cell = "1.10.0" once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] } regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0" aho-corasick = "1.0"
tantivy-fst = "0.5" tantivy-fst = "0.4.0"
memmap2 = { version = "0.9.0", optional = true } memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true } lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false } zstd = { version = "0.12", optional = true, default-features = false }
tempfile = { version = "3.3.0", optional = true } tempfile = { version = "3.3.0", optional = true }
log = "0.4.16" log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79" serde_json = "1.0.79"
num_cpus = "1.13.1" num_cpus = "1.13.1"
fs4 = { version = "0.7.0", optional = true } fs4 = { version = "0.6.3", optional = true }
levenshtein_automata = "0.2.1" levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] } uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4" crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0" rust-stemmers = "1.2.0"
downcast-rs = "1.2.0" downcast-rs = "1.2.0"
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker4x"] } bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
census = "0.4.0" census = "0.4.0"
rustc-hash = "1.1.0" rustc-hash = "1.1.0"
thiserror = "1.0.30" thiserror = "1.0.30"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true } fail = { version = "0.5.0", optional = true }
murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] } time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0" smallvec = "1.8.0"
rayon = "1.5.2" rayon = "1.5.2"
lru = "0.12.0" lru = "0.11.0"
fastdivide = "0.4.0" fastdivide = "0.4.0"
itertools = "0.12.0" itertools = "0.11.0"
measure_time = "0.8.2" measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0" arc-swap = "1.5.0"
columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" } columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" }
@@ -61,7 +64,6 @@ common = { version= "0.6", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" } tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] } sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true } futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"
@@ -73,13 +75,15 @@ matches = "0.1.9"
pretty_assertions = "1.2.1" pretty_assertions = "1.2.1"
proptest = "1.0.0" proptest = "1.0.0"
test-log = "0.2.10" test-log = "0.2.10"
env_logger = "0.10.0"
futures = "0.3.21" futures = "0.3.21"
paste = "1.0.11" paste = "1.0.11"
more-asserts = "0.3.1" more-asserts = "0.3.1"
rand_distr = "0.4.3" rand_distr = "0.4.3"
[target.'cfg(not(windows))'.dev-dependencies] [target.'cfg(not(windows))'.dev-dependencies]
criterion = { version = "0.5", default-features = false } criterion = "0.5"
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
[dev-dependencies.fail] [dev-dependencies.fail]
version = "0.5.0" version = "0.5.0"
@@ -112,11 +116,6 @@ unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"] quickwit = ["sstable", "futures-util"]
# Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
# Uses 64bit ahash.
compare_hash_only = ["stacker/compare_hash_only"]
[workspace] [workspace]
members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"] members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]

View File

@@ -1,99 +1,14 @@
use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion, Throughput}; use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use tantivy::schema::{TantivyDocument, FAST, INDEXED, STORED, STRING, TEXT}; use pprof::criterion::{Output, PProfProfiler};
use tantivy::{tokenizer, Index, IndexWriter}; use tantivy::schema::{FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
const HDFS_LOGS: &str = include_str!("hdfs.json"); const HDFS_LOGS: &str = include_str!("hdfs.json");
const GH_LOGS: &str = include_str!("gh.json"); const GH_LOGS: &str = include_str!("gh.json");
const WIKI: &str = include_str!("wiki.json"); const WIKI: &str = include_str!("wiki.json");
fn benchmark( fn get_lines(input: &str) -> Vec<&str> {
b: &mut Bencher, input.trim().split('\n').collect()
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
parse_json: bool,
is_dynamic: bool,
) {
if is_dynamic {
benchmark_dynamic_json(b, input, schema, commit, parse_json)
} else {
_benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
TantivyDocument::parse_json(&schema, doc_json).unwrap()
})
}
}
fn get_index(schema: tantivy::schema::Schema) -> Index {
let mut index = Index::create_in_ram(schema.clone());
let ff_tokenizer_manager = tokenizer::TokenizerManager::default();
ff_tokenizer_manager.register(
"raw",
tokenizer::TextAnalyzer::builder(tokenizer::RawTokenizer::default())
.filter(tokenizer::RemoveLongFilter::limit(255))
.build(),
);
index.set_fast_field_tokenizers(ff_tokenizer_manager.clone());
index
}
fn _benchmark(
b: &mut Bencher,
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
include_json_parsing: bool,
create_doc: impl Fn(&tantivy::schema::Schema, &str) -> TantivyDocument,
) {
if include_json_parsing {
let lines: Vec<&str> = input.trim().split('\n').collect();
b.iter(|| {
let index = get_index(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = create_doc(&schema, doc_json);
index_writer.add_document(doc).unwrap();
}
if commit {
index_writer.commit().unwrap();
}
})
} else {
let docs: Vec<_> = input
.trim()
.split('\n')
.map(|doc_json| create_doc(&schema, doc_json))
.collect();
b.iter_batched(
|| docs.clone(),
|docs| {
let index = get_index(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}
if commit {
index_writer.commit().unwrap();
}
},
BatchSize::SmallInput,
)
}
}
fn benchmark_dynamic_json(
b: &mut Bencher,
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
parse_json: bool,
) {
let json_field = schema.get_field("json").unwrap();
_benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
tantivy::doc!(json_field=>json_val)
})
} }
pub fn hdfs_index_benchmark(c: &mut Criterion) { pub fn hdfs_index_benchmark(c: &mut Criterion) {
@@ -104,14 +19,7 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
schema_builder.add_text_field("severity", STRING); schema_builder.add_text_field("severity", STRING);
schema_builder.build() schema_builder.build()
}; };
let schema_only_fast = { let schema_with_store = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", FAST);
schema_builder.add_text_field("body", FAST);
schema_builder.add_text_field("severity", FAST);
schema_builder.build()
};
let _schema_with_store = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new(); let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED | STORED); schema_builder.add_u64_field("timestamp", INDEXED | STORED);
schema_builder.add_text_field("body", TEXT | STORED); schema_builder.add_text_field("body", TEXT | STORED);
@@ -120,39 +28,74 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
}; };
let dynamic_schema = { let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new(); let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST); schema_builder.add_json_field("json", TEXT);
schema_builder.build() schema_builder.build()
}; };
let mut group = c.benchmark_group("index-hdfs"); let mut group = c.benchmark_group("index-hdfs");
group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64)); group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
group.sample_size(20); group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| {
let benches = [ let lines = get_lines(HDFS_LOGS);
("only-indexed-".to_string(), schema, false), b.iter(|| {
//("stored-".to_string(), _schema_with_store, false), let index = Index::create_in_ram(schema.clone());
("only-fast-".to_string(), schema_only_fast, false), let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
("dynamic-".to_string(), dynamic_schema, true), for doc_json in &lines {
]; let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
for (prefix, schema, is_dynamic) in benches {
for commit in [false, true] {
let suffix = if commit { "with-commit" } else { "no-commit" };
for parse_json in [false] {
// for parse_json in [false, true] {
let suffix = if parse_json {
format!("{}-with-json-parsing", suffix)
} else {
format!("{}", suffix)
};
let bench_name = format!("{}{}", prefix, suffix);
group.bench_function(bench_name, |b| {
benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
});
} }
} })
} });
group.bench_function("index-hdfs-with-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
} }
pub fn gh_index_benchmark(c: &mut Criterion) { pub fn gh_index_benchmark(c: &mut Criterion) {
@@ -161,24 +104,38 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
schema_builder.add_json_field("json", TEXT | FAST); schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build() schema_builder.build()
}; };
let dynamic_schema_fast = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-gh"); let mut group = c.benchmark_group("index-gh");
group.throughput(Throughput::Bytes(GH_LOGS.len() as u64)); group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
group.bench_function("index-gh-no-commit", |b| { group.bench_function("index-gh-no-commit", |b| {
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema.clone(), false, false) let lines = get_lines(GH_LOGS);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
}); });
group.bench_function("index-gh-fast", |b| { group.bench_function("index-gh-with-commit", |b| {
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), false, false) let lines = get_lines(GH_LOGS);
}); b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
group.bench_function("index-gh-fast-with-commit", |b| { let index = Index::create_in_ram(dynamic_schema.clone());
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), true, false) let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
}); });
} }
@@ -193,10 +150,33 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
group.throughput(Throughput::Bytes(WIKI.len() as u64)); group.throughput(Throughput::Bytes(WIKI.len() as u64));
group.bench_function("index-wiki-no-commit", |b| { group.bench_function("index-wiki-no-commit", |b| {
benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), false, false) let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
}); });
group.bench_function("index-wiki-with-commit", |b| { group.bench_function("index-wiki-with-commit", |b| {
benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), true, false) let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
}); });
} }
@@ -207,12 +187,12 @@ criterion_group! {
} }
criterion_group! { criterion_group! {
name = gh_benches; name = gh_benches;
config = Criterion::default(); config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = gh_index_benchmark targets = gh_index_benchmark
} }
criterion_group! { criterion_group! {
name = wiki_benches; name = wiki_benches;
config = Criterion::default(); config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = wiki_index_benchmark targets = wiki_index_benchmark
} }
criterion_main!(benches, gh_benches, wiki_benches); criterion_main!(benches, gh_benches, wiki_benches);

View File

@@ -15,7 +15,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker1x"] } bitpacking = {version="0.8", default-features=false, features = ["bitpacker1x"]}
[dev-dependencies] [dev-dependencies]
rand = "0.8" rand = "0.8"

View File

@@ -367,7 +367,7 @@ mod test {
let mut output: Vec<u32> = Vec::new(); let mut output: Vec<u32> = Vec::new();
for len in [0, 1, 2, 32, 33, 34, 64] { for len in [0, 1, 2, 32, 33, 34, 64] {
for start_idx in 0u32..32u32 { for start_idx in 0u32..32u32 {
output.resize(len, 0); output.resize(len as usize, 0);
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output); bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
for i in 0..len { for i in 0..len {
let expected = (start_idx + i as u32) & mask; let expected = (start_idx + i as u32) & mask;

View File

@@ -9,7 +9,8 @@ description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"] categories = ["database-implementations", "data-structures", "compression"]
[dependencies] [dependencies]
itertools = "0.12.0" itertools = "0.11.0"
fnv = "1.0.7"
fastdivide = "0.4.0" fastdivide = "0.4.0"
stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"} stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"}

View File

@@ -8,6 +8,7 @@ license = "MIT"
columnar = {path="../", package="tantivy-columnar"} columnar = {path="../", package="tantivy-columnar"}
serde_json = "1" serde_json = "1"
serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"} serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"}
serde = "1"
[workspace] [workspace]
members = [] members = []

View File

@@ -1,8 +1,3 @@
//! # `column_index`
//!
//! `column_index` provides rank and select operations to associate positions when not all
//! documents have exactly one element.
mod merge; mod merge;
mod multivalued_index; mod multivalued_index;
mod optional_index; mod optional_index;
@@ -46,10 +41,10 @@ impl ColumnIndex {
pub fn is_multivalue(&self) -> bool { pub fn is_multivalue(&self) -> bool {
matches!(self, ColumnIndex::Multivalued(_)) matches!(self, ColumnIndex::Multivalued(_))
} }
/// Returns the cardinality of the column index. // Returns the cardinality of the column index.
/// //
/// By convention, if the column contains no docs, we consider that it is // By convention, if the column contains no docs, we consider that it is
/// full. // full.
#[inline] #[inline]
pub fn get_cardinality(&self) -> Cardinality { pub fn get_cardinality(&self) -> Cardinality {
match self { match self {

View File

@@ -215,12 +215,12 @@ mod bench {
let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES) let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio)) .map(|_| rng.gen_bool(fill_ratio))
.enumerate() .enumerate()
.filter(|(_pos, val)| *val) .filter(|(pos, val)| *val)
.map(|(pos, _)| pos as RowId) .map(|(pos, _)| pos as RowId)
.collect(); .collect();
serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap(); serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
let codec = open_optional_index(OwnedBytes::new(out)).unwrap();
open_optional_index(OwnedBytes::new(out)).unwrap() codec
} }
fn random_range_iterator( fn random_range_iterator(
@@ -242,7 +242,7 @@ mod bench {
} }
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> { fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0; let ratio = percent as f32 / 100.0;
let step_size = (1f32 / ratio) as u32; let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1; let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation) random_range_iterator(0, num_values, step_size, deviation)

View File

@@ -30,7 +30,6 @@ impl<'a> SerializableColumnIndex<'a> {
} }
} }
/// Serialize a column index.
pub fn serialize_column_index( pub fn serialize_column_index(
column_index: SerializableColumnIndex, column_index: SerializableColumnIndex,
output: &mut impl Write, output: &mut impl Write,
@@ -52,7 +51,6 @@ pub fn serialize_column_index(
Ok(column_index_num_bytes) Ok(column_index_num_bytes)
} }
/// Open a serialized column index.
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> { pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
if bytes.is_empty() { if bytes.is_empty() {
return Err(io::Error::new( return Err(io::Error::new(

View File

@@ -269,8 +269,7 @@ impl StrOrBytesColumnWriter {
dictionaries: &mut [DictionaryBuilder], dictionaries: &mut [DictionaryBuilder],
arena: &mut MemoryArena, arena: &mut MemoryArena,
) { ) {
let unordered_id = let unordered_id = dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes);
dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes, arena);
self.column_writer.record(doc, unordered_id, arena); self.column_writer.record(doc, unordered_id, arena);
} }

View File

@@ -338,7 +338,7 @@ impl ColumnarWriter {
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
.numerical_field_hash_map .numerical_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| { .map(|(column_name, addr, _)| {
let numerical_column_writer: NumericalColumnWriter = let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr); self.numerical_field_hash_map.read(addr);
let column_type = numerical_column_writer.numerical_type().into(); let column_type = numerical_column_writer.numerical_type().into();
@@ -348,27 +348,27 @@ impl ColumnarWriter {
columns.extend( columns.extend(
self.bytes_field_hash_map self.bytes_field_hash_map
.iter() .iter()
.map(|(term, addr)| (term, ColumnType::Bytes, addr)), .map(|(term, addr, _)| (term, ColumnType::Bytes, addr)),
); );
columns.extend( columns.extend(
self.str_field_hash_map self.str_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| (column_name, ColumnType::Str, addr)), .map(|(column_name, addr, _)| (column_name, ColumnType::Str, addr)),
); );
columns.extend( columns.extend(
self.bool_field_hash_map self.bool_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| (column_name, ColumnType::Bool, addr)), .map(|(column_name, addr, _)| (column_name, ColumnType::Bool, addr)),
); );
columns.extend( columns.extend(
self.ip_addr_field_hash_map self.ip_addr_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| (column_name, ColumnType::IpAddr, addr)), .map(|(column_name, addr, _)| (column_name, ColumnType::IpAddr, addr)),
); );
columns.extend( columns.extend(
self.datetime_field_hash_map self.datetime_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)), .map(|(column_name, addr, _)| (column_name, ColumnType::DateTime, addr)),
); );
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type)); columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
@@ -437,7 +437,6 @@ impl ColumnarWriter {
&mut symbol_byte_buffer, &mut symbol_byte_buffer,
), ),
buffers, buffers,
&self.arena,
&mut column_serializer, &mut column_serializer,
)?; )?;
column_serializer.finalize()?; column_serializer.finalize()?;
@@ -491,7 +490,6 @@ impl ColumnarWriter {
// Serialize [Dictionary, Column, dictionary num bytes U32::LE] // Serialize [Dictionary, Column, dictionary num bytes U32::LE]
// Column: [Column Index, Column Values, column index num bytes U32::LE] // Column: [Column Index, Column Values, column index num bytes U32::LE]
#[allow(clippy::too_many_arguments)]
fn serialize_bytes_or_str_column( fn serialize_bytes_or_str_column(
cardinality: Cardinality, cardinality: Cardinality,
num_docs: RowId, num_docs: RowId,
@@ -499,7 +497,6 @@ fn serialize_bytes_or_str_column(
dictionary_builder: &DictionaryBuilder, dictionary_builder: &DictionaryBuilder,
operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>, operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>,
buffers: &mut SpareBuffers, buffers: &mut SpareBuffers,
arena: &MemoryArena,
wrt: impl io::Write, wrt: impl io::Write,
) -> io::Result<()> { ) -> io::Result<()> {
let SpareBuffers { let SpareBuffers {
@@ -508,8 +505,7 @@ fn serialize_bytes_or_str_column(
.. ..
} = buffers; } = buffers;
let mut counting_writer = CountingWriter::wrap(wrt); let mut counting_writer = CountingWriter::wrap(wrt);
let term_id_mapping: TermIdMapping = let term_id_mapping: TermIdMapping = dictionary_builder.serialize(&mut counting_writer)?;
dictionary_builder.serialize(arena, &mut counting_writer)?;
let dictionary_num_bytes: u32 = counting_writer.written_bytes() as u32; let dictionary_num_bytes: u32 = counting_writer.written_bytes() as u32;
let mut wrt = counting_writer.finish(); let mut wrt = counting_writer.finish();
let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| { let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| {

View File

@@ -1,7 +1,7 @@
use std::io; use std::io;
use fnv::FnvHashMap;
use sstable::SSTable; use sstable::SSTable;
use stacker::{MemoryArena, SharedArenaHashMap};
pub(crate) struct TermIdMapping { pub(crate) struct TermIdMapping {
unordered_to_ord: Vec<OrderedId>, unordered_to_ord: Vec<OrderedId>,
@@ -31,38 +31,29 @@ pub struct OrderedId(pub u32);
/// mapping. /// mapping.
#[derive(Default)] #[derive(Default)]
pub(crate) struct DictionaryBuilder { pub(crate) struct DictionaryBuilder {
dict: SharedArenaHashMap, dict: FnvHashMap<Vec<u8>, UnorderedId>,
memory_consumption: usize,
} }
impl DictionaryBuilder { impl DictionaryBuilder {
/// Get or allocate an unordered id. /// Get or allocate an unordered id.
/// (This ID is simply an auto-incremented id.) /// (This ID is simply an auto-incremented id.)
pub fn get_or_allocate_id(&mut self, term: &[u8], arena: &mut MemoryArena) -> UnorderedId { pub fn get_or_allocate_id(&mut self, term: &[u8]) -> UnorderedId {
let next_id = self.dict.len() as u32; if let Some(term_id) = self.dict.get(term) {
let unordered_id = self return *term_id;
.dict }
.mutate_or_create(term, arena, |unordered_id: Option<u32>| { let new_id = UnorderedId(self.dict.len() as u32);
if let Some(unordered_id) = unordered_id { self.dict.insert(term.to_vec(), new_id);
unordered_id self.memory_consumption += term.len();
} else { self.memory_consumption += 40; // Term Metadata + HashMap overhead
next_id new_id
}
});
UnorderedId(unordered_id)
} }
/// Serialize the dictionary into an fst, and returns the /// Serialize the dictionary into an fst, and returns the
/// `UnorderedId -> TermOrdinal` map. /// `UnorderedId -> TermOrdinal` map.
pub fn serialize<'a, W: io::Write + 'a>( pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<TermIdMapping> {
&self, let mut terms: Vec<(&[u8], UnorderedId)> =
arena: &MemoryArena, self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect();
wrt: &mut W,
) -> io::Result<TermIdMapping> {
let mut terms: Vec<(&[u8], UnorderedId)> = self
.dict
.iter(arena)
.map(|(k, v)| (k, arena.read(v)))
.collect();
terms.sort_unstable_by_key(|(key, _)| *key); terms.sort_unstable_by_key(|(key, _)| *key);
// TODO Remove the allocation. // TODO Remove the allocation.
let mut unordered_to_ord: Vec<OrderedId> = vec![OrderedId(0u32); terms.len()]; let mut unordered_to_ord: Vec<OrderedId> = vec![OrderedId(0u32); terms.len()];
@@ -77,7 +68,7 @@ impl DictionaryBuilder {
} }
pub(crate) fn mem_usage(&self) -> usize { pub(crate) fn mem_usage(&self) -> usize {
self.dict.mem_usage() self.memory_consumption
} }
} }
@@ -87,13 +78,12 @@ mod tests {
#[test] #[test]
fn test_dictionary_builder() { fn test_dictionary_builder() {
let mut arena = MemoryArena::default();
let mut dictionary_builder = DictionaryBuilder::default(); let mut dictionary_builder = DictionaryBuilder::default();
let hello_uid = dictionary_builder.get_or_allocate_id(b"hello", &mut arena); let hello_uid = dictionary_builder.get_or_allocate_id(b"hello");
let happy_uid = dictionary_builder.get_or_allocate_id(b"happy", &mut arena); let happy_uid = dictionary_builder.get_or_allocate_id(b"happy");
let tax_uid = dictionary_builder.get_or_allocate_id(b"tax", &mut arena); let tax_uid = dictionary_builder.get_or_allocate_id(b"tax");
let mut buffer = Vec::new(); let mut buffer = Vec::new();
let id_mapping = dictionary_builder.serialize(&arena, &mut buffer).unwrap(); let id_mapping = dictionary_builder.serialize(&mut buffer).unwrap();
assert_eq!(id_mapping.to_ord(hello_uid), OrderedId(1)); assert_eq!(id_mapping.to_ord(hello_uid), OrderedId(1));
assert_eq!(id_mapping.to_ord(happy_uid), OrderedId(0)); assert_eq!(id_mapping.to_ord(happy_uid), OrderedId(0));
assert_eq!(id_mapping.to_ord(tax_uid), OrderedId(2)); assert_eq!(id_mapping.to_ord(tax_uid), OrderedId(2));

View File

@@ -1,22 +1,3 @@
//! # Tantivy-Columnar
//!
//! `tantivy-columnar`provides a columnar storage for tantivy.
//! The crate allows for efficient read operations on specific columns rather than entire records.
//!
//! ## Overview
//!
//! - **columnar**: Reading, writing, and merging multiple columns:
//! - **[ColumnarWriter]**: Makes it possible to create a new columnar.
//! - **[ColumnarReader]**: The ColumnarReader makes it possible to access a set of columns
//! associated to field names.
//! - **[merge_columnar]**: Contains the functionalities to merge multiple ColumnarReader or
//! segments into a single one.
//!
//! - **column**: A single column, which contains
//! - [column_index]: Resolves the rows for a document id. Manages the cardinality of the
//! column.
//! - [column_values]: Stores the values of a column in a dense format.
#![cfg_attr(all(feature = "unstable", test), feature(test))] #![cfg_attr(all(feature = "unstable", test), feature(test))]
#[cfg(test)] #[cfg(test)]
@@ -31,7 +12,7 @@ use std::io;
mod block_accessor; mod block_accessor;
mod column; mod column;
pub mod column_index; mod column_index;
pub mod column_values; pub mod column_values;
mod columnar; mod columnar;
mod dictionary; mod dictionary;

View File

@@ -26,7 +26,7 @@ fn test_dataframe_writer_str() {
assert_eq!(columnar.num_columns(), 1); assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap(); let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1); assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 73); assert_eq!(cols[0].num_bytes(), 87);
} }
#[test] #[test]
@@ -40,7 +40,7 @@ fn test_dataframe_writer_bytes() {
assert_eq!(columnar.num_columns(), 1); assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap(); let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1); assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 73); assert_eq!(cols[0].num_bytes(), 87);
} }
#[test] #[test]
@@ -330,9 +330,9 @@ fn bytes_strategy() -> impl Strategy<Value = &'static [u8]> {
// A random column value // A random column value
fn column_value_strategy() -> impl Strategy<Value = ColumnValue> { fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
prop_oneof![ prop_oneof![
10 => string_strategy().prop_map(ColumnValue::Str), 10 => string_strategy().prop_map(|s| ColumnValue::Str(s)),
1 => bytes_strategy().prop_map(ColumnValue::Bytes), 1 => bytes_strategy().prop_map(|b| ColumnValue::Bytes(b)),
40 => num_strategy().prop_map(ColumnValue::Numerical), 40 => num_strategy().prop_map(|n| ColumnValue::Numerical(n)),
1 => (1u16..3u16).prop_map(|ip_addr_byte| ColumnValue::IpAddr(Ipv6Addr::new( 1 => (1u16..3u16).prop_map(|ip_addr_byte| ColumnValue::IpAddr(Ipv6Addr::new(
127, 127,
0, 0,
@@ -343,7 +343,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
0, 0,
ip_addr_byte ip_addr_byte
))), ))),
1 => any::<bool>().prop_map(ColumnValue::Bool), 1 => any::<bool>().prop_map(|b| ColumnValue::Bool(b)),
1 => (0_679_723_993i64..1_679_723_995i64) 1 => (0_679_723_993i64..1_679_723_995i64)
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) }) .prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
] ]
@@ -419,8 +419,8 @@ fn build_columnar_with_mapping(
columnar_writer columnar_writer
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer) .serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
.unwrap(); .unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
ColumnarReader::open(buffer).unwrap() columnar_reader
} }
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader { fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
@@ -746,7 +746,7 @@ proptest! {
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into(); let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap(); crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect(); let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]); let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar); assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
@@ -772,7 +772,7 @@ fn test_columnar_merging_empty_columnar() {
.unwrap(); .unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect(); columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]); let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar); assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
@@ -809,7 +809,7 @@ fn test_columnar_merging_number_columns() {
.unwrap(); .unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect(); columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]); let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar); assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }

View File

@@ -1,14 +1,11 @@
#![allow(deprecated)] #![allow(deprecated)]
use std::fmt; use std::fmt;
use std::io::{Read, Write};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339; use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset}; use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
use crate::BinarySerializable;
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only /// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision. /// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
#[derive( #[derive(
@@ -167,15 +164,3 @@ impl fmt::Debug for DateTime {
f.write_str(&utc_rfc3339) f.write_str(&utc_rfc3339)
} }
} }
impl BinarySerializable for DateTime {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
let timestamp_micros = self.into_timestamp_micros();
<i64 as BinarySerializable>::serialize(&timestamp_micros, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let timestamp_micros = <i64 as BinarySerializable>::deserialize(reader)?;
Ok(Self::from_timestamp_micros(timestamp_micros))
}
}

View File

@@ -1,112 +0,0 @@
use crate::replace_in_place;
/// Separates the different segments of a json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };
/// Create a new JsonPathWriter, that creates flattened json paths for tantivy.
#[derive(Clone, Debug, Default)]
pub struct JsonPathWriter {
path: String,
indices: Vec<usize>,
expand_dots: bool,
}
impl JsonPathWriter {
pub fn new() -> Self {
JsonPathWriter {
path: String::new(),
indices: Vec::new(),
expand_dots: false,
}
}
/// When expand_dots is enabled, json object like
/// `{"k8s.node.id": 5}` is processed as if it was
/// `{"k8s": {"node": {"id": 5}}}`.
/// This option has the merit of allowing users to
/// write queries like `k8s.node.id:5`.
/// On the other, enabling that feature can lead to
/// ambiguity.
#[inline]
pub fn set_expand_dots(&mut self, expand_dots: bool) {
self.expand_dots = expand_dots;
}
/// Push a new segment to the path.
#[inline]
pub fn push(&mut self, segment: &str) {
let len_path = self.path.len();
self.indices.push(len_path);
if !self.path.is_empty() {
self.path.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
self.path.push_str(segment);
if self.expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut self.path[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
unsafe {
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment.as_bytes_mut())
};
}
}
/// Remove the last segment. Does nothing if the path is empty.
#[inline]
pub fn pop(&mut self) {
if let Some(last_idx) = self.indices.pop() {
self.path.truncate(last_idx);
}
}
/// Clear the path.
#[inline]
pub fn clear(&mut self) {
self.path.clear();
self.indices.clear();
}
/// Get the current path.
#[inline]
pub fn as_str(&self) -> &str {
&self.path
}
}
impl From<JsonPathWriter> for String {
#[inline]
fn from(value: JsonPathWriter) -> Self {
value.path
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn json_path_writer_test() {
let mut writer = JsonPathWriter::new();
writer.push("root");
assert_eq!(writer.as_str(), "root");
writer.push("child");
assert_eq!(writer.as_str(), "root\u{1}child");
writer.pop();
assert_eq!(writer.as_str(), "root");
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s.node.id");
writer.set_expand_dots(true);
writer.pop();
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s\u{1}node\u{1}id");
}
}

View File

@@ -9,7 +9,6 @@ mod byte_count;
mod datetime; mod datetime;
pub mod file_slice; pub mod file_slice;
mod group_by; mod group_by;
mod json_path_writer;
mod serialize; mod serialize;
mod vint; mod vint;
mod writer; mod writer;
@@ -19,7 +18,6 @@ pub use byte_count::ByteCount;
pub use datetime::DatePrecision; pub use datetime::DatePrecision;
pub use datetime::{DateTime, DateTimePrecision}; pub use datetime::{DateTime, DateTimePrecision};
pub use group_by::GroupByIteratorExtended; pub use group_by::GroupByIteratorExtended;
pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref}; pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{ pub use vint::{
@@ -118,7 +116,6 @@ pub fn u64_to_f64(val: u64) -> f64 {
/// ///
/// This function assumes that the needle is rarely contained in the bytes string /// This function assumes that the needle is rarely contained in the bytes string
/// and offers a fast path if the needle is not present. /// and offers a fast path if the needle is not present.
#[inline]
pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) { pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
if !bytes.contains(&needle) { if !bytes.contains(&needle) {
return; return;

View File

@@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::{fmt, io}; use std::{fmt, io};
@@ -250,43 +249,6 @@ impl BinarySerializable for String {
} }
} }
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self.iter() {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))
}
}
#[cfg(test)] #[cfg(test)]
pub mod test { pub mod test {

View File

@@ -12,7 +12,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::AggregationCollector; use tantivy::aggregation::AggregationCollector;
use tantivy::query::AllQuery; use tantivy::query::AllQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST}; use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Create Schema // # Create Schema
@@ -132,10 +132,10 @@ fn main() -> tantivy::Result<()> {
let stream = Deserializer::from_str(data).into_iter::<Value>(); let stream = Deserializer::from_str(data).into_iter::<Value>();
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
let mut num_indexed = 0; let mut num_indexed = 0;
for value in stream { for value in stream {
let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?; let doc = schema.parse_document(&serde_json::to_string(&value.unwrap())?)?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
num_indexed += 1; num_indexed += 1;
if num_indexed > 4 { if num_indexed > 4 {

View File

@@ -15,7 +15,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy}; use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
@@ -75,7 +75,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`. // Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase // Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty. // throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents! // Let's index our documents!
// We first need a handle on the title and the body field. // We first need a handle on the title and the body field.
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
let title = schema.get_field("title").unwrap(); let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap(); let body = schema.get_field("body").unwrap();
let mut old_man_doc = TantivyDocument::default(); let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea"); old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text( old_man_doc.add_text(
body, body,
@@ -164,7 +164,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit. // will reload the index automatically after each commit.
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into()?; .try_into()?;
// We now need to acquire a searcher. // We now need to acquire a searcher.
@@ -217,8 +217,8 @@ fn main() -> tantivy::Result<()> {
// the document returned will only contain // the document returned will only contain
// a title. // a title.
for (_score, doc_address) in top_docs { for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema)); println!("{}", schema.to_json(&retrieved_doc));
} }
// We can also get an explanation to understand // We can also get an explanation to understand

View File

@@ -13,7 +13,7 @@ use columnar::Column;
use tantivy::collector::{Collector, SegmentCollector}; use tantivy::collector::{Collector, SegmentCollector};
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT}; use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader}; use tantivy::{doc, Index, Score, SegmentReader};
#[derive(Default)] #[derive(Default)]
struct Stats { struct Stats {
@@ -142,7 +142,7 @@ fn main() -> tantivy::Result<()> {
// this example. // this example.
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
product_name => "Super Broom 2000", product_name => "Super Broom 2000",
product_description => "While it is ok for short distance travel, this broom \ product_description => "While it is ok for short distance travel, this broom \

View File

@@ -6,7 +6,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer; use tantivy::tokenizer::NgramTokenizer;
use tantivy::{doc, Index, IndexWriter}; use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -62,7 +62,7 @@ fn main() -> tantivy::Result<()> {
// //
// Here we use a buffer of 50MB per thread. Using a bigger // Here we use a buffer of 50MB per thread. Using a bigger
// memory arena for the indexer can increase its throughput. // memory arena for the indexer can increase its throughput.
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
@@ -103,8 +103,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs { for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema)); println!("{}", schema.to_json(&retrieved_doc));
} }
Ok(()) Ok(())

View File

@@ -4,8 +4,8 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING}; use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -22,18 +22,16 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents // # Indexing documents
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// The dates are passed as string in the RFC3339 format // The dates are passed as string in the RFC3339 format
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"occurred_at": "2022-06-22T12:53:50.53Z", "occurred_at": "2022-06-22T12:53:50.53Z",
"event": "pull-request" "event": "pull-request"
}"#, }"#,
)?; )?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"occurred_at": "2022-06-22T13:00:00.22Z", "occurred_at": "2022-06-22T13:00:00.22Z",
"event": "comment" "event": "comment"
@@ -60,13 +58,13 @@ fn main() -> tantivy::Result<()> {
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1); assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs { for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
assert!(matches!( assert!(matches!(
retrieved_doc.get_first(occurred_at), retrieved_doc.get_first(occurred_at),
Some(OwnedValue::Date(_)) Some(Value::Date(_))
)); ));
assert_eq!( assert_eq!(
retrieved_doc.to_json(&schema), schema.to_json(&retrieved_doc),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"# r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
); );
} }

View File

@@ -11,7 +11,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::TermQuery; use tantivy::query::TermQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexReader, IndexWriter}; use tantivy::{doc, Index, IndexReader};
// A simple helper function to fetch a single document // A simple helper function to fetch a single document
// given its id from our index. // given its id from our index.
@@ -19,7 +19,7 @@ use tantivy::{doc, Index, IndexReader, IndexWriter};
fn extract_doc_given_isbn( fn extract_doc_given_isbn(
reader: &IndexReader, reader: &IndexReader,
isbn_term: &Term, isbn_term: &Term,
) -> tantivy::Result<Option<TantivyDocument>> { ) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher(); let searcher = reader.searcher();
// This is the simplest query you can think of. // This is the simplest query you can think of.
@@ -69,10 +69,10 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// Let's add a couple of documents, for the sake of the example. // Let's add a couple of documents, for the sake of the example.
let mut old_man_doc = TantivyDocument::default(); let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea"); old_man_doc.add_text(title, "The Old Man and the Sea");
index_writer.add_document(doc!( index_writer.add_document(doc!(
isbn => "978-0099908401", isbn => "978-0099908401",
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
// Oops our frankenstein doc seems misspelled // Oops our frankenstein doc seems misspelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap(); let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!( assert_eq!(
frankenstein_doc_misspelled.to_json(&schema), schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#, r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
); );
@@ -136,7 +136,7 @@ fn main() -> tantivy::Result<()> {
// No more typo! // No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap(); let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!( assert_eq!(
frankenstein_new_doc.to_json(&schema), schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#, r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
); );

View File

@@ -17,7 +17,7 @@
use tantivy::collector::FacetCollector; use tantivy::collector::FacetCollector;
use tantivy::query::{AllQuery, TermQuery}; use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter}; use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the sake of this example // Let's create a temporary directory for the sake of this example
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?; let mut index_writer = index.writer(30_000_000)?;
// For convenience, tantivy also comes with a macro to // For convenience, tantivy also comes with a macro to
// reduce the boilerplate above. // reduce the boilerplate above.

View File

@@ -12,7 +12,7 @@ use std::collections::HashSet;
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::BooleanQuery; use tantivy::query::BooleanQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, DocId, Index, IndexWriter, Score, SegmentReader}; use tantivy::{doc, DocId, Index, Score, SegmentReader};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?; let mut index_writer = index.writer(30_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Fried egg", title => "Fried egg",
@@ -91,10 +91,11 @@ fn main() -> tantivy::Result<()> {
.iter() .iter()
.map(|(_, doc_id)| { .map(|(_, doc_id)| {
searcher searcher
.doc::<TantivyDocument>(*doc_id) .doc(*doc_id)
.unwrap() .unwrap()
.get_first(title) .get_first(title)
.and_then(|v| v.as_str()) .unwrap()
.as_text()
.unwrap() .unwrap()
.to_owned() .to_owned()
}) })

View File

@@ -14,7 +14,7 @@
use tantivy::collector::{Count, TopDocs}; use tantivy::collector::{Count, TopDocs};
use tantivy::query::FuzzyTermQuery; use tantivy::query::FuzzyTermQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy}; use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
@@ -66,7 +66,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`. // Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase // Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty. // throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents! // Let's index our documents!
// We first need a handle on the title and the body field. // We first need a handle on the title and the body field.
@@ -123,7 +123,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit. // will reload the index automatically after each commit.
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into()?; .try_into()?;
// We now need to acquire a searcher. // We now need to acquire a searcher.
@@ -151,10 +151,10 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count, 3); assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3); assert_eq!(top_docs.len(), 3);
for (score, doc_address) in top_docs { for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
// Note that the score is not lower for the fuzzy hit. // Note that the score is not lower for the fuzzy hit.
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563 // There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
println!("score {score:?} doc {}", retrieved_doc.to_json(&schema));
// score 1.0 doc {"title":["The Diary of Muadib"]} // score 1.0 doc {"title":["The Diary of Muadib"]}
// //
// score 1.0 doc {"title":["The Diary of a Young Girl"]} // score 1.0 doc {"title":["The Diary of a Young Girl"]}

View File

@@ -21,7 +21,7 @@ fn main() -> tantivy::Result<()> {
}"#; }"#;
// We can parse our document // We can parse our document
let _mice_and_men_doc = TantivyDocument::parse_json(&schema, mice_and_men_doc_json)?; let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
// Multi-valued field are allowed, they are // Multi-valued field are allowed, they are
// expressed in JSON by an array. // expressed in JSON by an array.
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Prometheus"], "title": ["Frankenstein", "The Modern Prometheus"],
"year": 1818 "year": 1818
}"#; }"#;
let _frankenstein_doc = TantivyDocument::parse_json(&schema, frankenstein_json)?; let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
// Note that the schema is saved in your index directory. // Note that the schema is saved in your index directory.
// //

View File

@@ -5,7 +5,7 @@
use tantivy::collector::Count; use tantivy::collector::Count;
use tantivy::query::RangeQuery; use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED}; use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result}; use tantivy::{doc, Index, Result};
fn main() -> Result<()> { fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field // For the sake of simplicity, this schema will only have 1 field
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index.reader()?; let reader = index.reader()?;
{ {
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 6_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 { for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?; index_writer.add_document(doc!(year_field => year))?;
} }

View File

@@ -6,7 +6,7 @@
use tantivy::collector::{Count, TopDocs}; use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING}; use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -22,22 +22,20 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents // # Indexing documents
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// ### IPv4 // ### IPv4
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as // Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
// `String`. Since the field is of type ip, we parse the IP address from the string and store it // `String`. Since the field is of type ip, we parse the IP address from the string and store it
// internally as IPv6. // internally as IPv6.
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"ip": "192.168.0.33", "ip": "192.168.0.33",
"event_type": "login" "event_type": "login"
}"#, }"#,
)?; )?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"ip": "192.168.0.80", "ip": "192.168.0.80",
"event_type": "checkout" "event_type": "checkout"
@@ -46,8 +44,7 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
// ### IPv6 // ### IPv6
// Adding a document that contains an IPv6 address. // Adding a document that contains an IPv6 address.
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334", "ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout" "event_type": "checkout"

View File

@@ -10,7 +10,7 @@
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED}; use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the // We first create a schema for the sake of the
@@ -24,7 +24,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?; index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?; index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?; index_writer.add_document(doc!(title => "The modern Promotheus"))?;

View File

@@ -7,7 +7,7 @@
use tantivy::collector::{Count, TopDocs}; use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT}; use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -20,9 +20,8 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents // # Indexing documents
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"timestamp": "2022-02-22T23:20:50.53Z", "timestamp": "2022-02-22T23:20:50.53Z",
"event_type": "click", "event_type": "click",
@@ -34,8 +33,7 @@ fn main() -> tantivy::Result<()> {
}"#, }"#,
)?; )?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"timestamp": "2022-02-22T23:20:51.53Z", "timestamp": "2022-02-22T23:20:51.53Z",
"event_type": "click", "event_type": "click",

View File

@@ -1,7 +1,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy, Result}; use tantivy::{doc, Index, ReloadPolicy, Result};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> Result<()> { fn main() -> Result<()> {
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_dir(&index_path, schema)?; let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
@@ -51,7 +51,7 @@ fn main() -> Result<()> {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into()?; .try_into()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -67,12 +67,8 @@ fn main() -> Result<()> {
let mut titles = top_docs let mut titles = top_docs
.into_iter() .into_iter()
.map(|(_score, doc_address)| { .map(|(_score, doc_address)| {
let doc = searcher.doc::<TantivyDocument>(doc_address)?; let doc = searcher.doc(doc_address)?;
let title = doc let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.to_owned();
Ok(title) Ok(title)
}) })
.collect::<Result<Vec<_>>>()?; .collect::<Result<Vec<_>>>()?;

View File

@@ -13,7 +13,7 @@ use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery; use tantivy::query::TermQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer}; use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy}; use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> { fn pre_tokenize_text(text: &str) -> Vec<Token> {
@@ -38,7 +38,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_dir(&index_path, schema.clone())?; let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// We can create a document manually, by setting the fields // We can create a document manually, by setting the fields
// one by one in a Document object. // one by one in a Document object.
@@ -83,7 +83,7 @@ fn main() -> tantivy::Result<()> {
}] }]
}"#; }"#;
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?; let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc)?; index_writer.add_document(short_man_doc)?;
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into()?; .try_into()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -115,8 +115,8 @@ fn main() -> tantivy::Result<()> {
// Note that the tokens are not stored along with the original text // Note that the tokens are not stored along with the original text
// in the document store // in the document store
for (_score, doc_address) in top_docs { for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema)); println!("Document: {}", schema.to_json(&retrieved_doc));
} }
// In contrary to the previous query, when we search for the "man" term we // In contrary to the previous query, when we search for the "man" term we

View File

@@ -10,8 +10,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::snippet::{Snippet, SnippetGenerator}; use tantivy::{doc, Index, Snippet, SnippetGenerator};
use tantivy::{doc, Index, IndexWriter};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
@@ -28,7 +27,7 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents // # Indexing documents
let index = Index::create_in_dir(&index_path, schema)?; let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// we'll only need one doc for this example. // we'll only need one doc for this example.
index_writer.add_document(doc!( index_writer.add_document(doc!(
@@ -55,10 +54,13 @@ fn main() -> tantivy::Result<()> {
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs { for (score, doc_address) in top_docs {
let doc = searcher.doc::<TantivyDocument>(doc_address)?; let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc); let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:"); println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap()); println!(
"title: {}",
doc.get_first(title).unwrap().as_text().unwrap()
);
println!("snippet: {}", snippet.to_html()); println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet)); println!("custom highlighting: {}", highlight(snippet));
} }

View File

@@ -15,7 +15,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::*; use tantivy::tokenizer::*;
use tantivy::{doc, Index, IndexWriter}; use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search` // this example assumes you understand the content in `basic_search`
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
index.tokenizers().register("stoppy", tokenizer); index.tokenizers().register("stoppy", tokenizer);
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
let title = schema.get_field("title").unwrap(); let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap(); let body = schema.get_field("body").unwrap();
@@ -105,9 +105,9 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (score, doc_address) in top_docs { for (score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("\n==\nDocument score {score}:"); println!("\n==\nDocument score {score}:");
println!("{}", retrieved_doc.to_json(&schema)); println!("{}", schema.to_json(&retrieved_doc));
} }
Ok(()) Ok(())

View File

@@ -6,8 +6,8 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT}; use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{ use tantivy::{
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId, doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
SegmentReader, Warmer, Warmer,
}; };
// This example shows how warmers can be used to // This example shows how warmers can be used to
@@ -143,7 +143,7 @@ fn main() -> tantivy::Result<()> {
const SNEAKERS: ProductId = 23222; const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?; let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?; writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?; writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?; writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;

View File

@@ -185,7 +185,7 @@ fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> {
fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>> { fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>> {
map( map(
// ~* for slop/prefix, ) inside group or ast tree, ^ if boost // ~* for slop/prefix, ) inside group or ast tree, ^ if boost
tuple_infallible((simple_term_infallible(")^"), slop_or_prefix_val)), tuple_infallible((simple_term_infallible("*)^"), slop_or_prefix_val)),
|((delimiter_phrase, (slop, prefix)), errors)| { |((delimiter_phrase, (slop, prefix)), errors)| {
let leaf = if let Some((delimiter, phrase)) = delimiter_phrase { let leaf = if let Some((delimiter, phrase)) = delimiter_phrase {
Some( Some(
@@ -1113,9 +1113,6 @@ mod test {
test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'"); test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded"); test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded"); test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
test_parse_query_to_ast_helper("mr james bo?d", "(*mr *james *bo?d)");
test_parse_query_to_ast_helper("mr james bo*", "(*mr *james *bo*)");
test_parse_query_to_ast_helper("mr james b*d", "(*mr *james *b*d)");
} }
#[test] #[test]

View File

@@ -48,7 +48,7 @@ mod bench {
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone()); let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype); let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?; let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"]; let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap(); let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
@@ -85,7 +85,7 @@ mod bench {
if cardinality == Cardinality::Sparse { if cardinality == Cardinality::Sparse {
doc_with_value /= 20; doc_with_value /= 20;
} }
let _val_max = 1_000_000.0; let val_max = 1_000_000.0;
for _ in 0..doc_with_value { for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0); let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) { let json = if rng.gen_bool(0.1) {

View File

@@ -73,9 +73,9 @@ impl AggregationLimits {
/// Create a new ResourceLimitGuard, that will release the memory when dropped. /// Create a new ResourceLimitGuard, that will release the memory when dropped.
pub fn new_guard(&self) -> ResourceLimitGuard { pub fn new_guard(&self) -> ResourceLimitGuard {
ResourceLimitGuard { ResourceLimitGuard {
// The counter which is shared between the aggregations for one request. /// The counter which is shared between the aggregations for one request.
memory_consumption: Arc::clone(&self.memory_consumption), memory_consumption: Arc::clone(&self.memory_consumption),
// The memory_limit in bytes /// The memory_limit in bytes
memory_limit: self.memory_limit, memory_limit: self.memory_limit,
allocated_with_the_guard: 0, allocated_with_the_guard: 0,
} }
@@ -134,142 +134,3 @@ impl Drop for ResourceLimitGuard {
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed); .fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
} }
} }
#[cfg(test)]
mod tests {
use crate::aggregation::tests::exec_request_with_query;
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_merge() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb" }"#],
vec![r#"{ "text": "aaa", "text2": "bbb" }"#],
];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 1, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 1,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_data() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![vec![r#"{ "text": "aaa", "text2": "bbb" }"#]];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
// Empty result since there is no doc with dates
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 0, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 0,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
}

View File

@@ -103,8 +103,7 @@ impl AggregationWithAccessor {
field: field_name, .. field: field_name, ..
}) => { }) => {
let (accessor, column_type) = let (accessor, column_type) =
// Only DateTime is supported for DateHistogram get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
get_ff_reader(reader, field_name, Some(&[ColumnType::DateTime]))?;
add_agg_with_accessor(accessor, column_type, &mut res)?; add_agg_with_accessor(accessor, column_type, &mut res)?;
} }
Terms(TermsAggregation { Terms(TermsAggregation {

View File

@@ -9,7 +9,7 @@ use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_v
use crate::aggregation::DistributedAggregationCollector; use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery}; use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST}; use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term}; use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation { fn get_avg_req(field_name: &str) -> Aggregation {
serde_json::from_value(json!({ serde_json::from_value(json!({
@@ -586,7 +586,7 @@ fn test_aggregation_on_json_object() {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"color": "red"}))) .add_document(doc!(json => json!({"color": "red"})))
.unwrap(); .unwrap();
@@ -624,72 +624,13 @@ fn test_aggregation_on_json_object() {
); );
} }
#[test]
fn test_aggregation_on_nested_json_object() {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json.blub", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"color.dot": "red", "color": {"nested":"red"} })))
.unwrap();
index_writer
.add_document(doc!(json => json!({"color.dot": "blue", "color": {"nested":"blue"} })))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let agg: Aggregations = serde_json::from_value(json!({
"jsonagg1": {
"terms": {
"field": "json\\.blub.color\\.dot",
}
},
"jsonagg2": {
"terms": {
"field": "json\\.blub.color.nested",
}
}
}))
.unwrap();
let aggregation_collector = get_collector(agg);
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
assert_eq!(
&aggregation_res_json,
&serde_json::json!({
"jsonagg1": {
"buckets": [
{"doc_count": 1, "key": "blue"},
{"doc_count": 1, "key": "red"}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
},
"jsonagg2": {
"buckets": [
{"doc_count": 1, "key": "blue"},
{"doc_count": 1, "key": "red"}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
})
);
}
#[test] #[test]
fn test_aggregation_on_json_object_empty_columns() { fn test_aggregation_on_json_object_empty_columns() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Empty column when accessing color // => Empty column when accessing color
index_writer index_writer
.add_document(doc!(json => json!({"price": 10.0}))) .add_document(doc!(json => json!({"price": 10.0})))
@@ -807,7 +748,7 @@ fn test_aggregation_on_json_object_mixed_types() {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -132,7 +132,6 @@ impl DateHistogramAggregationReq {
hard_bounds: self.hard_bounds, hard_bounds: self.hard_bounds,
extended_bounds: self.extended_bounds, extended_bounds: self.extended_bounds,
keyed: self.keyed, keyed: self.keyed,
is_normalized_to_ns: false,
}) })
} }
@@ -244,15 +243,15 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
} }
#[cfg(test)] #[cfg(test)]
pub mod tests { mod tests {
use pretty_assertions::assert_eq; use pretty_assertions::assert_eq;
use super::*; use super::*;
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request; use crate::aggregation::tests::exec_request;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING}; use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter, TantivyDocument}; use crate::Index;
#[test] #[test]
fn test_parse_into_millisecs() { fn test_parse_into_millisecs() {
@@ -307,8 +306,7 @@ pub mod tests {
) -> crate::Result<Index> { ) -> crate::Result<Index> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
schema_builder.add_date_field("date", FAST); schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("text", FAST | STRING); schema_builder.add_text_field("text", FAST);
schema_builder.add_text_field("text2", FAST | STRING);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
@@ -316,7 +314,7 @@ pub mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for values in segment_and_docs { for values in segment_and_docs {
for doc_str in values { for doc_str in values {
let doc = TantivyDocument::parse_json(&schema, doc_str)?; let doc = schema.parse_document(doc_str)?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
} }
// writing the segment // writing the segment
@@ -328,7 +326,7 @@ pub mod tests {
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
if segment_ids.len() > 1 { if segment_ids.len() > 1 {
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?; index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
} }

View File

@@ -122,14 +122,11 @@ pub struct HistogramAggregation {
/// Whether to return the buckets as a hash map /// Whether to return the buckets as a hash map
#[serde(default)] #[serde(default)]
pub keyed: bool, pub keyed: bool,
/// Whether the values are normalized to ns for date time values. Defaults to false.
#[serde(default)]
pub is_normalized_to_ns: bool,
} }
impl HistogramAggregation { impl HistogramAggregation {
pub(crate) fn normalize_date_time(&mut self) { pub(crate) fn normalize(&mut self, column_type: ColumnType) {
if !self.is_normalized_to_ns { if column_type.is_date_time() {
// values are provided in ms, but the fastfield is in nano seconds // values are provided in ms, but the fastfield is in nano seconds
self.interval *= 1_000_000.0; self.interval *= 1_000_000.0;
self.offset = self.offset.map(|off| off * 1_000_000.0); self.offset = self.offset.map(|off| off * 1_000_000.0);
@@ -141,7 +138,6 @@ impl HistogramAggregation {
min: bounds.min * 1_000_000.0, min: bounds.min * 1_000_000.0,
max: bounds.max * 1_000_000.0, max: bounds.max * 1_000_000.0,
}); });
self.is_normalized_to_ns = true;
} }
} }
@@ -374,7 +370,7 @@ impl SegmentHistogramCollector {
Ok(IntermediateBucketResult::Histogram { Ok(IntermediateBucketResult::Histogram {
buckets, buckets,
is_date_agg: self.column_type == ColumnType::DateTime, column_type: Some(self.column_type),
}) })
} }
@@ -385,9 +381,7 @@ impl SegmentHistogramCollector {
accessor_idx: usize, accessor_idx: usize,
) -> crate::Result<Self> { ) -> crate::Result<Self> {
req.validate()?; req.validate()?;
if field_type == ColumnType::DateTime { req.normalize(field_type);
req.normalize_date_time();
}
let sub_aggregation_blueprint = if sub_aggregation.is_empty() { let sub_aggregation_blueprint = if sub_aggregation.is_empty() {
None None
@@ -445,7 +439,6 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// memory check upfront // memory check upfront
let (_, first_bucket_num, last_bucket_num) = let (_, first_bucket_num, last_bucket_num) =
generate_bucket_pos_with_opt_minmax(histogram_req, min_max); generate_bucket_pos_with_opt_minmax(histogram_req, min_max);
// It's based on user input, so we need to account for overflows // It's based on user input, so we need to account for overflows
let added_buckets = ((last_bucket_num.saturating_sub(first_bucket_num)).max(0) as u64) let added_buckets = ((last_bucket_num.saturating_sub(first_bucket_num)).max(0) as u64)
.saturating_sub(buckets.len() as u64); .saturating_sub(buckets.len() as u64);
@@ -489,7 +482,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// Convert to BucketEntry // Convert to BucketEntry
pub(crate) fn intermediate_histogram_buckets_to_final_buckets( pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
buckets: Vec<IntermediateHistogramBucketEntry>, buckets: Vec<IntermediateHistogramBucketEntry>,
is_date_agg: bool, column_type: Option<ColumnType>,
histogram_req: &HistogramAggregation, histogram_req: &HistogramAggregation,
sub_aggregation: &Aggregations, sub_aggregation: &Aggregations,
limits: &AggregationLimits, limits: &AggregationLimits,
@@ -498,8 +491,8 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// The request used in the the call to final is not yet be normalized. // The request used in the the call to final is not yet be normalized.
// Normalization is changing the precision from milliseconds to nanoseconds. // Normalization is changing the precision from milliseconds to nanoseconds.
let mut histogram_req = histogram_req.clone(); let mut histogram_req = histogram_req.clone();
if is_date_agg { if let Some(column_type) = column_type {
histogram_req.normalize_date_time(); histogram_req.normalize(column_type);
} }
let mut buckets = if histogram_req.min_doc_count() == 0 { let mut buckets = if histogram_req.min_doc_count() == 0 {
// With min_doc_count != 0, we may need to add buckets, so that there are no // With min_doc_count != 0, we may need to add buckets, so that there are no
@@ -523,7 +516,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339 // If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339
// and normalize from nanoseconds to milliseconds // and normalize from nanoseconds to milliseconds
if is_date_agg { if column_type == Some(ColumnType::DateTime) {
for bucket in buckets.iter_mut() { for bucket in buckets.iter_mut() {
if let crate::aggregation::Key::F64(ref mut val) = bucket.key { if let crate::aggregation::Key::F64(ref mut val) = bucket.key {
let key_as_string = format_date(*val as i64)?; let key_as_string = format_date(*val as i64)?;

View File

@@ -601,7 +601,7 @@ mod tests {
use crate::aggregation::AggregationLimits; use crate::aggregation::AggregationLimits;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING}; use crate::schema::{Schema, FAST, STRING};
use crate::{Index, IndexWriter}; use crate::Index;
#[test] #[test]
fn terms_aggregation_test_single_segment() -> crate::Result<()> { fn terms_aggregation_test_single_segment() -> crate::Result<()> {
@@ -1473,7 +1473,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json // => Segment with empty json
index_writer.add_document(doc!()).unwrap(); index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();

View File

@@ -117,7 +117,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query; use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST}; use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter}; use crate::Index;
#[test] #[test]
fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> { fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> {
@@ -126,7 +126,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST); let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0}))) .add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))
@@ -186,7 +186,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST); let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?; index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?;
index_writer.add_document(doc!(score => 5.0))?; index_writer.add_document(doc!(score => 5.0))?;
@@ -231,7 +231,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST); let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?; index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -278,7 +278,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST); let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?; index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?; index_writer.add_document(doc!(score => 5.0))?;
@@ -323,7 +323,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -385,7 +385,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -427,7 +427,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -172,16 +172,10 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Range(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range( Range(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(
Default::default(), Default::default(),
)), )),
Histogram(_) => { Histogram(_) | DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram { IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(), buckets: Vec::new(),
is_date_agg: false, column_type: None,
})
}
DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
is_date_agg: true,
}) })
} }
Average(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Average( Average(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Average(
@@ -349,8 +343,8 @@ pub enum IntermediateBucketResult {
/// This is the histogram entry for a bucket, which contains a key, count, and optionally /// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations. /// sub_aggregations.
Histogram { Histogram {
/// The column_type of the underlying `Column` is DateTime /// The column_type of the underlying `Column`
is_date_agg: bool, column_type: Option<ColumnType>,
/// The buckets /// The buckets
buckets: Vec<IntermediateHistogramBucketEntry>, buckets: Vec<IntermediateHistogramBucketEntry>,
}, },
@@ -405,7 +399,7 @@ impl IntermediateBucketResult {
Ok(BucketResult::Range { buckets }) Ok(BucketResult::Range { buckets })
} }
IntermediateBucketResult::Histogram { IntermediateBucketResult::Histogram {
is_date_agg, column_type,
buckets, buckets,
} => { } => {
let histogram_req = &req let histogram_req = &req
@@ -414,7 +408,7 @@ impl IntermediateBucketResult {
.expect("unexpected aggregation, expected histogram aggregation"); .expect("unexpected aggregation, expected histogram aggregation");
let buckets = intermediate_histogram_buckets_to_final_buckets( let buckets = intermediate_histogram_buckets_to_final_buckets(
buckets, buckets,
is_date_agg, column_type,
histogram_req, histogram_req,
req.sub_aggregation(), req.sub_aggregation(),
limits, limits,
@@ -463,11 +457,11 @@ impl IntermediateBucketResult {
( (
IntermediateBucketResult::Histogram { IntermediateBucketResult::Histogram {
buckets: buckets_left, buckets: buckets_left,
is_date_agg: _, ..
}, },
IntermediateBucketResult::Histogram { IntermediateBucketResult::Histogram {
buckets: buckets_right, buckets: buckets_right,
is_date_agg: _, ..
}, },
) => { ) => {
let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> = let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> =

View File

@@ -71,7 +71,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query; use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST}; use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter}; use crate::Index;
#[test] #[test]
fn test_max_agg_with_missing() -> crate::Result<()> { fn test_max_agg_with_missing() -> crate::Result<()> {
@@ -79,7 +79,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json // => Segment with empty json
index_writer.add_document(doc!()).unwrap(); index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();

View File

@@ -88,7 +88,7 @@ mod tests {
use crate::aggregation::AggregationCollector; use crate::aggregation::AggregationCollector;
use crate::query::AllQuery; use crate::query::AllQuery;
use crate::schema::{NumericOptions, Schema}; use crate::schema::{NumericOptions, Schema};
use crate::{Index, IndexWriter}; use crate::Index;
#[test] #[test]
fn test_metric_aggregations() { fn test_metric_aggregations() {
@@ -96,7 +96,7 @@ mod tests {
let field_options = NumericOptions::default().set_fast(); let field_options = NumericOptions::default().set_fast();
let field = schema_builder.add_f64_field("price", field_options); let field = schema_builder.add_f64_field("price", field_options);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..3 { for i in 0..3 {
index_writer index_writer

View File

@@ -300,7 +300,7 @@ mod tests {
use crate::aggregation::AggregationCollector; use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery}; use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST}; use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term}; use crate::{Index, Term};
#[test] #[test]
fn test_aggregation_stats_empty_index() -> crate::Result<()> { fn test_aggregation_stats_empty_index() -> crate::Result<()> {
@@ -494,7 +494,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json // => Segment with empty json
index_writer.add_document(doc!()).unwrap(); index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -541,7 +541,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json // => Segment with empty json
index_writer.add_document(doc!()).unwrap(); index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();

View File

@@ -319,7 +319,7 @@ mod tests {
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, TermQuery}; use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING}; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, IndexWriter, Term}; use crate::{Index, Term};
pub fn get_test_index_with_num_docs( pub fn get_test_index_with_num_docs(
merge_segments: bool, merge_segments: bool,
@@ -451,7 +451,7 @@ mod tests {
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
if segment_ids.len() > 1 { if segment_ids.len() > 1 {
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?; index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
} }
@@ -565,7 +565,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?; index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
} }

View File

@@ -495,8 +495,8 @@ mod tests {
use crate::collector::Count; use crate::collector::Count;
use crate::core::Index; use crate::core::Index;
use crate::query::{AllQuery, QueryParser, TermQuery}; use crate::query::{AllQuery, QueryParser, TermQuery};
use crate::schema::{Facet, FacetOptions, IndexRecordOption, Schema, TantivyDocument}; use crate::schema::{Document, Facet, FacetOptions, IndexRecordOption, Schema};
use crate::{IndexWriter, Term}; use crate::Term;
fn test_collapse_mapping_aux( fn test_collapse_mapping_aux(
facet_terms: &[&str], facet_terms: &[&str],
@@ -559,7 +559,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(facet_field=>Facet::from("/facet/a"))) .add_document(doc!(facet_field=>Facet::from("/facet/a")))
.unwrap(); .unwrap();
@@ -588,7 +588,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5; let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets) let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| { .map(|mut n| {
@@ -601,7 +601,7 @@ mod tests {
}) })
.collect(); .collect();
for i in 0..num_facets * 10 { for i in 0..num_facets * 10 {
let mut doc = TantivyDocument::new(); let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone()); doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
} }
@@ -732,25 +732,24 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let uniform = Uniform::new_inclusive(1, 100_000); let uniform = Uniform::new_inclusive(1, 100_000);
let mut docs: Vec<TantivyDocument> = let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)] .into_iter()
.into_iter() .flat_map(|(c, count)| {
.flat_map(|(c, count)| { let facet = Facet::from(&format!("/facet/{}", c));
let facet = Facet::from(&format!("/facet/{}", c)); let doc = doc!(facet_field => facet);
let doc = doc!(facet_field => facet); iter::repeat(doc).take(count)
iter::repeat(doc).take(count) })
}) .map(|mut doc| {
.map(|mut doc| { doc.add_facet(
doc.add_facet( facet_field,
facet_field, &format!("/facet/{}", thread_rng().sample(uniform)),
&format!("/facet/{}", thread_rng().sample(uniform)), );
); doc
doc })
}) .collect();
.collect();
docs[..].shuffle(&mut thread_rng()); docs[..].shuffle(&mut thread_rng());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
} }
@@ -781,7 +780,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)] let docs: Vec<Document> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter() .into_iter()
.flat_map(|(c, count)| { .flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c)); let facet = Facet::from(&format!("/facet/{}", c));
@@ -829,7 +828,7 @@ mod bench {
use crate::collector::FacetCollector; use crate::collector::FacetCollector;
use crate::query::AllQuery; use crate::query::AllQuery;
use crate::schema::{Facet, Schema, INDEXED}; use crate::schema::{Facet, Schema, INDEXED};
use crate::{Index, IndexWriter}; use crate::Index;
#[bench] #[bench]
fn bench_facet_collector(b: &mut Bencher) { fn bench_facet_collector(b: &mut Bencher) {
@@ -848,7 +847,7 @@ mod bench {
// 40425 docs // 40425 docs
docs[..].shuffle(&mut thread_rng()); docs[..].shuffle(&mut thread_rng());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
} }

View File

@@ -12,7 +12,8 @@ use std::marker::PhantomData;
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType}; use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
use crate::collector::{Collector, SegmentCollector}; use crate::collector::{Collector, SegmentCollector};
use crate::{DocId, Score, SegmentReader}; use crate::schema::Field;
use crate::{DocId, Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate. /// The `FilterCollector` filters docs using a fast field value and a predicate.
/// ///
@@ -49,13 +50,13 @@ use crate::{DocId, Score, SegmentReader};
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new("price".to_string(), |value: u64| value > 20_120u64, TopDocs::with_limit(2)); /// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?; /// let top_docs = searcher.search(&query, &no_filter_collector)?;
/// ///
/// assert_eq!(top_docs.len(), 1); /// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); /// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
/// ///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new("price".to_string(), |value| value < 5u64, TopDocs::with_limit(2)); /// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?; /// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
/// ///
/// assert_eq!(filtered_top_docs.len(), 0); /// assert_eq!(filtered_top_docs.len(), 0);
@@ -69,7 +70,7 @@ use crate::{DocId, Score, SegmentReader};
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue> pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
where TPredicate: 'static + Clone where TPredicate: 'static + Clone
{ {
field: String, field: Field,
collector: TCollector, collector: TCollector,
predicate: TPredicate, predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>, t_predicate_value: PhantomData<TPredicateValue>,
@@ -82,7 +83,7 @@ where
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone, TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
{ {
/// Create a new `FilterCollector`. /// Create a new `FilterCollector`.
pub fn new(field: String, predicate: TPredicate, collector: TCollector) -> Self { pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self { Self {
field, field,
predicate, predicate,
@@ -109,7 +110,18 @@ where
segment_local_id: u32, segment_local_id: u32,
segment_reader: &SegmentReader, segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> { ) -> crate::Result<Self::Child> {
let column_opt = segment_reader.fast_fields().column_opt(&self.field)?; let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let column_opt = segment_reader
.fast_fields()
.column_opt(field_entry.name())?;
let segment_collector = self let segment_collector = self
.collector .collector
@@ -217,7 +229,7 @@ where
/// ///
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary")?;
/// let filter_collector = BytesFilterCollector::new("barcode".to_string(), |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2)); /// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &filter_collector)?; /// let top_docs = searcher.search(&query, &filter_collector)?;
/// ///
/// assert_eq!(top_docs.len(), 1); /// assert_eq!(top_docs.len(), 1);
@@ -228,7 +240,7 @@ where
pub struct BytesFilterCollector<TCollector, TPredicate> pub struct BytesFilterCollector<TCollector, TPredicate>
where TPredicate: 'static + Clone where TPredicate: 'static + Clone
{ {
field: String, field: Field,
collector: TCollector, collector: TCollector,
predicate: TPredicate, predicate: TPredicate,
} }
@@ -239,7 +251,7 @@ where
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone, TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
{ {
/// Create a new `BytesFilterCollector`. /// Create a new `BytesFilterCollector`.
pub fn new(field: String, predicate: TPredicate, collector: TCollector) -> Self { pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self { Self {
field, field,
predicate, predicate,
@@ -262,7 +274,10 @@ where
segment_local_id: u32, segment_local_id: u32,
segment_reader: &SegmentReader, segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> { ) -> crate::Result<Self::Child> {
let column_opt = segment_reader.fast_fields().bytes(&self.field)?; let schema = segment_reader.schema();
let field_name = schema.get_field_name(self.field);
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
let segment_collector = self let segment_collector = self
.collector .collector

View File

@@ -97,7 +97,7 @@ pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
mod top_collector; mod top_collector;
mod top_score_collector; mod top_score_collector;
pub use self::top_score_collector::{TopDocs, TopNComputer}; pub use self::top_score_collector::TopDocs;
mod custom_score_top_collector; mod custom_score_top_collector;
pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer}; pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};

View File

@@ -7,9 +7,7 @@ use crate::query::{AllQuery, QueryParser};
use crate::schema::{Schema, FAST, TEXT}; use crate::schema::{Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::{ use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
doc, DateTime, DocAddress, DocId, Index, Score, Searcher, SegmentOrdinal, TantivyDocument,
};
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector { pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
compute_score: true, compute_score: true,
@@ -42,7 +40,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
let query_parser = QueryParser::for_index(&index, vec![title]); let query_parser = QueryParser::for_index(&index, vec![title]);
let query = query_parser.parse_query("diary")?; let query = query_parser.parse_query("diary")?;
let filter_some_collector = FilterCollector::new( let filter_some_collector = FilterCollector::new(
"price".to_string(), price,
&|value: u64| value > 20_120u64, &|value: u64| value > 20_120u64,
TopDocs::with_limit(2), TopDocs::with_limit(2),
); );
@@ -51,11 +49,8 @@ pub fn test_filter_collector() -> crate::Result<()> {
assert_eq!(top_docs.len(), 1); assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress::new(0, 1)); assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new( let filter_all_collector: FilterCollector<_, _, u64> =
"price".to_string(), FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
&|value| value < 5u64,
TopDocs::with_limit(2),
);
let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap(); let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
assert_eq!(filtered_top_docs.len(), 0); assert_eq!(filtered_top_docs.len(), 0);
@@ -66,8 +61,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
> 0 > 0
} }
let filter_dates_collector = let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
FilterCollector::new("date".to_string(), &date_filter, TopDocs::with_limit(5));
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?; let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
assert_eq!(filtered_date_docs.len(), 2); assert_eq!(filtered_date_docs.len(), 2);
@@ -286,8 +280,8 @@ fn make_test_searcher() -> crate::Result<Searcher> {
let schema = Schema::builder().build(); let schema = Schema::builder().build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::default())?; index_writer.add_document(Document::default())?;
index_writer.add_document(TantivyDocument::default())?; index_writer.add_document(Document::default())?;
index_writer.commit()?; index_writer.commit()?;
Ok(index.reader()?.searcher()) Ok(index.reader()?.searcher())
} }

View File

@@ -1,7 +1,7 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::marker::PhantomData; use std::marker::PhantomData;
use super::top_score_collector::TopNComputer;
use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader}; use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader};
/// Contains a feature (field, score, etc.) of a document along with the document address. /// Contains a feature (field, score, etc.) of a document along with the document address.
@@ -20,14 +20,6 @@ pub(crate) struct ComparableDoc<T, D> {
pub feature: T, pub feature: T,
pub doc: D, pub doc: D,
} }
impl<T: std::fmt::Debug, D: std::fmt::Debug> std::fmt::Debug for ComparableDoc<T, D> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ComparableDoc")
.field("feature", &self.feature)
.field("doc", &self.doc)
.finish()
}
}
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> { impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
@@ -99,13 +91,18 @@ where T: PartialOrd + Clone
if self.limit == 0 { if self.limit == 0 {
return Ok(Vec::new()); return Ok(Vec::new());
} }
let mut top_collector = TopNComputer::new(self.limit + self.offset); let mut top_collector = BinaryHeap::new();
for child_fruit in children { for child_fruit in children {
for (feature, doc) in child_fruit { for (feature, doc) in child_fruit {
top_collector.push(ComparableDoc { feature, doc }); if top_collector.len() < (self.limit + self.offset) {
top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
}
} }
} }
Ok(top_collector Ok(top_collector
.into_sorted_vec() .into_sorted_vec()
.into_iter() .into_iter()
@@ -114,7 +111,7 @@ where T: PartialOrd + Clone
.collect()) .collect())
} }
pub(crate) fn for_segment<F: PartialOrd + Clone>( pub(crate) fn for_segment<F: PartialOrd>(
&self, &self,
segment_id: SegmentOrdinal, segment_id: SegmentOrdinal,
_: &SegmentReader, _: &SegmentReader,
@@ -139,18 +136,20 @@ where T: PartialOrd + Clone
/// The Top Collector keeps track of the K documents /// The Top Collector keeps track of the K documents
/// sorted by type `T`. /// sorted by type `T`.
/// ///
/// The implementation is based on a repeatedly truncating on the median after K * 2 documents /// The implementation is based on a `BinaryHeap`.
/// The theoretical complexity for collecting the top `K` out of `n` documents /// The theoretical complexity for collecting the top `K` out of `n` documents
/// is `O(n + K)`. /// is `O(n log K)`.
pub(crate) struct TopSegmentCollector<T> { pub(crate) struct TopSegmentCollector<T> {
topn_computer: TopNComputer<T, DocId>, limit: usize,
heap: BinaryHeap<ComparableDoc<T, DocId>>,
segment_ord: u32, segment_ord: u32,
} }
impl<T: PartialOrd + Clone> TopSegmentCollector<T> { impl<T: PartialOrd> TopSegmentCollector<T> {
fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> { fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {
TopSegmentCollector { TopSegmentCollector {
topn_computer: TopNComputer::new(limit), limit,
heap: BinaryHeap::with_capacity(limit),
segment_ord, segment_ord,
} }
} }
@@ -159,7 +158,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
impl<T: PartialOrd + Clone> TopSegmentCollector<T> { impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
pub fn harvest(self) -> Vec<(T, DocAddress)> { pub fn harvest(self) -> Vec<(T, DocAddress)> {
let segment_ord = self.segment_ord; let segment_ord = self.segment_ord;
self.topn_computer self.heap
.into_sorted_vec() .into_sorted_vec()
.into_iter() .into_iter()
.map(|comparable_doc| { .map(|comparable_doc| {
@@ -174,13 +173,33 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
.collect() .collect()
} }
/// Return true if more documents have been collected than the limit.
#[inline]
pub(crate) fn at_capacity(&self) -> bool {
self.heap.len() >= self.limit
}
/// Collects a document scored by the given feature /// Collects a document scored by the given feature
/// ///
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it /// It collects documents until it has reached the max capacity. Once it reaches capacity, it
/// will compare the lowest scoring item with the given one and keep whichever is greater. /// will compare the lowest scoring item with the given one and keep whichever is greater.
#[inline] #[inline]
pub fn collect(&mut self, doc: DocId, feature: T) { pub fn collect(&mut self, doc: DocId, feature: T) {
self.topn_computer.push(ComparableDoc { feature, doc }); if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
if let Some(limit_feature) = self.heap.peek().map(|head| head.feature.clone()) {
if limit_feature < feature {
if let Some(mut head) = self.heap.peek_mut() {
head.feature = feature;
head.doc = doc;
}
}
}
} else {
// we have not reached capacity yet, so we can just push the
// element.
self.heap.push(ComparableDoc { feature, doc });
}
} }
} }

View File

@@ -1,3 +1,4 @@
use std::collections::BinaryHeap;
use std::fmt; use std::fmt;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::sync::Arc; use std::sync::Arc;
@@ -85,15 +86,12 @@ where
/// The `TopDocs` collector keeps track of the top `K` documents /// The `TopDocs` collector keeps track of the top `K` documents
/// sorted by their score. /// sorted by their score.
/// ///
/// The implementation is based on a repeatedly truncating on the median after K * 2 documents /// The implementation is based on a `BinaryHeap`.
/// with pattern defeating QuickSort. /// The theoretical complexity for collecting the top `K` out of `n` documents
/// The theoretical complexity for collecting the top `K` out of `N` documents /// is `O(n log K)`.
/// is `O(N + K)`.
/// ///
/// This collector does not guarantee a stable sorting in case of a tie on the /// This collector guarantees a stable sorting in case of a tie on the
/// document score, for stable sorting `PartialOrd` needs to resolve on other fields /// document score. As such, it is suitable to implement pagination.
/// like docid in case of score equality.
/// Only then, it is suitable for pagination.
/// ///
/// ```rust /// ```rust
/// use tantivy::collector::TopDocs; /// use tantivy::collector::TopDocs;
@@ -663,35 +661,50 @@ impl Collector for TopDocs {
reader: &SegmentReader, reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> { ) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let heap_len = self.0.limit + self.0.offset; let heap_len = self.0.limit + self.0.offset;
let mut top_n = TopNComputer::new(heap_len); let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
if let Some(alive_bitset) = reader.alive_bitset() { if let Some(alive_bitset) = reader.alive_bitset() {
let mut threshold = Score::MIN; let mut threshold = Score::MIN;
top_n.threshold = Some(threshold); weight.for_each_pruning(threshold, reader, &mut |doc, score| {
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
if alive_bitset.is_deleted(doc) { if alive_bitset.is_deleted(doc) {
return threshold; return threshold;
} }
let doc = ComparableDoc { let heap_item = ComparableDoc {
feature: score, feature: score,
doc, doc,
}; };
top_n.push(doc); if heap.len() < heap_len {
threshold = top_n.threshold.unwrap_or(Score::MIN); heap.push(heap_item);
if heap.len() == heap_len {
threshold = heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
}
return threshold;
}
*heap.peek_mut().unwrap() = heap_item;
threshold = heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
threshold threshold
})?; })?;
} else { } else {
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| { weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
let doc = ComparableDoc { let heap_item = ComparableDoc {
feature: score, feature: score,
doc, doc,
}; };
top_n.push(doc); if heap.len() < heap_len {
top_n.threshold.unwrap_or(Score::MIN) heap.push(heap_item);
// TODO the threshold is suboptimal for heap.len == heap_len
if heap.len() == heap_len {
return heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
} else {
return Score::MIN;
}
}
*heap.peek_mut().unwrap() = heap_item;
heap.peek().map(|el| el.feature).unwrap_or(Score::MIN)
})?; })?;
} }
let fruit = top_n let fruit = heap
.into_sorted_vec() .into_sorted_vec()
.into_iter() .into_iter()
.map(|cid| { .map(|cid| {
@@ -723,81 +736,9 @@ impl SegmentCollector for TopScoreSegmentCollector {
} }
} }
/// Fast TopN Computation
///
/// For TopN == 0, it will be relative expensive.
pub struct TopNComputer<Score, DocId> {
buffer: Vec<ComparableDoc<Score, DocId>>,
top_n: usize,
pub(crate) threshold: Option<Score>,
}
impl<Score, DocId> TopNComputer<Score, DocId>
where
Score: PartialOrd + Clone,
DocId: Ord + Clone,
{
/// Create a new `TopNComputer`.
/// Internally it will allocate a buffer of size `2 * top_n`.
pub fn new(top_n: usize) -> Self {
let vec_cap = top_n.max(1) * 2;
TopNComputer {
buffer: Vec::with_capacity(vec_cap),
top_n,
threshold: None,
}
}
#[inline]
pub(crate) fn push(&mut self, doc: ComparableDoc<Score, DocId>) {
if let Some(last_median) = self.threshold.clone() {
if doc.feature < last_median {
return;
}
}
if self.buffer.len() == self.buffer.capacity() {
let median = self.truncate_top_n();
self.threshold = Some(median);
}
// This is faster since it avoids the buffer resizing to be inlined from vec.push()
// (this is in the hot path)
// TODO: Replace with `push_within_capacity` when it's stabilized
let uninit = self.buffer.spare_capacity_mut();
// This cannot panic, because we truncate_median will at least remove one element, since
// the min capacity is 2.
uninit[0].write(doc);
// This is safe because it would panic in the line above
unsafe {
self.buffer.set_len(self.buffer.len() + 1);
}
}
#[inline(never)]
fn truncate_top_n(&mut self) -> Score {
// Use select_nth_unstable to find the top nth score
let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n);
let median_score = median_el.feature.clone();
// Remove all elements below the top_n
self.buffer.truncate(self.top_n);
median_score
}
pub(crate) fn into_sorted_vec(mut self) -> Vec<ComparableDoc<Score, DocId>> {
if self.buffer.len() > self.top_n {
self.truncate_top_n();
}
self.buffer.sort_unstable();
self.buffer
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{TopDocs, TopNComputer}; use super::TopDocs;
use crate::collector::top_collector::ComparableDoc;
use crate::collector::Collector; use crate::collector::Collector;
use crate::query::{AllQuery, Query, QueryParser}; use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT}; use crate::schema::{Field, Schema, FAST, STORED, TEXT};
@@ -826,78 +767,6 @@ mod tests {
} }
} }
#[test]
fn test_empty_topn_computer() {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(0);
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 2u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 3u32,
});
assert!(computer.into_sorted_vec().is_empty());
}
#[test]
fn test_topn_computer() {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(2);
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
computer.push(ComparableDoc {
feature: 2u32,
doc: 2u32,
});
computer.push(ComparableDoc {
feature: 3u32,
doc: 3u32,
});
computer.push(ComparableDoc {
feature: 2u32,
doc: 4u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 5u32,
});
assert_eq!(
computer.into_sorted_vec(),
&[
ComparableDoc {
feature: 3u32,
doc: 3u32,
},
ComparableDoc {
feature: 2u32,
doc: 2u32,
}
]
);
}
#[test]
fn test_topn_computer_no_panic() {
for top_n in 0..10 {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(top_n);
for _ in 0..1 + top_n * 2 {
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
}
let _vals = computer.into_sorted_vec();
}
}
#[test] #[test]
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> { fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> {
let index = make_index()?; let index = make_index()?;
@@ -983,25 +852,20 @@ mod tests {
// using AllQuery to get a constant score // using AllQuery to get a constant score
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let page_0 = searcher.search(&AllQuery, &TopDocs::with_limit(1)).unwrap();
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap(); let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap(); let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
// precondition for the test to be meaningful: we did get documents // precondition for the test to be meaningful: we did get documents
// with the same score // with the same score
assert!(page_0.iter().all(|result| result.0 == page_1[0].0));
assert!(page_1.iter().all(|result| result.0 == page_1[0].0)); assert!(page_1.iter().all(|result| result.0 == page_1[0].0));
assert!(page_2.iter().all(|result| result.0 == page_2[0].0)); assert!(page_2.iter().all(|result| result.0 == page_2[0].0));
// sanity check since we're relying on make_index() // sanity check since we're relying on make_index()
assert_eq!(page_0.len(), 1);
assert_eq!(page_1.len(), 2); assert_eq!(page_1.len(), 2);
assert_eq!(page_2.len(), 3); assert_eq!(page_2.len(), 3);
assert_eq!(page_1, &page_2[..page_1.len()]); assert_eq!(page_1, &page_2[..page_1.len()]);
assert_eq!(page_0, &page_2[..page_0.len()]);
} }
#[test] #[test]

View File

@@ -18,12 +18,10 @@ use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_L
use crate::error::{DataCorruption, TantivyError}; use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN}; use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas; use crate::indexer::segment_updater::save_metas;
use crate::indexer::IndexWriter;
use crate::reader::{IndexReader, IndexReaderBuilder}; use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::document::Document;
use crate::schema::{Field, FieldType, Schema}; use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{merge_field_meta_data, FieldMetadata, SegmentReader}; use crate::IndexWriter;
fn load_metas( fn load_metas(
directory: &dyn Directory, directory: &dyn Directory,
@@ -186,11 +184,11 @@ impl IndexBuilder {
/// ///
/// It expects an originally empty directory, and will not run any GC operation. /// It expects an originally empty directory, and will not run any GC operation.
#[doc(hidden)] #[doc(hidden)]
pub fn single_segment_index_writer<D: Document>( pub fn single_segment_index_writer(
self, self,
dir: impl Into<Box<dyn Directory>>, dir: impl Into<Box<dyn Directory>>,
mem_budget: usize, mem_budget: usize,
) -> crate::Result<SingleSegmentIndexWriter<D>> { ) -> crate::Result<SingleSegmentIndexWriter> {
let index = self.create(dir)?; let index = self.create(dir)?;
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?; let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
Ok(index_simple_writer) Ok(index_simple_writer)
@@ -490,28 +488,6 @@ impl Index {
self.inventory.all() self.inventory.all()
} }
/// Returns the list of fields that have been indexed in the Index.
/// The field list includes the field defined in the schema as well as the fields
/// that have been indexed as a part of a JSON field.
/// The returned field name is the full field name, including the name of the JSON field.
///
/// The returned field names can be used in queries.
///
/// Notice: If your data contains JSON fields this is **very expensive**, as it requires
/// browsing through the inverted index term dictionary and the columnar field dictionary.
///
/// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json
/// field that is not indexed nor a fast field but is stored, it is possible for the field
/// to not be listed.
pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
let segments = self.searchable_segments()?;
let fields_metadata: Vec<Vec<FieldMetadata>> = segments
.into_iter()
.map(|segment| SegmentReader::open(&segment)?.fields_metadata())
.collect::<Result<_, _>>()?;
Ok(merge_field_meta_data(fields_metadata, &self.schema()))
}
/// Creates a new segment_meta (Advanced user only). /// Creates a new segment_meta (Advanced user only).
/// ///
/// As long as the `SegmentMeta` lives, the files associated with the /// As long as the `SegmentMeta` lives, the files associated with the
@@ -555,11 +531,11 @@ impl Index {
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`. /// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
/// If the memory arena per thread is too small or too big, returns /// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument` /// `TantivyError::InvalidArgument`
pub fn writer_with_num_threads<D: Document>( pub fn writer_with_num_threads(
&self, &self,
num_threads: usize, num_threads: usize,
overall_memory_budget_in_bytes: usize, overall_memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> { ) -> crate::Result<IndexWriter> {
let directory_lock = self let directory_lock = self
.directory .directory
.acquire_lock(&INDEX_WRITER_LOCK) .acquire_lock(&INDEX_WRITER_LOCK)
@@ -588,8 +564,8 @@ impl Index {
/// That index writer only simply has a single thread and a memory budget of 15 MB. /// That index writer only simply has a single thread and a memory budget of 15 MB.
/// Using a single thread gives us a deterministic allocation of DocId. /// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)] #[cfg(test)]
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<D>> { pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, MEMORY_BUDGET_NUM_BYTES_MIN) self.writer_with_num_threads(1, 15_000_000)
} }
/// Creates a multithreaded writer /// Creates a multithreaded writer
@@ -603,10 +579,7 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the memory arena per thread is too small or too big, returns /// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument` /// `TantivyError::InvalidArgument`
pub fn writer<D: Document>( pub fn writer(&self, memory_budget_in_bytes: usize) -> crate::Result<IndexWriter> {
&self,
memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD); let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads; let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN { if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {

View File

@@ -1,12 +1,11 @@
use std::io; use std::io;
use common::BinarySerializable; use common::BinarySerializable;
use fnv::FnvHashSet;
use crate::directory::FileSlice; use crate::directory::FileSlice;
use crate::positions::PositionReader; use crate::positions::PositionReader;
use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo}; use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
use crate::schema::{IndexRecordOption, Term, Type, JSON_END_OF_PATH}; use crate::schema::{IndexRecordOption, Term};
use crate::termdict::TermDictionary; use crate::termdict::TermDictionary;
/// The inverted index reader is in charge of accessing /// The inverted index reader is in charge of accessing
@@ -70,28 +69,6 @@ impl InvertedIndexReader {
&self.termdict &self.termdict
} }
/// Return the fields and types encoded in the dictionary in lexicographic oder.
/// Only valid on JSON fields.
///
/// Notice: This requires a full scan and therefore **very expensive**.
/// TODO: Move to sstable to use the index.
pub fn list_encoded_fields(&self) -> io::Result<Vec<(String, Type)>> {
let mut stream = self.termdict.stream()?;
let mut fields = Vec::new();
let mut fields_set = FnvHashSet::default();
while let Some((term, _term_info)) = stream.next() {
if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
if !fields_set.contains(&term[..index + 2]) {
fields_set.insert(term[..index + 2].to_vec());
let typ = Type::from_code(term[index + 1]).unwrap();
fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ));
}
}
}
Ok(fields)
}
/// Resets the block segment to another position of the postings /// Resets the block segment to another position of the postings
/// file. /// file.
/// ///

View File

@@ -1,11 +1,11 @@
use columnar::MonotonicallyMappableToU64; use columnar::MonotonicallyMappableToU64;
use common::{replace_in_place, JsonPathWriter}; use common::replace_in_place;
use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use crate::fastfield::FastValue; use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::term::JSON_PATH_SEGMENT_SEP;
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED}; use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset}; use crate::time::{OffsetDateTime, UtcOffset};
@@ -57,41 +57,31 @@ struct IndexingPositionsPerPath {
} }
impl IndexingPositionsPerPath { impl IndexingPositionsPerPath {
fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition { fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
self.positions_per_path.entry(id).or_default() self.positions_per_path
.entry(murmurhash2(term.serialized_term()))
.or_default()
} }
} }
/// Convert JSON_PATH_SEGMENT_SEP to a dot. pub(crate) fn index_json_values<'a>(
pub fn json_path_sep_to_dot(path: &mut str) {
// This is safe since we are replacing a ASCII character by another ASCII character.
unsafe {
replace_in_place(JSON_PATH_SEGMENT_SEP, b'.', path.as_bytes_mut());
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_json_values<'a, V: Value<'a>>(
doc: DocId, doc: DocId,
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>, json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &mut TextAnalyzer, text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool, expand_dots_enabled: bool,
term_buffer: &mut Term, term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter, postings_writer: &mut dyn PostingsWriter,
json_path_writer: &mut JsonPathWriter,
ctx: &mut IndexingContext, ctx: &mut IndexingContext,
) -> crate::Result<()> { ) -> crate::Result<()> {
json_path_writer.clear(); let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
json_path_writer.set_expand_dots(expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default(); let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_visitor_res in json_visitors { for json_value_res in json_values {
let json_visitor = json_visitor_res?; let json_value = json_value_res?;
index_json_object::<V>( index_json_object(
doc, doc,
json_visitor, json_value,
text_analyzer, text_analyzer,
term_buffer, &mut json_term_writer,
json_path_writer,
postings_writer, postings_writer,
ctx, ctx,
&mut positions_per_path, &mut positions_per_path,
@@ -100,154 +90,93 @@ pub(crate) fn index_json_values<'a, V: Value<'a>>(
Ok(()) Ok(())
} }
#[allow(clippy::too_many_arguments)] fn index_json_object(
fn index_json_object<'a, V: Value<'a>>(
doc: DocId, doc: DocId,
json_visitor: V::ObjectIter, json_value: &serde_json::Map<String, serde_json::Value>,
text_analyzer: &mut TextAnalyzer, text_analyzer: &mut TextAnalyzer,
term_buffer: &mut Term, json_term_writer: &mut JsonTermWriter,
json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter, postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext, ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath, positions_per_path: &mut IndexingPositionsPerPath,
) { ) {
for (json_path_segment, json_value_visitor) in json_visitor { for (json_path_segment, json_value) in json_value {
json_path_writer.push(json_path_segment); json_term_writer.push_path_segment(json_path_segment);
index_json_value( index_json_value(
doc, doc,
json_value_visitor, json_value,
text_analyzer, text_analyzer,
term_buffer, json_term_writer,
json_path_writer,
postings_writer, postings_writer,
ctx, ctx,
positions_per_path, positions_per_path,
); );
json_path_writer.pop(); json_term_writer.pop_path_segment();
} }
} }
#[allow(clippy::too_many_arguments)] fn index_json_value(
fn index_json_value<'a, V: Value<'a>>(
doc: DocId, doc: DocId,
json_value: V, json_value: &serde_json::Value,
text_analyzer: &mut TextAnalyzer, text_analyzer: &mut TextAnalyzer,
term_buffer: &mut Term, json_term_writer: &mut JsonTermWriter,
json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter, postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext, ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath, positions_per_path: &mut IndexingPositionsPerPath,
) { ) {
let set_path_id = |term_buffer: &mut Term, unordered_id: u32| { match json_value {
term_buffer.truncate_value_bytes(0); serde_json::Value::Null => {}
term_buffer.append_bytes(&unordered_id.to_be_bytes()); serde_json::Value::Bool(val_bool) => {
}; json_term_writer.set_fast_value(*val_bool);
let set_type = |term_buffer: &mut Term, typ: Type| { postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
term_buffer.append_bytes(&[typ.to_code()]); }
}; serde_json::Value::Number(number) => {
if let Some(number_i64) = number.as_i64() {
match json_value.as_value() { json_term_writer.set_fast_value(number_i64);
ReferenceValue::Leaf(leaf) => match leaf { } else if let Some(number_u64) = number.as_u64() {
ReferenceValueLeaf::Null => {} json_term_writer.set_fast_value(number_u64);
ReferenceValueLeaf::Str(val) => { } else if let Some(number_f64) = number.as_f64() {
let mut token_stream = text_analyzer.token_stream(val); json_term_writer.set_fast_value(number_f64);
let unordered_id = ctx }
.path_to_unordered_id postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
.get_or_allocate_unordered_id(json_path_writer.as_str()); }
serde_json::Value::String(text) => match infer_type_from_str(text) {
// TODO: make sure the chain position works out. TextOrDateTime::Text(text) => {
set_path_id(term_buffer, unordered_id); let mut token_stream = text_analyzer.token_stream(text);
set_type(term_buffer, Type::Str); // TODO make sure the chain position works out.
let indexing_position = positions_per_path.get_position_from_id(unordered_id); json_term_writer.close_path_and_set_type(Type::Str);
let indexing_position = positions_per_path.get_position(json_term_writer.term());
postings_writer.index_text( postings_writer.index_text(
doc, doc,
&mut *token_stream, &mut *token_stream,
term_buffer, json_term_writer.term_buffer,
ctx, ctx,
indexing_position, indexing_position,
); );
} }
ReferenceValueLeaf::U64(val) => { TextOrDateTime::DateTime(dt) => {
set_path_id( json_term_writer.set_fast_value(DateTime::from_utc(dt));
term_buffer, postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::I64(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::F64(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Bool(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Date(val) => {
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::PreTokStr(_) => {
unimplemented!(
"Pre-tokenized string support in dynamic fields is not yet implemented"
)
}
ReferenceValueLeaf::Bytes(_) => {
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
} }
}, },
ReferenceValue::Array(elements) => { serde_json::Value::Array(arr) => {
for val in elements { for val in arr {
index_json_value( index_json_value(
doc, doc,
val, val,
text_analyzer, text_analyzer,
term_buffer, json_term_writer,
json_path_writer,
postings_writer, postings_writer,
ctx, ctx,
positions_per_path, positions_per_path,
); );
} }
} }
ReferenceValue::Object(object) => { serde_json::Value::Object(map) => {
index_json_object::<V>( index_json_object(
doc, doc,
object, map,
text_analyzer, text_analyzer,
term_buffer, json_term_writer,
json_path_writer,
postings_writer, postings_writer,
ctx, ctx,
positions_per_path, positions_per_path,
@@ -256,6 +185,21 @@ fn index_json_value<'a, V: Value<'a>>(
} }
} }
enum TextOrDateTime<'a> {
Text(&'a str),
DateTime(OffsetDateTime),
}
fn infer_type_from_str(text: &str) -> TextOrDateTime {
match OffsetDateTime::parse(text, &Rfc3339) {
Ok(dt) => {
let dt_utc = dt.to_offset(UtcOffset::UTC);
TextOrDateTime::DateTime(dt_utc)
}
Err(_) => TextOrDateTime::Text(text),
}
}
// Tries to infer a JSON type from a string. // Tries to infer a JSON type from a string.
pub fn convert_to_fast_value_and_get_term( pub fn convert_to_fast_value_and_get_term(
json_term_writer: &mut JsonTermWriter, json_term_writer: &mut JsonTermWriter,
@@ -328,7 +272,7 @@ pub struct JsonTermWriter<'a> {
/// In other words, /// In other words,
/// - `k8s.node` ends up as `["k8s", "node"]`. /// - `k8s.node` ends up as `["k8s", "node"]`.
/// - `k8s\.node` ends up as `["k8s.node"]`. /// - `k8s\.node` ends up as `["k8s.node"]`.
pub fn split_json_path(json_path: &str) -> Vec<String> { fn split_json_path(json_path: &str) -> Vec<String> {
let mut escaped_state: bool = false; let mut escaped_state: bool = false;
let mut json_path_segments = Vec::new(); let mut json_path_segments = Vec::new();
let mut buffer = String::new(); let mut buffer = String::new();
@@ -368,13 +312,17 @@ pub(crate) fn encode_column_name(
json_path: &str, json_path: &str,
expand_dots_enabled: bool, expand_dots_enabled: bool,
) -> String { ) -> String {
let mut path = JsonPathWriter::default(); let mut column_key: String = String::with_capacity(field_name.len() + json_path.len() + 1);
path.push(field_name); column_key.push_str(field_name);
path.set_expand_dots(expand_dots_enabled); for mut segment in split_json_path(json_path) {
for segment in split_json_path(json_path) { column_key.push_str(JSON_PATH_SEGMENT_SEP_STR);
path.push(&segment); if expand_dots_enabled {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
unsafe { replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, segment.as_bytes_mut()) };
}
column_key.push_str(&segment);
} }
path.into() column_key
} }
impl<'a> JsonTermWriter<'a> { impl<'a> JsonTermWriter<'a> {
@@ -414,7 +362,6 @@ impl<'a> JsonTermWriter<'a> {
self.term_buffer.append_bytes(&[typ.to_code()]); self.term_buffer.append_bytes(&[typ.to_code()]);
} }
// TODO: Remove this function and use JsonPathWriter instead.
pub fn push_path_segment(&mut self, segment: &str) { pub fn push_path_segment(&mut self, segment: &str) {
// the path stack should never be empty. // the path stack should never be empty.
self.trim_to_end_of_path(); self.trim_to_end_of_path();

View File

@@ -25,7 +25,7 @@ pub use self::searcher::{Searcher, SearcherGeneration};
pub use self::segment::Segment; pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent; pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId; pub use self::segment_id::SegmentId;
pub use self::segment_reader::{merge_field_meta_data, FieldMetadata, SegmentReader}; pub use self::segment_reader::SegmentReader;
pub use self::single_segment_index_writer::SingleSegmentIndexWriter; pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
/// The meta file contains all the information about the list of segments and the schema /// The meta file contains all the information about the list of segments and the schema

View File

@@ -5,8 +5,7 @@ use std::{fmt, io};
use crate::collector::Collector; use crate::collector::Collector;
use crate::core::{Executor, SegmentReader}; use crate::core::{Executor, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query}; use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::document::DocumentDeserialize; use crate::schema::{Document, Schema, Term};
use crate::schema::{Schema, Term};
use crate::space_usage::SearcherSpaceUsage; use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader}; use crate::store::{CacheStats, StoreReader};
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject}; use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
@@ -84,7 +83,7 @@ impl Searcher {
/// ///
/// The searcher uses the segment ordinal to route the /// The searcher uses the segment ordinal to route the
/// request to the right `Segment`. /// request to the right `Segment`.
pub fn doc<D: DocumentDeserialize>(&self, doc_address: DocAddress) -> crate::Result<D> { pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize]; let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get(doc_address.doc_id) store_reader.get(doc_address.doc_id)
} }
@@ -104,10 +103,7 @@ impl Searcher {
/// Fetches a document in an asynchronous manner. /// Fetches a document in an asynchronous manner.
#[cfg(feature = "quickwit")] #[cfg(feature = "quickwit")]
pub async fn doc_async<D: DocumentDeserialize>( pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {
&self,
doc_address: DocAddress,
) -> crate::Result<D> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize]; let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get_async(doc_address.doc_id).await store_reader.get_async(doc_address.doc_id).await
} }

View File

@@ -1,17 +1,12 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::ops::BitOrAssign;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use std::{fmt, io}; use std::{fmt, io};
use fnv::FnvHashMap;
use itertools::Itertools;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId}; use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice}; use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders}; use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::json_utils::json_path_sep_to_dot;
use crate::schema::{Field, IndexRecordOption, Schema, Type}; use crate::schema::{Field, IndexRecordOption, Schema, Type};
use crate::space_usage::SegmentSpaceUsage; use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
@@ -285,103 +280,6 @@ impl SegmentReader {
Ok(inv_idx_reader) Ok(inv_idx_reader)
} }
/// Returns the list of fields that have been indexed in the segment.
/// The field list includes the field defined in the schema as well as the fields
/// that have been indexed as a part of a JSON field.
/// The returned field name is the full field name, including the name of the JSON field.
///
/// The returned field names can be used in queries.
///
/// Notice: If your data contains JSON fields this is **very expensive**, as it requires
/// browsing through the inverted index term dictionary and the columnar field dictionary.
///
/// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json
/// field that is not indexed nor a fast field but is stored, it is possible for the field
/// to not be listed.
pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
let mut indexed_fields: Vec<FieldMetadata> = Vec::new();
let mut map_to_canonical = FnvHashMap::default();
for (field, field_entry) in self.schema().fields() {
let field_name = field_entry.name().to_string();
let is_indexed = field_entry.is_indexed();
if is_indexed {
let is_json = field_entry.field_type().value_type() == Type::Json;
if is_json {
let inv_index = self.inverted_index(field)?;
let encoded_fields_in_index = inv_index.list_encoded_fields()?;
let mut build_path = |field_name: &str, mut json_path: String| {
// In this case we need to map the potential fast field to the field name
// accepted by the query parser.
let create_canonical =
!field_entry.is_expand_dots_enabled() && json_path.contains('.');
if create_canonical {
// Without expand dots enabled dots need to be escaped.
let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path);
let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path
} else {
// With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path)
}
};
indexed_fields.extend(
encoded_fields_in_index
.into_iter()
.map(|(name, typ)| (build_path(&field_name, name), typ))
.map(|(field_name, typ)| FieldMetadata {
indexed: true,
stored: false,
field_name,
fast: false,
typ,
}),
);
} else {
indexed_fields.push(FieldMetadata {
indexed: true,
stored: false,
field_name: field_name.to_string(),
fast: false,
typ: field_entry.field_type().value_type(),
});
}
}
}
let mut fast_fields: Vec<FieldMetadata> = self
.fast_fields()
.columnar()
.iter_columns()?
.map(|(mut field_name, handle)| {
json_path_sep_to_dot(&mut field_name);
// map to canonical path, to avoid similar but different entries.
// Eventually we should just accept '.' seperated for all cases.
let field_name = map_to_canonical
.get(&field_name)
.unwrap_or(&field_name)
.to_string();
FieldMetadata {
indexed: false,
stored: false,
field_name,
fast: true,
typ: Type::from(handle.column_type()),
}
})
.collect();
// Since the type is encoded differently in the fast field and in the inverted index,
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
// If we are sure that the order is the same, we can remove this sort.
indexed_fields.sort_unstable();
fast_fields.sort_unstable();
let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);
Ok(merged)
}
/// Returns the segment id /// Returns the segment id
pub fn segment_id(&self) -> SegmentId { pub fn segment_id(&self) -> SegmentId {
self.segment_id self.segment_id
@@ -432,65 +330,6 @@ impl SegmentReader {
} }
} }
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
/// FieldMetadata
pub struct FieldMetadata {
/// The field name
// Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
// field_name then typ.
pub field_name: String,
/// The field type
// Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
// field_name then typ.
pub typ: Type,
/// Is the field indexed for search
pub indexed: bool,
/// Is the field stored in the doc store
pub stored: bool,
/// Is the field stored in the columnar storage
pub fast: bool,
}
impl BitOrAssign for FieldMetadata {
fn bitor_assign(&mut self, rhs: Self) {
assert!(self.field_name == rhs.field_name);
assert!(self.typ == rhs.typ);
self.indexed |= rhs.indexed;
self.stored |= rhs.stored;
self.fast |= rhs.fast;
}
}
// Maybe too slow for the high cardinality case
fn is_field_stored(field_name: &str, schema: &Schema) -> bool {
schema
.find_field(field_name)
.map(|(field, _path)| schema.get_field_entry(field).is_stored())
.unwrap_or(false)
}
/// Helper to merge the field metadata from multiple segments.
pub fn merge_field_meta_data(
field_metadatas: Vec<Vec<FieldMetadata>>,
schema: &Schema,
) -> Vec<FieldMetadata> {
let mut merged_field_metadata = Vec::new();
for (_key, mut group) in &field_metadatas
.into_iter()
.kmerge_by(|left, right| left < right)
// TODO: Remove allocation
.group_by(|el| (el.field_name.to_string(), el.typ))
{
let mut merged: FieldMetadata = group.next().unwrap();
for el in group {
merged |= el;
}
// Currently is_field_stored is maybe too slow for the high cardinality case
merged.stored = is_field_stored(&merged.field_name, schema);
merged_field_metadata.push(merged);
}
merged_field_metadata
}
fn intersect_alive_bitset( fn intersect_alive_bitset(
left_opt: Option<AliveBitSet>, left_opt: Option<AliveBitSet>,
right_opt: Option<AliveBitSet>, right_opt: Option<AliveBitSet>,
@@ -514,127 +353,9 @@ impl fmt::Debug for SegmentReader {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*;
use crate::core::Index; use crate::core::Index;
use crate::schema::{Schema, SchemaBuilder, Term, STORED, TEXT}; use crate::schema::{Schema, Term, STORED, TEXT};
use crate::{DocId, FieldMetadata, IndexWriter}; use crate::DocId;
#[test]
fn test_merge_field_meta_data_same() {
let schema = SchemaBuilder::new().build();
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
let res = merge_field_meta_data(
vec![vec![field_metadata1.clone()], vec![field_metadata2]],
&schema,
);
assert_eq!(res, vec![field_metadata1]);
}
#[test]
fn test_merge_field_meta_data_different() {
let schema = SchemaBuilder::new().build();
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: false,
stored: false,
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "b".to_string(),
typ: crate::schema::Type::Str,
indexed: false,
stored: false,
fast: true,
};
let field_metadata3 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: false,
};
let res = merge_field_meta_data(
vec![
vec![field_metadata1.clone(), field_metadata2.clone()],
vec![field_metadata3],
],
&schema,
);
let field_metadata_expected1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
assert_eq!(res, vec![field_metadata_expected1, field_metadata2.clone()]);
}
#[test]
fn test_merge_field_meta_data_merge() {
use pretty_assertions::assert_eq;
let get_meta_data = |name: &str, typ: Type| FieldMetadata {
field_name: name.to_string(),
typ,
indexed: false,
stored: false,
fast: true,
};
let schema = SchemaBuilder::new().build();
let mut metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
metas.sort();
let res = merge_field_meta_data(vec![vec![get_meta_data("e", Type::Str)], metas], &schema);
assert_eq!(
res,
vec![
get_meta_data("d", Type::Str),
get_meta_data("e", Type::Str),
get_meta_data("e", Type::U64),
]
);
}
#[test]
fn test_merge_field_meta_data_bitxor() {
let field_metadata1 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: false,
stored: false,
fast: true,
};
let field_metadata2 = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: false,
};
let field_metadata_expected = FieldMetadata {
field_name: "a".to_string(),
typ: crate::schema::Type::Str,
indexed: true,
stored: false,
fast: true,
};
let mut res1 = field_metadata1.clone();
res1 |= field_metadata2.clone();
let mut res2 = field_metadata2.clone();
res2 |= field_metadata1;
assert_eq!(res1, field_metadata_expected);
assert_eq!(res2, field_metadata_expected);
}
#[test] #[test]
fn test_num_alive() -> crate::Result<()> { fn test_num_alive() -> crate::Result<()> {
@@ -645,7 +366,7 @@ mod test {
let name = schema.get_field("name").unwrap(); let name = schema.get_field("name").unwrap();
{ {
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?; index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?; index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?; index_writer.add_document(doc!(name => "jockey"))?;
@@ -671,7 +392,7 @@ mod test {
let name = schema.get_field("name").unwrap(); let name = schema.get_field("name").unwrap();
{ {
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?; index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?; index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?; index_writer.add_document(doc!(name => "jockey"))?;
@@ -681,7 +402,7 @@ mod test {
} }
{ {
let mut index_writer2: IndexWriter = index.writer(50_000_000)?; let mut index_writer2 = index.writer(50_000_000)?;
index_writer2.delete_term(Term::from_field_text(name, "horse")); index_writer2.delete_term(Term::from_field_text(name, "horse"));
index_writer2.delete_term(Term::from_field_text(name, "cap")); index_writer2.delete_term(Term::from_field_text(name, "cap"));

View File

@@ -1,20 +1,16 @@
use std::marker::PhantomData;
use crate::indexer::operation::AddOperation; use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas; use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter; use crate::indexer::SegmentWriter;
use crate::schema::document::Document; use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
#[doc(hidden)] #[doc(hidden)]
pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> { pub struct SingleSegmentIndexWriter {
segment_writer: SegmentWriter, segment_writer: SegmentWriter,
segment: Segment, segment: Segment,
opstamp: Opstamp, opstamp: Opstamp,
_phantom: PhantomData<D>,
} }
impl<D: Document> SingleSegmentIndexWriter<D> { impl SingleSegmentIndexWriter {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> { pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment(); let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?; let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
@@ -22,7 +18,6 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
segment_writer, segment_writer,
segment, segment,
opstamp: 0, opstamp: 0,
_phantom: PhantomData,
}) })
} }
@@ -30,7 +25,7 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
self.segment_writer.mem_usage() self.segment_writer.mem_usage()
} }
pub fn add_document(&mut self, document: D) -> crate::Result<()> { pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
let opstamp = self.opstamp; let opstamp = self.opstamp;
self.opstamp += 1; self.opstamp += 1;
self.segment_writer self.segment_writer

View File

@@ -1,13 +1,12 @@
use crate::collector::Count; use crate::collector::Count;
use crate::directory::{RamDirectory, WatchCallback}; use crate::directory::{RamDirectory, WatchCallback};
use crate::indexer::{LogMergePolicy, NoMergePolicy}; use crate::indexer::NoMergePolicy;
use crate::json_utils::JsonTermWriter;
use crate::query::TermQuery; use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, Type, INDEXED, STRING, TEXT}; use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
use crate::tokenizer::TokenizerManager; use crate::tokenizer::TokenizerManager;
use crate::{ use crate::{
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, Postings, Directory, Document, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, SegmentId,
ReloadPolicy, SegmentId, TantivyDocument, Term, Term,
}; };
#[test] #[test]
@@ -122,7 +121,7 @@ fn test_index_on_commit_reload_policy() -> crate::Result<()> {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
@@ -148,7 +147,7 @@ mod mmap_specific {
let index = Index::create_in_dir(tempdir_path, schema).unwrap(); let index = Index::create_in_dir(tempdir_path, schema).unwrap();
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
@@ -160,7 +159,7 @@ mod mmap_specific {
let schema = throw_away_schema(); let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema)?; let mut index = Index::create_from_tempdir(schema)?;
let mut writer: IndexWriter = index.writer_for_tests()?; let mut writer = index.writer_for_tests()?;
writer.commit()?; writer.commit()?;
let reader = index let reader = index
.reader_builder() .reader_builder()
@@ -190,7 +189,7 @@ mod mmap_specific {
let read_index = Index::open_in_dir(&tempdir_path).unwrap(); let read_index = Index::open_in_dir(&tempdir_path).unwrap();
let reader = read_index let reader = read_index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into() .try_into()
.unwrap(); .unwrap();
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
@@ -209,7 +208,7 @@ fn test_index_on_commit_reload_policy_aux(
.watch(WatchCallback::new(move || { .watch(WatchCallback::new(move || {
let _ = sender.send(()); let _ = sender.send(());
})); }));
let mut writer: IndexWriter = index.writer_for_tests()?; let mut writer = index.writer_for_tests()?;
assert_eq!(reader.searcher().num_docs(), 0); assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?; writer.add_document(doc!(field=>1u64))?;
writer.commit().unwrap(); writer.commit().unwrap();
@@ -243,7 +242,7 @@ fn garbage_collect_works_as_intended() -> crate::Result<()> {
let field = schema.get_field("num_likes").unwrap(); let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default())?; let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
let mut writer: IndexWriter = index.writer_with_num_threads(1, 32_000_000).unwrap(); let mut writer = index.writer_with_num_threads(1, 32_000_000).unwrap();
for _seg in 0..8 { for _seg in 0..8 {
for i in 0u64..1_000u64 { for i in 0u64..1_000u64 {
writer.add_document(doc!(field => i))?; writer.add_document(doc!(field => i))?;
@@ -307,7 +306,7 @@ fn test_merging_segment_update_docfreq() {
let id_field = schema_builder.add_text_field("id", STRING); let id_field = schema_builder.add_text_field("id", STRING);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests().unwrap(); let mut writer = index.writer_for_tests().unwrap();
writer.set_merge_policy(Box::new(NoMergePolicy)); writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5 { for _ in 0..5 {
writer.add_document(doc!(text_field=>"hello")).unwrap(); writer.add_document(doc!(text_field=>"hello")).unwrap();
@@ -318,13 +317,13 @@ fn test_merging_segment_update_docfreq() {
writer writer
.add_document(doc!(text_field=>"hello", id_field=>"TO_BE_DELETED")) .add_document(doc!(text_field=>"hello", id_field=>"TO_BE_DELETED"))
.unwrap(); .unwrap();
writer.add_document(TantivyDocument::default()).unwrap(); writer.add_document(Document::default()).unwrap();
writer.commit().unwrap(); writer.commit().unwrap();
for _ in 0..7 { for _ in 0..7 {
writer.add_document(doc!(text_field=>"hello")).unwrap(); writer.add_document(doc!(text_field=>"hello")).unwrap();
} }
writer.add_document(TantivyDocument::default()).unwrap(); writer.add_document(Document::default()).unwrap();
writer.add_document(TantivyDocument::default()).unwrap(); writer.add_document(Document::default()).unwrap();
writer.delete_term(Term::from_field_text(id_field, "TO_BE_DELETED")); writer.delete_term(Term::from_field_text(id_field, "TO_BE_DELETED"));
writer.commit().unwrap(); writer.commit().unwrap();
@@ -345,132 +344,3 @@ fn test_merging_segment_update_docfreq() {
let term_info = inv_index.get_term_info(&term).unwrap().unwrap(); let term_info = inv_index.get_term_info(&term).unwrap().unwrap();
assert_eq!(term_info.doc_freq, 12); assert_eq!(term_info.doc_freq, 12);
} }
// motivated by https://github.com/quickwit-oss/quickwit/issues/4130
#[test]
fn test_positions_merge_bug_non_text_json_vint() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("dynamic", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let mut merge_policy = LogMergePolicy::default();
merge_policy.set_min_num_segments(2);
writer.set_merge_policy(Box::new(merge_policy));
// Here a string would work.
let doc_json = r#"{"tenant_id":75}"#;
let vals = serde_json::from_str(doc_json).unwrap();
let mut doc = TantivyDocument::default();
doc.add_object(field, vals);
writer.add_document(doc.clone()).unwrap();
writer.commit().unwrap();
writer.add_document(doc.clone()).unwrap();
writer.commit().unwrap();
writer.wait_merging_threads().unwrap();
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
}
// Same as above but with bitpacked blocks
#[test]
fn test_positions_merge_bug_non_text_json_bitpacked_block() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("dynamic", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let mut merge_policy = LogMergePolicy::default();
merge_policy.set_min_num_segments(2);
writer.set_merge_policy(Box::new(merge_policy));
// Here a string would work.
let doc_json = r#"{"tenant_id":75}"#;
let vals = serde_json::from_str(doc_json).unwrap();
let mut doc = TantivyDocument::default();
doc.add_object(field, vals);
for _ in 0..128 {
writer.add_document(doc.clone()).unwrap();
}
writer.commit().unwrap();
writer.add_document(doc.clone()).unwrap();
writer.commit().unwrap();
writer.wait_merging_threads().unwrap();
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
}
#[test]
fn test_non_text_json_term_freq() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("dynamic", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
// Here a string would work.
let doc_json = r#"{"tenant_id":75}"#;
let vals = serde_json::from_str(doc_json).unwrap();
let mut doc = TantivyDocument::default();
doc.add_object(field, vals);
writer.add_document(doc.clone()).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("tenant_id");
json_term_writer.close_path_and_set_type(Type::U64);
json_term_writer.set_fast_value(75u64);
let postings = inv_idx
.read_postings(
&json_term_writer.term(),
IndexRecordOption::WithFreqsAndPositions,
)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 1u32);
}
#[test]
fn test_non_text_json_term_freq_bitpacked() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("dynamic", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
// Here a string would work.
let doc_json = r#"{"tenant_id":75}"#;
let vals = serde_json::from_str(doc_json).unwrap();
let mut doc = TantivyDocument::default();
doc.add_object(field, vals);
let num_docs = 132;
for _ in 0..num_docs {
writer.add_document(doc.clone()).unwrap();
}
writer.commit().unwrap();
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("tenant_id");
json_term_writer.close_path_and_set_type(Type::U64);
json_term_writer.set_fast_value(75u64);
let mut postings = inv_idx
.read_postings(
&json_term_writer.term(),
IndexRecordOption::WithFreqsAndPositions,
)
.unwrap()
.unwrap();
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 1u32);
for i in 1..num_docs {
assert_eq!(postings.advance(), i);
assert_eq!(postings.term_freq(), 1u32);
}
}

View File

@@ -222,8 +222,8 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// registered (and whose [`WatchHandle`] is still alive) are triggered. /// registered (and whose [`WatchHandle`] is still alive) are triggered.
/// ///
/// Internally, tantivy only uses this API to detect new commits to implement the /// Internally, tantivy only uses this API to detect new commits to implement the
/// `OnCommitWithDelay` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents /// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
/// the `OnCommitWithDelay` `ReloadPolicy` to work properly. /// `OnCommit` `ReloadPolicy` to work properly.
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>; fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>;
} }

View File

@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
use crate::directory::error::Incompatibility; use crate::directory::error::Incompatibility;
use crate::directory::{AntiCallToken, FileSlice, TerminatingWrite}; use crate::directory::{AntiCallToken, FileSlice, TerminatingWrite};
use crate::{Version, INDEX_FORMAT_OLDEST_SUPPORTED_VERSION, INDEX_FORMAT_VERSION}; use crate::{Version, INDEX_FORMAT_VERSION};
const FOOTER_MAX_LEN: u32 = 50_000; const FOOTER_MAX_LEN: u32 = 50_000;
@@ -102,11 +102,10 @@ impl Footer {
/// Confirms that the index will be read correctly by this version of tantivy /// Confirms that the index will be read correctly by this version of tantivy
/// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory /// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory
pub fn is_compatible(&self) -> Result<(), Incompatibility> { pub fn is_compatible(&self) -> Result<(), Incompatibility> {
const SUPPORTED_INDEX_FORMAT_VERSION_RANGE: std::ops::RangeInclusive<u32> =
INDEX_FORMAT_OLDEST_SUPPORTED_VERSION..=INDEX_FORMAT_VERSION;
let library_version = crate::version(); let library_version = crate::version();
if !SUPPORTED_INDEX_FORMAT_VERSION_RANGE.contains(&self.version.index_format_version) { if self.version.index_format_version < 4
|| self.version.index_format_version > INDEX_FORMAT_VERSION
{
return Err(Incompatibility::IndexMismatch { return Err(Incompatibility::IndexMismatch {
library_version: library_version.clone(), library_version: library_version.clone(),
index_version: self.version.clone(), index_version: self.version.clone(),

View File

@@ -1,15 +1,13 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
use std::fs::{self, File, OpenOptions}; use std::fs::{self, File, OpenOptions};
use std::io::{self, BufWriter, Read, Write}; use std::io::{self, BufWriter, Read, Seek, Write};
use std::ops::Deref; use std::ops::Deref;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock, Weak}; use std::sync::{Arc, RwLock, Weak};
use common::StableDeref; use common::StableDeref;
use fs4::FileExt; use fs4::FileExt;
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
use memmap2::Mmap; use memmap2::Mmap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tempfile::TempDir; use tempfile::TempDir;
@@ -23,6 +21,8 @@ use crate::directory::{
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite, AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
WatchCallback, WatchHandle, WritePtr, WatchCallback, WatchHandle, WritePtr,
}; };
#[cfg(unix)]
use crate::Advice;
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>; pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>; pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
@@ -328,6 +328,12 @@ impl Write for SafeFileWriter {
} }
} }
impl Seek for SafeFileWriter {
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
self.0.seek(pos)
}
}
impl TerminatingWrite for SafeFileWriter { impl TerminatingWrite for SafeFileWriter {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> { fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
self.0.flush()?; self.0.flush()?;
@@ -533,7 +539,7 @@ mod tests {
use super::*; use super::*;
use crate::indexer::LogMergePolicy; use crate::indexer::LogMergePolicy;
use crate::schema::{Schema, SchemaBuilder, TEXT}; use crate::schema::{Schema, SchemaBuilder, TEXT};
use crate::{Index, IndexSettings, IndexWriter, ReloadPolicy}; use crate::{Index, IndexSettings, ReloadPolicy};
#[test] #[test]
fn test_open_non_existent_path() { fn test_open_non_existent_path() {
@@ -645,7 +651,7 @@ mod tests {
let index = let index =
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap(); Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default(); let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_num_segments(3); log_merge_policy.set_min_num_segments(3);
index_writer.set_merge_policy(Box::new(log_merge_policy)); index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -42,9 +42,6 @@ pub struct GarbageCollectionResult {
pub failed_to_delete_files: Vec<PathBuf>, pub failed_to_delete_files: Vec<PathBuf>,
} }
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
pub use self::managed_directory::ManagedDirectory; pub use self::managed_directory::ManagedDirectory;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory; pub use self::mmap_directory::MmapDirectory;

View File

@@ -1,5 +1,5 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::io::{self, BufWriter, Cursor, Write}; use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use std::{fmt, result}; use std::{fmt, result};
@@ -48,6 +48,12 @@ impl Drop for VecWriter {
} }
} }
impl Seek for VecWriter {
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
self.data.seek(pos)
}
}
impl Write for VecWriter { impl Write for VecWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> { fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.is_flushed = false; self.is_flushed = false;

View File

@@ -17,7 +17,7 @@ pub trait DocSet: Send {
/// ///
/// The DocId of the next element is returned. /// The DocId of the next element is returned.
/// In other words we should always have : /// In other words we should always have :
/// ```compile_fail /// ```ignore
/// let doc = docset.advance(); /// let doc = docset.advance();
/// assert_eq!(doc, docset.doc()); /// assert_eq!(doc, docset.doc());
/// ``` /// ```

View File

@@ -11,7 +11,6 @@ use crate::directory::error::{
Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError, Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
}; };
use crate::fastfield::FastFieldNotAvailableError; use crate::fastfield::FastFieldNotAvailableError;
use crate::schema::document::DeserializeError;
use crate::{query, schema}; use crate::{query, schema};
/// Represents a `DataCorruption` error. /// Represents a `DataCorruption` error.
@@ -107,9 +106,6 @@ pub enum TantivyError {
/// e.g. a datastructure is incorrectly inititalized. /// e.g. a datastructure is incorrectly inititalized.
#[error("Internal error: '{0}'")] #[error("Internal error: '{0}'")]
InternalError(String), InternalError(String),
#[error("Deserialize error: {0}")]
/// An error occurred while attempting to deserialize a document.
DeserializeError(DeserializeError),
} }
impl From<io::Error> for TantivyError { impl From<io::Error> for TantivyError {
@@ -180,9 +176,3 @@ impl From<rayon::ThreadPoolBuildError> for TantivyError {
TantivyError::SystemError(error.to_string()) TantivyError::SystemError(error.to_string())
} }
} }
impl From<DeserializeError> for TantivyError {
fn from(error: DeserializeError) -> TantivyError {
TantivyError::DeserializeError(error)
}
}

View File

@@ -62,9 +62,8 @@ impl FacetReader {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::schema::document::Value; use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED}; use crate::{DocAddress, Document, Index};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
#[test] #[test]
fn test_facet_only_indexed() { fn test_facet_only_indexed() {
@@ -72,7 +71,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap())) .add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))
.unwrap(); .unwrap();
@@ -86,10 +85,8 @@ mod tests {
let mut facet = Facet::default(); let mut facet = Facet::default();
facet_reader.facet_from_ord(0, &mut facet).unwrap(); facet_reader.facet_from_ord(0, &mut facet).unwrap();
assert_eq!(facet.to_path_string(), "/a/b"); assert_eq!(facet.to_path_string(), "/a/b");
let doc = searcher let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap();
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32)) let value = doc.get_first(facet_field).and_then(Value::as_facet);
.unwrap();
let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
assert_eq!(value, None); assert_eq!(value, None);
} }
@@ -99,7 +96,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap())) .add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap()))
.unwrap(); .unwrap();
@@ -145,8 +142,8 @@ mod tests {
let mut facet_ords = Vec::new(); let mut facet_ords = Vec::new();
facet_ords.extend(facet_reader.facet_ords(0u32)); facet_ords.extend(facet_reader.facet_ords(0u32));
assert_eq!(&facet_ords, &[0u64]); assert_eq!(&facet_ords, &[0u64]);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?; let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet()); let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet);
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref()); assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
Ok(()) Ok(())
} }
@@ -159,7 +156,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.add_document(TantivyDocument::default())?; index_writer.add_document(Document::default())?;
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap(); let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
@@ -179,8 +176,8 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::default())?; index_writer.add_document(Document::default())?;
index_writer.add_document(TantivyDocument::default())?; index_writer.add_document(Document::default())?;
index_writer.commit()?; index_writer.commit()?;
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap(); let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();

View File

@@ -90,12 +90,12 @@ mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::schema::{ use crate::schema::{
Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument, Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
}; };
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager}; use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader}; use crate::{DateOptions, DateTimePrecision, Index, SegmentId, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| { pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -131,7 +131,7 @@ mod tests {
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80); assert_eq!(file.len(), 93);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap(); let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let column = fast_field_readers let column = fast_field_readers
.u64("field") .u64("field")
@@ -181,7 +181,7 @@ mod tests {
write.terminate().unwrap(); write.terminate().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 108); assert_eq!(file.len(), 121);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap(); let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let col = fast_field_readers let col = fast_field_readers
.u64("field") .u64("field")
@@ -214,7 +214,7 @@ mod tests {
write.terminate().unwrap(); write.terminate().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 81); assert_eq!(file.len(), 94);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap(); let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let fast_field_reader = fast_field_readers let fast_field_reader = fast_field_readers
.u64("field") .u64("field")
@@ -246,7 +246,7 @@ mod tests {
write.terminate().unwrap(); write.terminate().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 4476); assert_eq!(file.len(), 4489);
{ {
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap(); let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let col = fast_field_readers let col = fast_field_readers
@@ -271,7 +271,7 @@ mod tests {
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
for i in -100i64..10_000i64 { for i in -100i64..10_000i64 {
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
doc.add_i64(i64_field, i); doc.add_i64(i64_field, i);
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc).unwrap();
} }
@@ -279,7 +279,7 @@ mod tests {
write.terminate().unwrap(); write.terminate().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 252); assert_eq!(file.len(), 265);
{ {
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap(); let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
@@ -312,7 +312,7 @@ mod tests {
{ {
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default(); let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap(); fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap(); write.terminate().unwrap();
@@ -345,7 +345,7 @@ mod tests {
{ {
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default(); let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap(); fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap(); write.terminate().unwrap();
@@ -416,7 +416,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST); let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer index_writer
.add_document(doc!(date_field => DateTime::from_utc(OffsetDateTime::now_utc()))) .add_document(doc!(date_field => DateTime::from_utc(OffsetDateTime::now_utc())))
@@ -452,7 +452,7 @@ mod tests {
{ {
// first segment // first segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer index_writer
.add_document(doc!( .add_document(doc!(
@@ -506,7 +506,7 @@ mod tests {
{ {
// second segment // second segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!( .add_document(doc!(
@@ -537,7 +537,7 @@ mod tests {
// Merging the segments // Merging the segments
{ {
let segment_ids = index.searchable_segment_ids().unwrap(); let segment_ids = index.searchable_segment_ids().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.merge(&segment_ids).wait().unwrap(); index_writer.merge(&segment_ids).wait().unwrap();
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
} }
@@ -662,7 +662,7 @@ mod tests {
// Merging the segments // Merging the segments
{ {
let segment_ids = index.searchable_segment_ids()?; let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?; index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
} }
@@ -773,7 +773,7 @@ mod tests {
write.terminate().unwrap(); write.terminate().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 84); assert_eq!(file.len(), 102);
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap(); let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
let bool_col = fast_field_readers.bool("field_bool").unwrap(); let bool_col = fast_field_readers.bool("field_bool").unwrap();
assert_eq!(bool_col.first(0), Some(true)); assert_eq!(bool_col.first(0), Some(true));
@@ -805,7 +805,7 @@ mod tests {
write.terminate().unwrap(); write.terminate().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 96); assert_eq!(file.len(), 114);
let readers = FastFieldReaders::open(file, schema).unwrap(); let readers = FastFieldReaders::open(file, schema).unwrap();
let bool_col = readers.bool("field_bool").unwrap(); let bool_col = readers.bool("field_bool").unwrap();
for i in 0..25 { for i in 0..25 {
@@ -824,13 +824,13 @@ mod tests {
{ {
let mut write: WritePtr = directory.open_write(path).unwrap(); let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default(); let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap(); fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap(); fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap(); write.terminate().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 86); assert_eq!(file.len(), 104);
let fastfield_readers = FastFieldReaders::open(file, schema).unwrap(); let fastfield_readers = FastFieldReaders::open(file, schema).unwrap();
let col = fastfield_readers.bool("field_bool").unwrap(); let col = fastfield_readers.bool("field_bool").unwrap();
assert_eq!(col.first(0), None); assert_eq!(col.first(0), None);
@@ -846,7 +846,7 @@ mod tests {
assert_eq!(col.get_val(0), true); assert_eq!(col.get_val(0), true);
} }
fn get_index(docs: &[crate::TantivyDocument], schema: &Schema) -> crate::Result<RamDirectory> { fn get_index(docs: &[crate::Document], schema: &Schema) -> crate::Result<RamDirectory> {
let directory: RamDirectory = RamDirectory::create(); let directory: RamDirectory = RamDirectory::create();
{ {
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
@@ -888,7 +888,7 @@ mod tests {
let field = schema_builder.add_date_field("field", date_options); let field = schema_builder.add_date_field("field", date_options);
let schema = schema_builder.build(); let schema = schema_builder.build();
let docs: Vec<TantivyDocument> = times.iter().map(|time| doc!(field=>*time)).collect(); let docs: Vec<Document> = times.iter().map(|time| doc!(field=>*time)).collect();
let directory = get_index(&docs[..], &schema).unwrap(); let directory = get_index(&docs[..], &schema).unwrap();
let path = Path::new("test"); let path = Path::new("test");
@@ -962,15 +962,11 @@ mod tests {
let ip_field = schema_builder.add_u64_field("ip", FAST); let ip_field = schema_builder.add_u64_field("ip", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let ip_addr = Ipv6Addr::new(1, 2, 3, 4, 5, 1, 2, 3); let ip_addr = Ipv6Addr::new(1, 2, 3, 4, 5, 1, 2, 3);
index_writer index_writer.add_document(Document::default()).unwrap();
.add_document(TantivyDocument::default())
.unwrap();
index_writer.add_document(doc!(ip_field=>ip_addr)).unwrap(); index_writer.add_document(doc!(ip_field=>ip_addr)).unwrap();
index_writer index_writer.add_document(Document::default()).unwrap();
.add_document(TantivyDocument::default())
.unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let fastfields = searcher.segment_reader(0u32).fast_fields(); let fastfields = searcher.segment_reader(0u32).fast_fields();
@@ -1090,7 +1086,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option); let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"attr.age": 32}))) .add_document(doc!(json => json!({"attr.age": 32})))
.unwrap(); .unwrap();
@@ -1116,7 +1112,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option); let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"age": 32}))) .add_document(doc!(json => json!({"age": 32})))
.unwrap(); .unwrap();
@@ -1143,7 +1139,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option); let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"attr.age": 32}))) .add_document(doc!(json => json!({"attr.age": 32})))
.unwrap(); .unwrap();
@@ -1166,7 +1162,7 @@ mod tests {
let field_with_dot = schema_builder.add_i64_field("field.with.dot", FAST); let field_with_dot = schema_builder.add_i64_field("field.with.dot", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(field_with_dot => 32i64)) .add_document(doc!(field_with_dot => 32i64))
.unwrap(); .unwrap();
@@ -1188,7 +1184,7 @@ mod tests {
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", FAST); let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(json_field=> json!({"attr": {"age": 32}}), shadowing_json_field=>json!({"age": 33}))) .add_document(doc!(json_field=> json!({"attr": {"age": 32}}), shadowing_json_field=>json!({"age": 33})))
.unwrap(); .unwrap();
@@ -1219,7 +1215,7 @@ mod tests {
let mut index = Index::create_in_ram(schema); let mut index = Index::create_in_ram(schema);
index.set_fast_field_tokenizers(ff_tokenizer_manager); index.set_fast_field_tokenizers(ff_tokenizer_manager);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(text_field => "Test1 test2")) .add_document(doc!(text_field => "Test1 test2"))
.unwrap(); .unwrap();
@@ -1248,7 +1244,7 @@ mod tests {
let log_field = schema_builder.add_text_field("log_level", text_fieldtype); let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(log_field => "info")) .add_document(doc!(log_field => "info"))
.unwrap(); .unwrap();
@@ -1281,25 +1277,18 @@ mod tests {
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option); let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(json_field=> json!({"attr.age": 32}), shadowing_json_field=>json!({"age": 33}))) .add_document(doc!(json_field=> json!({"attr.age": 32}), shadowing_json_field=>json!({"age": 33})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let fast_field_reader = searcher.segment_reader(0u32).fast_fields(); let fast_field_reader = searcher.segment_reader(0u32).fast_fields();
// Supported for now, maybe dropped in the future.
let column = fast_field_reader let column = fast_field_reader
.column_opt::<i64>("jsonfield.attr.age") .column_opt::<i64>("jsonfield.attr.age")
.unwrap() .unwrap()
.unwrap(); .unwrap();
let vals: Vec<i64> = column.values_for_doc(0u32).collect(); let vals: Vec<i64> = column.values_for_doc(0u32).collect();
assert_eq!(&vals, &[33]); assert_eq!(&vals, &[33]);
let column = fast_field_reader
.column_opt::<i64>("jsonfield\\.attr.age")
.unwrap()
.unwrap();
let vals: Vec<i64> = column.values_for_doc(0u32).collect();
assert_eq!(&vals, &[33]);
} }
} }

View File

@@ -357,7 +357,7 @@ mod tests {
use columnar::ColumnType; use columnar::ColumnType;
use crate::schema::{JsonObjectOptions, Schema, FAST}; use crate::schema::{JsonObjectOptions, Schema, FAST};
use crate::{Index, IndexWriter, TantivyDocument}; use crate::{Document, Index};
#[test] #[test]
fn test_fast_field_reader_resolve_with_dynamic_internal() { fn test_fast_field_reader_resolve_with_dynamic_internal() {
@@ -373,10 +373,8 @@ mod tests {
let dynamic_field = schema_builder.add_json_field("_dyna", FAST); let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer.add_document(Document::default()).unwrap();
.add_document(TantivyDocument::default())
.unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -445,7 +443,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(id=> 1u64, json => json!({"foo": 42}))) .add_document(doc!(id=> 1u64, json => json!({"foo": 42})))
.unwrap(); .unwrap();

View File

@@ -1,12 +1,12 @@
use std::io; use std::io;
use columnar::{ColumnarWriter, NumericalValue}; use columnar::{ColumnarWriter, NumericalValue};
use common::JsonPathWriter; use common::replace_in_place;
use tokenizer_api::Token; use tokenizer_api::Token;
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type}; use crate::schema::{value_type_to_column_type, Document, FieldType, Schema, Type, Value};
use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DateTimePrecision, DocId, TantivyError}; use crate::{DateTimePrecision, DocId, TantivyError};
@@ -23,7 +23,7 @@ pub struct FastFieldsWriter {
expand_dots: Vec<bool>, expand_dots: Vec<bool>,
num_docs: DocId, num_docs: DocId,
// Buffer that we recycle to avoid allocation. // Buffer that we recycle to avoid allocation.
json_path_buffer: JsonPathWriter, json_path_buffer: String,
} }
impl FastFieldsWriter { impl FastFieldsWriter {
@@ -97,7 +97,7 @@ impl FastFieldsWriter {
num_docs: 0u32, num_docs: 0u32,
date_precisions, date_precisions,
expand_dots, expand_dots,
json_path_buffer: JsonPathWriter::default(), json_path_buffer: String::new(),
}) })
} }
@@ -117,121 +117,114 @@ impl FastFieldsWriter {
} }
/// Indexes all of the fastfields of a new document. /// Indexes all of the fastfields of a new document.
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> { pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
let doc_id = self.num_docs; let doc_id = self.num_docs;
for (field, value) in doc.iter_fields_and_values() { for field_value in doc.field_values() {
let value_access = value as D::Value<'_>; if let Some(field_name) =
&self.fast_field_names[field_value.field().field_id() as usize]
{
match &field_value.value {
Value::U64(u64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*u64_val),
);
}
Value::I64(i64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*i64_val),
);
}
Value::F64(f64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*f64_val),
);
}
Value::Str(text_val) => {
if let Some(tokenizer) =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
})
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
}
Value::Bytes(bytes_val) => {
self.columnar_writer
.record_bytes(doc_id, field_name.as_str(), bytes_val);
}
Value::PreTokStr(pre_tok) => {
for token in &pre_tok.tokens {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
}
}
Value::Bool(bool_val) => {
self.columnar_writer
.record_bool(doc_id, field_name.as_str(), *bool_val);
}
Value::Date(datetime) => {
let date_precision =
self.date_precisions[field_value.field().field_id() as usize];
let truncated_datetime = datetime.truncate(date_precision);
self.columnar_writer.record_datetime(
doc_id,
field_name.as_str(),
truncated_datetime,
);
}
Value::Facet(facet) => {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
facet.encoded_str(),
);
}
Value::JsonObject(json_obj) => {
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
self.add_doc_value(doc_id, field, value_access)?; let text_analyzer =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
Value::IpAddr(ip_addr) => {
self.columnar_writer
.record_ip_addr(doc_id, field_name.as_str(), *ip_addr);
}
}
}
} }
self.num_docs += 1; self.num_docs += 1;
Ok(()) Ok(())
} }
fn add_doc_value<'a, V: Value<'a>>(
&mut self,
doc_id: DocId,
field: Field,
value: V,
) -> crate::Result<()> {
let field_name = match &self.fast_field_names[field.field_id() as usize] {
None => return Ok(()),
Some(name) => name,
};
match value.as_value() {
ReferenceValue::Leaf(leaf) => match leaf {
ReferenceValueLeaf::Null => {}
ReferenceValueLeaf::Str(val) => {
if let Some(tokenizer) =
&mut self.per_field_tokenizer[field.field_id() as usize]
{
let mut token_stream = tokenizer.token_stream(val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer
.record_str(doc_id, field_name, &token.text);
})
} else {
self.columnar_writer.record_str(doc_id, field_name, val);
}
}
ReferenceValueLeaf::U64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValueLeaf::I64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValueLeaf::F64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValueLeaf::Date(val) => {
let date_precision = self.date_precisions[field.field_id() as usize];
let truncated_datetime = val.truncate(date_precision);
self.columnar_writer
.record_datetime(doc_id, field_name, truncated_datetime);
}
ReferenceValueLeaf::Facet(val) => {
self.columnar_writer
.record_str(doc_id, field_name, val.encoded_str());
}
ReferenceValueLeaf::Bytes(val) => {
self.columnar_writer.record_bytes(doc_id, field_name, val);
}
ReferenceValueLeaf::IpAddr(val) => {
self.columnar_writer.record_ip_addr(doc_id, field_name, val);
}
ReferenceValueLeaf::Bool(val) => {
self.columnar_writer.record_bool(doc_id, field_name, val);
}
ReferenceValueLeaf::PreTokStr(val) => {
for token in &val.tokens {
self.columnar_writer
.record_str(doc_id, field_name, &token.text);
}
}
},
ReferenceValue::Array(val) => {
// TODO: Check this is the correct behaviour we want.
for value in val {
self.add_doc_value(doc_id, field, value)?;
}
}
ReferenceValue::Object(val) => {
let expand_dots = self.expand_dots[field.field_id() as usize];
self.json_path_buffer.clear();
// First field should not be expanded.
self.json_path_buffer.set_expand_dots(false);
self.json_path_buffer.push(field_name);
self.json_path_buffer.set_expand_dots(expand_dots);
let text_analyzer = &mut self.per_field_tokenizer[field.field_id() as usize];
record_json_obj_to_columnar_writer::<V>(
doc_id,
val,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
}
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in /// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer. /// order to the fast field serializer.
pub fn serialize( pub fn serialize(
@@ -248,33 +241,66 @@ impl FastFieldsWriter {
} }
} }
fn record_json_obj_to_columnar_writer<'a, V: Value<'a>>( #[inline]
fn columnar_numerical_value(json_number: &serde_json::Number) -> Option<NumericalValue> {
if let Some(num_i64) = json_number.as_i64() {
return Some(num_i64.into());
}
if let Some(num_u64) = json_number.as_u64() {
return Some(num_u64.into());
}
if let Some(num_f64) = json_number.as_f64() {
return Some(num_f64.into());
}
// This can happen with arbitrary precision.... but we do not handle it.
None
}
fn record_json_obj_to_columnar_writer(
doc: DocId, doc: DocId,
json_visitor: V::ObjectIter, json_obj: &serde_json::Map<String, serde_json::Value>,
expand_dots: bool,
remaining_depth_limit: usize, remaining_depth_limit: usize,
json_path_buffer: &mut JsonPathWriter, json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter, columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>, tokenizer: &mut Option<TextAnalyzer>,
) { ) {
for (key, child) in json_visitor { for (key, child) in json_obj {
json_path_buffer.push(key); let len_path = json_path_buffer.len();
if !json_path_buffer.is_empty() {
json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
json_path_buffer.push_str(key);
if expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut json_path_buffer[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, unsafe {
appended_segment.as_bytes_mut()
});
}
record_json_value_to_columnar_writer( record_json_value_to_columnar_writer(
doc, doc,
child, child,
expand_dots,
remaining_depth_limit, remaining_depth_limit,
json_path_buffer, json_path_buffer,
columnar_writer, columnar_writer,
tokenizer, tokenizer,
); );
json_path_buffer.pop(); // popping our sub path.
json_path_buffer.truncate(len_path);
} }
} }
fn record_json_value_to_columnar_writer<'a, V: Value<'a>>( fn record_json_value_to_columnar_writer(
doc: DocId, doc: DocId,
json_val: V, json_val: &serde_json::Value,
expand_dots: bool,
mut remaining_depth_limit: usize, mut remaining_depth_limit: usize,
json_path_writer: &mut JsonPathWriter, json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter, columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>, tokenizer: &mut Option<TextAnalyzer>,
) { ) {
@@ -282,69 +308,34 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
return; return;
} }
remaining_depth_limit -= 1; remaining_depth_limit -= 1;
match json_val {
match json_val.as_value() { serde_json::Value::Null => {
ReferenceValue::Leaf(leaf) => match leaf { // TODO handle null
ReferenceValueLeaf::Null => {} // TODO: Handle null }
ReferenceValueLeaf::Str(val) => { serde_json::Value::Bool(bool_val) => {
if let Some(text_analyzer) = tokenizer.as_mut() { columnar_writer.record_bool(doc, json_path_writer, *bool_val);
let mut token_stream = text_analyzer.token_stream(val); }
token_stream.process(&mut |token| { serde_json::Value::Number(json_number) => {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text); if let Some(numerical_value) = columnar_numerical_value(json_number) {
}) columnar_writer.record_numerical(doc, json_path_writer.as_str(), numerical_value);
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), val);
}
} }
ReferenceValueLeaf::U64(val) => { }
columnar_writer.record_numerical( serde_json::Value::String(text) => {
doc, if let Some(text_analyzer) = tokenizer.as_mut() {
json_path_writer.as_str(), let mut token_stream = text_analyzer.token_stream(text);
NumericalValue::from(val), token_stream.process(&mut |token| {
); columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
} }
ReferenceValueLeaf::I64(val) => { }
columnar_writer.record_numerical( serde_json::Value::Array(arr) => {
doc, for el in arr {
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValueLeaf::F64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValueLeaf::Bool(val) => {
columnar_writer.record_bool(doc, json_path_writer.as_str(), val);
}
ReferenceValueLeaf::Date(val) => {
columnar_writer.record_datetime(doc, json_path_writer.as_str(), val);
}
ReferenceValueLeaf::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::Bytes(_) => {
// TODO: This can be re added once it is added to the JSON Utils section as well.
// columnar_writer.record_bytes(doc, json_path_writer.as_str(), val);
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::PreTokStr(_) => {
unimplemented!(
"Pre-tokenized string support in dynamic fields is not yet implemented"
)
}
},
ReferenceValue::Array(elements) => {
for el in elements {
record_json_value_to_columnar_writer( record_json_value_to_columnar_writer(
doc, doc,
el, el,
expand_dots,
remaining_depth_limit, remaining_depth_limit,
json_path_writer, json_path_writer,
columnar_writer, columnar_writer,
@@ -352,10 +343,11 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
); );
} }
} }
ReferenceValue::Object(object) => { serde_json::Value::Object(json_obj) => {
record_json_obj_to_columnar_writer::<V>( record_json_obj_to_columnar_writer(
doc, doc,
object, json_obj,
expand_dots,
remaining_depth_limit, remaining_depth_limit,
json_path_writer, json_path_writer,
columnar_writer, columnar_writer,
@@ -368,7 +360,6 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use columnar::{Column, ColumnarReader, ColumnarWriter, StrColumn}; use columnar::{Column, ColumnarReader, ColumnarWriter, StrColumn};
use common::JsonPathWriter;
use super::record_json_value_to_columnar_writer; use super::record_json_value_to_columnar_writer;
use crate::fastfield::writer::JSON_DEPTH_LIMIT; use crate::fastfield::writer::JSON_DEPTH_LIMIT;
@@ -379,12 +370,12 @@ mod tests {
expand_dots: bool, expand_dots: bool,
) -> ColumnarReader { ) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default(); let mut columnar_writer = ColumnarWriter::default();
let mut json_path = JsonPathWriter::default(); let mut json_path = String::new();
json_path.set_expand_dots(expand_dots);
for (doc, json_doc) in json_docs.iter().enumerate() { for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer( record_json_value_to_columnar_writer(
doc as u32, doc as u32,
json_doc, json_doc,
expand_dots,
JSON_DEPTH_LIMIT, JSON_DEPTH_LIMIT,
&mut json_path, &mut json_path,
&mut columnar_writer, &mut columnar_writer,

View File

@@ -4,7 +4,7 @@ use rand::{thread_rng, Rng};
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::*; use crate::schema::*;
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher}; use crate::{doc, schema, Index, IndexSettings, IndexSortByField, Order, Searcher};
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> { fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
assert!(searcher.segment_readers().len() < 20); assert!(searcher.segment_readers().len() < 20);
@@ -12,7 +12,7 @@ fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
for segment_reader in searcher.segment_readers() { for segment_reader in searcher.segment_readers() {
let store_reader = segment_reader.get_store_reader(1)?; let store_reader = segment_reader.get_store_reader(1)?;
for doc_id in 0..segment_reader.max_doc() { for doc_id in 0..segment_reader.max_doc() {
let _doc: TantivyDocument = store_reader.get(doc_id)?; let _doc = store_reader.get(doc_id)?;
} }
} }
Ok(()) Ok(())
@@ -31,8 +31,7 @@ fn test_functional_store() -> crate::Result<()> {
let mut rng = thread_rng(); let mut rng = thread_rng();
let mut index_writer: IndexWriter = let mut index_writer = index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut doc_set: Vec<u64> = Vec::new(); let mut doc_set: Vec<u64> = Vec::new();
@@ -92,8 +91,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
let mut rng = thread_rng(); let mut rng = thread_rng();
let mut index_writer: IndexWriter = let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut committed_docs: HashSet<u64> = HashSet::new(); let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new(); let mut uncommitted_docs: HashSet<u64> = HashSet::new();
@@ -116,7 +114,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
index_writer.delete_term(doc_id_term); index_writer.delete_term(doc_id_term);
} else { } else {
uncommitted_docs.insert(random_val); uncommitted_docs.insert(random_val);
let mut doc = TantivyDocument::new(); let mut doc = Document::new();
doc.add_u64(id_field, random_val); doc.add_u64(id_field, random_val);
for i in 1u64..10u64 { for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i); doc.add_u64(multiples_field, random_val * i);
@@ -168,8 +166,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
let mut rng = thread_rng(); let mut rng = thread_rng();
let mut index_writer: IndexWriter = let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut committed_docs: HashSet<u64> = HashSet::new(); let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new(); let mut uncommitted_docs: HashSet<u64> = HashSet::new();
@@ -192,7 +189,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
index_writer.delete_term(doc_id_term); index_writer.delete_term(doc_id_term);
} else { } else {
uncommitted_docs.insert(random_val); uncommitted_docs.insert(random_val);
let mut doc = TantivyDocument::new(); let mut doc = Document::new();
doc.add_u64(id_field, random_val); doc.add_u64(id_field, random_val);
for i in 1u64..10u64 { for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i); doc.add_u64(multiples_field, random_val * i);

View File

@@ -158,7 +158,6 @@ mod tests_indexsorting {
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{Schema, *}; use crate::schema::{Schema, *};
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order}; use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order};
@@ -309,16 +308,16 @@ mod tests_indexsorting {
{ {
assert_eq!( assert_eq!(
searcher searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))? .doc(DocAddress::new(0, 0))?
.get_first(my_string_field), .get_first(my_string_field),
None None
); );
assert_eq!( assert_eq!(
searcher searcher
.doc::<TantivyDocument>(DocAddress::new(0, 3))? .doc(DocAddress::new(0, 3))?
.get_first(my_string_field) .get_first(my_string_field)
.unwrap() .unwrap()
.as_str(), .as_text(),
Some("blublub") Some("blublub")
); );
} }
@@ -338,13 +337,13 @@ mod tests_indexsorting {
{ {
assert_eq!( assert_eq!(
searcher searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))? .doc(DocAddress::new(0, 0))?
.get_first(my_string_field) .get_first(my_string_field)
.unwrap() .unwrap()
.as_str(), .as_text(),
Some("blublub") Some("blublub")
); );
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?; let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(my_string_field), None); assert_eq!(doc.get_first(my_string_field), None);
} }
// sort by field desc // sort by field desc
@@ -361,9 +360,9 @@ mod tests_indexsorting {
let my_string_field = index.schema().get_field("string_field").unwrap(); let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?; let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!( assert_eq!(
doc.get_first(my_string_field).unwrap().as_str(), doc.get_first(my_string_field).unwrap().as_text(),
Some("blublub") Some("blublub")
); );
} }

View File

@@ -20,8 +20,7 @@ use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper; use crate::indexer::stamper::Stamper;
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter}; use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
use crate::query::{EnableScoring, Query, TermQuery}; use crate::query::{EnableScoring, Query, TermQuery};
use crate::schema::document::Document; use crate::schema::{Document, IndexRecordOption, Term};
use crate::schema::{IndexRecordOption, TantivyDocument, Term};
use crate::{FutureResult, Opstamp}; use crate::{FutureResult, Opstamp};
// Size of the margin for the `memory_arena`. A segment is closed when the remaining memory // Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
@@ -51,7 +50,7 @@ fn error_in_index_worker_thread(context: &str) -> TantivyError {
/// indexing queue. /// indexing queue.
/// Each indexing thread builds its own independent [`Segment`], via /// Each indexing thread builds its own independent [`Segment`], via
/// a `SegmentWriter` object. /// a `SegmentWriter` object.
pub struct IndexWriter<D: Document = TantivyDocument> { pub struct IndexWriter {
// the lock is just used to bind the // the lock is just used to bind the
// lifetime of the lock with that of the IndexWriter. // lifetime of the lock with that of the IndexWriter.
_directory_lock: Option<DirectoryLock>, _directory_lock: Option<DirectoryLock>,
@@ -63,8 +62,8 @@ pub struct IndexWriter<D: Document = TantivyDocument> {
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>, workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
index_writer_status: IndexWriterStatus<D>, index_writer_status: IndexWriterStatus,
operation_sender: AddBatchSender<D>, operation_sender: AddBatchSender,
segment_updater: SegmentUpdater, segment_updater: SegmentUpdater,
@@ -165,10 +164,10 @@ pub(crate) fn advance_deletes(
Ok(()) Ok(())
} }
fn index_documents<D: Document>( fn index_documents(
memory_budget: usize, memory_budget: usize,
segment: Segment, segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch<D>>, grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
segment_updater: &SegmentUpdater, segment_updater: &SegmentUpdater,
mut delete_cursor: DeleteCursor, mut delete_cursor: DeleteCursor,
) -> crate::Result<()> { ) -> crate::Result<()> {
@@ -248,7 +247,7 @@ fn apply_deletes(
}) })
} }
impl<D: Document> IndexWriter<D> { impl IndexWriter {
/// Create a new index writer. Attempts to acquire a lockfile. /// Create a new index writer. Attempts to acquire a lockfile.
/// ///
/// The lockfile should be deleted on drop, but it is possible /// The lockfile should be deleted on drop, but it is possible
@@ -268,7 +267,7 @@ impl<D: Document> IndexWriter<D> {
num_threads: usize, num_threads: usize,
memory_budget_in_bytes_per_thread: usize, memory_budget_in_bytes_per_thread: usize,
directory_lock: DirectoryLock, directory_lock: DirectoryLock,
) -> crate::Result<Self> { ) -> crate::Result<IndexWriter> {
if memory_budget_in_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN { if memory_budget_in_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
let err_msg = format!( let err_msg = format!(
"The memory arena in bytes per thread needs to be at least \ "The memory arena in bytes per thread needs to be at least \
@@ -282,7 +281,7 @@ impl<D: Document> IndexWriter<D> {
); );
return Err(TantivyError::InvalidArgument(err_msg)); return Err(TantivyError::InvalidArgument(err_msg));
} }
let (document_sender, document_receiver) = let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new(); let delete_queue = DeleteQueue::new();
@@ -294,7 +293,7 @@ impl<D: Document> IndexWriter<D> {
let segment_updater = let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?; SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = Self { let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock), _directory_lock: Some(directory_lock),
memory_budget_in_bytes_per_thread, memory_budget_in_bytes_per_thread,
@@ -376,7 +375,7 @@ impl<D: Document> IndexWriter<D> {
self.index.new_segment() self.index.new_segment()
} }
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver<D>> { fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
self.index_writer_status self.index_writer_status
.operation_receiver() .operation_receiver()
.ok_or_else(|| { .ok_or_else(|| {
@@ -526,7 +525,7 @@ impl<D: Document> IndexWriter<D> {
/// ///
/// Returns the former segment_ready channel. /// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) { fn recreate_document_channel(&mut self) {
let (document_sender, document_receiver) = let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
self.operation_sender = document_sender; self.operation_sender = document_sender;
self.index_writer_status = IndexWriterStatus::from(document_receiver); self.index_writer_status = IndexWriterStatus::from(document_receiver);
@@ -553,7 +552,7 @@ impl<D: Document> IndexWriter<D> {
.take() .take()
.expect("The IndexWriter does not have any lock. This is a bug, please report."); .expect("The IndexWriter does not have any lock. This is a bug, please report.");
let new_index_writer = IndexWriter::new( let new_index_writer: IndexWriter = IndexWriter::new(
&self.index, &self.index,
self.num_threads, self.num_threads,
self.memory_budget_in_bytes_per_thread, self.memory_budget_in_bytes_per_thread,
@@ -599,7 +598,7 @@ impl<D: Document> IndexWriter<D> {
/// It is also possible to add a payload to the `commit` /// It is also possible to add a payload to the `commit`
/// using this API. /// using this API.
/// See [`PreparedCommit::set_payload()`]. /// See [`PreparedCommit::set_payload()`].
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> { pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
// Here, because we join all of the worker threads, // Here, because we join all of the worker threads,
// all of the segment update for this commit have been // all of the segment update for this commit have been
// sent. // sent.
@@ -708,7 +707,7 @@ impl<D: Document> IndexWriter<D> {
/// The opstamp is an increasing `u64` that can /// The opstamp is an increasing `u64` that can
/// be used by the client to align commits with its own /// be used by the client to align commits with its own
/// document queue. /// document queue.
pub fn add_document(&self, document: D) -> crate::Result<Opstamp> { pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> {
let opstamp = self.stamper.stamp(); let opstamp = self.stamper.stamp();
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?; self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?;
Ok(opstamp) Ok(opstamp)
@@ -745,7 +744,7 @@ impl<D: Document> IndexWriter<D> {
/// visible to readers only after calling `commit()`. /// visible to readers only after calling `commit()`.
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp> pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp>
where where
I: IntoIterator<Item = UserOperation<D>>, I: IntoIterator<Item = UserOperation>,
I::IntoIter: ExactSizeIterator, I::IntoIter: ExactSizeIterator,
{ {
let user_operations_it = user_operations.into_iter(); let user_operations_it = user_operations.into_iter();
@@ -779,7 +778,7 @@ impl<D: Document> IndexWriter<D> {
Ok(batch_opstamp) Ok(batch_opstamp)
} }
fn send_add_documents_batch(&self, add_ops: AddBatch<D>) -> crate::Result<()> { fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> {
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() { if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
Ok(()) Ok(())
} else { } else {
@@ -788,7 +787,7 @@ impl<D: Document> IndexWriter<D> {
} }
} }
impl<D: Document> Drop for IndexWriter<D> { impl Drop for IndexWriter {
fn drop(&mut self) { fn drop(&mut self) {
self.segment_updater.kill(); self.segment_updater.kill();
self.drop_sender(); self.drop_sender();
@@ -815,15 +814,13 @@ mod tests {
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}; use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema, self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
}; };
use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{ use crate::{
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, IndexWriter, Order, DateTime, DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term,
ReloadPolicy, TantivyDocument, Term,
}; };
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \ const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
@@ -855,7 +852,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(text_field => "hello1")) .add_document(doc!(text_field => "hello1"))
.unwrap(); .unwrap();
@@ -908,7 +905,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual) .reload_policy(ReloadPolicy::Manual)
.try_into() .try_into()
.unwrap(); .unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let a_term = Term::from_field_text(text_field, "a"); let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b"); let b_term = Term::from_field_text(text_field, "b");
let operations = vec![ let operations = vec![
@@ -946,7 +943,7 @@ mod tests {
fn test_empty_operations_group() { fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let index_writer: IndexWriter = index.writer_for_tests().unwrap(); let index_writer = index.writer_for_tests().unwrap();
let operations1 = vec![]; let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1).unwrap(); let batch_opstamp1 = index_writer.run(operations1).unwrap();
assert_eq!(batch_opstamp1, 0u64); assert_eq!(batch_opstamp1, 0u64);
@@ -959,8 +956,8 @@ mod tests {
fn test_lockfile_stops_duplicates() { fn test_lockfile_stops_duplicates() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let _index_writer: IndexWriter = index.writer_for_tests().unwrap(); let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests::<TantivyDocument>() { match index.writer_for_tests() {
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {} Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
_ => panic!("Expected a `LockFailure` error"), _ => panic!("Expected a `LockFailure` error"),
} }
@@ -970,8 +967,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() { fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let _index_writer: IndexWriter = index.writer_for_tests().unwrap(); let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests::<TantivyDocument>() { match index.writer_for_tests() {
Err(err) => { Err(err) => {
let err_msg = err.to_string(); let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`")); assert!(err_msg.contains("already an `IndexWriter`"));
@@ -984,7 +981,7 @@ mod tests {
fn test_set_merge_policy() { fn test_set_merge_policy() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let index_writer: IndexWriter = index.writer_for_tests().unwrap(); let index_writer = index.writer_for_tests().unwrap();
assert_eq!( assert_eq!(
format!("{:?}", index_writer.get_merge_policy()), format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \ "LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \
@@ -1003,11 +1000,11 @@ mod tests {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let _index_writer: IndexWriter = index.writer_for_tests().unwrap(); let _index_writer = index.writer_for_tests().unwrap();
// the lock should be released when the // the lock should be released when the
// index_writer leaves the scope. // index_writer leaves the scope.
} }
let _index_writer_two: IndexWriter = index.writer_for_tests().unwrap(); let _index_writer_two = index.writer_for_tests().unwrap();
} }
#[test] #[test]
@@ -1059,7 +1056,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap() reader.searcher().doc_freq(&term_a).unwrap()
}; };
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?; index_writer.commit()?;
// this should create 1 segment // this should create 1 segment
@@ -1099,7 +1096,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap() reader.searcher().doc_freq(&term_a).unwrap()
}; };
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"))?;
@@ -1385,7 +1382,7 @@ mod tests {
fn test_delete_all_documents_empty_index() { fn test_delete_all_documents_empty_index() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4) .writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap(); .unwrap();
let clear = index_writer.delete_all_documents(); let clear = index_writer.delete_all_documents();
@@ -1398,7 +1395,7 @@ mod tests {
fn test_delete_all_documents_index_twice() { fn test_delete_all_documents_index_twice() {
let schema_builder = schema::Schema::builder(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4) .writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap(); .unwrap();
let clear = index_writer.delete_all_documents(); let clear = index_writer.delete_all_documents();
@@ -1418,7 +1415,7 @@ mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::builder().schema(schema).create_in_ram().unwrap(); let index = Index::builder().schema(schema).create_in_ram().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(text_field => "one")) .add_document(doc!(text_field => "one"))
.unwrap(); .unwrap();
@@ -1706,8 +1703,7 @@ mod tests {
let old_reader = index.reader()?; let old_reader = index.reader()?;
// Every 3rd doc has only id field let id_exists = |id| id % 3 != 0; // 0 does not exist
let id_is_full_doc = |id| id % 3 != 0;
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3"; let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left // rotate left
@@ -1723,7 +1719,7 @@ mod tests {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id); let ip = ip_from_id(id);
if !id_is_full_doc(id) { if !id_exists(id) {
// every 3rd doc has no ip field // every 3rd doc has no ip field
index_writer.add_document(doc!( index_writer.add_document(doc!(
id_field=>id, id_field=>id,
@@ -1781,7 +1777,7 @@ mod tests {
let num_segments_before_merge = searcher.segment_readers().len(); let num_segments_before_merge = searcher.segment_readers().len();
if force_end_merge { if force_end_merge {
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
@@ -1843,7 +1839,7 @@ mod tests {
let num_docs_with_values = expected_ids_and_num_occurrences let num_docs_with_values = expected_ids_and_num_occurrences
.iter() .iter()
.filter(|(id, _id_occurrences)| id_is_full_doc(**id)) .filter(|(id, _id_occurrences)| id_exists(**id))
.map(|(_, id_occurrences)| *id_occurrences as usize) .map(|(_, id_occurrences)| *id_occurrences as usize)
.sum::<usize>(); .sum::<usize>();
@@ -1867,7 +1863,7 @@ mod tests {
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 { if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
let mut expected_multi_ips: Vec<_> = id_list let mut expected_multi_ips: Vec<_> = id_list
.iter() .iter()
.filter(|id| id_is_full_doc(**id)) .filter(|id| id_exists(**id))
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)]) .flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
.collect(); .collect();
assert_eq!(num_ips, expected_multi_ips.len() as u32); assert_eq!(num_ips, expected_multi_ips.len() as u32);
@@ -1905,7 +1901,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences let expected_ips = expected_ids_and_num_occurrences
.keys() .keys()
.flat_map(|id| { .flat_map(|id| {
if !id_is_full_doc(*id) { if !id_exists(*id) {
None None
} else { } else {
Some(Ipv6Addr::from_u128(*id as u128)) Some(Ipv6Addr::from_u128(*id as u128))
@@ -1917,7 +1913,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences let expected_ips = expected_ids_and_num_occurrences
.keys() .keys()
.filter_map(|id| { .filter_map(|id| {
if !id_is_full_doc(*id) { if !id_exists(*id) {
None None
} else { } else {
Some(Ipv6Addr::from_u128(*id as u128)) Some(Ipv6Addr::from_u128(*id as u128))
@@ -1952,7 +1948,7 @@ mod tests {
let id = id_reader.first(doc).unwrap(); let id = id_reader.first(doc).unwrap();
let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect(); let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect();
if id_is_full_doc(id) { if id_exists(id) {
assert_eq!(vals.len(), 2); assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]); assert_eq!(vals[0], vals[1]);
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0])); assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
@@ -1962,7 +1958,7 @@ mod tests {
} }
let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect(); let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect();
if id_is_full_doc(id) { if id_exists(id) {
assert_eq!(bool_vals.len(), 2); assert_eq!(bool_vals.len(), 2);
assert_ne!(bool_vals[0], bool_vals[1]); assert_ne!(bool_vals[0], bool_vals[1]);
} else { } else {
@@ -1977,23 +1973,23 @@ mod tests {
.get_store_reader(DOCSTORE_CACHE_CAPACITY) .get_store_reader(DOCSTORE_CACHE_CAPACITY)
.unwrap(); .unwrap();
// test store iterator // test store iterator
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) { for doc in store_reader.iter(segment_reader.alive_bitset()) {
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap(); let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id)); assert!(expected_ids_and_num_occurrences.contains_key(&id));
} }
// test store random access // test store random access
for doc_id in segment_reader.doc_ids_alive() { for doc_id in segment_reader.doc_ids_alive() {
let id = store_reader let id = store_reader
.get::<TantivyDocument>(doc_id) .get(doc_id)
.unwrap() .unwrap()
.get_first(id_field) .get_first(id_field)
.unwrap() .unwrap()
.as_u64() .as_u64()
.unwrap(); .unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id)); assert!(expected_ids_and_num_occurrences.contains_key(&id));
if id_is_full_doc(id) { if id_exists(id) {
let id2 = store_reader let id2 = store_reader
.get::<TantivyDocument>(doc_id) .get(doc_id)
.unwrap() .unwrap()
.get_first(multi_numbers) .get_first(multi_numbers)
.unwrap() .unwrap()
@@ -2001,13 +1997,13 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(id, id2); assert_eq!(id, id2);
let bool = store_reader let bool = store_reader
.get::<TantivyDocument>(doc_id) .get(doc_id)
.unwrap() .unwrap()
.get_first(bool_field) .get_first(bool_field)
.unwrap() .unwrap()
.as_bool() .as_bool()
.unwrap(); .unwrap();
let doc = store_reader.get::<TantivyDocument>(doc_id).unwrap(); let doc = store_reader.get(doc_id).unwrap();
let mut bool2 = doc.get_all(multi_bools); let mut bool2 = doc.get_all(multi_bools);
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap()); assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap()); assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
@@ -2038,7 +2034,7 @@ mod tests {
let (existing_id, count) = (*id, *count); let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64; let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(id_field), count); assert_eq!(get_num_hits(id_field), count);
if !id_is_full_doc(existing_id) { if !id_exists(existing_id) {
continue; continue;
} }
assert_eq!(get_num_hits(text_field), count); assert_eq!(get_num_hits(text_field), count);
@@ -2088,7 +2084,7 @@ mod tests {
// //
for (existing_id, count) in &expected_ids_and_num_occurrences { for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count); let (existing_id, count) = (*existing_id, *count);
if !id_is_full_doc(existing_id) { if !id_exists(existing_id) {
continue; continue;
} }
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
@@ -2105,84 +2101,34 @@ mod tests {
} }
} }
// Range query // assert data is like expected
// //
// Take half as sample for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect(); let (existing_id, count) = (*existing_id, *count);
sample.sort_by_key(|(k, _num_occurences)| *k); if !id_exists(existing_id) {
// sample.truncate(sample.len() / 2); continue;
if !sample.is_empty() { }
let (left_sample, right_sample) = sample.split_at(sample.len() / 2); let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
let expected_count = |sample: &[(&u64, &u64)]| {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
.map(|(_id, num_occurences)| **num_occurences)
.sum::<u64>()
};
fn gen_query_inclusive<T1: ToString, T2: ToString>(
field: &str,
from: T1,
to: T2,
) -> String {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string()) format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
} };
let ip = ip_from_id(existing_id);
// Query first half let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
if !left_sample.is_empty() { // Range query on single value field
let expected_count = expected_count(left_sample); let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
let start_range = *left_sample[0].0; // Range query on multi value field
let end_range = *left_sample.last().unwrap().0; let query = gen_query_inclusive("ips", ip, ip);
let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on ip field assert_eq!(do_search_ip_field(&query), count);
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
}
// Query second half
if !right_sample.is_empty() {
let expected_count = expected_count(right_sample);
let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field
let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
}
} }
// ip range query on fast field // ip range query on fast field
// //
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) { for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count); let (existing_id, count) = (*existing_id, *count);
if !id_is_full_doc(existing_id) { if !id_exists(existing_id) {
continue; continue;
} }
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| { let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
@@ -2210,7 +2156,7 @@ mod tests {
.first_or_default_col(9999); .first_or_default_col(9999);
for doc_id in segment_reader.doc_ids_alive() { for doc_id in segment_reader.doc_ids_alive() {
let id = ff_reader.get_val(doc_id); let id = ff_reader.get_val(doc_id);
if !id_is_full_doc(id) { if !id_exists(id) {
continue; continue;
} }
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect(); let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
@@ -2248,12 +2194,6 @@ mod tests {
Ok(index) Ok(index)
} }
#[test]
fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok());
}
#[test] #[test]
fn test_sort_index_on_opt_field_regression() { fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy( assert!(test_operation_strategy(
@@ -2603,7 +2543,7 @@ mod tests {
// Merge // Merge
{ {
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
@@ -2645,7 +2585,7 @@ mod tests {
// Merge // Merge
{ {
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");

View File

@@ -2,15 +2,13 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use super::AddBatchReceiver; use super::AddBatchReceiver;
use crate::schema::document::Document;
use crate::TantivyDocument;
#[derive(Clone)] #[derive(Clone)]
pub(crate) struct IndexWriterStatus<D: Document = TantivyDocument> { pub(crate) struct IndexWriterStatus {
inner: Arc<Inner<D>>, inner: Arc<Inner>,
} }
impl<D: Document> IndexWriterStatus<D> { impl IndexWriterStatus {
/// Returns true iff the index writer is alive. /// Returns true iff the index writer is alive.
pub fn is_alive(&self) -> bool { pub fn is_alive(&self) -> bool {
self.inner.as_ref().is_alive() self.inner.as_ref().is_alive()
@@ -18,7 +16,7 @@ impl<D: Document> IndexWriterStatus<D> {
/// Returns a copy of the operation receiver. /// Returns a copy of the operation receiver.
/// If the index writer was killed, returns `None`. /// If the index writer was killed, returns `None`.
pub fn operation_receiver(&self) -> Option<AddBatchReceiver<D>> { pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
let rlock = self let rlock = self
.inner .inner
.receive_channel .receive_channel
@@ -29,19 +27,19 @@ impl<D: Document> IndexWriterStatus<D> {
/// Create an index writer bomb. /// Create an index writer bomb.
/// If dropped, the index writer status will be killed. /// If dropped, the index writer status will be killed.
pub(crate) fn create_bomb(&self) -> IndexWriterBomb<D> { pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
IndexWriterBomb { IndexWriterBomb {
inner: Some(self.inner.clone()), inner: Some(self.inner.clone()),
} }
} }
} }
struct Inner<D: Document> { struct Inner {
is_alive: AtomicBool, is_alive: AtomicBool,
receive_channel: RwLock<Option<AddBatchReceiver<D>>>, receive_channel: RwLock<Option<AddBatchReceiver>>,
} }
impl<D: Document> Inner<D> { impl Inner {
fn is_alive(&self) -> bool { fn is_alive(&self) -> bool {
self.is_alive.load(Ordering::Relaxed) self.is_alive.load(Ordering::Relaxed)
} }
@@ -55,8 +53,8 @@ impl<D: Document> Inner<D> {
} }
} }
impl<D: Document> From<AddBatchReceiver<D>> for IndexWriterStatus<D> { impl From<AddBatchReceiver> for IndexWriterStatus {
fn from(receiver: AddBatchReceiver<D>) -> Self { fn from(receiver: AddBatchReceiver) -> Self {
IndexWriterStatus { IndexWriterStatus {
inner: Arc::new(Inner { inner: Arc::new(Inner {
is_alive: AtomicBool::new(true), is_alive: AtomicBool::new(true),
@@ -68,11 +66,11 @@ impl<D: Document> From<AddBatchReceiver<D>> for IndexWriterStatus<D> {
/// If dropped, the index writer will be killed. /// If dropped, the index writer will be killed.
/// To prevent this, clients can call `.defuse()`. /// To prevent this, clients can call `.defuse()`.
pub(crate) struct IndexWriterBomb<D: Document> { pub(crate) struct IndexWriterBomb {
inner: Option<Arc<Inner<D>>>, inner: Option<Arc<Inner>>,
} }
impl<D: Document> IndexWriterBomb<D> { impl IndexWriterBomb {
/// Defuses the bomb. /// Defuses the bomb.
/// ///
/// This is the only way to drop the bomb without killing /// This is the only way to drop the bomb without killing
@@ -82,7 +80,7 @@ impl<D: Document> IndexWriterBomb<D> {
} }
} }
impl<D: Document> Drop for IndexWriterBomb<D> { impl Drop for IndexWriterBomb {
fn drop(&mut self) { fn drop(&mut self) {
if let Some(inner) = self.inner.take() { if let Some(inner) = self.inner.take() {
inner.kill(); inner.kill();

View File

@@ -63,13 +63,10 @@ impl MergeOperation {
} }
} }
/// Returns the opstamp up to which we want to consume the delete queue and reflect their
/// deletes.
pub fn target_opstamp(&self) -> Opstamp { pub fn target_opstamp(&self) -> Opstamp {
self.inner.target_opstamp self.inner.target_opstamp
} }
/// Returns the list of segment to be merged.
pub fn segment_ids(&self) -> &[SegmentId] { pub fn segment_ids(&self) -> &[SegmentId] {
&self.inner.segment_ids[..] &self.inner.segment_ids[..]
} }

View File

@@ -552,41 +552,7 @@ impl IndexMerger {
continue; continue;
} }
// This should never happen as we early exited for total_doc_freq == 0. field_serializer.new_term(term_bytes, total_doc_freq)?;
assert!(!segment_postings_containing_the_term.is_empty());
let has_term_freq = {
let has_term_freq = !segment_postings_containing_the_term[0]
.1
.block_cursor
.freqs()
.is_empty();
for (_, postings) in &segment_postings_containing_the_term[1..] {
// This may look at a strange way to test whether we have term freq or not.
// With JSON object, the schema is not sufficient to know whether a term
// has its term frequency encoded or not:
// strings may have term frequencies, while number terms never have one.
//
// Ideally, we should have burnt one bit of two in the `TermInfo`.
// However, we preferred not changing the codec too much and detect this
// instead by
// - looking at the size of the skip data for bitpacked blocks
// - observing the absence of remaining data after reading the docs for vint
// blocks.
//
// Overall the reliable way to know if we have actual frequencies loaded or not
// is to check whether the actual decoded array is empty or not.
if has_term_freq != !postings.block_cursor.freqs().is_empty() {
return Err(DataCorruption::comment_only(
"Term freqs are inconsistent across segments",
)
.into());
}
}
has_term_freq
};
field_serializer.new_term(term_bytes, total_doc_freq, has_term_freq)?;
// We can now serialize this postings, by pushing each document to the // We can now serialize this postings, by pushing each document to the
// postings serializer. // postings serializer.
@@ -601,13 +567,8 @@ impl IndexMerger {
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] { if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term if // we make sure to only write the term if
// there is at least one document. // there is at least one document.
let term_freq = if has_term_freq { let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer); segment_postings.positions(&mut positions_buffer);
segment_postings.term_freq()
} else {
0u32
};
// if doc_id_mapping exists, the doc_ids are reordered, they are // if doc_id_mapping exists, the doc_ids are reordered, they are
// not just stacked. The field serializer expects monotonically increasing // not just stacked. The field serializer expects monotonically increasing
// doc_ids, so we collect and sort them first, before writing. // doc_ids, so we collect and sort them first, before writing.
@@ -792,10 +753,9 @@ mod tests {
use crate::collector::{Count, FacetCollector}; use crate::collector::{Count, FacetCollector};
use crate::core::Index; use crate::core::Index;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term, TextFieldIndexing,
TextFieldIndexing, INDEXED, TEXT, INDEXED, TEXT,
}; };
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::{ use crate::{
@@ -857,7 +817,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?; index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
} }
@@ -906,24 +866,30 @@ mod tests {
); );
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?; let doc = searcher.doc(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b")); assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?; let doc = searcher.doc(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c")); assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("a b c"));
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?; let doc = searcher.doc(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d")); assert_eq!(
doc.get_first(text_field).unwrap().as_text(),
Some("a b c d")
);
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?; let doc = searcher.doc(DocAddress::new(0, 3))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b")); assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
} }
{ {
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?; let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c g")); assert_eq!(
doc.get_first(text_field).unwrap().as_text(),
Some("a b c g")
);
} }
{ {
@@ -1334,10 +1300,10 @@ mod tests {
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let mut int_val = 0; let mut int_val = 0;
{ {
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = let index_doc =
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| { |index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
for facet in doc_facets { for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet)); doc.add_facet(facet_field, Facet::from(facet));
} }
@@ -1418,7 +1384,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.merge(&segment_ids) .merge(&segment_ids)
.wait() .wait()
@@ -1440,7 +1406,7 @@ mod tests {
// Deleting one term // Deleting one term
{ {
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]); let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet); let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term); index_writer.delete_term(facet_term);
@@ -1465,7 +1431,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED); let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64))?; index_writer.add_document(doc!(int_field => 1u64))?;
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64))?; index_writer.add_document(doc!(int_field => 1u64))?;
@@ -1494,7 +1460,7 @@ mod tests {
let reader = index.reader()?; let reader = index.reader()?;
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
doc.add_u64(int_field, 1); doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone())?; index_writer.add_document(doc.clone())?;
index_writer.commit()?; index_writer.commit()?;
@@ -1537,7 +1503,7 @@ mod tests {
{ {
let mut index_writer = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| { let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
for &val in int_vals { for &val in int_vals {
doc.add_u64(int_field, val); doc.add_u64(int_field, val);
} }
@@ -1600,7 +1566,7 @@ mod tests {
// Merging the segments // Merging the segments
{ {
let segment_ids = index.searchable_segment_ids()?; let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?; index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
} }
@@ -1647,7 +1613,7 @@ mod tests {
writer.set_merge_policy(Box::new(policy)); writer.set_merge_policy(Box::new(policy));
for i in 0..100 { for i in 0..100 {
let mut doc = TantivyDocument::new(); let mut doc = Document::new();
doc.add_f64(field, 42.0); doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24); doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27); doc.add_f64(multi_field, 0.27);

View File

@@ -4,15 +4,11 @@ mod tests {
use crate::core::Index; use crate::core::Index;
use crate::fastfield::AliveBitSet; use crate::fastfield::AliveBitSet;
use crate::query::QueryParser; use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions, TextFieldIndexing, TextOptions,
}; };
use crate::{ use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings,
TantivyDocument, Term,
};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index { fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
@@ -30,7 +26,7 @@ mod tests {
let index = index_builder.create_in_ram().unwrap(); let index = index_builder.create_in_ram().unwrap();
{ {
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer index_writer
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime"))) .add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
.unwrap(); .unwrap();
@@ -49,7 +45,7 @@ mod tests {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
assert!(index_writer.merge(&segment_ids).wait().is_ok()); assert!(index_writer.merge(&segment_ids).wait().is_ok());
assert!(index_writer.wait_merging_threads().is_ok()); assert!(index_writer.wait_merging_threads().is_ok());
} }
@@ -137,7 +133,7 @@ mod tests {
// Merging the segments // Merging the segments
{ {
let segment_ids = index.searchable_segment_ids()?; let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?; index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
} }
@@ -276,16 +272,12 @@ mod tests {
} else { } else {
2 2
}; };
let doc = searcher let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap();
assert_eq!( assert_eq!(
doc.get_first(my_text_field).unwrap().as_str(), doc.get_first(my_text_field).unwrap().as_text(),
Some("blubber") Some("blubber")
); );
let doc = searcher let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
.doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000)); assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
} }
} }
@@ -502,7 +494,7 @@ mod bench_sorted_index_merge {
let index = index_builder.create_in_ram().unwrap(); let index = index_builder.create_in_ram().unwrap();
{ {
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, val: u64| { let index_doc = |index_writer: &mut IndexWriter, val: u64| {
index_writer.add_document(doc!(int_field=>val)).unwrap(); index_writer.add_document(doc!(int_field=>val)).unwrap();
}; };
@@ -543,7 +535,7 @@ mod bench_sorted_index_merge {
//); //);
//(doc_addr.doc_id, reader, u64_reader) //(doc_addr.doc_id, reader, u64_reader)
//}); //});
/// add values in order of the new doc_ids //// add values in order of the new doc_ids
// let mut val = 0; // let mut val = 0;
// for (doc_id, _reader, field_reader) in sorted_doc_ids { // for (doc_id, _reader, field_reader) in sorted_doc_ids {
// val = field_reader.get_val(doc_id); // val = field_reader.get_val(doc_id);

View File

@@ -1,30 +1,23 @@
//! Indexing and merging data. pub mod delete_queue;
//!
//! Contains code to create and merge segments.
//! `IndexWriter` is the main entry point for that, which created from
//! [`Index::writer`](crate::Index::writer).
pub(crate) mod delete_queue; pub mod doc_id_mapping;
pub(crate) mod path_to_unordered_id;
pub(crate) mod doc_id_mapping;
mod doc_opstamp_mapping; mod doc_opstamp_mapping;
mod flat_map_with_buffer; mod flat_map_with_buffer;
pub(crate) mod index_writer; pub mod index_writer;
pub(crate) mod index_writer_status; mod index_writer_status;
mod log_merge_policy; mod log_merge_policy;
mod merge_operation; mod merge_operation;
pub(crate) mod merge_policy; pub mod merge_policy;
pub(crate) mod merger; pub mod merger;
mod merger_sorted_index_test; mod merger_sorted_index_test;
pub(crate) mod operation; pub mod operation;
pub(crate) mod prepared_commit; pub mod prepared_commit;
mod segment_entry; mod segment_entry;
mod segment_manager; mod segment_manager;
mod segment_register; mod segment_register;
pub(crate) mod segment_serializer; pub mod segment_serializer;
pub(crate) mod segment_updater; pub mod segment_updater;
pub(crate) mod segment_writer; mod segment_writer;
mod stamper; mod stamper;
use crossbeam_channel as channel; use crossbeam_channel as channel;
@@ -34,10 +27,10 @@ pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy; pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation; pub use self::merge_operation::MergeOperation;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::operation::UserOperation;
pub use self::prepared_commit::PreparedCommit; pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry; pub use self::segment_entry::SegmentEntry;
pub(crate) use self::segment_serializer::SegmentSerializer; pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::{merge_filtered_segments, merge_indices}; pub use self::segment_updater::{merge_filtered_segments, merge_indices};
pub use self::segment_writer::SegmentWriter; pub use self::segment_writer::SegmentWriter;
use crate::indexer::operation::AddOperation; use crate::indexer::operation::AddOperation;
@@ -51,28 +44,25 @@ pub type DefaultMergePolicy = LogMergePolicy;
// - all docs in the operation will happen on the same segment and continuous doc_ids. // - all docs in the operation will happen on the same segment and continuous doc_ids.
// - all operations in the group are committed at the same time, making the group // - all operations in the group are committed at the same time, making the group
// atomic. // atomic.
type AddBatch<D> = SmallVec<[AddOperation<D>; 4]>; type AddBatch = SmallVec<[AddOperation; 4]>;
type AddBatchSender<D> = channel::Sender<AddBatch<D>>; type AddBatchSender = channel::Sender<AddBatch>;
type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>; type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod tests_mmap { mod tests_mmap {
use crate::aggregation::agg_req::Aggregations; use crate::collector::Count;
use crate::aggregation::agg_result::AggregationResults; use crate::query::QueryParser;
use crate::aggregation::AggregationCollector; use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::collector::{Count, TopDocs}; use crate::{Index, Term};
use crate::query::{AllQuery, QueryParser};
use crate::schema::{JsonObjectOptions, Schema, Type, FAST, INDEXED, STORED, TEXT};
use crate::{FieldMetadata, Index, IndexWriter, Term};
#[test] #[test]
fn test_advance_delete_bug() -> crate::Result<()> { fn test_advance_delete_bug() -> crate::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_from_tempdir(schema_builder.build())?; let index = Index::create_from_tempdir(schema_builder.build())?;
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// there must be one deleted document in the segment // there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"))?; index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "b")); index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -89,7 +79,7 @@ mod tests_mmap {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT); let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"}); let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap(); index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -113,35 +103,6 @@ mod tests_mmap {
} }
} }
#[test]
fn test_json_field_number() {
// this test was added specifically to reach some cases related to using json fields, with
// frequency enabled, to store integers, with enough documents containing a single integer
// that the posting list can be bitpacked.
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..256 {
let json = serde_json::json!({"somekey": 1u64, "otherkey": -2i64});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"somekey": "1str", "otherkey": "2str"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 512);
let parse_query = QueryParser::for_index(&index, Vec::new());
{
let query = parse_query.parse_query(r"json.somekey:1").unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 256);
}
}
#[test] #[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() { fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -149,7 +110,7 @@ mod tests_mmap {
JsonObjectOptions::from(TEXT).set_expand_dots_enabled(); JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options); let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"}); let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap(); index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -172,275 +133,4 @@ mod tests_mmap {
assert_eq!(num_docs, 1); assert_eq!(num_docs, 1);
} }
} }
#[test]
fn test_json_field_list_fields() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions = JsonObjectOptions::from(TEXT);
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "sub": {"a": 1, "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": "mixed", "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 3);
let reader = &searcher.segment_readers()[0];
let inverted_index = reader.inverted_index(json_field).unwrap();
assert_eq!(
inverted_index.list_encoded_fields().unwrap(),
[
("k8s.container.name".to_string(), Type::Str),
("sub\u{1}a".to_string(), Type::I64),
("sub\u{1}b".to_string(), Type::I64),
("suber\u{1}a".to_string(), Type::I64),
("suber\u{1}a".to_string(), Type::Str),
("suber\u{1}b".to_string(), Type::I64),
("val".to_string(), Type::Str),
]
);
}
#[test]
fn test_json_fields_metadata_expanded_dots_one_segment() {
test_json_fields_metadata(true, true);
}
#[test]
fn test_json_fields_metadata_expanded_dots_multi_segment() {
test_json_fields_metadata(true, false);
}
#[test]
fn test_json_fields_metadata_no_expanded_dots_one_segment() {
test_json_fields_metadata(false, true);
}
#[test]
fn test_json_fields_metadata_no_expanded_dots_multi_segment() {
test_json_fields_metadata(false, false);
}
fn test_json_fields_metadata(expanded_dots: bool, one_segment: bool) {
use pretty_assertions::assert_eq;
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_fast(None).set_stored();
let json_options = if expanded_dots {
json_options.set_expand_dots_enabled()
} else {
json_options
};
schema_builder.add_json_field("json.confusing", json_options.clone());
let json_field = schema_builder.add_json_field("json.shadow", json_options.clone());
let json_field2 = schema_builder.add_json_field("json", json_options.clone());
schema_builder.add_json_field("empty_json", json_options);
let number_field = schema_builder.add_u64_field("numbers", FAST);
schema_builder.add_u64_field("empty", FAST | INDEXED | STORED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json =
serde_json::json!({"k8s.container.name": "a", "val": "a", "sub": {"a": 1, "b": 1}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json =
serde_json::json!({"k8s.container.name": "a", "val": "a", "suber": {"a": 1, "b": 1}});
if !one_segment {
index_writer.commit().unwrap();
}
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"k8s.container.name": "a", "k8s.container.name": "a", "val": "a", "suber": {"a": "a", "b": 1}});
index_writer
.add_document(doc!(number_field => 50u64, json_field=>json, json_field2=>json!({"shadow": {"val": "a"}})))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 3);
let fields_metadata = index.fields_metadata().unwrap();
assert_eq!(
fields_metadata,
[
FieldMetadata {
field_name: "empty".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::U64
},
FieldMetadata {
field_name: if expanded_dots {
"json.shadow.k8s.container.name".to_string()
} else {
"json.shadow.k8s\\.container\\.name".to_string()
},
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "json.shadow.sub.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.sub.b".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.suber.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.suber.a".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "json.shadow.suber.b".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::I64
},
FieldMetadata {
field_name: "json.shadow.val".to_string(),
indexed: true,
stored: true,
fast: true,
typ: Type::Str
},
FieldMetadata {
field_name: "numbers".to_string(),
indexed: false,
stored: false,
fast: true,
typ: Type::U64
}
]
);
let query_parser = QueryParser::for_index(&index, vec![]);
// Test if returned field name can be queried
for indexed_field in fields_metadata.iter().filter(|meta| meta.indexed) {
let val = if indexed_field.typ == Type::Str {
"a"
} else {
"1"
};
let query_str = &format!("{}:{}", indexed_field.field_name, val);
let query = query_parser.parse_query(query_str).unwrap();
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
if indexed_field.field_name.contains("empty") || indexed_field.typ == Type::Json {
assert_eq!(count_docs.len(), 0);
} else {
assert!(!count_docs.is_empty(), "{}", indexed_field.field_name);
}
}
// Test if returned field name can be used for aggregation
for fast_field in fields_metadata.iter().filter(|meta| meta.fast) {
let agg_req_str = json!(
{
"termagg": {
"terms": {
"field": fast_field.field_name,
}
}
});
let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap();
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res = serde_json::to_value(agg_res).unwrap();
if !fast_field.field_name.contains("empty") && fast_field.typ != Type::Json {
assert!(
!res["termagg"]["buckets"].as_array().unwrap().is_empty(),
"{}",
fast_field.field_name
);
}
}
}
#[test]
fn test_json_field_shadowing_field_name_bug() {
/// This test is only there to display a bug on addressing a field if it gets shadowed
/// The issues only occurs if the field name that shadows contains a dot.
///
/// Happens independently of the `expand_dots` option. Since that option does not
/// affect the field name itself.
use pretty_assertions::assert_eq;
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_fast(None).set_stored();
// let json_options = json_options.set_expand_dots_enabled();
let json_field_shadow = schema_builder.add_json_field("json.shadow", json_options.clone());
let json_field = schema_builder.add_json_field("json", json_options.clone());
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(
doc!(json_field_shadow=>json!({"val": "b"}), json_field=>json!({"shadow": {"val": "a"}})),
)
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let fields_and_vals = vec![
// Only way to address or it gets shadowed by `json.shadow` field
("json.shadow\u{1}val".to_string(), "a"), // Succeeds
//("json.shadow.val".to_string(), "a"), // Fails
("json.shadow.val".to_string(), "b"), // Succeeds
];
let query_parser = QueryParser::for_index(&index, vec![]);
// Test if field name can be queried
for (indexed_field, val) in fields_and_vals.iter() {
let query_str = &format!("{}:{}", indexed_field, val);
let query = query_parser.parse_query(query_str).unwrap();
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val);
}
// Test if field name can be used for aggregation
for (field_name, val) in fields_and_vals.iter() {
let agg_req_str = json!(
{
"termagg": {
"terms": {
"field": field_name,
}
}
});
let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap();
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res = serde_json::to_value(agg_res).unwrap();
assert_eq!(
res["termagg"]["buckets"].as_array().unwrap()[0]["key"]
.as_str()
.unwrap(),
*val,
"{}",
field_name
);
}
}
} }

View File

@@ -1,6 +1,5 @@
use crate::query::Weight; use crate::query::Weight;
use crate::schema::document::Document; use crate::schema::{Document, Term};
use crate::schema::{TantivyDocument, Term};
use crate::Opstamp; use crate::Opstamp;
/// Timestamped Delete operation. /// Timestamped Delete operation.
@@ -11,16 +10,16 @@ pub struct DeleteOperation {
/// Timestamped Add operation. /// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)] #[derive(Eq, PartialEq, Debug)]
pub struct AddOperation<D: Document = TantivyDocument> { pub struct AddOperation {
pub opstamp: Opstamp, pub opstamp: Opstamp,
pub document: D, pub document: Document,
} }
/// UserOperation is an enum type that encapsulates other operation types. /// UserOperation is an enum type that encapsulates other operation types.
#[derive(Eq, PartialEq, Debug)] #[derive(Eq, PartialEq, Debug)]
pub enum UserOperation<D: Document = TantivyDocument> { pub enum UserOperation {
/// Add operation /// Add operation
Add(D), Add(Document),
/// Delete operation /// Delete operation
Delete(Term), Delete(Term),
} }

View File

@@ -1,92 +0,0 @@
use fnv::FnvHashMap;
/// `Field` is represented by an unsigned 32-bit integer type.
/// The schema holds the mapping between field names and `Field` objects.
#[derive(Copy, Default, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)]
pub struct OrderedPathId(u32);
impl OrderedPathId {
/// Create a new field object for the given PathId.
pub const fn from_ordered_id(field_id: u32) -> OrderedPathId {
OrderedPathId(field_id)
}
/// Returns a u32 identifying uniquely a path within a schema.
pub const fn path_id(self) -> u32 {
self.0
}
}
impl From<u32> for OrderedPathId {
fn from(id: u32) -> Self {
Self(id)
}
}
#[derive(Default)]
pub(crate) struct PathToUnorderedId {
map: FnvHashMap<String, u32>,
}
impl PathToUnorderedId {
#[inline]
pub(crate) fn get_or_allocate_unordered_id(&mut self, path: &str) -> u32 {
if let Some(id) = self.map.get(path) {
return *id;
}
self.insert_new_path(path)
}
#[cold]
fn insert_new_path(&mut self, path: &str) -> u32 {
let next_id = self.map.len() as u32;
self.map.insert(path.to_string(), next_id);
next_id
}
/// Retuns ids which reflect the lexical order of the paths.
///
/// The returned vec can be indexed with the unordered id to get the ordered id.
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> {
let mut sorted_ids: Vec<(&str, &u32)> =
self.map.iter().map(|(k, v)| (k.as_str(), v)).collect();
sorted_ids.sort_unstable_by_key(|(path, _)| *path);
let mut result = vec![OrderedPathId::default(); sorted_ids.len()];
for (ordered, unordered) in sorted_ids.iter().map(|(_k, v)| v).enumerate() {
result[**unordered as usize] = OrderedPathId::from_ordered_id(ordered as u32);
}
result
}
/// Retuns the paths so they can be queried by the ordered id (which is the index).
pub(crate) fn ordered_id_to_path(&self) -> Vec<&str> {
let mut paths = self.map.keys().map(String::as_str).collect::<Vec<_>>();
paths.sort_unstable();
paths
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn path_to_unordered_test() {
let mut path_to_id = PathToUnorderedId::default();
let terms = vec!["b", "a", "b", "c"];
let ids = terms
.iter()
.map(|term| path_to_id.get_or_allocate_unordered_id(term))
.collect::<Vec<u32>>();
assert_eq!(ids, vec![0, 1, 0, 2]);
let ordered_ids = ids
.iter()
.map(|id| path_to_id.unordered_id_to_ordered_id()[*id as usize])
.collect::<Vec<OrderedPathId>>();
assert_eq!(ordered_ids, vec![1.into(), 0.into(), 1.into(), 2.into()]);
// Fetch terms
let terms_fetched = ordered_ids
.iter()
.map(|id| path_to_id.ordered_id_to_path()[id.path_id() as usize])
.collect::<Vec<&str>>();
assert_eq!(terms_fetched, terms);
}
}

View File

@@ -1,17 +1,16 @@
use super::IndexWriter; use super::IndexWriter;
use crate::schema::document::Document; use crate::{FutureResult, Opstamp};
use crate::{FutureResult, Opstamp, TantivyDocument};
/// A prepared commit /// A prepared commit
pub struct PreparedCommit<'a, D: Document = TantivyDocument> { pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter<D>, index_writer: &'a mut IndexWriter,
payload: Option<String>, payload: Option<String>,
opstamp: Opstamp, opstamp: Opstamp,
} }
impl<'a, D: Document> PreparedCommit<'a, D> { impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter<D>, opstamp: Opstamp) -> Self { pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> {
Self { PreparedCommit {
index_writer, index_writer,
payload: None, payload: None,
opstamp, opstamp,

View File

@@ -1,5 +1,4 @@
use columnar::MonotonicallyMappableToU64; use columnar::MonotonicallyMappableToU64;
use common::JsonPathWriter;
use itertools::Itertools; use itertools::Itertools;
use tokenizer_api::BoxTokenStream; use tokenizer_api::BoxTokenStream;
@@ -14,11 +13,10 @@ use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition, compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter, PerFieldPostingsWriter, PostingsWriter,
}; };
use crate::schema::document::{Document, ReferenceValue, Value}; use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter}; use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer}; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Opstamp, SegmentComponent, TantivyError}; use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
/// Computes the initial size of the hash table. /// Computes the initial size of the hash table.
/// ///
@@ -67,7 +65,6 @@ pub struct SegmentWriter {
pub(crate) segment_serializer: SegmentSerializer, pub(crate) segment_serializer: SegmentSerializer,
pub(crate) fast_field_writers: FastFieldsWriter, pub(crate) fast_field_writers: FastFieldsWriter,
pub(crate) fieldnorms_writer: FieldNormsWriter, pub(crate) fieldnorms_writer: FieldNormsWriter,
pub(crate) json_path_writer: JsonPathWriter,
pub(crate) doc_opstamps: Vec<Opstamp>, pub(crate) doc_opstamps: Vec<Opstamp>,
per_field_text_analyzers: Vec<TextAnalyzer>, per_field_text_analyzers: Vec<TextAnalyzer>,
term_buffer: Term, term_buffer: Term,
@@ -84,7 +81,10 @@ impl SegmentWriter {
/// the flushing behavior as a memory limit. /// the flushing behavior as a memory limit.
/// - segment: The segment being written /// - segment: The segment being written
/// - schema /// - schema
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> { pub fn for_segment(
memory_budget_in_bytes: usize,
segment: Segment,
) -> crate::Result<SegmentWriter> {
let schema = segment.schema(); let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone(); let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone(); let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
@@ -113,12 +113,11 @@ impl SegmentWriter {
}) })
}) })
.collect::<Result<Vec<_>, _>>()?; .collect::<Result<Vec<_>, _>>()?;
Ok(Self { Ok(SegmentWriter {
max_doc: 0, max_doc: 0,
ctx: IndexingContext::new(table_size), ctx: IndexingContext::new(table_size),
per_field_postings_writers, per_field_postings_writers,
fieldnorms_writer: FieldNormsWriter::for_schema(&schema), fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
json_path_writer: JsonPathWriter::default(),
segment_serializer, segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager( fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
&schema, &schema,
@@ -147,7 +146,6 @@ impl SegmentWriter {
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self)) .map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
.transpose()?; .transpose()?;
remap_and_write( remap_and_write(
self.schema,
&self.per_field_postings_writers, &self.per_field_postings_writers,
self.ctx, self.ctx,
self.fast_field_writers, self.fast_field_writers,
@@ -159,8 +157,6 @@ impl SegmentWriter {
Ok(doc_opstamps) Ok(doc_opstamps)
} }
/// Returns an estimation of the current memory usage of the segment writer.
/// If the mem usage exceeds the `memory_budget`, the segment be serialized.
pub fn mem_usage(&self) -> usize { pub fn mem_usage(&self) -> usize {
self.ctx.mem_usage() self.ctx.mem_usage()
+ self.fieldnorms_writer.mem_usage() + self.fieldnorms_writer.mem_usage()
@@ -168,21 +164,18 @@ impl SegmentWriter {
+ self.segment_serializer.mem_usage() + self.segment_serializer.mem_usage()
} }
fn index_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> { fn index_document(&mut self, doc: &Document) -> crate::Result<()> {
let doc_id = self.max_doc; let doc_id = self.max_doc;
// TODO: Can this be optimised a bit?
let vals_grouped_by_field = doc let vals_grouped_by_field = doc
.iter_fields_and_values() .field_values()
.sorted_by_key(|(field, _)| *field) .iter()
.group_by(|(field, _)| *field); .sorted_by_key(|el| el.field())
.group_by(|el| el.field());
for (field, field_values) in &vals_grouped_by_field { for (field, field_values) in &vals_grouped_by_field {
let values = field_values.map(|el| el.1); let values = field_values.map(|field_value| field_value.value());
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
let make_schema_error = || { let make_schema_error = || {
TantivyError::SchemaError(format!( crate::TantivyError::SchemaError(format!(
"Expected a {:?} for field {:?}", "Expected a {:?} for field {:?}",
field_entry.field_type().value_type(), field_entry.field_type().value_type(),
field_entry.name() field_entry.name()
@@ -200,10 +193,7 @@ impl SegmentWriter {
match field_entry.field_type() { match field_entry.field_type() {
FieldType::Facet(_) => { FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value_access in values { for value in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
let facet = value.as_facet().ok_or_else(make_schema_error)?; let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str(); let facet_str = facet.encoded_str();
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str); let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -219,18 +209,19 @@ impl SegmentWriter {
} }
FieldType::Str(_) => { FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default(); let mut indexing_position = IndexingPosition::default();
for value_access in values { for value in values {
// Used to help with linting and type checking. let mut token_stream = match value {
let value = value_access as D::Value<'_>; Value::PreTokStr(tok_str) => {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
let mut token_stream = if let Some(text) = value.as_str() { }
let text_analyzer = Value::Str(ref text) => {
&mut self.per_field_text_analyzers[field.field_id() as usize]; let text_analyzer =
text_analyzer.token_stream(text) &mut self.per_field_text_analyzers[field.field_id() as usize];
} else if let Some(tok_str) = value.as_pre_tokenized_text() { text_analyzer.token_stream(text)
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone())) }
} else { _ => {
continue; continue;
}
}; };
assert!(term_buffer.is_empty()); assert!(term_buffer.is_empty());
@@ -249,10 +240,7 @@ impl SegmentWriter {
} }
FieldType::U64(_) => { FieldType::U64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?; let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val); term_buffer.set_u64(u64_val);
@@ -264,13 +252,9 @@ impl SegmentWriter {
} }
FieldType::Date(_) => { FieldType::Date(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
num_vals += 1; num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?; let date_val = value.as_date().ok_or_else(make_schema_error)?;
term_buffer term_buffer
.set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64()); .set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
@@ -281,10 +265,7 @@ impl SegmentWriter {
} }
FieldType::I64(_) => { FieldType::I64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?; let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val); term_buffer.set_i64(i64_val);
@@ -296,10 +277,7 @@ impl SegmentWriter {
} }
FieldType::F64(_) => { FieldType::F64(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?; let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val); term_buffer.set_f64(f64_val);
@@ -311,10 +289,7 @@ impl SegmentWriter {
} }
FieldType::Bool(_) => { FieldType::Bool(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?; let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val); term_buffer.set_bool(bool_val);
@@ -326,10 +301,7 @@ impl SegmentWriter {
} }
FieldType::Bytes(_) => { FieldType::Bytes(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?; let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes); term_buffer.set_bytes(bytes);
@@ -342,33 +314,21 @@ impl SegmentWriter {
FieldType::JsonObject(json_options) => { FieldType::JsonObject(json_options) => {
let text_analyzer = let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize]; &mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it = values.map(|value_access| { let json_values_it =
// Used to help with linting and type checking. values.map(|value| value.as_json().ok_or_else(make_schema_error));
let value_access = value_access as D::Value<'_>; index_json_values(
let value = value_access.as_value();
match value {
ReferenceValue::Object(object_iter) => Ok(object_iter),
_ => Err(make_schema_error()),
}
});
index_json_values::<D::Value<'_>>(
doc_id, doc_id,
json_values_it, json_values_it,
text_analyzer, text_analyzer,
json_options.is_expand_dots_enabled(), json_options.is_expand_dots_enabled(),
term_buffer, term_buffer,
postings_writer, postings_writer,
&mut self.json_path_writer,
ctx, ctx,
)?; )?;
} }
FieldType::IpAddr(_) => { FieldType::IpAddr(_) => {
let mut num_vals = 0; let mut num_vals = 0;
for value_access in values { for value in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
num_vals += 1; num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?; let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
term_buffer.set_ip_addr(ip_addr); term_buffer.set_ip_addr(ip_addr);
@@ -386,10 +346,7 @@ impl SegmentWriter {
/// Indexes a new document /// Indexes a new document
/// ///
/// As a user, you should rather use `IndexWriter`'s add_document. /// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document<D: Document>( pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
&mut self,
add_operation: AddOperation<D>,
) -> crate::Result<()> {
let AddOperation { document, opstamp } = add_operation; let AddOperation { document, opstamp } = add_operation;
self.doc_opstamps.push(opstamp); self.doc_opstamps.push(opstamp);
self.fast_field_writers.add_document(&document)?; self.fast_field_writers.add_document(&document)?;
@@ -427,7 +384,6 @@ impl SegmentWriter {
/// ///
/// `doc_id_map` is used to map to the new doc_id order. /// `doc_id_map` is used to map to the new doc_id order.
fn remap_and_write( fn remap_and_write(
schema: Schema,
per_field_postings_writers: &PerFieldPostingsWriter, per_field_postings_writers: &PerFieldPostingsWriter,
ctx: IndexingContext, ctx: IndexingContext,
fast_field_writers: FastFieldsWriter, fast_field_writers: FastFieldsWriter,
@@ -445,7 +401,6 @@ fn remap_and_write(
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
serialize_postings( serialize_postings(
ctx, ctx,
schema,
per_field_postings_writers, per_field_postings_writers,
fieldnorm_readers, fieldnorm_readers,
doc_id_map, doc_id_map,
@@ -490,37 +445,32 @@ fn remap_and_write(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::collections::BTreeMap;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use tempfile::TempDir; use tempfile::TempDir;
use crate::collector::{Count, TopDocs}; use super::compute_initial_table_size;
use crate::collector::Count;
use crate::core::json_utils::JsonTermWriter; use crate::core::json_utils::JsonTermWriter;
use crate::directory::RamDirectory; use crate::directory::RamDirectory;
use crate::postings::TermInfo; use crate::postings::TermInfo;
use crate::query::{PhraseQuery, QueryParser}; use crate::query::PhraseQuery;
use crate::schema::document::Value;
use crate::schema::{ use crate::schema::{
Document, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
TEXT,
}; };
use crate::store::{Compressor, StoreReader, StoreWriter}; use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339; use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::tokenizer::{PreTokenizedString, Token}; use crate::tokenizer::{PreTokenizedString, Token};
use crate::{ use crate::{
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument, DateTime, Directory, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED,
Term, TERMINATED,
}; };
#[test] #[test]
#[cfg(not(feature = "compare_hash_only"))]
fn test_hashmap_size() { fn test_hashmap_size() {
use super::compute_initial_table_size; assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 12); assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 15); assert_eq!(compute_initial_table_size(15_000_000).unwrap(), 1 << 18);
assert_eq!(compute_initial_table_size(15_000_000).unwrap(), 1 << 19);
assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19); assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19); assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
} }
@@ -530,7 +480,7 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("title", TEXT | STORED); let text_field = schema_builder.add_text_field("title", TEXT | STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
let pre_tokenized_text = PreTokenizedString { let pre_tokenized_text = PreTokenizedString {
text: String::from("A"), text: String::from("A"),
tokens: vec![Token { tokens: vec![Token {
@@ -554,48 +504,11 @@ mod tests {
store_writer.close().unwrap(); store_writer.close().unwrap();
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get::<TantivyDocument>(0).unwrap(); let doc = reader.get(0).unwrap();
assert_eq!(doc.field_values().len(), 2); assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.field_values()[0].value().as_str(), Some("A")); assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_str(), Some("title")); assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
}
#[test]
fn test_simple_json_indexing() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "b"})))
.unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "a"})))
.unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "b"})))
.unwrap();
writer.commit().unwrap();
let query_parser = QueryParser::for_index(&index, vec![json_field]);
let text_query = query_parser.parse_query("my_field:a").unwrap();
let score_docs: Vec<(_, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_eq!(score_docs.len(), 1);
let text_query = query_parser.parse_query("my_field:b").unwrap();
let score_docs: Vec<(_, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_eq!(score_docs.len(), 2);
} }
#[test] #[test]
@@ -626,13 +539,13 @@ mod tests {
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
let searcher = reader.searcher(); let searcher = reader.searcher();
let doc = searcher let doc = searcher
.doc::<TantivyDocument>(DocAddress { .doc(DocAddress {
segment_ord: 0u32, segment_ord: 0u32,
doc_id: 0u32, doc_id: 0u32,
}) })
.unwrap(); .unwrap();
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>( let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
&doc.to_json(&schema), &schema.to_json(&doc),
) )
.unwrap() .unwrap()
.get("json") .get("json")
@@ -762,10 +675,10 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT); let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
let json_val: BTreeMap<String, crate::schema::OwnedValue> = let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap(); serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap();
doc.add_object(json_field, json_val); doc.add_json_object(json_field, json_val);
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap(); let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap(); writer.add_document(doc).unwrap();
@@ -889,10 +802,11 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT); let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let doc = TantivyDocument::parse_json(&schema, r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#) let doc = schema
.parse_document(r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
.unwrap(); .unwrap();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
// On debug this did panic on the underflow // On debug this did panic on the underflow
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -917,7 +831,7 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT); let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
// This is a bit of a contrived example. // This is a bit of a contrived example.
let tokens = PreTokenizedString { let tokens = PreTokenizedString {
text: "roller-coaster".to_string(), text: "roller-coaster".to_string(),
@@ -932,7 +846,7 @@ mod tests {
doc.add_pre_tokenized_text(text, tokens.clone()); doc.add_pre_tokenized_text(text, tokens.clone());
doc.add_pre_tokenized_text(text, tokens); doc.add_pre_tokenized_text(text, tokens);
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
@@ -955,7 +869,7 @@ mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT); let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
// This is a bit of a contrived example. // This is a bit of a contrived example.
let tokens = PreTokenizedString { let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life. text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
@@ -980,7 +894,7 @@ mod tests {
doc.add_pre_tokenized_text(text, tokens); doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello"); doc.add_text(text, "hello");
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap(); index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
let reader = index.reader().unwrap(); let reader = index.reader().unwrap();
@@ -1016,7 +930,7 @@ mod tests {
let schema = index.schema(); let schema = index.schema();
let mut index_writer = index.writer(50_000_000).unwrap(); let mut index_writer = index.writer(50_000_000).unwrap();
let title = schema.get_field("title").unwrap(); let title = schema.get_field("title").unwrap();
let mut document = TantivyDocument::default(); let mut document = Document::default();
document.add_text(title, "The Old Man and the Sea"); document.add_text(title, "The Old Man and the Sea");
index_writer.add_document(document).unwrap(); index_writer.add_document(document).unwrap();
let error = index_writer.commit().unwrap_err(); let error = index_writer.commit().unwrap_err();

View File

@@ -21,7 +21,7 @@
//! # use tantivy::collector::TopDocs; //! # use tantivy::collector::TopDocs;
//! # use tantivy::query::QueryParser; //! # use tantivy::query::QueryParser;
//! # use tantivy::schema::*; //! # use tantivy::schema::*;
//! # use tantivy::{doc, DocAddress, Index, IndexWriter, Score}; //! # use tantivy::{doc, DocAddress, Index, Score};
//! # //! #
//! # fn main() { //! # fn main() {
//! # // Let's create a temporary directory for the //! # // Let's create a temporary directory for the
@@ -53,7 +53,7 @@
//! //!
//! // Here we use a buffer of 100MB that will be split //! // Here we use a buffer of 100MB that will be split
//! // between indexing threads. //! // between indexing threads.
//! let mut index_writer: IndexWriter = index.writer(100_000_000)?; //! let mut index_writer = index.writer(100_000_000)?;
//! //!
//! // Let's index one documents! //! // Let's index one documents!
//! index_writer.add_document(doc!( //! index_writer.add_document(doc!(
@@ -89,8 +89,8 @@
//! //!
//! for (_score, doc_address) in top_docs { //! for (_score, doc_address) in top_docs {
//! // Retrieve the actual content of documents given its `doc_address`. //! // Retrieve the actual content of documents given its `doc_address`.
//! let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?; //! let retrieved_doc = searcher.doc(doc_address)?;
//! println!("{}", retrieved_doc.to_json(&schema)); //! println!("{}", schema.to_json(&retrieved_doc));
//! } //! }
//! //!
//! # Ok(()) //! # Ok(())
@@ -103,48 +103,7 @@
//! the example code ( //! the example code (
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) / //! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/quickwit-oss/tantivy/blob/main/examples/basic_search.rs)) //! [source code](https://github.com/quickwit-oss/tantivy/blob/main/examples/basic_search.rs))
//!
//! # Tantivy Architecture Overview
//!
//! Tantivy is inspired by Lucene, the Architecture is very similar.
//!
//! ## Core Concepts
//!
//! - **[Index]**: A collection of segments. The top level entry point for tantivy users to search
//! and index data.
//!
//! - **[Segment]**: At the heart of Tantivy's indexing structure is the [Segment]. It contains
//! documents and indices and is the atomic unit of indexing and search.
//!
//! - **[Schema](schema)**: A schema is a set of fields in an index. Each field has a specific data
//! type and set of attributes.
//!
//! - **[IndexWriter]**: Responsible creating and merging segments. It executes the indexing
//! pipeline including tokenization, creating indices, and storing the index in the
//! [Directory](directory).
//!
//! - **Searching**: [Searcher] searches the segments with anything that implements
//! [Query](query::Query) and merges the results. The list of [supported
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
//! [Query](query::Query) trait.
//!
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
//!
//! - **[Tokenizer](tokenizer)**: Breaks down text into individual tokens. Users can implement or
//! use provided tokenizers.
//!
//! ## Architecture Flow
//!
//! 1. **Document Addition**: Users create documents according to the defined schema. The documents
//! fields are tokenized, processed, and added to the current segment. See
//! [Document](schema::document) for the structure and usage.
//!
//! 2. **Segment Creation**: Once the memory limit threshold is reached or a commit is called, the
//! segment is written to the Directory. Documents are searchable after `commit`.
//!
//! 3. **Merging**: To optimize space and search speed, segments might be merged. This operation is
//! performed in the background. Customize the merge behaviour via
//! [IndexWriter::set_merge_policy].
#[cfg_attr(test, macro_use)] #[cfg_attr(test, macro_use)]
extern crate serde_json; extern crate serde_json;
#[macro_use] #[macro_use]
@@ -178,7 +137,7 @@ pub use crate::future_result::FutureResult;
pub type Result<T> = std::result::Result<T, TantivyError>; pub type Result<T> = std::result::Result<T, TantivyError>;
mod core; mod core;
pub mod indexer; mod indexer;
#[allow(unused_doc_comments)] #[allow(unused_doc_comments)]
pub mod error; pub mod error;
@@ -202,7 +161,8 @@ pub mod termdict;
mod reader; mod reader;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer}; pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
pub mod snippet; mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
mod docset; mod docset;
use std::fmt; use std::fmt;
@@ -213,34 +173,26 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
pub use self::docset::{DocSet, TERMINATED}; pub use self::docset::{DocSet, TERMINATED};
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
)]
pub use self::snippet::{Snippet, SnippetGenerator};
#[doc(hidden)] #[doc(hidden)]
pub use crate::core::json_utils; pub use crate::core::json_utils;
pub use crate::core::{ pub use crate::core::{
merge_field_meta_data, Executor, FieldMetadata, Index, IndexBuilder, IndexMeta, IndexSettings, Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
IndexSortByField, InvertedIndexReader, Order, Searcher, SearcherGeneration, Segment, Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
SegmentComponent, SegmentId, SegmentMeta, SegmentReader, SingleSegmentIndexWriter, SegmentReader, SingleSegmentIndexWriter,
}; };
pub use crate::directory::Directory; pub use crate::directory::Directory;
pub use crate::indexer::IndexWriter; pub use crate::indexer::operation::UserOperation;
#[deprecated( pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit};
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
)]
pub use crate::indexer::{merge_filtered_segments, merge_indices, PreparedCommit};
pub use crate::postings::Postings; pub use crate::postings::Postings;
#[allow(deprecated)] #[allow(deprecated)]
pub use crate::schema::DatePrecision; pub use crate::schema::DatePrecision;
pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term}; pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
/// Index format version. /// Index format version.
const INDEX_FORMAT_VERSION: u32 = 6; const INDEX_FORMAT_VERSION: u32 = 5;
/// Oldest index format version this tantivy version can read.
const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4; #[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
/// Structure version for the index. /// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -390,9 +342,8 @@ pub mod tests {
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery; use crate::query::BooleanQuery;
use crate::schema::document::Value;
use crate::schema::*; use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy}; use crate::{DateTime, DocAddress, Index, Postings, ReloadPolicy};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() { pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
@@ -463,7 +414,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?; let index = Index::create_from_tempdir(schema)?;
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
{ {
let doc = doc!(text_field=>"af b"); let doc = doc!(text_field=>"af b");
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
@@ -485,7 +436,7 @@ pub mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?; index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"))?;
@@ -512,7 +463,7 @@ pub mod tests {
let title_field = schema_builder.add_text_field("title", TEXT); let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?; index_writer.commit()?;
let index_reader = index.reader()?; let index_reader = index.reader()?;
@@ -534,7 +485,7 @@ pub mod tests {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!())?; index_writer.add_document(doc!())?;
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"))?;
@@ -577,7 +528,7 @@ pub mod tests {
.unwrap(); .unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"))?;
// 1 // 1
@@ -624,7 +575,7 @@ pub mod tests {
} }
{ {
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
// 0 // 0
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"))?;
// 1 // 1
@@ -661,7 +612,7 @@ pub mod tests {
} }
{ {
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"))?; index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?; index_writer.rollback()?;
@@ -711,7 +662,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64))?; index_writer.add_document(doc!(field=>1u64))?;
index_writer.commit()?; index_writer.commit()?;
let reader = index.reader()?; let reader = index.reader()?;
@@ -734,7 +685,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64; let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val))?; index_writer.add_document(doc!(value_field => negative_val))?;
index_writer.commit()?; index_writer.commit()?;
@@ -758,7 +709,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI; let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val))?; index_writer.add_document(doc!(value_field => val))?;
index_writer.commit()?; index_writer.commit()?;
@@ -782,7 +733,7 @@ pub mod tests {
let absent_field = schema_builder.add_text_field("absent_text", TEXT); let absent_field = schema_builder.add_text_field("absent_text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"))?; index_writer.add_document(doc!(text_field=>"a"))?;
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let reader = index.reader()?; let reader = index.reader()?;
@@ -805,7 +756,7 @@ pub mod tests {
.try_into()?; .try_into()?;
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"))?; index_writer.add_document(doc!(text_field=>"63"))?;
index_writer.add_document(doc!(text_field=>"70"))?; index_writer.add_document(doc!(text_field=>"70"))?;
index_writer.add_document(doc!(text_field=>"34"))?; index_writer.add_document(doc!(text_field=>"34"))?;
@@ -830,7 +781,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?; index_writer.add_document(doc!(text_field=>"af af af bc bc"))?;
index_writer.commit()?; index_writer.commit()?;
} }
@@ -862,7 +813,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index.reader()?; let reader = index.reader()?;
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"))?; index_writer.add_document(doc!(text_field=>"af af af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?; index_writer.add_document(doc!(text_field=>"a b c d"))?;
@@ -926,7 +877,7 @@ pub mod tests {
.try_into()?; .try_into()?;
assert_eq!(reader.searcher().num_docs(), 0u64); assert_eq!(reader.searcher().num_docs(), 0u64);
// writing the segment // writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"))?; index_writer.add_document(doc!(text_field=>"af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?; index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?; index_writer.add_document(doc!(text_field=>"a b c d"))?;
@@ -1034,13 +985,13 @@ pub mod tests {
text_field => "some other value", text_field => "some other value",
other_text_field => "short"); other_text_field => "short");
assert_eq!(document.len(), 3); assert_eq!(document.len(), 3);
let values: Vec<&OwnedValue> = document.get_all(text_field).collect(); let values: Vec<&Value> = document.get_all(text_field).collect();
assert_eq!(values.len(), 2); assert_eq!(values.len(), 2);
assert_eq!(values[0].as_str(), Some("tantivy")); assert_eq!(values[0].as_text(), Some("tantivy"));
assert_eq!(values[1].as_str(), Some("some other value")); assert_eq!(values[1].as_text(), Some("some other value"));
let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect(); let values: Vec<&Value> = document.get_all(other_text_field).collect();
assert_eq!(values.len(), 1); assert_eq!(values.len(), 1);
assert_eq!(values[0].as_str(), Some("short")); assert_eq!(values[0].as_text(), Some("short"));
} }
#[test] #[test]
@@ -1054,7 +1005,7 @@ pub mod tests {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
{ {
let document = let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64); doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
@@ -1120,7 +1071,7 @@ pub mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let index_reader = index.reader()?; let index_reader = index.reader()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT { for doc_id in 0u64..DOC_COUNT {
@@ -1173,7 +1124,7 @@ pub mod tests {
let body = builder.add_text_field("body", TEXT | STORED); let body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build(); let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?; let index = Index::create_in_dir(&index_path, schema)?;
let mut writer: IndexWriter = index.writer(50_000_000)?; let mut writer = index.writer(50_000_000)?;
writer.set_merge_policy(Box::new(NoMergePolicy)); writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5000 { for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?; writer.add_document(doc!(body => "foo"))?;

View File

@@ -45,12 +45,12 @@
macro_rules! doc( macro_rules! doc(
() => { () => {
{ {
($crate::TantivyDocument::default()) ($crate::Document::default())
} }
}; // avoids a warning due to the useless `mut`. }; // avoids a warning due to the useless `mut`.
($($field:expr => $value:expr),*) => { ($($field:expr => $value:expr),*) => {
{ {
let mut document = $crate::TantivyDocument::default(); let mut document = $crate::Document::default();
$( $(
document.add_field_value($field, $value); document.add_field_value($field, $value);
)* )*

View File

@@ -92,7 +92,7 @@ impl PositionReader {
// that block is bitpacked. // that block is bitpacked.
let bit_width = bit_widths[block_rel_id]; let bit_width = bit_widths[block_rel_id];
self.block_decoder self.block_decoder
.uncompress_block_unsorted(compressed_data, bit_width, false); .uncompress_block_unsorted(compressed_data, bit_width);
} else { } else {
// that block is vint encoded. // that block is vint encoded.
self.block_decoder self.block_decoder

View File

@@ -62,9 +62,8 @@ impl<W: io::Write> PositionSerializer<W> {
return; return;
} }
if self.block.len() == COMPRESSION_BLOCK_SIZE { if self.block.len() == COMPRESSION_BLOCK_SIZE {
let (bit_width, block_encoded): (u8, &[u8]) = self let (bit_width, block_encoded): (u8, &[u8]) =
.block_encoder self.block_encoder.compress_block_unsorted(&self.block[..]);
.compress_block_unsorted(&self.block[..], false);
self.bit_widths.push(bit_width); self.bit_widths.push(bit_width);
self.positions_buffer.extend(block_encoded); self.positions_buffer.extend(block_encoded);
} else { } else {

View File

@@ -24,13 +24,13 @@ fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
#[derive(Clone)] #[derive(Clone)]
pub struct BlockSegmentPostings { pub struct BlockSegmentPostings {
pub(crate) doc_decoder: BlockDecoder, pub(crate) doc_decoder: BlockDecoder,
block_loaded: bool, loaded_offset: usize,
freq_decoder: BlockDecoder, freq_decoder: BlockDecoder,
freq_reading_option: FreqReadingOption, freq_reading_option: FreqReadingOption,
block_max_score_cache: Option<Score>, block_max_score_cache: Option<Score>,
doc_freq: u32, doc_freq: u32,
data: OwnedBytes, data: OwnedBytes,
skip_reader: SkipReader, pub(crate) skip_reader: SkipReader,
} }
fn decode_bitpacked_block( fn decode_bitpacked_block(
@@ -40,16 +40,10 @@ fn decode_bitpacked_block(
doc_offset: DocId, doc_offset: DocId,
doc_num_bits: u8, doc_num_bits: u8,
tf_num_bits: u8, tf_num_bits: u8,
strict_delta: bool,
) { ) {
let num_consumed_bytes = let num_consumed_bytes = doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits);
doc_decoder.uncompress_block_sorted(data, doc_offset, doc_num_bits, strict_delta);
if let Some(freq_decoder) = freq_decoder_opt { if let Some(freq_decoder) = freq_decoder_opt {
freq_decoder.uncompress_block_unsorted( freq_decoder.uncompress_block_unsorted(&data[num_consumed_bytes..], tf_num_bits);
&data[num_consumed_bytes..],
tf_num_bits,
strict_delta,
);
} }
} }
@@ -63,15 +57,11 @@ fn decode_vint_block(
let num_consumed_bytes = let num_consumed_bytes =
doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs, TERMINATED); doc_decoder.uncompress_vint_sorted(data, doc_offset, num_vint_docs, TERMINATED);
if let Some(freq_decoder) = freq_decoder_opt { if let Some(freq_decoder) = freq_decoder_opt {
// if it's a json term with freq, containing less than 256 docs, we can reach here thinking freq_decoder.uncompress_vint_unsorted(
// we have a freq, despite not really having one. &data[num_consumed_bytes..],
if data.len() > num_consumed_bytes { num_vint_docs,
freq_decoder.uncompress_vint_unsorted( TERMINATED,
&data[num_consumed_bytes..], );
num_vint_docs,
TERMINATED,
);
}
} }
} }
@@ -88,46 +78,28 @@ fn split_into_skips_and_postings(
} }
impl BlockSegmentPostings { impl BlockSegmentPostings {
/// Opens a `BlockSegmentPostings`.
/// `doc_freq` is the number of documents in the posting list.
/// `record_option` represents the amount of data available according to the schema.
/// `requested_option` is the amount of data requested by the user.
/// If for instance, we do not request for term frequencies, this function will not decompress
/// term frequency blocks.
pub(crate) fn open( pub(crate) fn open(
doc_freq: u32, doc_freq: u32,
data: FileSlice, data: FileSlice,
mut record_option: IndexRecordOption, record_option: IndexRecordOption,
requested_option: IndexRecordOption, requested_option: IndexRecordOption,
) -> io::Result<BlockSegmentPostings> { ) -> io::Result<BlockSegmentPostings> {
let bytes = data.read_bytes()?;
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
let skip_reader = match skip_data_opt {
Some(skip_data) => {
let block_count = doc_freq as usize / COMPRESSION_BLOCK_SIZE;
// 8 is the minimum size of a block with frequency (can be more if pos are stored
// too)
if skip_data.len() < 8 * block_count {
// the field might be encoded with frequency, but this term in particular isn't.
// This can happen for JSON field with term frequencies:
// - text terms are encoded with term freqs.
// - numerical terms are encoded without term freqs.
record_option = IndexRecordOption::Basic;
}
SkipReader::new(skip_data, doc_freq, record_option)
}
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
};
let freq_reading_option = match (record_option, requested_option) { let freq_reading_option = match (record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq, (IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq, (_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq, (_, _) => FreqReadingOption::ReadFreq,
}; };
let bytes = data.read_bytes()?;
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, bytes)?;
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, doc_freq, record_option),
None => SkipReader::new(OwnedBytes::empty(), doc_freq, record_option),
};
let mut block_segment_postings = BlockSegmentPostings { let mut block_segment_postings = BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED), doc_decoder: BlockDecoder::with_val(TERMINATED),
block_loaded: false, loaded_offset: usize::MAX,
freq_decoder: BlockDecoder::with_val(1), freq_decoder: BlockDecoder::with_val(1),
freq_reading_option, freq_reading_option,
block_max_score_cache: None, block_max_score_cache: None,
@@ -197,7 +169,7 @@ impl BlockSegmentPostings {
split_into_skips_and_postings(doc_freq, postings_data)?; split_into_skips_and_postings(doc_freq, postings_data)?;
self.data = postings_data; self.data = postings_data;
self.block_max_score_cache = None; self.block_max_score_cache = None;
self.block_loaded = false; self.loaded_offset = usize::MAX;
if let Some(skip_data) = skip_data_opt { if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data, doc_freq); self.skip_reader.reset(skip_data, doc_freq);
} else { } else {
@@ -293,23 +265,22 @@ impl BlockSegmentPostings {
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) { pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) { if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None; self.block_max_score_cache = None;
self.block_loaded = false;
} }
} }
pub(crate) fn block_is_loaded(&self) -> bool { pub(crate) fn block_is_loaded(&self) -> bool {
self.block_loaded self.loaded_offset == self.skip_reader.byte_offset()
} }
pub(crate) fn load_block(&mut self) { pub(crate) fn load_block(&mut self) {
let offset = self.skip_reader.byte_offset(); let offset = self.skip_reader.byte_offset();
if self.block_is_loaded() { if self.loaded_offset == offset {
return; return;
} }
self.loaded_offset = offset;
match self.skip_reader.block_info() { match self.skip_reader.block_info() {
BlockInfo::BitPacked { BlockInfo::BitPacked {
doc_num_bits, doc_num_bits,
strict_delta_encoded,
tf_num_bits, tf_num_bits,
.. ..
} => { } => {
@@ -324,7 +295,6 @@ impl BlockSegmentPostings {
self.skip_reader.last_doc_in_previous_block, self.skip_reader.last_doc_in_previous_block,
doc_num_bits, doc_num_bits,
tf_num_bits, tf_num_bits,
strict_delta_encoded,
); );
} }
BlockInfo::VInt { num_docs } => { BlockInfo::VInt { num_docs } => {
@@ -348,13 +318,13 @@ impl BlockSegmentPostings {
); );
} }
} }
self.block_loaded = true;
} }
/// Advance to the next block. /// Advance to the next block.
///
/// Returns false if and only if there is no remaining block.
pub fn advance(&mut self) { pub fn advance(&mut self) {
self.skip_reader.advance(); self.skip_reader.advance();
self.block_loaded = false;
self.block_max_score_cache = None; self.block_max_score_cache = None;
self.load_block(); self.load_block();
} }
@@ -363,7 +333,7 @@ impl BlockSegmentPostings {
pub fn empty() -> BlockSegmentPostings { pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings { BlockSegmentPostings {
doc_decoder: BlockDecoder::with_val(TERMINATED), doc_decoder: BlockDecoder::with_val(TERMINATED),
block_loaded: true, loaded_offset: 0,
freq_decoder: BlockDecoder::with_val(1), freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq, freq_reading_option: FreqReadingOption::NoFreq,
block_max_score_cache: None, block_max_score_cache: None,
@@ -372,10 +342,6 @@ impl BlockSegmentPostings {
skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic), skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic),
} }
} }
pub(crate) fn skip_reader(&self) -> &SkipReader {
&self.skip_reader
}
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -33,40 +33,14 @@ impl BlockEncoder {
} }
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) { pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) {
// if offset is zero, convert it to None. This is correct as long as we do the same when let num_bits = self.bitpacker.num_bits_sorted(offset, block);
// decompressing. It's required in case the block starts with an actual zero.
let offset = if offset == 0u32 { None } else { Some(offset) };
let num_bits = self.bitpacker.num_bits_strictly_sorted(offset, block);
let written_size = let written_size =
self.bitpacker self.bitpacker
.compress_strictly_sorted(offset, block, &mut self.output[..], num_bits); .compress_sorted(offset, block, &mut self.output[..], num_bits);
(num_bits, &self.output[..written_size]) (num_bits, &self.output[..written_size])
} }
/// Compress a single block of unsorted numbers. pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) {
///
/// If `minus_one_encoded` is set, each value must be >= 1, and will be encoded in a sligly
/// more compact format. This is useful for some values where 0 isn't a correct value, such
/// as term frequency, but isn't correct for some usages like position lists, where 0 can
/// appear.
pub fn compress_block_unsorted(
&mut self,
block: &[u32],
minus_one_encoded: bool,
) -> (u8, &[u8]) {
debug_assert!(!minus_one_encoded || !block.contains(&0));
let mut block_minus_one = [0; COMPRESSION_BLOCK_SIZE];
let block = if minus_one_encoded {
for (elem_min_one, elem) in block_minus_one.iter_mut().zip(block) {
*elem_min_one = elem - 1;
}
&block_minus_one
} else {
block
};
let num_bits = self.bitpacker.num_bits(block); let num_bits = self.bitpacker.num_bits(block);
let written_size = self let written_size = self
.bitpacker .bitpacker
@@ -97,55 +71,21 @@ impl BlockDecoder {
} }
} }
/// Decompress block of sorted integers.
///
/// `strict_delta` depends on what encoding was used. Older version of tantivy never use strict
/// deltas, newer versions always use them.
pub fn uncompress_block_sorted( pub fn uncompress_block_sorted(
&mut self, &mut self,
compressed_data: &[u8], compressed_data: &[u8],
offset: u32, offset: u32,
num_bits: u8, num_bits: u8,
strict_delta: bool,
) -> usize {
if strict_delta {
let offset = std::num::NonZeroU32::new(offset).map(std::num::NonZeroU32::get);
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker.decompress_strictly_sorted(
offset,
compressed_data,
&mut self.output,
num_bits,
)
} else {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
}
}
/// Decompress block of unsorted integers.
///
/// `minus_one_encoded` depends on what encoding was used. Older version of tantivy never use
/// that encoding. Newer version use it for some structures, but not all. See the corresponding
/// call to `BlockEncoder::compress_block_unsorted`.
pub fn uncompress_block_unsorted(
&mut self,
compressed_data: &[u8],
num_bits: u8,
minus_one_encoded: bool,
) -> usize { ) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE; self.output_len = COMPRESSION_BLOCK_SIZE;
let res = self self.bitpacker
.bitpacker .decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
.decompress(compressed_data, &mut self.output, num_bits); }
if minus_one_encoded {
for val in &mut self.output { pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
*val += 1; self.output_len = COMPRESSION_BLOCK_SIZE;
} self.bitpacker
} .decompress(compressed_data, &mut self.output, num_bits)
res
} }
#[inline] #[inline]
@@ -278,8 +218,7 @@ pub mod tests {
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 0); let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 0);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::default();
{ {
let consumed_num_bytes = let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0, num_bits);
decoder.uncompress_block_sorted(compressed_data, 0, num_bits, true);
assert_eq!(consumed_num_bytes, compressed_data.len()); assert_eq!(consumed_num_bytes, compressed_data.len());
} }
for i in 0..128 { for i in 0..128 {
@@ -294,8 +233,7 @@ pub mod tests {
let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10); let (num_bits, compressed_data) = encoder.compress_block_sorted(&vals, 10);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::default();
{ {
let consumed_num_bytes = let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10, num_bits);
decoder.uncompress_block_sorted(compressed_data, 10, num_bits, true);
assert_eq!(consumed_num_bytes, compressed_data.len()); assert_eq!(consumed_num_bytes, compressed_data.len());
} }
for i in 0..128 { for i in 0..128 {
@@ -314,8 +252,7 @@ pub mod tests {
compressed.push(173u8); compressed.push(173u8);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::default();
{ {
let consumed_num_bytes = let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10, num_bits);
decoder.uncompress_block_sorted(&compressed, 10, num_bits, true);
assert_eq!(consumed_num_bytes, compressed.len() - 1); assert_eq!(consumed_num_bytes, compressed.len() - 1);
assert_eq!(compressed[consumed_num_bytes], 173u8); assert_eq!(compressed[consumed_num_bytes], 173u8);
} }
@@ -326,25 +263,21 @@ pub mod tests {
#[test] #[test]
fn test_encode_unsorted_block_with_junk() { fn test_encode_unsorted_block_with_junk() {
for minus_one_encode in [false, true] { let mut compressed: Vec<u8> = Vec::new();
let mut compressed: Vec<u8> = Vec::new(); let n = 128;
let n = 128; let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect(); let mut encoder = BlockEncoder::default();
let mut encoder = BlockEncoder::default(); let (num_bits, compressed_data) = encoder.compress_block_unsorted(&vals);
let (num_bits, compressed_data) = compressed.extend_from_slice(compressed_data);
encoder.compress_block_unsorted(&vals, minus_one_encode); compressed.push(173u8);
compressed.extend_from_slice(compressed_data); let mut decoder = BlockDecoder::default();
compressed.push(173u8); {
let mut decoder = BlockDecoder::default(); let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed, num_bits);
{ assert_eq!(consumed_num_bytes + 1, compressed.len());
let consumed_num_bytes = assert_eq!(compressed[consumed_num_bytes], 173u8);
decoder.uncompress_block_unsorted(&compressed, num_bits, minus_one_encode); }
assert_eq!(consumed_num_bytes + 1, compressed.len()); for i in 0..n {
assert_eq!(compressed[consumed_num_bytes], 173u8); assert_eq!(vals[i], decoder.output(i));
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
}
} }
} }
@@ -411,7 +344,7 @@ mod bench {
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32); let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::default(); let mut decoder = BlockDecoder::default();
b.iter(|| { b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32, num_bits, true); decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
}); });
} }

View File

@@ -1,7 +1,5 @@
use stacker::{ArenaHashMap, MemoryArena}; use stacker::{ArenaHashMap, MemoryArena};
use crate::indexer::path_to_unordered_id::PathToUnorderedId;
/// IndexingContext contains all of the transient memory arenas /// IndexingContext contains all of the transient memory arenas
/// required for building the inverted index. /// required for building the inverted index.
pub(crate) struct IndexingContext { pub(crate) struct IndexingContext {
@@ -10,7 +8,6 @@ pub(crate) struct IndexingContext {
pub term_index: ArenaHashMap, pub term_index: ArenaHashMap,
/// Arena is a memory arena that stores posting lists / term frequencies / positions. /// Arena is a memory arena that stores posting lists / term frequencies / positions.
pub arena: MemoryArena, pub arena: MemoryArena,
pub path_to_unordered_id: PathToUnorderedId,
} }
impl IndexingContext { impl IndexingContext {
@@ -20,7 +17,6 @@ impl IndexingContext {
IndexingContext { IndexingContext {
arena: MemoryArena::default(), arena: MemoryArena::default(),
term_index, term_index,
path_to_unordered_id: PathToUnorderedId::default(),
} }
} }

View File

@@ -3,18 +3,13 @@ use std::io;
use stacker::Addr; use stacker::Addr;
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::postings_writer::SpecializedPostingsWriter; use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder}; use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter}; use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::{Field, Type, JSON_END_OF_PATH}; use crate::schema::Type;
use crate::tokenizer::TokenStream; use crate::tokenizer::TokenStream;
use crate::{DocId, Term}; use crate::{DocId, Term};
/// The `JsonPostingsWriter` is odd in that it relies on a hidden contract:
///
/// `subscribe` is called directly to index non-text tokens, while
/// `index_text` is used to index text.
#[derive(Default)] #[derive(Default)]
pub(crate) struct JsonPostingsWriter<Rec: Recorder> { pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
str_posting_writer: SpecializedPostingsWriter<Rec>, str_posting_writer: SpecializedPostingsWriter<Rec>,
@@ -59,24 +54,18 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
/// The actual serialization format is handled by the `PostingsSerializer`. /// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize( fn serialize(
&self, &self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)], term_addrs: &[(Term<&[u8]>, Addr)],
ordered_id_to_path: &[&str],
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext, ctx: &IndexingContext,
serializer: &mut FieldSerializer, serializer: &mut FieldSerializer,
) -> io::Result<()> { ) -> io::Result<()> {
let mut term_buffer = Term::with_capacity(48);
let mut buffer_lender = BufferLender::default(); let mut buffer_lender = BufferLender::default();
for (_field, path_id, term, addr) in term_addrs { for (term, addr) in term_addrs {
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0)); if let Some(json_value) = term.value().as_json_value_bytes() {
term_buffer.append_bytes(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
term_buffer.append_bytes(&[JSON_END_OF_PATH]);
term_buffer.append_bytes(term);
if let Some(json_value) = term_buffer.value().as_json_value_bytes() {
let typ = json_value.typ(); let typ = json_value.typ();
if typ == Type::Str { if typ == Type::Str {
SpecializedPostingsWriter::<Rec>::serialize_one_term( SpecializedPostingsWriter::<Rec>::serialize_one_term(
term_buffer.serialized_value_bytes(), term,
*addr, *addr,
doc_id_map, doc_id_map,
&mut buffer_lender, &mut buffer_lender,
@@ -85,7 +74,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
)?; )?;
} else { } else {
SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term( SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
term_buffer.serialized_value_bytes(), term,
*addr, *addr,
doc_id_map, doc_id_map,
&mut buffer_lender, &mut buffer_lender,

View File

@@ -52,7 +52,7 @@ pub mod tests {
Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT, Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT,
}; };
use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN}; use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
use crate::{DocId, HasLen, IndexWriter, Score}; use crate::{DocId, HasLen, Score};
#[test] #[test]
pub fn test_position_write() -> crate::Result<()> { pub fn test_position_write() -> crate::Result<()> {
@@ -63,7 +63,7 @@ pub mod tests {
let mut segment = index.new_segment(); let mut segment = index.new_segment();
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment)?; let mut posting_serializer = InvertedIndexSerializer::open(&mut segment)?;
let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4, None)?; let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4, None)?;
field_serializer.new_term("abc".as_bytes(), 12u32, true)?; field_serializer.new_term("abc".as_bytes(), 12u32)?;
for doc_id in 0u32..120u32 { for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2]; let delta_positions = vec![1, 2, 3, 2];
field_serializer.write_doc(doc_id, 4, &delta_positions); field_serializer.write_doc(doc_id, 4, &delta_positions);
@@ -432,7 +432,7 @@ pub mod tests {
// delete some of the documents // delete some of the documents
{ {
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.delete_term(term_0); index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -483,7 +483,7 @@ pub mod tests {
// delete everything else // delete everything else
{ {
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.delete_term(term_1); index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
@@ -568,8 +568,8 @@ mod bench {
use crate::docset::TERMINATED; use crate::docset::TERMINATED;
use crate::query::Intersection; use crate::query::Intersection;
use crate::schema::{Field, IndexRecordOption, Schema, TantivyDocument, Term, STRING}; use crate::schema::{Document, Field, IndexRecordOption, Schema, Term, STRING};
use crate::{tests, DocSet, Index, IndexWriter}; use crate::{tests, DocSet, Index};
pub static TERM_A: Lazy<Term> = Lazy::new(|| { pub static TERM_A: Lazy<Term> = Lazy::new(|| {
let field = Field::from_field_id(0); let field = Field::from_field_id(0);
@@ -598,9 +598,9 @@ mod bench {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000; let posting_list_size = 1_000_000;
{ {
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size { for _ in 0..posting_list_size {
let mut doc = TantivyDocument::default(); let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) { if rng.gen_bool(1f64 / 15f64) {
doc.add_text(text_field, "a"); doc.add_text(text_field, "a");
} }

Some files were not shown because too many files have changed in this diff Show More