Compare commits

..

3 Commits

Author SHA1 Message Date
Pascal Seitz
722b6c5205 bump version 2023-10-25 20:41:07 +08:00
Pascal Seitz
0f2211ca44 increase min memory to 15MB for indexing
With tantivy 0.20 the minimum memory consumption per SegmentWriter increased to
12MB. 7MB are for the different fast field collectors types (they could be
lazily created). Increase the minimum memory from 3MB to 15MB.

Change memory variable naming from arena to budget.

closes #2156
2023-10-25 20:37:47 +08:00
PSeitz
21aabf961c Fix range query (#2226)
Fix range query end check in advance
Rename vars to reduce ambiguity
add tests

Fixes #2225
2023-10-25 20:37:36 +08:00
128 changed files with 2276 additions and 5904 deletions

View File

@@ -15,13 +15,13 @@ jobs:
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install Rust
run: rustup toolchain install nightly-2023-09-10 --profile minimal --component llvm-tools-preview
run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo +nightly-2023-09-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
run: cargo +nightly llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true

View File

@@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1
with:

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install nightly
uses: actions-rs/toolchain@v1
@@ -60,7 +60,7 @@ jobs:
name: test-${{ matrix.features.label}}
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.21.0"
version = "0.21.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -19,20 +19,19 @@ oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
tracing = "0.1"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.9.0", optional = true }
memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false }
zstd = { version = "0.12", optional = true, default-features = false }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79"
num_cpus = "1.13.1"
fs4 = { version = "0.7.0", optional = true }
fs4 = { version = "0.6.3", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
@@ -48,7 +47,7 @@ murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.12.0"
lru = "0.11.0"
fastdivide = "0.4.0"
itertools = "0.11.0"
measure_time = "0.8.2"
@@ -64,7 +63,6 @@ common = { version= "0.6", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"

View File

@@ -1,7 +1,7 @@
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{TantivyDocument, FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter};
use tantivy::schema::{FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
const HDFS_LOGS: &str = include_str!("hdfs.json");
const GH_LOGS: &str = include_str!("gh.json");
@@ -39,9 +39,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
@@ -50,10 +50,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
@@ -63,9 +62,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
@@ -74,10 +73,9 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
@@ -88,8 +86,7 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -116,7 +113,7 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -130,8 +127,7 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -158,7 +154,7 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
@@ -172,8 +168,7 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();

View File

@@ -1,8 +1,3 @@
//! # `column_index`
//!
//! `column_index` provides rank and select operations to associate positions when not all
//! documents have exactly one element.
mod merge;
mod multivalued_index;
mod optional_index;
@@ -46,10 +41,10 @@ impl ColumnIndex {
pub fn is_multivalue(&self) -> bool {
matches!(self, ColumnIndex::Multivalued(_))
}
/// Returns the cardinality of the column index.
///
/// By convention, if the column contains no docs, we consider that it is
/// full.
// Returns the cardinality of the column index.
//
// By convention, if the column contains no docs, we consider that it is
// full.
#[inline]
pub fn get_cardinality(&self) -> Cardinality {
match self {

View File

@@ -30,7 +30,6 @@ impl<'a> SerializableColumnIndex<'a> {
}
}
/// Serialize a column index.
pub fn serialize_column_index(
column_index: SerializableColumnIndex,
output: &mut impl Write,
@@ -52,7 +51,6 @@ pub fn serialize_column_index(
Ok(column_index_num_bytes)
}
/// Open a serialized column index.
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
if bytes.is_empty() {
return Err(io::Error::new(

View File

@@ -12,7 +12,7 @@ use std::io;
mod block_accessor;
mod column;
pub mod column_index;
mod column_index;
pub mod column_values;
mod columnar;
mod dictionary;

View File

@@ -1,14 +1,11 @@
#![allow(deprecated)]
use std::fmt;
use std::io::{Read, Write};
use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
use crate::BinarySerializable;
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
#[derive(
@@ -167,15 +164,3 @@ impl fmt::Debug for DateTime {
f.write_str(&utc_rfc3339)
}
}
impl BinarySerializable for DateTime {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
let timestamp_micros = self.into_timestamp_micros();
<i64 as BinarySerializable>::serialize(&timestamp_micros, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let timestamp_micros = <i64 as BinarySerializable>::deserialize(reader)?;
Ok(Self::from_timestamp_micros(timestamp_micros))
}
}

View File

@@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io::{Read, Write};
use std::{fmt, io};
@@ -250,43 +249,6 @@ impl BinarySerializable for String {
}
}
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self.iter() {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))
}
}
#[cfg(test)]
pub mod test {

View File

@@ -12,7 +12,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::AllQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Create Schema
@@ -132,10 +132,10 @@ fn main() -> tantivy::Result<()> {
let stream = Deserializer::from_str(data).into_iter::<Value>();
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let mut num_indexed = 0;
for value in stream {
let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?;
let doc = schema.parse_document(&serde_json::to_string(&value.unwrap())?)?;
index_writer.add_document(doc)?;
num_indexed += 1;
if num_indexed > 4 {

View File

@@ -15,7 +15,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -75,7 +75,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = TantivyDocument::default();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(
body,
@@ -217,8 +217,8 @@ fn main() -> tantivy::Result<()> {
// the document returned will only contain
// a title.
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
// We can also get an explanation to understand

View File

@@ -13,7 +13,7 @@ use columnar::Column;
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, Index, Score, SegmentReader};
#[derive(Default)]
struct Stats {
@@ -142,7 +142,7 @@ fn main() -> tantivy::Result<()> {
// this example.
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
product_name => "Super Broom 2000",
product_description => "While it is ok for short distance travel, this broom \

View File

@@ -6,7 +6,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -62,7 +62,7 @@ fn main() -> tantivy::Result<()> {
//
// Here we use a buffer of 50MB per thread. Using a bigger
// memory arena for the indexer can increase its throughput.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
@@ -103,8 +103,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

View File

@@ -4,8 +4,8 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -22,18 +22,16 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// The dates are passed as string in the RFC3339 format
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T12:53:50.53Z",
"event": "pull-request"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T13:00:00.22Z",
"event": "comment"
@@ -60,13 +58,13 @@ fn main() -> tantivy::Result<()> {
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
let retrieved_doc = searcher.doc(doc_address)?;
assert!(matches!(
retrieved_doc.get_first(occurred_at),
Some(OwnedValue::Date(_))
Some(Value::Date(_))
));
assert_eq!(
retrieved_doc.to_json(&schema),
schema.to_json(&retrieved_doc),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
);
}

View File

@@ -11,7 +11,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexReader, IndexWriter};
use tantivy::{doc, Index, IndexReader};
// A simple helper function to fetch a single document
// given its id from our index.
@@ -19,7 +19,7 @@ use tantivy::{doc, Index, IndexReader, IndexWriter};
fn extract_doc_given_isbn(
reader: &IndexReader,
isbn_term: &Term,
) -> tantivy::Result<Option<TantivyDocument>> {
) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher();
// This is the simplest query you can think of.
@@ -69,10 +69,10 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's add a couple of documents, for the sake of the example.
let mut old_man_doc = TantivyDocument::default();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
index_writer.add_document(doc!(
isbn => "978-0099908401",
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
// Oops our frankenstein doc seems misspelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
frankenstein_doc_misspelled.to_json(&schema),
schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
);
@@ -136,7 +136,7 @@ fn main() -> tantivy::Result<()> {
// No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
frankenstein_new_doc.to_json(&schema),
schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
);

View File

@@ -17,7 +17,7 @@
use tantivy::collector::FacetCollector;
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the sake of this example
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
let mut index_writer = index.writer(30_000_000)?;
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.

View File

@@ -12,7 +12,7 @@ use std::collections::HashSet;
use tantivy::collector::TopDocs;
use tantivy::query::BooleanQuery;
use tantivy::schema::*;
use tantivy::{doc, DocId, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, DocId, Index, Score, SegmentReader};
fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
let mut index_writer = index.writer(30_000_000)?;
index_writer.add_document(doc!(
title => "Fried egg",
@@ -91,10 +91,11 @@ fn main() -> tantivy::Result<()> {
.iter()
.map(|(_, doc_id)| {
searcher
.doc::<TantivyDocument>(*doc_id)
.doc(*doc_id)
.unwrap()
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.as_text()
.unwrap()
.to_owned()
})

View File

@@ -14,7 +14,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::FuzzyTermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -66,7 +66,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -151,10 +151,10 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3);
for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
// Note that the score is not lower for the fuzzy hit.
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("score {score:?} doc {}", retrieved_doc.to_json(&schema));
println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
// score 1.0 doc {"title":["The Diary of Muadib"]}
//
// score 1.0 doc {"title":["The Diary of a Young Girl"]}

View File

@@ -21,7 +21,7 @@ fn main() -> tantivy::Result<()> {
}"#;
// We can parse our document
let _mice_and_men_doc = TantivyDocument::parse_json(&schema, mice_and_men_doc_json)?;
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Prometheus"],
"year": 1818
}"#;
let _frankenstein_doc = TantivyDocument::parse_json(&schema, frankenstein_json)?;
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
// Note that the schema is saved in your index directory.
//

View File

@@ -5,7 +5,7 @@
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result};
use tantivy::{doc, Index, Result};
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
{
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 6_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?;
}

View File

@@ -6,7 +6,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -22,22 +22,20 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// ### IPv4
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
// `String`. Since the field is of type ip, we parse the IP address from the string and store it
// internally as IPv6.
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.33",
"event_type": "login"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.80",
"event_type": "checkout"
@@ -46,8 +44,7 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc)?;
// ### IPv6
// Adding a document that contains an IPv6 address.
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout"

View File

@@ -10,7 +10,7 @@
// ---
// Importing tantivy...
use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED};
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the
@@ -24,7 +24,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?;

View File

@@ -7,7 +7,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -20,9 +20,8 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let doc = TantivyDocument::parse_json(
&schema,
let mut index_writer = index.writer(50_000_000)?;
let doc = schema.parse_document(
r#"{
"timestamp": "2022-02-22T23:20:50.53Z",
"event_type": "click",
@@ -34,8 +33,7 @@ fn main() -> tantivy::Result<()> {
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"timestamp": "2022-02-22T23:20:51.53Z",
"event_type": "click",

View File

@@ -1,7 +1,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy, Result};
use tantivy::{doc, Index, ReloadPolicy, Result};
use tempfile::TempDir;
fn main() -> Result<()> {
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
@@ -67,12 +67,8 @@ fn main() -> Result<()> {
let mut titles = top_docs
.into_iter()
.map(|(_score, doc_address)| {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let title = doc
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.to_owned();
let doc = searcher.doc(doc_address)?;
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
Ok(title)
})
.collect::<Result<Vec<_>>>()?;

View File

@@ -13,7 +13,7 @@ use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> {
@@ -38,7 +38,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// We can create a document manually, by setting the fields
// one by one in a Document object.
@@ -83,7 +83,7 @@ fn main() -> tantivy::Result<()> {
}]
}"#;
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?;
let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc)?;
@@ -115,8 +115,8 @@ fn main() -> tantivy::Result<()> {
// Note that the tokens are not stored along with the original text
// in the document store
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("Document: {}", schema.to_json(&retrieved_doc));
}
// In contrary to the previous query, when we search for the "man" term we

View File

@@ -10,7 +10,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, Snippet, SnippetGenerator};
use tantivy::{doc, Index, Snippet, SnippetGenerator};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -27,7 +27,7 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// we'll only need one doc for this example.
index_writer.add_document(doc!(
@@ -54,10 +54,13 @@ fn main() -> tantivy::Result<()> {
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
println!(
"title: {}",
doc.get_first(title).unwrap().as_text().unwrap()
);
println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet));
}

View File

@@ -15,7 +15,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search`
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
index.tokenizers().register("stoppy", tokenizer);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
@@ -105,9 +105,9 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
let retrieved_doc = searcher.doc(doc_address)?;
println!("\n==\nDocument score {score}:");
println!("{}", retrieved_doc.to_json(&schema));
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

View File

@@ -6,8 +6,8 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId,
SegmentReader, Warmer,
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
Warmer,
};
// This example shows how warmers can be used to
@@ -143,7 +143,7 @@ fn main() -> tantivy::Result<()> {
const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;

View File

@@ -24,7 +24,7 @@ const SPECIAL_CHARS: &[char] = &[
/// consume a field name followed by colon. Return the field name with escape sequence
/// already interpreted
fn field_name(inp: &str) -> IResult<&str, String> {
fn field_name(i: &str) -> IResult<&str, String> {
let simple_char = none_of(SPECIAL_CHARS);
let first_char = verify(none_of(SPECIAL_CHARS), |c| *c != '-');
let escape_sequence = || preceded(char('\\'), one_of(SPECIAL_CHARS));
@@ -38,12 +38,12 @@ fn field_name(inp: &str) -> IResult<&str, String> {
char(':'),
),
|(first_char, next)| once(first_char).chain(next).collect(),
)(inp)
)(i)
}
/// Consume a word outside of any context.
// TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, &str> {
fn word(i: &str) -> IResult<&str, &str> {
map_res(
recognize(tuple((
satisfy(|c| {
@@ -55,14 +55,14 @@ fn word(inp: &str) -> IResult<&str, &str> {
})),
))),
|s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(i, ErrorKind::Tag)),
_ => Ok(s),
},
)(inp)
)(i)
}
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|inp| {
|i| {
opt_i_err(
preceded(
space0,
@@ -71,29 +71,29 @@ fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&st
}))),
),
"expected word",
)(inp)
)(i)
}
}
/// Consume a word inside a Range context. More values are allowed as they are
/// not ambiguous in this context.
fn relaxed_word(inp: &str) -> IResult<&str, &str> {
fn relaxed_word(i: &str) -> IResult<&str, &str> {
recognize(tuple((
satisfy(|c| !c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)),
many0(satisfy(|c: char| {
!c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
)))(inp)
)))(i)
}
fn negative_number(inp: &str) -> IResult<&str, &str> {
fn negative_number(i: &str) -> IResult<&str, &str> {
recognize(preceded(
char('-'),
tuple((digit1, opt(tuple((char('.'), digit1))))),
))(inp)
))(i)
}
fn simple_term(inp: &str) -> IResult<&str, (Delimiter, String)> {
fn simple_term(i: &str) -> IResult<&str, (Delimiter, String)> {
let escaped_string = |delimiter| {
// we need this because none_of can't accept an owned array of char.
let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter);
@@ -123,13 +123,13 @@ fn simple_term(inp: &str) -> IResult<&str, (Delimiter, String)> {
simple_quotes,
double_quotes,
text_no_delimiter,
))(inp)
))(i)
}
fn simple_term_infallible(
delimiter: &str,
) -> impl Fn(&str) -> JResult<&str, Option<(Delimiter, String)>> + '_ {
|inp| {
|i| {
let escaped_string = |delimiter| {
// we need this because none_of can't accept an owned array of char.
let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter);
@@ -162,11 +162,11 @@ fn simple_term_infallible(
map(word_infallible(delimiter), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors)
}),
)(inp)
)(i)
}
}
fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> {
fn term_or_phrase(i: &str) -> IResult<&str, UserInputLeaf> {
map(
tuple((simple_term, fallible(slop_or_prefix_val))),
|((delimiter, phrase), (slop, prefix))| {
@@ -179,10 +179,10 @@ fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> {
}
.into()
},
)(inp)
)(i)
}
fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>> {
fn term_or_phrase_infallible(i: &str) -> JResult<&str, Option<UserInputLeaf>> {
map(
// ~* for slop/prefix, ) inside group or ast tree, ^ if boost
tuple_infallible((simple_term_infallible("*)^"), slop_or_prefix_val)),
@@ -214,10 +214,10 @@ fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>>
};
(leaf, errors)
},
)(inp)
)(i)
}
fn term_group(inp: &str) -> IResult<&str, UserInputAst> {
fn term_group(i: &str) -> IResult<&str, UserInputAst> {
let occur_symbol = alt((
value(Occur::MustNot, char('-')),
value(Occur::Must, char('+')),
@@ -240,12 +240,12 @@ fn term_group(inp: &str) -> IResult<&str, UserInputAst> {
.collect(),
)
},
)(inp)
)(i)
}
// this is a precondition for term_group_infallible. Without it, term_group_infallible can fail
// with a panic. It does not consume its input.
fn term_group_precond(inp: &str) -> IResult<&str, (), ()> {
fn term_group_precond(i: &str) -> IResult<&str, (), ()> {
value(
(),
peek(tuple((
@@ -253,13 +253,13 @@ fn term_group_precond(inp: &str) -> IResult<&str, (), ()> {
space0,
char('('), // when we are here, we know it can't be anything but a term group
))),
)(inp)
)(i)
.map_err(|e| e.map(|_| ()))
}
fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (mut inp, (field_name, _, _, _)) =
tuple((field_name, space0, char('('), space0))(inp).expect("precondition failed");
fn term_group_infallible(i: &str) -> JResult<&str, UserInputAst> {
let (mut i, (field_name, _, _, _)) =
tuple((field_name, space0, char('('), space0))(i).expect("precondition failed");
let mut terms = Vec::new();
let mut errs = Vec::new();
@@ -270,19 +270,19 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
first_round = false;
Vec::new()
} else {
let (rest, (_, err)) = space1_infallible(inp)?;
inp = rest;
let (rest, (_, err)) = space1_infallible(i)?;
i = rest;
err
};
if inp.is_empty() {
if i.is_empty() {
errs.push(LenientErrorInternal {
pos: inp.len(),
pos: i.len(),
message: "missing )".to_string(),
});
break Ok((inp, (UserInputAst::Clause(terms), errs)));
break Ok((i, (UserInputAst::Clause(terms), errs)));
}
if let Some(inp) = inp.strip_prefix(')') {
break Ok((inp, (UserInputAst::Clause(terms), errs)));
if let Some(i) = i.strip_prefix(')') {
break Ok((i, (UserInputAst::Clause(terms), errs)));
}
// only append missing space error if we did not reach the end of group
errs.append(&mut space_error);
@@ -291,57 +291,26 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
// first byte is not `)` or ' '. If it did not, we would end up looping.
let (rest, ((occur, leaf), mut err)) =
tuple_infallible((occur_symbol, term_or_phrase_infallible))(inp)?;
tuple_infallible((occur_symbol, term_or_phrase_infallible))(i)?;
errs.append(&mut err);
if let Some(leaf) = leaf {
terms.push((occur, leaf.set_field(Some(field_name.clone())).into()));
}
inp = rest;
i = rest;
}
}
fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
value(
UserInputLeaf::Exists {
field: String::new(),
},
tuple((space0, char('*'))),
)(inp)
}
fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
value(
(),
peek(tuple((
field_name,
space0,
char('*'), // when we are here, we know it can't be anything but a exists
))),
)(inp)
.map_err(|e| e.map(|_| ()))
}
fn exists_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (inp, (field_name, _, _)) =
tuple((field_name, space0, char('*')))(inp).expect("precondition failed");
let exists = UserInputLeaf::Exists { field: field_name }.into();
Ok((inp, (exists, Vec::new())))
}
fn literal(inp: &str) -> IResult<&str, UserInputAst> {
// * alone is already parsed by our caller, so if `exists` succeed, we can be confident
// something (a field name) got parsed before
fn literal(i: &str) -> IResult<&str, UserInputAst> {
alt((
map(
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
tuple((opt(field_name), alt((range, set, term_or_phrase)))),
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
),
term_group,
))(inp)
))(i)
}
fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
fn literal_no_group_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
map(
tuple_infallible((
opt_i(field_name),
@@ -368,7 +337,7 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
pos: i.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
@@ -377,7 +346,7 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
pos: i.len(),
message: "parsed keyword NOT as term. It should be quoted".to_string(),
});
}
@@ -386,40 +355,34 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
errors,
)
},
)(inp)
)(i)
}
fn literal_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
fn literal_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
alt_infallible(
(
(
term_group_precond,
map(term_group_infallible, |(group, errs)| (Some(group), errs)),
),
(
exists_precond,
map(exists_infallible, |(exists, errs)| (Some(exists), errs)),
),
),
((
term_group_precond,
map(term_group_infallible, |(group, errs)| (Some(group), errs)),
),),
literal_no_group_infallible,
)(inp)
)(i)
}
fn slop_or_prefix_val(inp: &str) -> JResult<&str, (u32, bool)> {
fn slop_or_prefix_val(i: &str) -> JResult<&str, (u32, bool)> {
map(
opt_i(alt((
value((0, true), char('*')),
map(preceded(char('~'), u32), |slop| (slop, false)),
))),
|(slop_or_prefix_opt, err)| (slop_or_prefix_opt.unwrap_or_default(), err),
)(inp)
)(i)
}
/// Function that parses a range out of a Stream
/// Supports ranges like:
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
/// [a TO *], [a TO c], [abc TO bcd}
fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
fn range(i: &str) -> IResult<&str, UserInputLeaf> {
let range_term_val = || {
map(
alt((negative_number, relaxed_word, tag("*"))),
@@ -479,10 +442,10 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
lower,
upper,
},
)(inp)
)(i)
}
fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
fn range_infallible(i: &str) -> JResult<&str, UserInputLeaf> {
let lower_to_upper = map(
tuple_infallible((
opt_i(anychar),
@@ -590,10 +553,10 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
errors,
)
},
)(inp)
)(i)
}
fn set(inp: &str) -> IResult<&str, UserInputLeaf> {
fn set(i: &str) -> IResult<&str, UserInputLeaf> {
map(
preceded(
tuple((space0, tag("IN"), space1)),
@@ -607,10 +570,10 @@ fn set(inp: &str) -> IResult<&str, UserInputLeaf> {
field: None,
elements,
},
)(inp)
)(i)
}
fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
fn set_infallible(mut i: &str) -> JResult<&str, UserInputLeaf> {
// `IN [` has already been parsed when we enter, we only need to parse simple terms until we
// find a `]`
let mut elements = Vec::new();
@@ -621,41 +584,41 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
first_round = false;
Vec::new()
} else {
let (rest, (_, err)) = space1_infallible(inp)?;
inp = rest;
let (rest, (_, err)) = space1_infallible(i)?;
i = rest;
err
};
if inp.is_empty() {
if i.is_empty() {
// TODO push error about missing ]
//
errs.push(LenientErrorInternal {
pos: inp.len(),
pos: i.len(),
message: "missing ]".to_string(),
});
let res = UserInputLeaf::Set {
field: None,
elements,
};
return Ok((inp, (res, errs)));
return Ok((i, (res, errs)));
}
if let Some(inp) = inp.strip_prefix(']') {
if let Some(i) = i.strip_prefix(']') {
let res = UserInputLeaf::Set {
field: None,
elements,
};
return Ok((inp, (res, errs)));
return Ok((i, (res, errs)));
}
errs.append(&mut space_error);
// TODO
// here we do the assumption term_or_phrase_infallible always consume something if the
// first byte is not `)` or ' '. If it did not, we would end up looping.
let (rest, (delim_term, mut err)) = simple_term_infallible("]")(inp)?;
let (rest, (delim_term, mut err)) = simple_term_infallible("]")(i)?;
errs.append(&mut err);
if let Some((_, term)) = delim_term {
elements.push(term);
}
inp = rest;
i = rest;
}
}
@@ -663,16 +626,16 @@ fn negate(expr: UserInputAst) -> UserInputAst {
expr.unary(Occur::MustNot)
}
fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
fn leaf(i: &str) -> IResult<&str, UserInputAst> {
alt((
delimited(char('('), ast, char(')')),
map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
map(preceded(tuple((tag("NOT"), space1)), leaf), negate),
literal,
))(inp)
))(i)
}
fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
fn leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
alt_infallible(
(
(
@@ -702,23 +665,23 @@ fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
),
),
literal_infallible,
)(inp)
)(i)
}
fn positive_float_number(inp: &str) -> IResult<&str, f64> {
fn positive_float_number(i: &str) -> IResult<&str, f64> {
map(
recognize(tuple((digit1, opt(tuple((char('.'), digit1)))))),
// TODO this is actually dangerous if the number is actually not representable as a f64
// (too big for instance)
|float_str: &str| float_str.parse::<f64>().unwrap(),
)(inp)
)(i)
}
fn boost(inp: &str) -> JResult<&str, Option<f64>> {
opt_i(preceded(char('^'), positive_float_number))(inp)
fn boost(i: &str) -> JResult<&str, Option<f64>> {
opt_i(preceded(char('^'), positive_float_number))(i)
}
fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
fn boosted_leaf(i: &str) -> IResult<&str, UserInputAst> {
map(
tuple((leaf, fallible(boost))),
|(leaf, boost_opt)| match boost_opt {
@@ -727,10 +690,10 @@ fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
}
_ => leaf,
},
)(inp)
)(i)
}
fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
fn boosted_leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
map(
tuple_infallible((leaf_infallible, boost)),
|((leaf, boost_opt), error)| match boost_opt {
@@ -740,30 +703,30 @@ fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
),
_ => (leaf, error),
},
)(inp)
)(i)
}
fn occur_symbol(inp: &str) -> JResult<&str, Option<Occur>> {
fn occur_symbol(i: &str) -> JResult<&str, Option<Occur>> {
opt_i(alt((
value(Occur::MustNot, char('-')),
value(Occur::Must, char('+')),
)))(inp)
)))(i)
}
fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
tuple((fallible(occur_symbol), boosted_leaf))(inp)
fn occur_leaf(i: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
tuple((fallible(occur_symbol), boosted_leaf))(i)
}
#[allow(clippy::type_complexity)]
fn operand_occur_leaf_infallible(
inp: &str,
i: &str,
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
// TODO maybe this should support multiple chained AND/OR, and "fuse" them?
tuple_infallible((
delimited_infallible(nothing, opt_i(binary_operand), space0_infallible),
occur_symbol,
boosted_leaf_infallible,
))(inp)
))(i)
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -772,11 +735,11 @@ enum BinaryOperand {
And,
}
fn binary_operand(inp: &str) -> IResult<&str, BinaryOperand> {
fn binary_operand(i: &str) -> IResult<&str, BinaryOperand> {
alt((
value(BinaryOperand::And, tag("AND ")),
value(BinaryOperand::Or, tag("OR ")),
))(inp)
))(i)
}
fn aggregate_binary_expressions(
@@ -917,14 +880,14 @@ fn aggregate_infallible_expressions(
}
}
fn operand_leaf(inp: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> {
fn operand_leaf(i: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> {
tuple((
terminated(binary_operand, space0),
terminated(boosted_leaf, space0),
))(inp)
))(i)
}
fn ast(inp: &str) -> IResult<&str, UserInputAst> {
fn ast(i: &str) -> IResult<&str, UserInputAst> {
let boolean_expr = map(
separated_pair(boosted_leaf, space1, many1(operand_leaf)),
|(left, right)| aggregate_binary_expressions(left, right),
@@ -945,10 +908,10 @@ fn ast(inp: &str) -> IResult<&str, UserInputAst> {
space0,
alt((boolean_expr, whitespace_separated_leaves)),
space0,
)(inp)
)(i)
}
fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
fn ast_infallible(i: &str) -> JResult<&str, UserInputAst> {
// ast() parse either `term AND term OR term` or `+term term -term`
// both are locally ambiguous, and as we allow error, it's hard to permit backtracking.
// Instead, we allow a mix of both syntaxes, trying to make sense of what a user meant.
@@ -965,13 +928,13 @@ fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
},
);
delimited_infallible(space0_infallible, expression, space0_infallible)(inp)
delimited_infallible(space0_infallible, expression, space0_infallible)(i)
}
pub fn parse_to_ast(inp: &str) -> IResult<&str, UserInputAst> {
pub fn parse_to_ast(i: &str) -> IResult<&str, UserInputAst> {
map(delimited(space0, opt(ast), eof), |opt_ast| {
rewrite_ast(opt_ast.unwrap_or_else(UserInputAst::empty_query))
})(inp)
})(i)
}
pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>) {
@@ -1575,17 +1538,6 @@ mod test {
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
}
#[test]
fn test_exist_query() {
test_parse_query_to_ast_helper("a:*", "\"a\":*");
test_parse_query_to_ast_helper("a: *", "\"a\":*");
// an exist followed by default term being b
test_is_parse_err("a:*b", "(*\"a\":* *b)");
// this is a term query (not a phrase prefix)
test_parse_query_to_ast_helper("a:b*", "\"a\":b*");
}
#[test]
fn test_not_queries_are_consistent() {
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");

View File

@@ -16,9 +16,6 @@ pub enum UserInputLeaf {
field: Option<String>,
elements: Vec<String>,
},
Exists {
field: String,
},
}
impl UserInputLeaf {
@@ -39,9 +36,6 @@ impl UserInputLeaf {
upper,
},
UserInputLeaf::Set { field: _, elements } => UserInputLeaf::Set { field, elements },
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
field: field.expect("Exist query without a field isn't allowed"),
},
}
}
}
@@ -80,9 +74,6 @@ impl Debug for UserInputLeaf {
write!(formatter, "]")
}
UserInputLeaf::All => write!(formatter, "*"),
UserInputLeaf::Exists { field } => {
write!(formatter, "\"{field}\":*")
}
}
}
}

View File

@@ -134,142 +134,3 @@ impl Drop for ResourceLimitGuard {
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
}
}
#[cfg(test)]
mod tests {
use crate::aggregation::tests::exec_request_with_query;
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_merge() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb" }"#],
vec![r#"{ "text": "aaa", "text2": "bbb" }"#],
];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 1, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 1,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_data() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![vec![r#"{ "text": "aaa", "text2": "bbb" }"#]];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
// Empty result since there is no doc with dates
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 0, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 0,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
}

View File

@@ -103,8 +103,7 @@ impl AggregationWithAccessor {
field: field_name, ..
}) => {
let (accessor, column_type) =
// Only DateTime is supported for DateHistogram
get_ff_reader(reader, field_name, Some(&[ColumnType::DateTime]))?;
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
Terms(TermsAggregation {
@@ -118,10 +117,10 @@ impl AggregationWithAccessor {
ColumnType::U64,
ColumnType::F64,
ColumnType::Str,
ColumnType::DateTime,
// ColumnType::Bytes Unsupported
// ColumnType::Bool Unsupported
// ColumnType::IpAddr Unsupported
// ColumnType::DateTime Unsupported
];
// In case the column is empty we want the shim column to match the missing type
@@ -146,18 +145,7 @@ impl AggregationWithAccessor {
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
// Actually we could convert the text to a number and have the fast path, if it is
// provided in Rfc3339 format. But this use case is probably common
// enough to justify the effort.
let text_on_date_col = column_and_types.len() == 1
&& column_and_types[0].1 == ColumnType::DateTime
&& missing
.as_ref()
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
let use_special_missing_agg =
missing_and_more_than_one_col || text_on_non_text_col || text_on_date_col;
let use_special_missing_agg = missing_and_more_than_one_col || text_on_non_text_col;
if use_special_missing_agg {
let column_and_types =
get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?;

View File

@@ -9,7 +9,7 @@ use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_v
use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term};
use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation {
serde_json::from_value(json!({
@@ -586,7 +586,7 @@ fn test_aggregation_on_json_object() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"color": "red"})))
.unwrap();
@@ -630,7 +630,7 @@ fn test_aggregation_on_json_object_empty_columns() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Empty column when accessing color
index_writer
.add_document(doc!(json => json!({"price": 10.0})))
@@ -748,7 +748,7 @@ fn test_aggregation_on_json_object_mixed_types() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -132,7 +132,6 @@ impl DateHistogramAggregationReq {
hard_bounds: self.hard_bounds,
extended_bounds: self.extended_bounds,
keyed: self.keyed,
is_normalized_to_ns: false,
})
}
@@ -244,15 +243,15 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
}
#[cfg(test)]
pub mod tests {
mod tests {
use pretty_assertions::assert_eq;
use super::*;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING};
use crate::{Index, IndexWriter, TantivyDocument};
use crate::schema::{Schema, FAST};
use crate::Index;
#[test]
fn test_parse_into_millisecs() {
@@ -307,8 +306,7 @@ pub mod tests {
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("text", FAST | STRING);
schema_builder.add_text_field("text2", FAST | STRING);
schema_builder.add_text_field("text", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
@@ -316,7 +314,7 @@ pub mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for values in segment_and_docs {
for doc_str in values {
let doc = TantivyDocument::parse_json(&schema, doc_str)?;
let doc = schema.parse_document(doc_str)?;
index_writer.add_document(doc)?;
}
// writing the segment
@@ -328,7 +326,7 @@ pub mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() > 1 {
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}

View File

@@ -122,14 +122,11 @@ pub struct HistogramAggregation {
/// Whether to return the buckets as a hash map
#[serde(default)]
pub keyed: bool,
/// Whether the values are normalized to ns for date time values. Defaults to false.
#[serde(default)]
pub is_normalized_to_ns: bool,
}
impl HistogramAggregation {
pub(crate) fn normalize_date_time(&mut self) {
if !self.is_normalized_to_ns {
pub(crate) fn normalize(&mut self, column_type: ColumnType) {
if column_type.is_date_time() {
// values are provided in ms, but the fastfield is in nano seconds
self.interval *= 1_000_000.0;
self.offset = self.offset.map(|off| off * 1_000_000.0);
@@ -141,7 +138,6 @@ impl HistogramAggregation {
min: bounds.min * 1_000_000.0,
max: bounds.max * 1_000_000.0,
});
self.is_normalized_to_ns = true;
}
}
@@ -374,7 +370,7 @@ impl SegmentHistogramCollector {
Ok(IntermediateBucketResult::Histogram {
buckets,
is_date_agg: self.column_type == ColumnType::DateTime,
column_type: Some(self.column_type),
})
}
@@ -385,9 +381,7 @@ impl SegmentHistogramCollector {
accessor_idx: usize,
) -> crate::Result<Self> {
req.validate()?;
if field_type == ColumnType::DateTime {
req.normalize_date_time();
}
req.normalize(field_type);
let sub_aggregation_blueprint = if sub_aggregation.is_empty() {
None
@@ -445,7 +439,6 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// memory check upfront
let (_, first_bucket_num, last_bucket_num) =
generate_bucket_pos_with_opt_minmax(histogram_req, min_max);
// It's based on user input, so we need to account for overflows
let added_buckets = ((last_bucket_num.saturating_sub(first_bucket_num)).max(0) as u64)
.saturating_sub(buckets.len() as u64);
@@ -489,7 +482,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// Convert to BucketEntry
pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
buckets: Vec<IntermediateHistogramBucketEntry>,
is_date_agg: bool,
column_type: Option<ColumnType>,
histogram_req: &HistogramAggregation,
sub_aggregation: &Aggregations,
limits: &AggregationLimits,
@@ -498,8 +491,8 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// The request used in the the call to final is not yet be normalized.
// Normalization is changing the precision from milliseconds to nanoseconds.
let mut histogram_req = histogram_req.clone();
if is_date_agg {
histogram_req.normalize_date_time();
if let Some(column_type) = column_type {
histogram_req.normalize(column_type);
}
let mut buckets = if histogram_req.min_doc_count() == 0 {
// With min_doc_count != 0, we may need to add buckets, so that there are no
@@ -523,7 +516,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339
// and normalize from nanoseconds to milliseconds
if is_date_agg {
if column_type == Some(ColumnType::DateTime) {
for bucket in buckets.iter_mut() {
if let crate::aggregation::Key::F64(ref mut val) = bucket.key {
let key_as_string = format_date(*val as i64)?;

View File

@@ -1,6 +1,6 @@
use std::fmt::Debug;
use columnar::{BytesColumn, ColumnType, MonotonicallyMappableToU64, StrColumn};
use columnar::{BytesColumn, ColumnType, StrColumn};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
@@ -16,7 +16,7 @@ use crate::aggregation::intermediate_agg_result::{
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector,
};
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key};
use crate::aggregation::{f64_from_fastfield_u64, Key};
use crate::error::DataCorruption;
use crate::TantivyError;
@@ -531,13 +531,6 @@ impl SegmentTermCollector {
});
}
}
} else if self.field_type == ColumnType::DateTime {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val = i64::from_u64(val);
let date = format_date(val)?;
dict.insert(IntermediateKey::Str(date), intermediate_entry);
}
} else {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
@@ -590,9 +583,6 @@ pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
#[cfg(test)]
mod tests {
use common::DateTime;
use time::{Date, Month};
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::{
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
@@ -601,7 +591,7 @@ mod tests {
use crate::aggregation::AggregationLimits;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING};
use crate::{Index, IndexWriter};
use crate::Index;
#[test]
fn terms_aggregation_test_single_segment() -> crate::Result<()> {
@@ -1473,7 +1463,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();
@@ -1823,75 +1813,4 @@ mod tests {
Ok(())
}
#[test]
fn terms_aggregation_date() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_date": {
"terms": {
"field": "date_field"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// date_field field
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 2);
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_date_missing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!())?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_date": {
"terms": {
"field": "date_field",
"missing": "1982-09-17T00:00:00Z"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// date_field field
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
}

View File

@@ -117,7 +117,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter};
use crate::Index;
#[test]
fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> {
@@ -126,7 +126,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))
@@ -186,7 +186,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?;
index_writer.add_document(doc!(score => 5.0))?;
@@ -231,7 +231,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
@@ -278,7 +278,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
@@ -323,7 +323,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -385,7 +385,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -427,7 +427,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -172,16 +172,10 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Range(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(
Default::default(),
)),
Histogram(_) => {
Histogram(_) | DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
is_date_agg: false,
})
}
DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
is_date_agg: true,
column_type: None,
})
}
Average(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Average(
@@ -349,8 +343,8 @@ pub enum IntermediateBucketResult {
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
Histogram {
/// The column_type of the underlying `Column` is DateTime
is_date_agg: bool,
/// The column_type of the underlying `Column`
column_type: Option<ColumnType>,
/// The buckets
buckets: Vec<IntermediateHistogramBucketEntry>,
},
@@ -405,7 +399,7 @@ impl IntermediateBucketResult {
Ok(BucketResult::Range { buckets })
}
IntermediateBucketResult::Histogram {
is_date_agg,
column_type,
buckets,
} => {
let histogram_req = &req
@@ -414,7 +408,7 @@ impl IntermediateBucketResult {
.expect("unexpected aggregation, expected histogram aggregation");
let buckets = intermediate_histogram_buckets_to_final_buckets(
buckets,
is_date_agg,
column_type,
histogram_req,
req.sub_aggregation(),
limits,
@@ -463,11 +457,11 @@ impl IntermediateBucketResult {
(
IntermediateBucketResult::Histogram {
buckets: buckets_left,
is_date_agg: _,
..
},
IntermediateBucketResult::Histogram {
buckets: buckets_right,
is_date_agg: _,
..
},
) => {
let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> =

View File

@@ -71,7 +71,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter};
use crate::Index;
#[test]
fn test_max_agg_with_missing() -> crate::Result<()> {
@@ -79,7 +79,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();

View File

@@ -88,7 +88,7 @@ mod tests {
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
use crate::schema::{NumericOptions, Schema};
use crate::{Index, IndexWriter};
use crate::Index;
#[test]
fn test_metric_aggregations() {
@@ -96,7 +96,7 @@ mod tests {
let field_options = NumericOptions::default().set_fast();
let field = schema_builder.add_f64_field("price", field_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..3 {
index_writer

View File

@@ -300,7 +300,7 @@ mod tests {
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term};
use crate::{Index, Term};
#[test]
fn test_aggregation_stats_empty_index() -> crate::Result<()> {
@@ -494,7 +494,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();
@@ -541,7 +541,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json
index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap();

View File

@@ -319,7 +319,7 @@ mod tests {
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, IndexWriter, Term};
use crate::{Index, Term};
pub fn get_test_index_with_num_docs(
merge_segments: bool,
@@ -451,7 +451,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() > 1 {
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -565,7 +565,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}

View File

@@ -495,8 +495,8 @@ mod tests {
use crate::collector::Count;
use crate::core::Index;
use crate::query::{AllQuery, QueryParser, TermQuery};
use crate::schema::{Facet, FacetOptions, IndexRecordOption, Schema, TantivyDocument};
use crate::{IndexWriter, Term};
use crate::schema::{Document, Facet, FacetOptions, IndexRecordOption, Schema};
use crate::Term;
fn test_collapse_mapping_aux(
facet_terms: &[&str],
@@ -559,7 +559,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from("/facet/a")))
.unwrap();
@@ -588,7 +588,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
@@ -601,7 +601,7 @@ mod tests {
})
.collect();
for i in 0..num_facets * 10 {
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc).unwrap();
}
@@ -732,25 +732,24 @@ mod tests {
let index = Index::create_in_ram(schema);
let uniform = Uniform::new_inclusive(1, 100_000);
let mut docs: Vec<TantivyDocument> =
vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)),
);
doc
})
.collect();
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(uniform)),
);
doc
})
.collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}
@@ -781,7 +780,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let docs: Vec<TantivyDocument> = vec![("b", 2), ("a", 2), ("c", 4)]
let docs: Vec<Document> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
@@ -829,7 +828,7 @@ mod bench {
use crate::collector::FacetCollector;
use crate::query::AllQuery;
use crate::schema::{Facet, Schema, INDEXED};
use crate::{Index, IndexWriter};
use crate::Index;
#[bench]
fn bench_facet_collector(b: &mut Bencher) {
@@ -848,7 +847,7 @@ mod bench {
// 40425 docs
docs[..].shuffle(&mut thread_rng());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}

View File

@@ -12,7 +12,8 @@ use std::marker::PhantomData;
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
use crate::collector::{Collector, SegmentCollector};
use crate::{DocId, Score, SegmentReader};
use crate::schema::Field;
use crate::{DocId, Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate.
///
@@ -49,13 +50,13 @@ use crate::{DocId, Score, SegmentReader};
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new("price".to_string(), |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new("price".to_string(), |value| value < 5u64, TopDocs::with_limit(2));
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
///
/// assert_eq!(filtered_top_docs.len(), 0);
@@ -69,7 +70,7 @@ use crate::{DocId, Score, SegmentReader};
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
where TPredicate: 'static + Clone
{
field: String,
field: Field,
collector: TCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
@@ -82,7 +83,7 @@ where
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
{
/// Create a new `FilterCollector`.
pub fn new(field: String, predicate: TPredicate, collector: TCollector) -> Self {
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
@@ -109,7 +110,18 @@ where
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let column_opt = segment_reader.fast_fields().column_opt(&self.field)?;
let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let column_opt = segment_reader
.fast_fields()
.column_opt(field_entry.name())?;
let segment_collector = self
.collector
@@ -217,7 +229,7 @@ where
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let filter_collector = BytesFilterCollector::new("barcode".to_string(), |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
@@ -228,7 +240,7 @@ where
pub struct BytesFilterCollector<TCollector, TPredicate>
where TPredicate: 'static + Clone
{
field: String,
field: Field,
collector: TCollector,
predicate: TPredicate,
}
@@ -239,7 +251,7 @@ where
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
{
/// Create a new `BytesFilterCollector`.
pub fn new(field: String, predicate: TPredicate, collector: TCollector) -> Self {
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
@@ -262,7 +274,10 @@ where
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let column_opt = segment_reader.fast_fields().bytes(&self.field)?;
let schema = segment_reader.schema();
let field_name = schema.get_field_name(self.field);
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
let segment_collector = self
.collector

View File

@@ -97,7 +97,7 @@ pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
mod top_collector;
mod top_score_collector;
pub use self::top_score_collector::{TopDocs, TopNComputer};
pub use self::top_score_collector::TopDocs;
mod custom_score_top_collector;
pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};

View File

@@ -7,9 +7,7 @@ use crate::query::{AllQuery, QueryParser};
use crate::schema::{Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{
doc, DateTime, DocAddress, DocId, Index, Score, Searcher, SegmentOrdinal, TantivyDocument,
};
use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
compute_score: true,
@@ -42,7 +40,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
let query_parser = QueryParser::for_index(&index, vec![title]);
let query = query_parser.parse_query("diary")?;
let filter_some_collector = FilterCollector::new(
"price".to_string(),
price,
&|value: u64| value > 20_120u64,
TopDocs::with_limit(2),
);
@@ -51,11 +49,8 @@ pub fn test_filter_collector() -> crate::Result<()> {
assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(
"price".to_string(),
&|value| value < 5u64,
TopDocs::with_limit(2),
);
let filter_all_collector: FilterCollector<_, _, u64> =
FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
assert_eq!(filtered_top_docs.len(), 0);
@@ -66,8 +61,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
> 0
}
let filter_dates_collector =
FilterCollector::new("date".to_string(), &date_filter, TopDocs::with_limit(5));
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
assert_eq!(filtered_date_docs.len(), 2);
@@ -286,8 +280,8 @@ fn make_test_searcher() -> crate::Result<Searcher> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(Document::default())?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
Ok(index.reader()?.searcher())
}

View File

@@ -1,7 +1,7 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::marker::PhantomData;
use super::top_score_collector::TopNComputer;
use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader};
/// Contains a feature (field, score, etc.) of a document along with the document address.
@@ -20,14 +20,6 @@ pub(crate) struct ComparableDoc<T, D> {
pub feature: T,
pub doc: D,
}
impl<T: std::fmt::Debug, D: std::fmt::Debug> std::fmt::Debug for ComparableDoc<T, D> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ComparableDoc")
.field("feature", &self.feature)
.field("doc", &self.doc)
.finish()
}
}
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
@@ -99,13 +91,18 @@ where T: PartialOrd + Clone
if self.limit == 0 {
return Ok(Vec::new());
}
let mut top_collector = TopNComputer::new(self.limit + self.offset);
let mut top_collector = BinaryHeap::new();
for child_fruit in children {
for (feature, doc) in child_fruit {
top_collector.push(ComparableDoc { feature, doc });
if top_collector.len() < (self.limit + self.offset) {
top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
}
}
}
Ok(top_collector
.into_sorted_vec()
.into_iter()
@@ -114,7 +111,7 @@ where T: PartialOrd + Clone
.collect())
}
pub(crate) fn for_segment<F: PartialOrd + Clone>(
pub(crate) fn for_segment<F: PartialOrd>(
&self,
segment_id: SegmentOrdinal,
_: &SegmentReader,
@@ -139,18 +136,20 @@ where T: PartialOrd + Clone
/// The Top Collector keeps track of the K documents
/// sorted by type `T`.
///
/// The implementation is based on a repeatedly truncating on the median after K * 2 documents
/// The implementation is based on a `BinaryHeap`.
/// The theoretical complexity for collecting the top `K` out of `n` documents
/// is `O(n + K)`.
/// is `O(n log K)`.
pub(crate) struct TopSegmentCollector<T> {
topn_computer: TopNComputer<T, DocId>,
limit: usize,
heap: BinaryHeap<ComparableDoc<T, DocId>>,
segment_ord: u32,
}
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
impl<T: PartialOrd> TopSegmentCollector<T> {
fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {
TopSegmentCollector {
topn_computer: TopNComputer::new(limit),
limit,
heap: BinaryHeap::with_capacity(limit),
segment_ord,
}
}
@@ -159,7 +158,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
pub fn harvest(self) -> Vec<(T, DocAddress)> {
let segment_ord = self.segment_ord;
self.topn_computer
self.heap
.into_sorted_vec()
.into_iter()
.map(|comparable_doc| {
@@ -174,13 +173,33 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
.collect()
}
/// Return true if more documents have been collected than the limit.
#[inline]
pub(crate) fn at_capacity(&self) -> bool {
self.heap.len() >= self.limit
}
/// Collects a document scored by the given feature
///
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
/// will compare the lowest scoring item with the given one and keep whichever is greater.
#[inline]
pub fn collect(&mut self, doc: DocId, feature: T) {
self.topn_computer.push(ComparableDoc { feature, doc });
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
if let Some(limit_feature) = self.heap.peek().map(|head| head.feature.clone()) {
if limit_feature < feature {
if let Some(mut head) = self.heap.peek_mut() {
head.feature = feature;
head.doc = doc;
}
}
}
} else {
// we have not reached capacity yet, so we can just push the
// element.
self.heap.push(ComparableDoc { feature, doc });
}
}
}

View File

@@ -1,3 +1,4 @@
use std::collections::BinaryHeap;
use std::fmt;
use std::marker::PhantomData;
use std::sync::Arc;
@@ -85,15 +86,12 @@ where
/// The `TopDocs` collector keeps track of the top `K` documents
/// sorted by their score.
///
/// The implementation is based on a repeatedly truncating on the median after K * 2 documents
/// with pattern defeating QuickSort.
/// The theoretical complexity for collecting the top `K` out of `N` documents
/// is `O(N + K)`.
/// The implementation is based on a `BinaryHeap`.
/// The theoretical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
///
/// This collector does not guarantee a stable sorting in case of a tie on the
/// document score, for stable sorting `PartialOrd` needs to resolve on other fields
/// like docid in case of score equality.
/// Only then, it is suitable for pagination.
/// This collector guarantees a stable sorting in case of a tie on the
/// document score. As such, it is suitable to implement pagination.
///
/// ```rust
/// use tantivy::collector::TopDocs;
@@ -663,35 +661,50 @@ impl Collector for TopDocs {
reader: &SegmentReader,
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let heap_len = self.0.limit + self.0.offset;
let mut top_n = TopNComputer::new(heap_len);
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
if let Some(alive_bitset) = reader.alive_bitset() {
let mut threshold = Score::MIN;
top_n.threshold = Some(threshold);
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
if alive_bitset.is_deleted(doc) {
return threshold;
}
let doc = ComparableDoc {
let heap_item = ComparableDoc {
feature: score,
doc,
};
top_n.push(doc);
threshold = top_n.threshold.unwrap_or(Score::MIN);
if heap.len() < heap_len {
heap.push(heap_item);
if heap.len() == heap_len {
threshold = heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
}
return threshold;
}
*heap.peek_mut().unwrap() = heap_item;
threshold = heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
threshold
})?;
} else {
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
let doc = ComparableDoc {
let heap_item = ComparableDoc {
feature: score,
doc,
};
top_n.push(doc);
top_n.threshold.unwrap_or(Score::MIN)
if heap.len() < heap_len {
heap.push(heap_item);
// TODO the threshold is suboptimal for heap.len == heap_len
if heap.len() == heap_len {
return heap.peek().map(|el| el.feature).unwrap_or(Score::MIN);
} else {
return Score::MIN;
}
}
*heap.peek_mut().unwrap() = heap_item;
heap.peek().map(|el| el.feature).unwrap_or(Score::MIN)
})?;
}
let fruit = top_n
let fruit = heap
.into_sorted_vec()
.into_iter()
.map(|cid| {
@@ -723,81 +736,9 @@ impl SegmentCollector for TopScoreSegmentCollector {
}
}
/// Fast TopN Computation
///
/// For TopN == 0, it will be relative expensive.
pub struct TopNComputer<Score, DocId> {
buffer: Vec<ComparableDoc<Score, DocId>>,
top_n: usize,
pub(crate) threshold: Option<Score>,
}
impl<Score, DocId> TopNComputer<Score, DocId>
where
Score: PartialOrd + Clone,
DocId: Ord + Clone,
{
/// Create a new `TopNComputer`.
/// Internally it will allocate a buffer of size `2 * top_n`.
pub fn new(top_n: usize) -> Self {
let vec_cap = top_n.max(1) * 2;
TopNComputer {
buffer: Vec::with_capacity(vec_cap),
top_n,
threshold: None,
}
}
#[inline]
pub(crate) fn push(&mut self, doc: ComparableDoc<Score, DocId>) {
if let Some(last_median) = self.threshold.clone() {
if doc.feature < last_median {
return;
}
}
if self.buffer.len() == self.buffer.capacity() {
let median = self.truncate_top_n();
self.threshold = Some(median);
}
// This is faster since it avoids the buffer resizing to be inlined from vec.push()
// (this is in the hot path)
// TODO: Replace with `push_within_capacity` when it's stabilized
let uninit = self.buffer.spare_capacity_mut();
// This cannot panic, because we truncate_median will at least remove one element, since
// the min capacity is 2.
uninit[0].write(doc);
// This is safe because it would panic in the line above
unsafe {
self.buffer.set_len(self.buffer.len() + 1);
}
}
#[inline(never)]
fn truncate_top_n(&mut self) -> Score {
// Use select_nth_unstable to find the top nth score
let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n);
let median_score = median_el.feature.clone();
// Remove all elements below the top_n
self.buffer.truncate(self.top_n);
median_score
}
pub(crate) fn into_sorted_vec(mut self) -> Vec<ComparableDoc<Score, DocId>> {
if self.buffer.len() > self.top_n {
self.truncate_top_n();
}
self.buffer.sort_unstable();
self.buffer
}
}
#[cfg(test)]
mod tests {
use super::{TopDocs, TopNComputer};
use crate::collector::top_collector::ComparableDoc;
use super::TopDocs;
use crate::collector::Collector;
use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
@@ -826,78 +767,6 @@ mod tests {
}
}
#[test]
fn test_empty_topn_computer() {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(0);
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 2u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 3u32,
});
assert!(computer.into_sorted_vec().is_empty());
}
#[test]
fn test_topn_computer() {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(2);
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
computer.push(ComparableDoc {
feature: 2u32,
doc: 2u32,
});
computer.push(ComparableDoc {
feature: 3u32,
doc: 3u32,
});
computer.push(ComparableDoc {
feature: 2u32,
doc: 4u32,
});
computer.push(ComparableDoc {
feature: 1u32,
doc: 5u32,
});
assert_eq!(
computer.into_sorted_vec(),
&[
ComparableDoc {
feature: 3u32,
doc: 3u32,
},
ComparableDoc {
feature: 2u32,
doc: 2u32,
}
]
);
}
#[test]
fn test_topn_computer_no_panic() {
for top_n in 0..10 {
let mut computer: TopNComputer<u32, u32> = TopNComputer::new(top_n);
for _ in 0..1 + top_n * 2 {
computer.push(ComparableDoc {
feature: 1u32,
doc: 1u32,
});
}
let _vals = computer.into_sorted_vec();
}
}
#[test]
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> {
let index = make_index()?;
@@ -983,25 +852,20 @@ mod tests {
// using AllQuery to get a constant score
let searcher = index.reader().unwrap().searcher();
let page_0 = searcher.search(&AllQuery, &TopDocs::with_limit(1)).unwrap();
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
// precondition for the test to be meaningful: we did get documents
// with the same score
assert!(page_0.iter().all(|result| result.0 == page_1[0].0));
assert!(page_1.iter().all(|result| result.0 == page_1[0].0));
assert!(page_2.iter().all(|result| result.0 == page_2[0].0));
// sanity check since we're relying on make_index()
assert_eq!(page_0.len(), 1);
assert_eq!(page_1.len(), 2);
assert_eq!(page_2.len(), 3);
assert_eq!(page_1, &page_2[..page_1.len()]);
assert_eq!(page_0, &page_2[..page_0.len()]);
}
#[test]

View File

@@ -19,7 +19,6 @@ use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::document::Document;
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;
@@ -185,11 +184,11 @@ impl IndexBuilder {
///
/// It expects an originally empty directory, and will not run any GC operation.
#[doc(hidden)]
pub fn single_segment_index_writer<D: Document>(
pub fn single_segment_index_writer(
self,
dir: impl Into<Box<dyn Directory>>,
mem_budget: usize,
) -> crate::Result<SingleSegmentIndexWriter<D>> {
) -> crate::Result<SingleSegmentIndexWriter> {
let index = self.create(dir)?;
let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
Ok(index_simple_writer)
@@ -532,11 +531,11 @@ impl Index {
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub fn writer_with_num_threads<D: Document>(
pub fn writer_with_num_threads(
&self,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
) -> crate::Result<IndexWriter> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
@@ -565,7 +564,7 @@ impl Index {
/// That index writer only simply has a single thread and a memory budget of 15 MB.
/// Using a single thread gives us a deterministic allocation of DocId.
#[cfg(test)]
pub fn writer_for_tests<D: Document>(&self) -> crate::Result<IndexWriter<D>> {
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
self.writer_with_num_threads(1, 15_000_000)
}
@@ -580,10 +579,7 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub fn writer<D: Document>(
&self,
memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
pub fn writer(&self, memory_budget_in_bytes: usize) -> crate::Result<IndexWriter> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads;
if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {

View File

@@ -1,12 +1,11 @@
use std::io;
use common::BinarySerializable;
use fnv::FnvHashSet;
use crate::directory::FileSlice;
use crate::positions::PositionReader;
use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
use crate::schema::{IndexRecordOption, Term, Type, JSON_END_OF_PATH};
use crate::schema::{IndexRecordOption, Term};
use crate::termdict::TermDictionary;
/// The inverted index reader is in charge of accessing
@@ -70,28 +69,6 @@ impl InvertedIndexReader {
&self.termdict
}
/// Return the fields and types encoded in the dictionary in lexicographic oder.
/// Only valid on JSON fields.
///
/// Notice: This requires a full scan and therefore **very expensive**.
/// TODO: Move to sstable to use the index.
pub fn list_fields(&self) -> io::Result<Vec<(String, Type)>> {
let mut stream = self.termdict.stream()?;
let mut fields = Vec::new();
let mut fields_set = FnvHashSet::default();
while let Some((term, _term_info)) = stream.next() {
if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
if !fields_set.contains(&term[..index + 2]) {
fields_set.insert(term[..index + 2].to_vec());
let typ = Type::from_code(term[index + 1]).unwrap();
fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ));
}
}
}
Ok(fields)
}
/// Resets the block segment to another position of the postings
/// file.
///

View File

@@ -5,7 +5,6 @@ use rustc_hash::FxHashMap;
use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, Value};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
@@ -65,9 +64,9 @@ impl IndexingPositionsPerPath {
}
}
pub(crate) fn index_json_values<'a, V: Value<'a>>(
pub(crate) fn index_json_values<'a>(
doc: DocId,
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
@@ -76,11 +75,11 @@ pub(crate) fn index_json_values<'a, V: Value<'a>>(
) -> crate::Result<()> {
let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_visitor_res in json_visitors {
let json_visitor = json_visitor_res?;
index_json_object::<V>(
for json_value_res in json_values {
let json_value = json_value_res?;
index_json_object(
doc,
json_visitor,
json_value,
text_analyzer,
&mut json_term_writer,
postings_writer,
@@ -91,20 +90,20 @@ pub(crate) fn index_json_values<'a, V: Value<'a>>(
Ok(())
}
fn index_json_object<'a, V: Value<'a>>(
fn index_json_object(
doc: DocId,
json_visitor: V::ObjectIter,
json_value: &serde_json::Map<String, serde_json::Value>,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
for (json_path_segment, json_value_visitor) in json_visitor {
for (json_path_segment, json_value) in json_value {
json_term_writer.push_path_segment(json_path_segment);
index_json_value(
doc,
json_value_visitor,
json_value,
text_analyzer,
json_term_writer,
postings_writer,
@@ -115,65 +114,52 @@ fn index_json_object<'a, V: Value<'a>>(
}
}
fn index_json_value<'a, V: Value<'a>>(
fn index_json_value(
doc: DocId,
json_value: V,
json_value: &serde_json::Value,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
match json_value.as_value() {
ReferenceValue::Null => {}
ReferenceValue::Str(val) => {
let mut token_stream = text_analyzer.token_stream(val);
// TODO: make sure the chain position works out.
json_term_writer.close_path_and_set_type(Type::Str);
let indexing_position = positions_per_path.get_position(json_term_writer.term());
postings_writer.index_text(
doc,
&mut *token_stream,
json_term_writer.term_buffer,
ctx,
indexing_position,
);
}
ReferenceValue::U64(val) => {
json_term_writer.set_fast_value(val);
match json_value {
serde_json::Value::Null => {}
serde_json::Value::Bool(val_bool) => {
json_term_writer.set_fast_value(*val_bool);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
ReferenceValue::I64(val) => {
json_term_writer.set_fast_value(val);
serde_json::Value::Number(number) => {
if let Some(number_i64) = number.as_i64() {
json_term_writer.set_fast_value(number_i64);
} else if let Some(number_u64) = number.as_u64() {
json_term_writer.set_fast_value(number_u64);
} else if let Some(number_f64) = number.as_f64() {
json_term_writer.set_fast_value(number_f64);
}
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
ReferenceValue::F64(val) => {
json_term_writer.set_fast_value(val);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
ReferenceValue::Bool(val) => {
json_term_writer.set_fast_value(val);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
ReferenceValue::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValue::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
}
ReferenceValue::Date(val) => {
json_term_writer.set_fast_value(val);
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
ReferenceValue::PreTokStr(_) => {
unimplemented!("Pre-tokenized string support in dynamic fields is not yet implemented")
}
ReferenceValue::Bytes(_) => {
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValue::Array(elements) => {
for val in elements {
serde_json::Value::String(text) => match infer_type_from_str(text) {
TextOrDateTime::Text(text) => {
let mut token_stream = text_analyzer.token_stream(text);
// TODO make sure the chain position works out.
json_term_writer.close_path_and_set_type(Type::Str);
let indexing_position = positions_per_path.get_position(json_term_writer.term());
postings_writer.index_text(
doc,
&mut *token_stream,
json_term_writer.term_buffer,
ctx,
indexing_position,
);
}
TextOrDateTime::DateTime(dt) => {
json_term_writer.set_fast_value(DateTime::from_utc(dt));
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
},
serde_json::Value::Array(arr) => {
for val in arr {
index_json_value(
doc,
val,
@@ -185,10 +171,10 @@ fn index_json_value<'a, V: Value<'a>>(
);
}
}
ReferenceValue::Object(object) => {
index_json_object::<V>(
serde_json::Value::Object(map) => {
index_json_object(
doc,
object,
map,
text_analyzer,
json_term_writer,
postings_writer,
@@ -199,6 +185,21 @@ fn index_json_value<'a, V: Value<'a>>(
}
}
enum TextOrDateTime<'a> {
Text(&'a str),
DateTime(OffsetDateTime),
}
fn infer_type_from_str(text: &str) -> TextOrDateTime {
match OffsetDateTime::parse(text, &Rfc3339) {
Ok(dt) => {
let dt_utc = dt.to_offset(UtcOffset::UTC);
TextOrDateTime::DateTime(dt_utc)
}
Err(_) => TextOrDateTime::Text(text),
}
}
// Tries to infer a JSON type from a string.
pub fn convert_to_fast_value_and_get_term(
json_term_writer: &mut JsonTermWriter,

View File

@@ -5,8 +5,7 @@ use std::{fmt, io};
use crate::collector::Collector;
use crate::core::{Executor, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::document::{Document, DocumentDeserialize};
use crate::schema::{Schema, Term};
use crate::schema::{Document, Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader};
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
@@ -84,7 +83,7 @@ impl Searcher {
///
/// The searcher uses the segment ordinal to route the
/// request to the right `Segment`.
pub fn doc<D: DocumentDeserialize>(&self, doc_address: DocAddress) -> crate::Result<D> {
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get(doc_address.doc_id)
}
@@ -104,10 +103,7 @@ impl Searcher {
/// Fetches a document in an asynchronous manner.
#[cfg(feature = "quickwit")]
pub async fn doc_async<D: DocumentDeserialize>(
&self,
doc_address: DocAddress,
) -> crate::Result<D> {
pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get_async(doc_address.doc_id).await
}

View File

@@ -355,7 +355,7 @@ impl fmt::Debug for SegmentReader {
mod test {
use crate::core::Index;
use crate::schema::{Schema, Term, STORED, TEXT};
use crate::{DocId, IndexWriter};
use crate::DocId;
#[test]
fn test_num_alive() -> crate::Result<()> {
@@ -366,7 +366,7 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?;
@@ -392,7 +392,7 @@ mod test {
let name = schema.get_field("name").unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?;
@@ -402,7 +402,7 @@ mod test {
}
{
let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
let mut index_writer2 = index.writer(50_000_000)?;
index_writer2.delete_term(Term::from_field_text(name, "horse"));
index_writer2.delete_term(Term::from_field_text(name, "cap"));

View File

@@ -1,20 +1,16 @@
use std::marker::PhantomData;
use crate::indexer::operation::AddOperation;
use crate::indexer::segment_updater::save_metas;
use crate::indexer::SegmentWriter;
use crate::schema::document::Document;
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
#[doc(hidden)]
pub struct SingleSegmentIndexWriter<D: Document = TantivyDocument> {
pub struct SingleSegmentIndexWriter {
segment_writer: SegmentWriter,
segment: Segment,
opstamp: Opstamp,
_phantom: PhantomData<D>,
}
impl<D: Document> SingleSegmentIndexWriter<D> {
impl SingleSegmentIndexWriter {
pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
let segment = index.new_segment();
let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
@@ -22,7 +18,6 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
segment_writer,
segment,
opstamp: 0,
_phantom: PhantomData,
})
}
@@ -30,7 +25,7 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
self.segment_writer.mem_usage()
}
pub fn add_document(&mut self, document: D) -> crate::Result<()> {
pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
let opstamp = self.opstamp;
self.opstamp += 1;
self.segment_writer

View File

@@ -5,8 +5,8 @@ use crate::query::TermQuery;
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
use crate::tokenizer::TokenizerManager;
use crate::{
Directory, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
SegmentId, TantivyDocument, Term,
Directory, Document, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, SegmentId,
Term,
};
#[test]
@@ -159,7 +159,7 @@ mod mmap_specific {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let mut index = Index::create_from_tempdir(schema)?;
let mut writer: IndexWriter = index.writer_for_tests()?;
let mut writer = index.writer_for_tests()?;
writer.commit()?;
let reader = index
.reader_builder()
@@ -208,7 +208,7 @@ fn test_index_on_commit_reload_policy_aux(
.watch(WatchCallback::new(move || {
let _ = sender.send(());
}));
let mut writer: IndexWriter = index.writer_for_tests()?;
let mut writer = index.writer_for_tests()?;
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?;
writer.commit().unwrap();
@@ -242,7 +242,7 @@ fn garbage_collect_works_as_intended() -> crate::Result<()> {
let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
let mut writer: IndexWriter = index.writer_with_num_threads(1, 32_000_000).unwrap();
let mut writer = index.writer_with_num_threads(1, 32_000_000).unwrap();
for _seg in 0..8 {
for i in 0u64..1_000u64 {
writer.add_document(doc!(field => i))?;
@@ -306,7 +306,7 @@ fn test_merging_segment_update_docfreq() {
let id_field = schema_builder.add_text_field("id", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let mut writer = index.writer_for_tests().unwrap();
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5 {
writer.add_document(doc!(text_field=>"hello")).unwrap();
@@ -317,13 +317,13 @@ fn test_merging_segment_update_docfreq() {
writer
.add_document(doc!(text_field=>"hello", id_field=>"TO_BE_DELETED"))
.unwrap();
writer.add_document(TantivyDocument::default()).unwrap();
writer.add_document(Document::default()).unwrap();
writer.commit().unwrap();
for _ in 0..7 {
writer.add_document(doc!(text_field=>"hello")).unwrap();
}
writer.add_document(TantivyDocument::default()).unwrap();
writer.add_document(TantivyDocument::default()).unwrap();
writer.add_document(Document::default()).unwrap();
writer.add_document(Document::default()).unwrap();
writer.delete_term(Term::from_field_text(id_field, "TO_BE_DELETED"));
writer.commit().unwrap();

View File

@@ -1,7 +1,7 @@
use std::collections::HashMap;
use std::fmt;
use std::fs::{self, File, OpenOptions};
use std::io::{self, BufWriter, Read, Write};
use std::io::{self, BufWriter, Read, Seek, Write};
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock, Weak};
@@ -328,6 +328,12 @@ impl Write for SafeFileWriter {
}
}
impl Seek for SafeFileWriter {
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
self.0.seek(pos)
}
}
impl TerminatingWrite for SafeFileWriter {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
self.0.flush()?;
@@ -533,7 +539,7 @@ mod tests {
use super::*;
use crate::indexer::LogMergePolicy;
use crate::schema::{Schema, SchemaBuilder, TEXT};
use crate::{Index, IndexSettings, IndexWriter, ReloadPolicy};
use crate::{Index, IndexSettings, ReloadPolicy};
#[test]
fn test_open_non_existent_path() {
@@ -645,7 +651,7 @@ mod tests {
let index =
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_num_segments(3);
index_writer.set_merge_policy(Box::new(log_merge_policy));

View File

@@ -1,5 +1,5 @@
use std::collections::HashMap;
use std::io::{self, BufWriter, Cursor, Write};
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};
use std::{fmt, result};
@@ -48,6 +48,12 @@ impl Drop for VecWriter {
}
}
impl Seek for VecWriter {
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
self.data.seek(pos)
}
}
impl Write for VecWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.is_flushed = false;

View File

@@ -11,7 +11,6 @@ use crate::directory::error::{
Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError,
};
use crate::fastfield::FastFieldNotAvailableError;
use crate::schema::document::DeserializeError;
use crate::{query, schema};
/// Represents a `DataCorruption` error.
@@ -107,9 +106,6 @@ pub enum TantivyError {
/// e.g. a datastructure is incorrectly inititalized.
#[error("Internal error: '{0}'")]
InternalError(String),
#[error("Deserialize error: {0}")]
/// An error occurred while attempting to deserialize a document.
DeserializeError(DeserializeError),
}
impl From<io::Error> for TantivyError {
@@ -180,9 +176,3 @@ impl From<rayon::ThreadPoolBuildError> for TantivyError {
TantivyError::SystemError(error.to_string())
}
}
impl From<DeserializeError> for TantivyError {
fn from(error: DeserializeError) -> TantivyError {
TantivyError::DeserializeError(error)
}
}

View File

@@ -62,9 +62,8 @@ impl FacetReader {
#[cfg(test)]
mod tests {
use crate::schema::document::Value;
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED};
use crate::{DocAddress, Document, Index};
#[test]
fn test_facet_only_indexed() {
@@ -72,7 +71,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))
.unwrap();
@@ -86,10 +85,8 @@ mod tests {
let mut facet = Facet::default();
facet_reader.facet_from_ord(0, &mut facet).unwrap();
assert_eq!(facet.to_path_string(), "/a/b");
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))
.unwrap();
let value = doc.get_first(facet_field).and_then(|v| v.as_facet());
let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap();
let value = doc.get_first(facet_field).and_then(Value::as_facet);
assert_eq!(value, None);
}
@@ -99,7 +96,7 @@ mod tests {
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap()))
.unwrap();
@@ -145,8 +142,8 @@ mod tests {
let mut facet_ords = Vec::new();
facet_ords.extend(facet_reader.facet_ords(0u32));
assert_eq!(&facet_ords, &[0u64]);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet());
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet);
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
Ok(())
}
@@ -159,7 +156,7 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
@@ -179,8 +176,8 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(TantivyDocument::default())?;
index_writer.add_document(Document::default())?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();

View File

@@ -90,12 +90,12 @@ mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{
Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument,
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::time::OffsetDateTime;
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader};
use crate::{DateOptions, DateTimePrecision, Index, SegmentId, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
@@ -271,7 +271,7 @@ mod tests {
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
for i in -100i64..10_000i64 {
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
doc.add_i64(i64_field, i);
fast_field_writers.add_document(&doc).unwrap();
}
@@ -312,7 +312,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
@@ -345,7 +345,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
@@ -416,7 +416,7 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer
.add_document(doc!(date_field => DateTime::from_utc(OffsetDateTime::now_utc())))
@@ -452,7 +452,7 @@ mod tests {
{
// first segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer
.add_document(doc!(
@@ -506,7 +506,7 @@ mod tests {
{
// second segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(
@@ -537,7 +537,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.merge(&segment_ids).wait().unwrap();
index_writer.wait_merging_threads().unwrap();
}
@@ -662,7 +662,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -824,7 +824,7 @@ mod tests {
{
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
let doc = Document::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
write.terminate().unwrap();
@@ -846,7 +846,7 @@ mod tests {
assert_eq!(col.get_val(0), true);
}
fn get_index(docs: &[crate::TantivyDocument], schema: &Schema) -> crate::Result<RamDirectory> {
fn get_index(docs: &[crate::Document], schema: &Schema) -> crate::Result<RamDirectory> {
let directory: RamDirectory = RamDirectory::create();
{
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
@@ -888,7 +888,7 @@ mod tests {
let field = schema_builder.add_date_field("field", date_options);
let schema = schema_builder.build();
let docs: Vec<TantivyDocument> = times.iter().map(|time| doc!(field=>*time)).collect();
let docs: Vec<Document> = times.iter().map(|time| doc!(field=>*time)).collect();
let directory = get_index(&docs[..], &schema).unwrap();
let path = Path::new("test");
@@ -962,15 +962,11 @@ mod tests {
let ip_field = schema_builder.add_u64_field("ip", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let ip_addr = Ipv6Addr::new(1, 2, 3, 4, 5, 1, 2, 3);
index_writer
.add_document(TantivyDocument::default())
.unwrap();
index_writer.add_document(Document::default()).unwrap();
index_writer.add_document(doc!(ip_field=>ip_addr)).unwrap();
index_writer
.add_document(TantivyDocument::default())
.unwrap();
index_writer.add_document(Document::default()).unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fastfields = searcher.segment_reader(0u32).fast_fields();
@@ -1090,7 +1086,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"attr.age": 32})))
.unwrap();
@@ -1116,7 +1112,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"age": 32})))
.unwrap();
@@ -1143,7 +1139,7 @@ mod tests {
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"attr.age": 32})))
.unwrap();
@@ -1166,7 +1162,7 @@ mod tests {
let field_with_dot = schema_builder.add_i64_field("field.with.dot", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field_with_dot => 32i64))
.unwrap();
@@ -1188,7 +1184,7 @@ mod tests {
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json_field=> json!({"attr": {"age": 32}}), shadowing_json_field=>json!({"age": 33})))
.unwrap();
@@ -1219,7 +1215,7 @@ mod tests {
let mut index = Index::create_in_ram(schema);
index.set_fast_field_tokenizers(ff_tokenizer_manager);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "Test1 test2"))
.unwrap();
@@ -1248,7 +1244,7 @@ mod tests {
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(log_field => "info"))
.unwrap();
@@ -1281,7 +1277,7 @@ mod tests {
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json_field=> json!({"attr.age": 32}), shadowing_json_field=>json!({"age": 33})))
.unwrap();

View File

@@ -234,22 +234,6 @@ impl FastFieldReaders {
Ok(dynamic_column_handle_opt)
}
/// Returning all `dynamic_column_handle`.
pub fn dynamic_column_handles(
&self,
field_name: &str,
) -> crate::Result<Vec<DynamicColumnHandle>> {
let Some(resolved_field_name) = self.resolve_field(field_name)? else {
return Ok(Vec::new());
};
let dynamic_column_handles = self
.columnar
.read_columns(&resolved_field_name)?
.into_iter()
.collect();
Ok(dynamic_column_handles)
}
#[doc(hidden)]
pub async fn list_dynamic_column_handles(
&self,
@@ -354,10 +338,8 @@ impl FastFieldReaders {
#[cfg(test)]
mod tests {
use columnar::ColumnType;
use crate::schema::{JsonObjectOptions, Schema, FAST};
use crate::{Index, IndexWriter, TantivyDocument};
use crate::{Document, Index};
#[test]
fn test_fast_field_reader_resolve_with_dynamic_internal() {
@@ -373,10 +355,8 @@ mod tests {
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(TantivyDocument::default())
.unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(Document::default()).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
@@ -437,45 +417,4 @@ mod tests {
Some("_dyna\u{1}notinschema\u{1}attr\u{1}color".to_string())
);
}
#[test]
fn test_fast_field_reader_dynamic_column_handles() {
let mut schema_builder = Schema::builder();
let id = schema_builder.add_u64_field("id", FAST);
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(id=> 1u64, json => json!({"foo": 42})))
.unwrap();
index_writer
.add_document(doc!(id=> 2u64, json => json!({"foo": true})))
.unwrap();
index_writer
.add_document(doc!(id=> 3u64, json => json!({"foo": "bar"})))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let reader = searcher.segment_reader(0u32);
let fast_fields = reader.fast_fields();
let id_columns = fast_fields.dynamic_column_handles("id").unwrap();
assert_eq!(id_columns.len(), 1);
assert_eq!(id_columns.first().unwrap().column_type(), ColumnType::U64);
let foo_columns = fast_fields.dynamic_column_handles("json.foo").unwrap();
assert_eq!(foo_columns.len(), 3);
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::I64));
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::Bool));
assert!(foo_columns
.iter()
.any(|column| column.column_type() == ColumnType::Str));
println!("*** {:?}", fast_fields.columnar().list_columns());
}
}

View File

@@ -5,9 +5,8 @@ use common::replace_in_place;
use tokenizer_api::Token;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::document::{Document, ReferenceValue, Value};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
use crate::schema::{value_type_to_column_type, Document, FieldType, Schema, Type, Value};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DateTimePrecision, DocId, TantivyError};
@@ -118,115 +117,114 @@ impl FastFieldsWriter {
}
/// Indexes all of the fastfields of a new document.
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
let doc_id = self.num_docs;
for (field, value) in doc.iter_fields_and_values() {
let value_access = value as D::Value<'_>;
for field_value in doc.field_values() {
if let Some(field_name) =
&self.fast_field_names[field_value.field().field_id() as usize]
{
match &field_value.value {
Value::U64(u64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*u64_val),
);
}
Value::I64(i64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*i64_val),
);
}
Value::F64(f64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*f64_val),
);
}
Value::Str(text_val) => {
if let Some(tokenizer) =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
})
} else {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
}
Value::Bytes(bytes_val) => {
self.columnar_writer
.record_bytes(doc_id, field_name.as_str(), bytes_val);
}
Value::PreTokStr(pre_tok) => {
for token in &pre_tok.tokens {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
&token.text,
);
}
}
Value::Bool(bool_val) => {
self.columnar_writer
.record_bool(doc_id, field_name.as_str(), *bool_val);
}
Value::Date(datetime) => {
let date_precision =
self.date_precisions[field_value.field().field_id() as usize];
let truncated_datetime = datetime.truncate(date_precision);
self.columnar_writer.record_datetime(
doc_id,
field_name.as_str(),
truncated_datetime,
);
}
Value::Facet(facet) => {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
facet.encoded_str(),
);
}
Value::JsonObject(json_obj) => {
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
self.add_doc_value(doc_id, field, value_access)?;
let text_analyzer =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
Value::IpAddr(ip_addr) => {
self.columnar_writer
.record_ip_addr(doc_id, field_name.as_str(), *ip_addr);
}
}
}
}
self.num_docs += 1;
Ok(())
}
fn add_doc_value<'a, V: Value<'a>>(
&mut self,
doc_id: DocId,
field: Field,
value: V,
) -> crate::Result<()> {
let field_name = match &self.fast_field_names[field.field_id() as usize] {
None => return Ok(()),
Some(name) => name,
};
match value.as_value() {
ReferenceValue::Null => {}
ReferenceValue::Str(val) => {
if let Some(tokenizer) = &mut self.per_field_tokenizer[field.field_id() as usize] {
let mut token_stream = tokenizer.token_stream(val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer
.record_str(doc_id, field_name, &token.text);
})
} else {
self.columnar_writer.record_str(doc_id, field_name, val);
}
}
ReferenceValue::U64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValue::I64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValue::F64(val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name,
NumericalValue::from(val),
);
}
ReferenceValue::Date(val) => {
let date_precision = self.date_precisions[field.field_id() as usize];
let truncated_datetime = val.truncate(date_precision);
self.columnar_writer
.record_datetime(doc_id, field_name, truncated_datetime);
}
ReferenceValue::Facet(val) => {
self.columnar_writer
.record_str(doc_id, field_name, val.encoded_str());
}
ReferenceValue::Bytes(val) => {
self.columnar_writer.record_bytes(doc_id, field_name, val);
}
ReferenceValue::IpAddr(val) => {
self.columnar_writer.record_ip_addr(doc_id, field_name, val);
}
ReferenceValue::Bool(val) => {
self.columnar_writer.record_bool(doc_id, field_name, val);
}
ReferenceValue::PreTokStr(val) => {
for token in &val.tokens {
self.columnar_writer
.record_str(doc_id, field_name, &token.text);
}
}
ReferenceValue::Array(val) => {
// TODO: Check this is the correct behaviour we want.
for value in val {
self.add_doc_value(doc_id, field, value)?;
}
}
ReferenceValue::Object(val) => {
let expand_dots = self.expand_dots[field.field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
let text_analyzer = &mut self.per_field_tokenizer[field.field_id() as usize];
record_json_obj_to_columnar_writer::<V>(
doc_id,
val,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
}
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
@@ -243,16 +241,31 @@ impl FastFieldsWriter {
}
}
fn record_json_obj_to_columnar_writer<'a, V: Value<'a>>(
#[inline]
fn columnar_numerical_value(json_number: &serde_json::Number) -> Option<NumericalValue> {
if let Some(num_i64) = json_number.as_i64() {
return Some(num_i64.into());
}
if let Some(num_u64) = json_number.as_u64() {
return Some(num_u64.into());
}
if let Some(num_f64) = json_number.as_f64() {
return Some(num_f64.into());
}
// This can happen with arbitrary precision.... but we do not handle it.
None
}
fn record_json_obj_to_columnar_writer(
doc: DocId,
json_visitor: V::ObjectIter,
json_obj: &serde_json::Map<String, serde_json::Value>,
expand_dots: bool,
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_visitor {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
if !json_path_buffer.is_empty() {
json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR);
@@ -282,9 +295,9 @@ fn record_json_obj_to_columnar_writer<'a, V: Value<'a>>(
}
}
fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
fn record_json_value_to_columnar_writer(
doc: DocId,
json_val: V,
json_val: &serde_json::Value,
expand_dots: bool,
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
@@ -295,62 +308,30 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
return;
}
remaining_depth_limit -= 1;
match json_val.as_value() {
ReferenceValue::Null => {} // TODO: Handle null
ReferenceValue::Str(val) => {
match json_val {
serde_json::Value::Null => {
// TODO handle null
}
serde_json::Value::Bool(bool_val) => {
columnar_writer.record_bool(doc, json_path_writer, *bool_val);
}
serde_json::Value::Number(json_number) => {
if let Some(numerical_value) = columnar_numerical_value(json_number) {
columnar_writer.record_numerical(doc, json_path_writer.as_str(), numerical_value);
}
}
serde_json::Value::String(text) => {
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(val);
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), val);
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
}
ReferenceValue::U64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValue::I64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValue::F64(val) => {
columnar_writer.record_numerical(
doc,
json_path_writer.as_str(),
NumericalValue::from(val),
);
}
ReferenceValue::Bool(val) => {
columnar_writer.record_bool(doc, json_path_writer, val);
}
ReferenceValue::Date(val) => {
columnar_writer.record_datetime(doc, json_path_writer.as_str(), val);
}
ReferenceValue::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
}
ReferenceValue::Bytes(_) => {
// TODO: This can be re added once it is added to the JSON Utils section as well.
// columnar_writer.record_bytes(doc, json_path_writer.as_str(), val);
unimplemented!("Bytes support in dynamic fields is not yet implemented")
}
ReferenceValue::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
}
ReferenceValue::PreTokStr(_) => {
unimplemented!("Pre-tokenized string support in dynamic fields is not yet implemented")
}
ReferenceValue::Array(elements) => {
for el in elements {
serde_json::Value::Array(arr) => {
for el in arr {
record_json_value_to_columnar_writer(
doc,
el,
@@ -362,10 +343,10 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
);
}
}
ReferenceValue::Object(object) => {
record_json_obj_to_columnar_writer::<V>(
serde_json::Value::Object(json_obj) => {
record_json_obj_to_columnar_writer(
doc,
object,
json_obj,
expand_dots,
remaining_depth_limit,
json_path_writer,

View File

@@ -4,7 +4,7 @@ use rand::{thread_rng, Rng};
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::*;
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, Order, Searcher};
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
assert!(searcher.segment_readers().len() < 20);
@@ -12,7 +12,7 @@ fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
for segment_reader in searcher.segment_readers() {
let store_reader = segment_reader.get_store_reader(1)?;
for doc_id in 0..segment_reader.max_doc() {
let _doc: TantivyDocument = store_reader.get(doc_id)?;
let _doc = store_reader.get(doc_id)?;
}
}
Ok(())
@@ -31,8 +31,7 @@ fn test_functional_store() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut index_writer = index.writer_with_num_threads(3, MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut doc_set: Vec<u64> = Vec::new();
@@ -92,7 +91,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer: IndexWriter = index.writer_with_num_threads(3, 120_000_000)?;
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
@@ -115,7 +114,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
@@ -167,7 +166,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
let mut rng = thread_rng();
let mut index_writer: IndexWriter = index.writer_with_num_threads(3, 120_000_000)?;
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
@@ -190,7 +189,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);

View File

@@ -158,7 +158,6 @@ mod tests_indexsorting {
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{Schema, *};
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order};
@@ -309,16 +308,16 @@ mod tests_indexsorting {
{
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.doc(DocAddress::new(0, 0))?
.get_first(my_string_field),
None
);
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
.doc(DocAddress::new(0, 3))?
.get_first(my_string_field)
.unwrap()
.as_str(),
.as_text(),
Some("blublub")
);
}
@@ -338,13 +337,13 @@ mod tests_indexsorting {
{
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.doc(DocAddress::new(0, 0))?
.get_first(my_string_field)
.unwrap()
.as_str(),
.as_text(),
Some("blublub")
);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(my_string_field), None);
}
// sort by field desc
@@ -361,9 +360,9 @@ mod tests_indexsorting {
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(
doc.get_first(my_string_field).unwrap().as_str(),
doc.get_first(my_string_field).unwrap().as_text(),
Some("blublub")
);
}

View File

@@ -20,8 +20,7 @@ use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper;
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
use crate::query::{EnableScoring, Query, TermQuery};
use crate::schema::document::Document;
use crate::schema::{IndexRecordOption, TantivyDocument, Term};
use crate::schema::{Document, IndexRecordOption, Term};
use crate::{FutureResult, Opstamp};
// Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
@@ -51,7 +50,7 @@ fn error_in_index_worker_thread(context: &str) -> TantivyError {
/// indexing queue.
/// Each indexing thread builds its own independent [`Segment`], via
/// a `SegmentWriter` object.
pub struct IndexWriter<D: Document = TantivyDocument> {
pub struct IndexWriter {
// the lock is just used to bind the
// lifetime of the lock with that of the IndexWriter.
_directory_lock: Option<DirectoryLock>,
@@ -63,8 +62,8 @@ pub struct IndexWriter<D: Document = TantivyDocument> {
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
index_writer_status: IndexWriterStatus<D>,
operation_sender: AddBatchSender<D>,
index_writer_status: IndexWriterStatus,
operation_sender: AddBatchSender,
segment_updater: SegmentUpdater,
@@ -165,10 +164,10 @@ pub(crate) fn advance_deletes(
Ok(())
}
fn index_documents<D: Document>(
fn index_documents(
memory_budget: usize,
segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch<D>>,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
segment_updater: &SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> crate::Result<()> {
@@ -248,7 +247,7 @@ fn apply_deletes(
})
}
impl<D: Document> IndexWriter<D> {
impl IndexWriter {
/// Create a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
@@ -268,7 +267,7 @@ impl<D: Document> IndexWriter<D> {
num_threads: usize,
memory_budget_in_bytes_per_thread: usize,
directory_lock: DirectoryLock,
) -> crate::Result<Self> {
) -> crate::Result<IndexWriter> {
if memory_budget_in_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
let err_msg = format!(
"The memory arena in bytes per thread needs to be at least \
@@ -282,7 +281,7 @@ impl<D: Document> IndexWriter<D> {
);
return Err(TantivyError::InvalidArgument(err_msg));
}
let (document_sender, document_receiver) =
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
@@ -294,7 +293,7 @@ impl<D: Document> IndexWriter<D> {
let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = Self {
let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock),
memory_budget_in_bytes_per_thread,
@@ -376,7 +375,7 @@ impl<D: Document> IndexWriter<D> {
self.index.new_segment()
}
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver<D>> {
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
self.index_writer_status
.operation_receiver()
.ok_or_else(|| {
@@ -526,7 +525,7 @@ impl<D: Document> IndexWriter<D> {
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) {
let (document_sender, document_receiver) =
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
self.operation_sender = document_sender;
self.index_writer_status = IndexWriterStatus::from(document_receiver);
@@ -553,7 +552,7 @@ impl<D: Document> IndexWriter<D> {
.take()
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
let new_index_writer = IndexWriter::new(
let new_index_writer: IndexWriter = IndexWriter::new(
&self.index,
self.num_threads,
self.memory_budget_in_bytes_per_thread,
@@ -599,7 +598,7 @@ impl<D: Document> IndexWriter<D> {
/// It is also possible to add a payload to the `commit`
/// using this API.
/// See [`PreparedCommit::set_payload()`].
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
// Here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.
@@ -708,7 +707,7 @@ impl<D: Document> IndexWriter<D> {
/// The opstamp is an increasing `u64` that can
/// be used by the client to align commits with its own
/// document queue.
pub fn add_document(&self, document: D) -> crate::Result<Opstamp> {
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> {
let opstamp = self.stamper.stamp();
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?;
Ok(opstamp)
@@ -745,7 +744,7 @@ impl<D: Document> IndexWriter<D> {
/// visible to readers only after calling `commit()`.
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp>
where
I: IntoIterator<Item = UserOperation<D>>,
I: IntoIterator<Item = UserOperation>,
I::IntoIter: ExactSizeIterator,
{
let user_operations_it = user_operations.into_iter();
@@ -779,7 +778,7 @@ impl<D: Document> IndexWriter<D> {
Ok(batch_opstamp)
}
fn send_add_documents_batch(&self, add_ops: AddBatch<D>) -> crate::Result<()> {
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> {
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
Ok(())
} else {
@@ -788,7 +787,7 @@ impl<D: Document> IndexWriter<D> {
}
}
impl<D: Document> Drop for IndexWriter<D> {
impl Drop for IndexWriter {
fn drop(&mut self) {
self.segment_updater.kill();
self.drop_sender();
@@ -815,15 +814,13 @@ mod tests {
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy;
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, IndexWriter, Order,
ReloadPolicy, TantivyDocument, Term,
DateTime, DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term,
};
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
@@ -855,7 +852,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "hello1"))
.unwrap();
@@ -908,7 +905,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let operations = vec![
@@ -946,7 +943,7 @@ mod tests {
fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer: IndexWriter = index.writer_for_tests().unwrap();
let index_writer = index.writer_for_tests().unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1).unwrap();
assert_eq!(batch_opstamp1, 0u64);
@@ -959,8 +956,8 @@ mod tests {
fn test_lockfile_stops_duplicates() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
match index.writer_for_tests::<TantivyDocument>() {
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
_ => panic!("Expected a `LockFailure` error"),
}
@@ -970,8 +967,8 @@ mod tests {
fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
match index.writer_for_tests::<TantivyDocument>() {
let _index_writer = index.writer_for_tests().unwrap();
match index.writer_for_tests() {
Err(err) => {
let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`"));
@@ -984,7 +981,7 @@ mod tests {
fn test_set_merge_policy() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer: IndexWriter = index.writer_for_tests().unwrap();
let index_writer = index.writer_for_tests().unwrap();
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \
@@ -1003,11 +1000,11 @@ mod tests {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
{
let _index_writer: IndexWriter = index.writer_for_tests().unwrap();
let _index_writer = index.writer_for_tests().unwrap();
// the lock should be released when the
// index_writer leaves the scope.
}
let _index_writer_two: IndexWriter = index.writer_for_tests().unwrap();
let _index_writer_two = index.writer_for_tests().unwrap();
}
#[test]
@@ -1059,7 +1056,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
// this should create 1 segment
@@ -1099,7 +1096,7 @@ mod tests {
reader.searcher().doc_freq(&term_a).unwrap()
};
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?;
@@ -1385,7 +1382,7 @@ mod tests {
fn test_delete_all_documents_empty_index() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let clear = index_writer.delete_all_documents();
@@ -1398,7 +1395,7 @@ mod tests {
fn test_delete_all_documents_index_twice() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index
let mut index_writer = index
.writer_with_num_threads(4, MEMORY_BUDGET_NUM_BYTES_MIN * 4)
.unwrap();
let clear = index_writer.delete_all_documents();
@@ -1418,7 +1415,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::builder().schema(schema).create_in_ram().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "one"))
.unwrap();
@@ -1706,7 +1703,8 @@ mod tests {
let old_reader = index.reader()?;
let id_exists = |id| id % 3 != 0; // 0 does not exist
// Every 3rd doc has only id field
let id_is_full_doc = |id| id % 3 != 0;
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
// rotate left
@@ -1722,7 +1720,7 @@ mod tests {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip = ip_from_id(id);
if !id_exists(id) {
if !id_is_full_doc(id) {
// every 3rd doc has no ip field
index_writer.add_document(doc!(
id_field=>id,
@@ -1780,7 +1778,7 @@ mod tests {
let num_segments_before_merge = searcher.segment_readers().len();
if force_end_merge {
index_writer.wait_merging_threads()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
@@ -1842,7 +1840,7 @@ mod tests {
let num_docs_with_values = expected_ids_and_num_occurrences
.iter()
.filter(|(id, _id_occurrences)| id_exists(**id))
.filter(|(id, _id_occurrences)| id_is_full_doc(**id))
.map(|(_, id_occurrences)| *id_occurrences as usize)
.sum::<usize>();
@@ -1866,7 +1864,7 @@ mod tests {
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
let mut expected_multi_ips: Vec<_> = id_list
.iter()
.filter(|id| id_exists(**id))
.filter(|id| id_is_full_doc(**id))
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
.collect();
assert_eq!(num_ips, expected_multi_ips.len() as u32);
@@ -1904,7 +1902,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.flat_map(|id| {
if !id_exists(*id) {
if !id_is_full_doc(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1916,7 +1914,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.filter_map(|id| {
if !id_exists(*id) {
if !id_is_full_doc(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1951,7 +1949,7 @@ mod tests {
let id = id_reader.first(doc).unwrap();
let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect();
if id_exists(id) {
if id_is_full_doc(id) {
assert_eq!(vals.len(), 2);
assert_eq!(vals[0], vals[1]);
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
@@ -1961,7 +1959,7 @@ mod tests {
}
let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect();
if id_exists(id) {
if id_is_full_doc(id) {
assert_eq!(bool_vals.len(), 2);
assert_ne!(bool_vals[0], bool_vals[1]);
} else {
@@ -1976,23 +1974,23 @@ mod tests {
.get_store_reader(DOCSTORE_CACHE_CAPACITY)
.unwrap();
// test store iterator
for doc in store_reader.iter::<TantivyDocument>(segment_reader.alive_bitset()) {
for doc in store_reader.iter(segment_reader.alive_bitset()) {
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
}
// test store random access
for doc_id in segment_reader.doc_ids_alive() {
let id = store_reader
.get::<TantivyDocument>(doc_id)
.get(doc_id)
.unwrap()
.get_first(id_field)
.unwrap()
.as_u64()
.unwrap();
assert!(expected_ids_and_num_occurrences.contains_key(&id));
if id_exists(id) {
if id_is_full_doc(id) {
let id2 = store_reader
.get::<TantivyDocument>(doc_id)
.get(doc_id)
.unwrap()
.get_first(multi_numbers)
.unwrap()
@@ -2000,13 +1998,13 @@ mod tests {
.unwrap();
assert_eq!(id, id2);
let bool = store_reader
.get::<TantivyDocument>(doc_id)
.get(doc_id)
.unwrap()
.get_first(bool_field)
.unwrap()
.as_bool()
.unwrap();
let doc = store_reader.get::<TantivyDocument>(doc_id).unwrap();
let doc = store_reader.get(doc_id).unwrap();
let mut bool2 = doc.get_all(multi_bools);
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
@@ -2037,7 +2035,7 @@ mod tests {
let (existing_id, count) = (*id, *count);
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(get_num_hits(id_field), count);
if !id_exists(existing_id) {
if !id_is_full_doc(existing_id) {
continue;
}
assert_eq!(get_num_hits(text_field), count);
@@ -2087,7 +2085,7 @@ mod tests {
//
for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
if !id_is_full_doc(existing_id) {
continue;
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
@@ -2104,34 +2102,84 @@ mod tests {
}
}
// assert data is like expected
// Range query
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
// Take half as sample
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
sample.sort_by_key(|(k, _num_occurences)| *k);
// sample.truncate(sample.len() / 2);
if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
let expected_count = |sample: &[(&u64, &u64)]| {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
.map(|(_id, num_occurences)| **num_occurences)
.sum::<u64>()
};
let ip = ip_from_id(existing_id);
fn gen_query_inclusive<T1: ToString, T2: ToString>(
field: &str,
from: T1,
to: T2,
) -> String {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
// Range query on single value field
let query = gen_query_inclusive("ip", ip, ip);
assert_eq!(do_search_ip_field(&query), count);
// Query first half
if !left_sample.is_empty() {
let expected_count = expected_count(left_sample);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip, ip);
let start_range = *left_sample[0].0;
let end_range = *left_sample.last().unwrap().0;
let query = gen_query_inclusive("id_opt", start_range, end_range);
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
assert_eq!(do_search_ip_field(&query), count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", "*", ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
}
// Query second half
if !right_sample.is_empty() {
let expected_count = expected_count(right_sample);
let start_range = *right_sample[0].0;
let end_range = *right_sample.last().unwrap().0;
// Range query on id opt field
let query =
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
// Range query on ip field
let ip1 = ip_from_id(start_range);
let ip2 = ip_from_id(end_range);
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let query = gen_query_inclusive("ip", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ip", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
// Range query on multi value field
let query = gen_query_inclusive("ips", ip1, ip2);
assert_eq!(do_search_ip_field(&query), expected_count);
let query = gen_query_inclusive("ips", ip1, "*");
assert_eq!(do_search_ip_field(&query), expected_count);
}
}
// ip range query on fast field
//
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
let (existing_id, count) = (*existing_id, *count);
if !id_exists(existing_id) {
if !id_is_full_doc(existing_id) {
continue;
}
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
@@ -2159,7 +2207,7 @@ mod tests {
.first_or_default_col(9999);
for doc_id in segment_reader.doc_ids_alive() {
let id = ff_reader.get_val(doc_id);
if !id_exists(id) {
if !id_is_full_doc(id) {
continue;
}
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
@@ -2197,6 +2245,12 @@ mod tests {
Ok(index)
}
#[test]
fn test_fast_field_range() {
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
assert!(test_operation_strategy(&ops, false, true).is_ok());
}
#[test]
fn test_sort_index_on_opt_field_regression() {
assert!(test_operation_strategy(
@@ -2546,7 +2600,7 @@ mod tests {
// Merge
{
assert!(index_writer.wait_merging_threads().is_ok());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
@@ -2588,7 +2642,7 @@ mod tests {
// Merge
{
assert!(index_writer.wait_merging_threads().is_ok());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");

View File

@@ -2,15 +2,13 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use super::AddBatchReceiver;
use crate::schema::document::Document;
use crate::TantivyDocument;
#[derive(Clone)]
pub(crate) struct IndexWriterStatus<D: Document = TantivyDocument> {
inner: Arc<Inner<D>>,
pub(crate) struct IndexWriterStatus {
inner: Arc<Inner>,
}
impl<D: Document> IndexWriterStatus<D> {
impl IndexWriterStatus {
/// Returns true iff the index writer is alive.
pub fn is_alive(&self) -> bool {
self.inner.as_ref().is_alive()
@@ -18,7 +16,7 @@ impl<D: Document> IndexWriterStatus<D> {
/// Returns a copy of the operation receiver.
/// If the index writer was killed, returns `None`.
pub fn operation_receiver(&self) -> Option<AddBatchReceiver<D>> {
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
let rlock = self
.inner
.receive_channel
@@ -29,19 +27,19 @@ impl<D: Document> IndexWriterStatus<D> {
/// Create an index writer bomb.
/// If dropped, the index writer status will be killed.
pub(crate) fn create_bomb(&self) -> IndexWriterBomb<D> {
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
IndexWriterBomb {
inner: Some(self.inner.clone()),
}
}
}
struct Inner<D: Document> {
struct Inner {
is_alive: AtomicBool,
receive_channel: RwLock<Option<AddBatchReceiver<D>>>,
receive_channel: RwLock<Option<AddBatchReceiver>>,
}
impl<D: Document> Inner<D> {
impl Inner {
fn is_alive(&self) -> bool {
self.is_alive.load(Ordering::Relaxed)
}
@@ -55,8 +53,8 @@ impl<D: Document> Inner<D> {
}
}
impl<D: Document> From<AddBatchReceiver<D>> for IndexWriterStatus<D> {
fn from(receiver: AddBatchReceiver<D>) -> Self {
impl From<AddBatchReceiver> for IndexWriterStatus {
fn from(receiver: AddBatchReceiver) -> Self {
IndexWriterStatus {
inner: Arc::new(Inner {
is_alive: AtomicBool::new(true),
@@ -68,11 +66,11 @@ impl<D: Document> From<AddBatchReceiver<D>> for IndexWriterStatus<D> {
/// If dropped, the index writer will be killed.
/// To prevent this, clients can call `.defuse()`.
pub(crate) struct IndexWriterBomb<D: Document> {
inner: Option<Arc<Inner<D>>>,
pub(crate) struct IndexWriterBomb {
inner: Option<Arc<Inner>>,
}
impl<D: Document> IndexWriterBomb<D> {
impl IndexWriterBomb {
/// Defuses the bomb.
///
/// This is the only way to drop the bomb without killing
@@ -82,7 +80,7 @@ impl<D: Document> IndexWriterBomb<D> {
}
}
impl<D: Document> Drop for IndexWriterBomb<D> {
impl Drop for IndexWriterBomb {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
inner.kill();

View File

@@ -753,10 +753,9 @@ mod tests {
use crate::collector::{Count, FacetCollector};
use crate::core::Index;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
TextFieldIndexing, INDEXED, TEXT,
Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term, TextFieldIndexing,
INDEXED, TEXT,
};
use crate::time::OffsetDateTime;
use crate::{
@@ -818,7 +817,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -867,24 +866,30 @@ mod tests {
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
let doc = searcher.doc(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c"));
let doc = searcher.doc(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("a b c"));
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d"));
let doc = searcher.doc(DocAddress::new(0, 2))?;
assert_eq!(
doc.get_first(text_field).unwrap().as_text(),
Some("a b c d")
);
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 3))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b"));
let doc = searcher.doc(DocAddress::new(0, 3))?;
assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b"));
}
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c g"));
let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(
doc.get_first(text_field).unwrap().as_text(),
Some("a b c g")
);
}
{
@@ -1295,10 +1300,10 @@ mod tests {
let reader = index.reader().unwrap();
let mut int_val = 0;
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc =
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
@@ -1379,7 +1384,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.merge(&segment_ids)
.wait()
@@ -1401,7 +1406,7 @@ mod tests {
// Deleting one term
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
@@ -1426,7 +1431,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field => 1u64))?;
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64))?;
@@ -1455,7 +1460,7 @@ mod tests {
let reader = index.reader()?;
{
let mut index_writer = index.writer_for_tests()?;
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone())?;
index_writer.commit()?;
@@ -1498,7 +1503,7 @@ mod tests {
{
let mut index_writer = index.writer_for_tests()?;
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
@@ -1561,7 +1566,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -1608,7 +1613,7 @@ mod tests {
writer.set_merge_policy(Box::new(policy));
for i in 0..100 {
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27);

View File

@@ -4,15 +4,11 @@ mod tests {
use crate::core::Index;
use crate::fastfield::AliveBitSet;
use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
};
use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings,
TantivyDocument, Term,
};
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
let mut schema_builder = schema::Schema::builder();
@@ -30,7 +26,7 @@ mod tests {
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
.unwrap();
@@ -49,7 +45,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
assert!(index_writer.merge(&segment_ids).wait().is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
@@ -137,7 +133,7 @@ mod tests {
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -276,16 +272,12 @@ mod tests {
} else {
2
};
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap();
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
assert_eq!(
doc.get_first(my_text_field).unwrap().as_str(),
doc.get_first(my_text_field).unwrap().as_text(),
Some("blubber")
);
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap();
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000));
}
}
@@ -502,7 +494,7 @@ mod bench_sorted_index_merge {
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
index_writer.add_document(doc!(int_field=>val)).unwrap();
};

View File

@@ -44,9 +44,9 @@ pub type DefaultMergePolicy = LogMergePolicy;
// - all docs in the operation will happen on the same segment and continuous doc_ids.
// - all operations in the group are committed at the same time, making the group
// atomic.
type AddBatch<D> = SmallVec<[AddOperation<D>; 4]>;
type AddBatchSender<D> = channel::Sender<AddBatch<D>>;
type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>;
type AddBatch = SmallVec<[AddOperation; 4]>;
type AddBatchSender = channel::Sender<AddBatch>;
type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(feature = "mmap")]
#[cfg(test)]
@@ -54,15 +54,15 @@ mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{JsonObjectOptions, Schema, Type, TEXT};
use crate::{Index, IndexWriter, Term};
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};
#[test]
fn test_advance_delete_bug() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_from_tempdir(schema_builder.build())?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
// there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -79,7 +79,7 @@ mod tests_mmap {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
@@ -110,7 +110,7 @@ mod tests_mmap {
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
@@ -133,40 +133,4 @@ mod tests_mmap {
assert_eq!(num_docs, 1);
}
}
#[test]
fn test_json_field_list_fields() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "sub": {"a": 1, "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": "mixed", "b": 2}});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 3);
let reader = &searcher.segment_readers()[0];
let inverted_index = reader.inverted_index(json_field).unwrap();
assert_eq!(
inverted_index.list_fields().unwrap(),
[
("k8s\u{1}container\u{1}name".to_string(), Type::Str),
("sub\u{1}a".to_string(), Type::I64),
("sub\u{1}b".to_string(), Type::I64),
("suber\u{1}a".to_string(), Type::I64),
("suber\u{1}a".to_string(), Type::Str),
("suber\u{1}b".to_string(), Type::I64),
("val".to_string(), Type::Str),
]
);
}
}

View File

@@ -1,6 +1,5 @@
use crate::query::Weight;
use crate::schema::document::Document;
use crate::schema::{TantivyDocument, Term};
use crate::schema::{Document, Term};
use crate::Opstamp;
/// Timestamped Delete operation.
@@ -11,16 +10,16 @@ pub struct DeleteOperation {
/// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)]
pub struct AddOperation<D: Document = TantivyDocument> {
pub struct AddOperation {
pub opstamp: Opstamp,
pub document: D,
pub document: Document,
}
/// UserOperation is an enum type that encapsulates other operation types.
#[derive(Eq, PartialEq, Debug)]
pub enum UserOperation<D: Document = TantivyDocument> {
pub enum UserOperation {
/// Add operation
Add(D),
Add(Document),
/// Delete operation
Delete(Term),
}

View File

@@ -1,17 +1,16 @@
use super::IndexWriter;
use crate::schema::document::Document;
use crate::{FutureResult, Opstamp, TantivyDocument};
use crate::{FutureResult, Opstamp};
/// A prepared commit
pub struct PreparedCommit<'a, D: Document = TantivyDocument> {
index_writer: &'a mut IndexWriter<D>,
pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter,
payload: Option<String>,
opstamp: Opstamp,
}
impl<'a, D: Document> PreparedCommit<'a, D> {
pub(crate) fn new(index_writer: &'a mut IndexWriter<D>, opstamp: Opstamp) -> Self {
Self {
impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> {
PreparedCommit {
index_writer,
payload: None,
opstamp,

View File

@@ -596,15 +596,10 @@ impl SegmentUpdater {
);
{
if let Some(after_merge_segment_entry) = after_merge_segment_entry.as_mut() {
// Deletes and commits could have happened as we were merging.
// We need to make sure we are up to date with deletes before accepting the
// segment.
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_meta().opstamp;
if delete_operation.opstamp < committed_opstamp {
// We are not up to date! Let's create a new tombstone file for our
// freshly create split.
let index = &segment_updater.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
if let Err(advance_deletes_err) = advance_deletes(

View File

@@ -13,11 +13,10 @@ use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::document::{Document, ReferenceValue, Value};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Opstamp, SegmentComponent, TantivyError};
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
/// Computes the initial size of the hash table.
///
@@ -82,7 +81,10 @@ impl SegmentWriter {
/// the flushing behavior as a memory limit.
/// - segment: The segment being written
/// - schema
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
pub fn for_segment(
memory_budget_in_bytes: usize,
segment: Segment,
) -> crate::Result<SegmentWriter> {
let schema = segment.schema();
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
@@ -111,7 +113,7 @@ impl SegmentWriter {
})
})
.collect::<Result<Vec<_>, _>>()?;
Ok(Self {
Ok(SegmentWriter {
max_doc: 0,
ctx: IndexingContext::new(table_size),
per_field_postings_writers,
@@ -162,21 +164,18 @@ impl SegmentWriter {
+ self.segment_serializer.mem_usage()
}
fn index_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
fn index_document(&mut self, doc: &Document) -> crate::Result<()> {
let doc_id = self.max_doc;
// TODO: Can this be optimised a bit?
let vals_grouped_by_field = doc
.iter_fields_and_values()
.sorted_by_key(|(field, _)| *field)
.group_by(|(field, _)| *field);
.field_values()
.iter()
.sorted_by_key(|el| el.field())
.group_by(|el| el.field());
for (field, field_values) in &vals_grouped_by_field {
let values = field_values.map(|el| el.1);
let values = field_values.map(|field_value| field_value.value());
let field_entry = self.schema.get_field_entry(field);
let make_schema_error = || {
TantivyError::SchemaError(format!(
crate::TantivyError::SchemaError(format!(
"Expected a {:?} for field {:?}",
field_entry.field_type().value_type(),
field_entry.name()
@@ -194,10 +193,7 @@ impl SegmentWriter {
match field_entry.field_type() {
FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -213,18 +209,19 @@ impl SegmentWriter {
}
FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default();
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
let mut token_stream = if let Some(text) = value.as_str() {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
} else {
continue;
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
}
Value::Str(ref text) => {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
}
_ => {
continue;
}
};
assert!(term_buffer.is_empty());
@@ -243,10 +240,7 @@ impl SegmentWriter {
}
FieldType::U64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val);
@@ -258,13 +252,9 @@ impl SegmentWriter {
}
FieldType::Date(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
for value in values {
num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
let date_val = value.as_date().ok_or_else(make_schema_error)?;
term_buffer
.set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
@@ -275,10 +265,7 @@ impl SegmentWriter {
}
FieldType::I64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val);
@@ -290,10 +277,7 @@ impl SegmentWriter {
}
FieldType::F64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
@@ -305,10 +289,7 @@ impl SegmentWriter {
}
FieldType::Bool(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val);
@@ -320,10 +301,7 @@ impl SegmentWriter {
}
FieldType::Bytes(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
@@ -336,17 +314,9 @@ impl SegmentWriter {
FieldType::JsonObject(json_options) => {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it = values.map(|value_access| {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
match value {
ReferenceValue::Object(object_iter) => Ok(object_iter),
_ => Err(make_schema_error()),
}
});
index_json_values::<D::Value<'_>>(
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
index_json_values(
doc_id,
json_values_it,
text_analyzer,
@@ -358,10 +328,7 @@ impl SegmentWriter {
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
term_buffer.set_ip_addr(ip_addr);
@@ -379,10 +346,7 @@ impl SegmentWriter {
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document<D: Document>(
&mut self,
add_operation: AddOperation<D>,
) -> crate::Result<()> {
pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
let AddOperation { document, opstamp } = add_operation;
self.doc_opstamps.push(opstamp);
self.fast_field_writers.add_document(&document)?;
@@ -481,7 +445,6 @@ fn remap_and_write(
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use tempfile::TempDir;
@@ -492,18 +455,15 @@ mod tests {
use crate::directory::RamDirectory;
use crate::postings::TermInfo;
use crate::query::PhraseQuery;
use crate::schema::document::Value;
use crate::schema::{
Document, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING,
TEXT,
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument,
Term, TERMINATED,
DateTime, Directory, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED,
};
#[test]
@@ -520,7 +480,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("title", TEXT | STORED);
let schema = schema_builder.build();
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
let pre_tokenized_text = PreTokenizedString {
text: String::from("A"),
tokens: vec![Token {
@@ -544,11 +504,11 @@ mod tests {
store_writer.close().unwrap();
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get::<TantivyDocument>(0).unwrap();
let doc = reader.get(0).unwrap();
assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.field_values()[0].value().as_str(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_str(), Some("title"));
assert_eq!(doc.field_values()[0].value().as_text(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_text(), Some("title"));
}
#[test]
@@ -579,13 +539,13 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let doc = searcher
.doc::<TantivyDocument>(DocAddress {
.doc(DocAddress {
segment_ord: 0u32,
doc_id: 0u32,
})
.unwrap();
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
&doc.to_json(&schema),
&schema.to_json(&doc),
)
.unwrap()
.get("json")
@@ -715,10 +675,10 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build();
let mut doc = TantivyDocument::default();
let json_val: BTreeMap<String, crate::schema::OwnedValue> =
let mut doc = Document::default();
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap();
doc.add_object(json_field, json_val);
doc.add_json_object(json_field, json_val);
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap();
@@ -842,10 +802,11 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let doc = TantivyDocument::parse_json(&schema, r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
let doc = schema
.parse_document(r#"{"text": [ "bbb", "aaa", "", "aaa"]}"#)
.unwrap();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
// On debug this did panic on the underflow
index_writer.commit().unwrap();
@@ -870,7 +831,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "roller-coaster".to_string(),
@@ -885,7 +846,7 @@ mod tests {
doc.add_pre_tokenized_text(text, tokens.clone());
doc.add_pre_tokenized_text(text, tokens);
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
@@ -908,7 +869,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
// This is a bit of a contrived example.
let tokens = PreTokenizedString {
text: "contrived-example".to_string(), //< I can't think of a use case where this corner case happens in real life.
@@ -933,7 +894,7 @@ mod tests {
doc.add_pre_tokenized_text(text, tokens);
doc.add_text(text, "hello");
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
@@ -969,7 +930,7 @@ mod tests {
let schema = index.schema();
let mut index_writer = index.writer(50_000_000).unwrap();
let title = schema.get_field("title").unwrap();
let mut document = TantivyDocument::default();
let mut document = Document::default();
document.add_text(title, "The Old Man and the Sea");
index_writer.add_document(document).unwrap();
let error = index_writer.commit().unwrap_err();

View File

@@ -21,7 +21,7 @@
//! # use tantivy::collector::TopDocs;
//! # use tantivy::query::QueryParser;
//! # use tantivy::schema::*;
//! # use tantivy::{doc, DocAddress, Index, IndexWriter, Score};
//! # use tantivy::{doc, DocAddress, Index, Score};
//! #
//! # fn main() {
//! # // Let's create a temporary directory for the
@@ -53,7 +53,7 @@
//!
//! // Here we use a buffer of 100MB that will be split
//! // between indexing threads.
//! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
//! let mut index_writer = index.writer(100_000_000)?;
//!
//! // Let's index one documents!
//! index_writer.add_document(doc!(
@@ -89,8 +89,8 @@
//!
//! for (_score, doc_address) in top_docs {
//! // Retrieve the actual content of documents given its `doc_address`.
//! let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
//! println!("{}", retrieved_doc.to_json(&schema));
//! let retrieved_doc = searcher.doc(doc_address)?;
//! println!("{}", schema.to_json(&retrieved_doc));
//! }
//!
//! # Ok(())
@@ -186,7 +186,7 @@ pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, Pr
pub use crate::postings::Postings;
#[allow(deprecated)]
pub use crate::schema::DatePrecision;
pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term};
pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5;
@@ -342,9 +342,8 @@ pub mod tests {
use crate::docset::{DocSet, TERMINATED};
use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery;
use crate::schema::document::Value;
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy};
use crate::{DateTime, DocAddress, Index, Postings, ReloadPolicy};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
@@ -415,7 +414,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc)?;
@@ -437,7 +436,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"))?;
@@ -464,7 +463,7 @@ pub mod tests {
let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
let index_reader = index.reader()?;
@@ -486,7 +485,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(text_field=>"a b"))?;
@@ -529,7 +528,7 @@ pub mod tests {
.unwrap();
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"))?;
// 1
@@ -576,7 +575,7 @@ pub mod tests {
}
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"))?;
// 1
@@ -613,7 +612,7 @@ pub mod tests {
}
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?;
@@ -663,7 +662,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64))?;
index_writer.commit()?;
let reader = index.reader()?;
@@ -686,7 +685,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val))?;
index_writer.commit()?;
@@ -710,7 +709,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val))?;
index_writer.commit()?;
@@ -734,7 +733,7 @@ pub mod tests {
let absent_field = schema_builder.add_text_field("absent_text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"))?;
assert!(index_writer.commit().is_ok());
let reader = index.reader()?;
@@ -757,7 +756,7 @@ pub mod tests {
.try_into()?;
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"))?;
index_writer.add_document(doc!(text_field=>"70"))?;
index_writer.add_document(doc!(text_field=>"34"))?;
@@ -782,7 +781,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?;
index_writer.commit()?;
}
@@ -814,7 +813,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?;
@@ -878,7 +877,7 @@ pub mod tests {
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 0u64);
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?;
@@ -986,13 +985,13 @@ pub mod tests {
text_field => "some other value",
other_text_field => "short");
assert_eq!(document.len(), 3);
let values: Vec<&OwnedValue> = document.get_all(text_field).collect();
let values: Vec<&Value> = document.get_all(text_field).collect();
assert_eq!(values.len(), 2);
assert_eq!(values[0].as_str(), Some("tantivy"));
assert_eq!(values[1].as_str(), Some("some other value"));
let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect();
assert_eq!(values[0].as_text(), Some("tantivy"));
assert_eq!(values[1].as_text(), Some("some other value"));
let values: Vec<&Value> = document.get_all(other_text_field).collect();
assert_eq!(values.len(), 1);
assert_eq!(values[0].as_str(), Some("short"));
assert_eq!(values[0].as_text(), Some("short"));
}
#[test]
@@ -1006,7 +1005,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
{
let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
@@ -1072,7 +1071,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let index_reader = index.reader()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT {
@@ -1125,7 +1124,7 @@ pub mod tests {
let body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?;
let mut writer: IndexWriter = index.writer(50_000_000)?;
let mut writer = index.writer(50_000_000)?;
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?;

View File

@@ -45,12 +45,12 @@
macro_rules! doc(
() => {
{
($crate::TantivyDocument::default())
($crate::Document::default())
}
}; // avoids a warning due to the useless `mut`.
($($field:expr => $value:expr),*) => {
{
let mut document = $crate::TantivyDocument::default();
let mut document = $crate::Document::default();
$(
document.add_field_value($field, $value);
)*

View File

@@ -52,7 +52,7 @@ pub mod tests {
Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT,
};
use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
use crate::{DocId, HasLen, IndexWriter, Score};
use crate::{DocId, HasLen, Score};
#[test]
pub fn test_position_write() -> crate::Result<()> {
@@ -432,7 +432,7 @@ pub mod tests {
// delete some of the documents
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
@@ -483,7 +483,7 @@ pub mod tests {
// delete everything else
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
@@ -568,8 +568,8 @@ mod bench {
use crate::docset::TERMINATED;
use crate::query::Intersection;
use crate::schema::{Field, IndexRecordOption, Schema, TantivyDocument, Term, STRING};
use crate::{tests, DocSet, Index, IndexWriter};
use crate::schema::{Document, Field, IndexRecordOption, Schema, Term, STRING};
use crate::{tests, DocSet, Index};
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
let field = Field::from_field_id(0);
@@ -598,9 +598,9 @@ mod bench {
let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000;
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..posting_list_size {
let mut doc = TantivyDocument::default();
let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) {
doc.add_text(text_field, "a");
}

View File

@@ -99,14 +99,14 @@ mod tests {
use crate::docset::{DocSet, BUFFER_LEN, TERMINATED};
use crate::query::{AllScorer, EnableScoring, Query};
use crate::schema::{Schema, TEXT};
use crate::{Index, IndexWriter};
use crate::Index;
fn create_test_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>"aaa"))?;
index_writer.add_document(doc!(field=>"bbb"))?;
index_writer.commit()?;

View File

@@ -4,7 +4,6 @@ use std::sync::Arc;
use common::BitSet;
use tantivy_fst::Automaton;
use super::phrase_prefix_query::prefix_end;
use crate::core::SegmentReader;
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
@@ -15,10 +14,6 @@ use crate::{DocId, Score, TantivyError};
pub struct AutomatonWeight<A> {
field: Field,
automaton: Arc<A>,
// For JSON fields, the term dictionary include terms from all paths.
// We apply additional filtering based on the given JSON path, when searching within the term
// dictionary. This prevents terms from unrelated paths from matching the search criteria.
json_path_bytes: Option<Box<[u8]>>,
}
impl<A> AutomatonWeight<A>
@@ -31,20 +26,6 @@ where
AutomatonWeight {
field,
automaton: automaton.into(),
json_path_bytes: None,
}
}
/// Create a new AutomationWeight for a json path
pub fn new_for_json_path<IntoArcA: Into<Arc<A>>>(
field: Field,
automaton: IntoArcA,
json_path_bytes: &[u8],
) -> AutomatonWeight<A> {
AutomatonWeight {
field,
automaton: automaton.into(),
json_path_bytes: Some(json_path_bytes.to_vec().into_boxed_slice()),
}
}
@@ -53,15 +34,7 @@ where
term_dict: &'a TermDictionary,
) -> io::Result<TermStreamer<'a, &'a A>> {
let automaton: &A = &self.automaton;
let mut term_stream_builder = term_dict.search(automaton);
if let Some(json_path_bytes) = &self.json_path_bytes {
term_stream_builder = term_stream_builder.ge(json_path_bytes);
if let Some(end) = prefix_end(json_path_bytes) {
term_stream_builder = term_stream_builder.lt(&end);
}
}
let term_stream_builder = term_dict.search(automaton);
term_stream_builder.into_stream()
}
}
@@ -117,13 +90,13 @@ mod tests {
use crate::docset::TERMINATED;
use crate::query::Weight;
use crate::schema::{Schema, STRING};
use crate::{Index, IndexWriter};
use crate::Index;
fn create_index() -> crate::Result<Index> {
let mut schema = Schema::builder();
let title = schema.add_text_field("title", STRING);
let index = Index::create_in_ram(schema.build());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(title=>"abc"))?;
index_writer.add_document(doc!(title=>"bcd"))?;
index_writer.add_document(doc!(title=>"abcd"))?;

View File

@@ -24,7 +24,6 @@ use crate::schema::{IndexRecordOption, Term};
/// use tantivy::schema::{IndexRecordOption, Schema, TEXT};
/// use tantivy::Term;
/// use tantivy::Index;
/// use tantivy::IndexWriter;
///
/// fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
@@ -33,7 +32,7 @@ use crate::schema::{IndexRecordOption, Term};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;

View File

@@ -19,7 +19,7 @@ mod tests {
TermQuery,
};
use crate::schema::*;
use crate::{assert_nearly_equals, DocAddress, DocId, Index, IndexWriter, Score};
use crate::{assert_nearly_equals, DocAddress, DocId, Index, Score};
fn aux_test_helper() -> crate::Result<(Index, Field)> {
let mut schema_builder = Schema::builder();
@@ -28,7 +28,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field => "a b c"))?;
index_writer.add_document(doc!(text_field => "a c"))?;
index_writer.add_document(doc!(text_field => "b c"))?;
@@ -224,7 +224,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field => "a b c"))?;
index_writer.add_document(doc!(text_field => "a c"))?;
index_writer.add_document(doc!(text_field => "b c"))?;
@@ -297,7 +297,7 @@ mod tests {
let text = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text=>"a"))?;
index_writer.add_document(doc!(text=>"b"))?;
index_writer.commit()?;

View File

@@ -136,14 +136,14 @@ mod tests {
use super::BoostQuery;
use crate::query::{AllQuery, Query};
use crate::schema::Schema;
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
use crate::{DocAddress, Document, Index};
#[test]
fn test_boost_query_explain() -> crate::Result<()> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::new())?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::new())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();

View File

@@ -143,14 +143,14 @@ mod tests {
use super::ConstScoreQuery;
use crate::query::{AllQuery, Query};
use crate::schema::Schema;
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
use crate::{DocAddress, Document, Index};
#[test]
fn test_const_score_query_explain() -> crate::Result<()> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(TantivyDocument::new())?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::new())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();

View File

@@ -15,7 +15,6 @@ use crate::{Score, Term};
/// use tantivy::schema::{IndexRecordOption, Schema, TEXT};
/// use tantivy::Term;
/// use tantivy::Index;
/// use tantivy::IndexWriter;
///
/// fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
@@ -24,7 +23,7 @@ use crate::{Score, Term};
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of Girl",
/// ))?;

View File

@@ -1,345 +0,0 @@
use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn};
use super::{ConstScorer, EmptyScorer};
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, Score, TantivyError};
/// Query that matches all documents with a non-null value in the specified field.
///
/// All of the matched documents get the score 1.0.
#[derive(Clone, Debug)]
pub struct ExistsQuery {
field_name: String,
}
impl ExistsQuery {
/// Creates a new `ExistQuery` from the given field.
///
/// This query matches all documents with at least one non-null value in the specified field.
/// This constructor never fails, but executing the search with this query will return an
/// error if the specified field doesn't exists or is not a fast field.
pub fn new_exists_query(field: String) -> ExistsQuery {
ExistsQuery { field_name: field }
}
}
impl Query for ExistsQuery {
fn weight(&self, enable_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema();
let Some((field, _path)) = schema.find_field(&self.field_name) else {
return Err(TantivyError::FieldNotFound(self.field_name.clone()));
};
let field_type = schema.get_field_entry(field).field_type();
if !field_type.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {} is not a fast field.",
self.field_name
)));
}
Ok(Box::new(ExistsWeight {
field_name: self.field_name.clone(),
}))
}
}
/// Weight associated with the `ExistsQuery` query.
pub struct ExistsWeight {
field_name: String,
}
impl Weight for ExistsWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields();
let dynamic_columns: crate::Result<Vec<DynamicColumn>> = fast_field_reader
.dynamic_column_handles(&self.field_name)?
.into_iter()
.map(|handle| handle.open().map_err(|io_error| io_error.into()))
.collect();
let mut non_empty_columns = Vec::new();
for column in dynamic_columns? {
if !matches!(column.column_index(), ColumnIndex::Empty { .. }) {
non_empty_columns.push(column)
}
}
// TODO: we can optimizer more here since in most cases we will have only one index
if !non_empty_columns.is_empty() {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
Ok(Box::new(EmptyScorer))
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
Ok(Explanation::new("ExistsQuery", 1.0))
}
}
pub(crate) struct ExistsDocSet {
columns: Vec<DynamicColumn>,
doc: DocId,
max_doc: DocId,
}
impl ExistsDocSet {
pub(crate) fn new(columns: Vec<DynamicColumn>, max_doc: DocId) -> Self {
let mut set = Self {
columns,
doc: 0u32,
max_doc,
};
set.find_next();
set
}
fn find_next(&mut self) -> DocId {
while self.doc < self.max_doc {
if self
.columns
.iter()
.any(|col| col.column_index().has_value(self.doc))
{
return self.doc;
}
self.doc += 1;
}
self.doc = TERMINATED;
TERMINATED
}
}
impl DocSet for ExistsDocSet {
fn advance(&mut self) -> DocId {
self.seek(self.doc + 1)
}
fn size_hint(&self) -> u32 {
0
}
fn doc(&self) -> DocId {
self.doc
}
#[inline(always)]
fn seek(&mut self, target: DocId) -> DocId {
self.doc = target;
self.find_next()
}
}
#[cfg(test)]
mod tests {
use std::net::Ipv6Addr;
use std::ops::Bound;
use common::DateTime;
use time::OffsetDateTime;
use crate::collector::Count;
use crate::query::exist_query::ExistsQuery;
use crate::query::{BooleanQuery, RangeQuery};
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
use crate::{doc, Index, Searcher};
#[test]
fn test_exists_query_simple() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let all_field = schema_builder.add_u64_field("all", INDEXED | FAST);
let even_field = schema_builder.add_u64_field("even", INDEXED | FAST);
let odd_field = schema_builder.add_text_field("odd", STRING | FAST);
let multi_field = schema_builder.add_text_field("multi", FAST);
let _never_field = schema_builder.add_u64_field("never", INDEXED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
if i % 10 == 0 {
index_writer.add_document(doc!(all_field => i, even_field => i, multi_field => i.to_string(), multi_field => (i + 1).to_string()))?;
} else {
index_writer.add_document(doc!(all_field => i, even_field => i))?;
}
} else {
index_writer.add_document(doc!(all_field => i, odd_field => i.to_string()))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "all")?, 100);
assert_eq!(count_existing_fields(&searcher, "odd")?, 50);
assert_eq!(count_existing_fields(&searcher, "even")?, 50);
assert_eq!(count_existing_fields(&searcher, "multi")?, 10);
assert_eq!(count_existing_fields(&searcher, "never")?, 0);
// exercise seek
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(50),
Bound::Unbounded,
)),
Box::new(ExistsQuery::new_exists_query("even".to_string())),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(0),
Bound::Excluded(50),
)),
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
Ok(())
}
#[test]
fn test_exists_query_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
index_writer.add_document(doc!(json => json!({"all": i, "even": true})))?;
} else {
index_writer
.add_document(doc!(json => json!({"all": i.to_string(), "odd": true})))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "json.all")?, 100);
assert_eq!(count_existing_fields(&searcher, "json.even")?, 50);
assert_eq!(count_existing_fields(&searcher, "json.odd")?, 50);
// Handling of non-existing fields:
assert_eq!(count_existing_fields(&searcher, "json.absent")?, 0);
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("does_not_exists.absent".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"The field does not exist: 'does_not_exists.absent'"
);
Ok(())
}
#[test]
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let bool = schema_builder.add_bool_field("bool", FAST);
let bytes = schema_builder.add_bytes_field("bytes", FAST);
let date = schema_builder.add_date_field("date", FAST);
let f64 = schema_builder.add_f64_field("f64", FAST);
let ip_addr = schema_builder.add_ip_addr_field("ip_addr", FAST);
let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let now = OffsetDateTime::now_utc().unix_timestamp();
for i in 0u8..100u8 {
if i % 2 == 0 {
let date_val = DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
now + i as i64 * 100,
)?);
index_writer.add_document(
doc!(bool => i % 3 == 0, bytes => vec![i, i + 1, i + 2], date => date_val),
)?;
} else {
let ip_addr_v6 = Ipv6Addr::new(0, 0, 0, 0, 0, 0xffff, 0xc00a, i.into());
index_writer
.add_document(doc!(f64 => i as f64 * 0.5, ip_addr => ip_addr_v6, facet => Facet::from("/facet/foo"), facet => Facet::from("/facet/bar")))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "bool")?, 50);
assert_eq!(count_existing_fields(&searcher, "bytes")?, 50);
assert_eq!(count_existing_fields(&searcher, "date")?, 50);
assert_eq!(count_existing_fields(&searcher, "f64")?, 50);
assert_eq!(count_existing_fields(&searcher, "ip_addr")?, 50);
assert_eq!(count_existing_fields(&searcher, "facet")?, 50);
Ok(())
}
#[test]
fn test_exists_query_unsupported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let not_fast = schema_builder.add_text_field("not_fast", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
not_fast => "slow",
))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("not_fast".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"Schema error: 'Field not_fast is not a fast field.'"
);
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("does_not_exists".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"The field does not exist: 'does_not_exists'"
);
Ok(())
}
fn count_existing_fields(searcher: &Searcher, field: &str) -> crate::Result<usize> {
let query = ExistsQuery::new_exists_query(field.to_string());
searcher.search(&query, &Count)
}
}

View File

@@ -3,7 +3,7 @@ use once_cell::sync::OnceCell;
use tantivy_fst::Automaton;
use crate::query::{AutomatonWeight, EnableScoring, Query, Weight};
use crate::schema::{Term, Type};
use crate::schema::Term;
use crate::TantivyError::InvalidArgument;
pub(crate) struct DfaWrapper(pub DFA);
@@ -38,7 +38,7 @@ impl Automaton for DfaWrapper {
/// use tantivy::collector::{Count, TopDocs};
/// use tantivy::query::FuzzyTermQuery;
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, Index, IndexWriter, Term};
/// use tantivy::{doc, Index, Term};
///
/// fn example() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
@@ -46,7 +46,7 @@ impl Automaton for DfaWrapper {
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;
@@ -132,46 +132,18 @@ impl FuzzyTermQuery {
});
let term_value = self.term.value();
let term_text = if term_value.typ() == Type::Json {
if let Some(json_path_type) = term_value.json_path_type() {
if json_path_type != Type::Str {
return Err(InvalidArgument(format!(
"The fuzzy term query requires a string path type for a json term. Found \
{:?}",
json_path_type
)));
}
}
std::str::from_utf8(self.term.serialized_value_bytes()).map_err(|_| {
InvalidArgument(
"Failed to convert json term value bytes to utf8 string.".to_string(),
)
})?
} else {
term_value.as_str().ok_or_else(|| {
InvalidArgument("The fuzzy term query requires a string term.".to_string())
})?
};
let term_text = term_value.as_str().ok_or_else(|| {
InvalidArgument("The fuzzy term query requires a string term.".to_string())
})?;
let automaton = if self.prefix {
automaton_builder.build_prefix_dfa(term_text)
} else {
automaton_builder.build_dfa(term_text)
};
if let Some((json_path_bytes, _)) = term_value.as_json() {
Ok(AutomatonWeight::new_for_json_path(
self.term.field(),
DfaWrapper(automaton),
json_path_bytes,
))
} else {
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
}
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
}
}
@@ -185,90 +157,8 @@ impl Query for FuzzyTermQuery {
mod test {
use super::FuzzyTermQuery;
use crate::collector::{Count, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Schema, STORED, TEXT};
use crate::{assert_nearly_equals, Index, IndexWriter, TantivyDocument, Term};
#[test]
pub fn test_fuzzy_json_path() -> crate::Result<()> {
// # Defining the schema
let mut schema_builder = Schema::builder();
let attributes = schema_builder.add_json_field("attributes", TEXT | STORED);
let schema = schema_builder.build();
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"attributes": {
"a": "japan"
}
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"attributes": {
"aa": "japan"
}
}"#,
)?;
index_writer.add_document(doc)?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
// # Fuzzy search
let query_parser = QueryParser::for_index(&index, vec![attributes]);
let get_json_path_term = |query: &str| -> crate::Result<Term> {
let query = query_parser.parse_query(query)?;
let mut terms = Vec::new();
query.query_terms(&mut |term, _| {
terms.push(term.clone());
});
Ok(terms[0].clone())
};
// shall not match the first document due to json path mismatch
{
let term = get_json_path_term("attributes.aa:japan")?;
let fuzzy_query = FuzzyTermQuery::new(term, 2, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
assert_eq!(top_docs[0].1.doc_id, 1, "Expected the second document");
}
// shall match the first document because Levenshtein distance is 1 (substitute 'o' with
// 'a')
{
let term = get_json_path_term("attributes.a:japon")?;
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
assert_eq!(top_docs[0].1.doc_id, 0, "Expected the first document");
}
// shall not match because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
{
let term = get_json_path_term("attributes.a:jap")?;
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
assert_eq!(top_docs.len(), 0, "Expected no document");
}
Ok(())
}
use crate::schema::{Schema, TEXT};
use crate::{assert_nearly_equals, Index, Term};
#[test]
pub fn test_fuzzy_term() -> crate::Result<()> {
@@ -277,7 +167,7 @@ mod test {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
country_field => "japan",
))?;
@@ -326,7 +216,7 @@ mod test {
let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(country_field => "japan"))?;
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -8,7 +8,6 @@ mod const_score_query;
mod disjunction_max_query;
mod empty_query;
mod exclude;
mod exist_query;
mod explanation;
mod fuzzy_query;
mod intersection;
@@ -42,7 +41,6 @@ pub use self::const_score_query::{ConstScoreQuery, ConstScorer};
pub use self::disjunction_max_query::DisjunctionMaxQuery;
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
pub use self::exclude::Exclude;
pub use self::exist_query::ExistsQuery;
pub use self::explanation::Explanation;
#[cfg(test)]
pub(crate) use self::fuzzy_query::DfaWrapper;

View File

@@ -1,14 +1,11 @@
use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashMap};
use tokenizer_api::Token;
use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::document::{Document, Value};
use crate::schema::{Field, FieldType, IndexRecordOption, Term};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
use crate::{DocAddress, Result, Searcher, TantivyDocument, TantivyError};
use crate::{DocAddress, Result, Searcher, TantivyError};
#[derive(Debug, PartialEq)]
struct ScoreTerm {
@@ -93,10 +90,10 @@ impl MoreLikeThis {
}
/// Creates a [`BooleanQuery`] using a set of field values.
pub fn query_with_document_fields<'a, V: Value<'a>>(
pub fn query_with_document_fields(
&self,
searcher: &Searcher,
doc_fields: &[(Field, Vec<V>)],
doc_fields: &[(Field, Vec<Value>)],
) -> Result<BooleanQuery> {
let score_terms = self.retrieve_terms_from_doc_fields(searcher, doc_fields)?;
let query = self.create_query(score_terms);
@@ -129,18 +126,26 @@ impl MoreLikeThis {
searcher: &Searcher,
doc_address: DocAddress,
) -> Result<Vec<ScoreTerm>> {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let field_to_values = doc.get_sorted_field_values();
let doc = searcher.doc(doc_address)?;
let field_to_values = doc
.get_sorted_field_values()
.iter()
.map(|(field, values)| {
(
*field,
values.iter().map(|v| (**v).clone()).collect::<Vec<Value>>(),
)
})
.collect::<Vec<_>>();
self.retrieve_terms_from_doc_fields(searcher, &field_to_values)
}
/// Finds terms for a more-like-this query.
/// field_to_field_values is a mapping from field to possible values of that field.
fn retrieve_terms_from_doc_fields<'a, V: Value<'a>>(
fn retrieve_terms_from_doc_fields(
&self,
searcher: &Searcher,
field_to_values: &[(Field, Vec<V>)],
field_to_values: &[(Field, Vec<Value>)],
) -> Result<Vec<ScoreTerm>> {
if field_to_values.is_empty() {
return Err(TantivyError::InvalidArgument(
@@ -159,11 +164,11 @@ impl MoreLikeThis {
/// Computes the frequency of values for a field while updating the term frequencies
/// Note: A FieldValue can be made up of multiple terms.
/// We are interested in extracting terms within FieldValue
fn add_term_frequencies<'a, V: Value<'a>>(
fn add_term_frequencies(
&self,
searcher: &Searcher,
field: Field,
values: &[V],
values: &[Value],
term_frequencies: &mut HashMap<Term, usize>,
) -> Result<()> {
let schema = searcher.schema();
@@ -179,10 +184,11 @@ impl MoreLikeThis {
FieldType::Facet(_) => {
let facets: Vec<&str> = values
.iter()
.map(|value| {
value.as_facet().map(|f| f.encoded_str()).ok_or_else(|| {
TantivyError::InvalidArgument("invalid field value".to_string())
})
.map(|value| match value {
Value::Facet(ref facet) => Ok(facet.encoded_str()),
_ => Err(TantivyError::InvalidArgument(
"invalid field value".to_string(),
)),
})
.collect::<Result<Vec<_>>>()?;
for fake_str in facets {
@@ -197,31 +203,35 @@ impl MoreLikeThis {
}
}
FieldType::Str(text_options) => {
let mut tokenizer_opt = text_options
.get_indexing_options()
.map(|options| options.tokenizer())
.and_then(|tokenizer_name| tokenizer_manager.get(tokenizer_name));
let sink = &mut |token: &Token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
};
// TOOD: Validate these changed align with the HEAD branch.
for value in values {
if let Some(text) = value.as_str() {
let tokenizer = match &mut tokenizer_opt {
None => continue,
Some(tokenizer) => tokenizer,
};
let mut token_stream = tokenizer.token_stream(text);
token_stream.process(sink);
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
token_stream.process(sink);
match value {
Value::PreTokStr(tok_str) => {
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
Value::Str(ref text) => {
if let Some(mut tokenizer) = text_options
.get_indexing_options()
.map(|text_indexing_options| {
text_indexing_options.tokenizer().to_string()
})
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
{
let mut token_stream = tokenizer.token_stream(text);
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
_ => (),
}
}
}
@@ -238,7 +248,7 @@ impl MoreLikeThis {
}
FieldType::Date(_) => {
for value in values {
let timestamp = value.as_datetime().ok_or_else(|| {
let timestamp = value.as_date().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
let term = Term::from_field_date(field, timestamp);

View File

@@ -1,8 +1,6 @@
use std::fmt::Debug;
use super::MoreLikeThis;
use crate::query::{EnableScoring, Query, Weight};
use crate::schema::{Field, OwnedValue};
use crate::schema::{Field, Value};
use crate::DocAddress;
/// A query that matches all of the documents similar to a document
@@ -30,10 +28,10 @@ pub struct MoreLikeThisQuery {
target: TargetDocument,
}
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, PartialEq, Clone)]
enum TargetDocument {
DocumentAddress(DocAddress),
DocumentFields(Vec<(Field, Vec<OwnedValue>)>),
DocumentAdress(DocAddress),
DocumentFields(Vec<(Field, Vec<Value>)>),
}
impl MoreLikeThisQuery {
@@ -53,20 +51,14 @@ impl Query for MoreLikeThisQuery {
}
};
match &self.target {
TargetDocument::DocumentAddress(doc_address) => self
TargetDocument::DocumentAdress(doc_address) => self
.mlt
.query_with_document(searcher, *doc_address)?
.weight(enable_scoring),
TargetDocument::DocumentFields(doc_fields) => {
let values = doc_fields
.iter()
.map(|(field, values)| (*field, values.iter().collect::<Vec<&OwnedValue>>()))
.collect::<Vec<_>>();
self.mlt
.query_with_document_fields(searcher, &values)?
.weight(enable_scoring)
}
TargetDocument::DocumentFields(doc_fields) => self
.mlt
.query_with_document_fields(searcher, doc_fields)?
.weight(enable_scoring),
}
}
}
@@ -164,7 +156,7 @@ impl MoreLikeThisQueryBuilder {
pub fn with_document(self, doc_address: DocAddress) -> MoreLikeThisQuery {
MoreLikeThisQuery {
mlt: self.mlt,
target: TargetDocument::DocumentAddress(doc_address),
target: TargetDocument::DocumentAdress(doc_address),
}
}
@@ -175,10 +167,7 @@ impl MoreLikeThisQueryBuilder {
/// that will be used to compose the resulting query.
/// This interface is meant to be used when you want to provide your own set of fields
/// not necessarily from a specific document.
pub fn with_document_fields(
self,
doc_fields: Vec<(Field, Vec<OwnedValue>)>,
) -> MoreLikeThisQuery {
pub fn with_document_fields(self, doc_fields: Vec<(Field, Vec<Value>)>) -> MoreLikeThisQuery {
MoreLikeThisQuery {
mlt: self.mlt,
target: TargetDocument::DocumentFields(doc_fields),
@@ -191,7 +180,7 @@ mod tests {
use super::{MoreLikeThisQuery, TargetDocument};
use crate::collector::TopDocs;
use crate::schema::{Schema, STORED, TEXT};
use crate::{DocAddress, Index, IndexWriter};
use crate::{DocAddress, Index};
fn create_test_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -199,7 +188,7 @@ mod tests {
let body = schema_builder.add_text_field("body", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(title => "aaa", body => "the old man and the sea"))?;
index_writer.add_document(doc!(title => "bbb", body => "an old man sailing on the sea"))?;
index_writer.add_document(doc!(title => "ccc", body=> "send this message to alice"))?;
@@ -247,7 +236,7 @@ mod tests {
);
assert_eq!(
query.target,
TargetDocument::DocumentAddress(DocAddress::new(1, 2))
TargetDocument::DocumentAdress(DocAddress::new(1, 2))
);
}

View File

@@ -6,7 +6,7 @@ pub use phrase_prefix_query::PhrasePrefixQuery;
pub use phrase_prefix_scorer::PhrasePrefixScorer;
pub use phrase_prefix_weight::PhrasePrefixWeight;
pub(crate) fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
let mut res = prefix_start.to_owned();
while !res.is_empty() {
let end = res.len() - 1;

View File

@@ -161,7 +161,7 @@ mod tests {
use crate::docset::TERMINATED;
use crate::query::{EnableScoring, PhrasePrefixQuery, Query};
use crate::schema::{Schema, TEXT};
use crate::{DocSet, IndexWriter, Term};
use crate::{DocSet, Term};
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -169,7 +169,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc)?;

View File

@@ -17,7 +17,7 @@ pub mod tests {
use crate::core::Index;
use crate::query::{EnableScoring, QueryParser, Weight};
use crate::schema::{Schema, Term, TEXT};
use crate::{assert_nearly_equals, DocAddress, DocId, IndexWriter, TERMINATED};
use crate::{assert_nearly_equals, DocAddress, DocId, TERMINATED};
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -25,7 +25,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc)?;
@@ -135,7 +135,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
}
@@ -278,7 +278,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.add_document(doc!(text_field=>"b a"))?;
@@ -310,7 +310,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c d e f g h"))?;
index_writer.commit()?;
}
@@ -348,7 +348,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(json_field=>json!({
"text": "elliot smith the happy who"
})))?;

View File

@@ -847,12 +847,6 @@ impl QueryParser {
}));
(Some(logical_ast), errors)
}
UserInputLeaf::Exists { .. } => (
None,
vec![QueryParserError::UnsupportedQuery(
"Range query need to target a specific field.".to_string(),
)],
),
}
}
}

View File

@@ -31,8 +31,8 @@ impl VecCursor {
self.current_pos = 0;
&mut self.docs
}
fn last_value(&self) -> Option<u32> {
self.docs.iter().last().cloned()
fn last_doc(&self) -> Option<u32> {
self.docs.last().cloned()
}
fn is_empty(&self) -> bool {
self.current().is_none()
@@ -112,15 +112,15 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
finished_to_end = true;
}
let last_value = self.loaded_docs.last_value();
let last_doc = self.loaded_docs.last_doc();
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
self.column.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
doc_buffer,
);
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
if let Some(last_doc) = last_doc {
while self.loaded_docs.current() == Some(last_doc) {
self.loaded_docs.next();
}
}
@@ -136,7 +136,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
if let Some(docid) = self.loaded_docs.next() {
return docid;
}
if self.next_fetch_start >= self.column.values.num_vals() {
if self.next_fetch_start >= self.column.num_docs() {
return TERMINATED;
}
self.fetch_block();
@@ -177,3 +177,54 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
0 // heuristic possible by checking number of hits when fetching a block
}
}
#[cfg(test)]
mod tests {
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::query::RangeQuery;
use crate::{schema, Document, IndexBuilder};
#[test]
fn range_query_fast_optional_field_minimum() {
let mut schema_builder = schema::SchemaBuilder::new();
let id_field = schema_builder.add_text_field("id", schema::STRING);
let score_field = schema_builder.add_u64_field("score", schema::FAST | schema::INDEXED);
let dir = RamDirectory::default();
let index = IndexBuilder::new()
.schema(schema_builder.build())
.open_or_create(dir)
.unwrap();
{
let mut writer = index.writer(15_000_000).unwrap();
let count = 1000;
for i in 0..count {
let mut doc = Document::new();
doc.add_text(id_field, format!("doc{i}"));
let nb_scores = i % 2; // 0 or 1 scores
for _ in 0..nb_scores {
doc.add_u64(score_field, 80);
}
writer.add_document(doc).unwrap();
}
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = RangeQuery::new_u64_bounds(
"score".to_string(),
std::ops::Bound::Included(70),
std::ops::Bound::Unbounded,
);
let count = searcher.search(&query, &Count).unwrap();
assert_eq!(count, 500);
}
}

View File

@@ -41,14 +41,14 @@ use crate::{DateTime, DocId, Score};
/// use tantivy::collector::Count;
/// use tantivy::query::RangeQuery;
/// use tantivy::schema::{Schema, INDEXED};
/// use tantivy::{doc, Index, IndexWriter};
/// use tantivy::{doc, Index};
/// # fn test() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
/// let schema = schema_builder.build();
///
/// let index = Index::create_in_ram(schema);
/// let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 20_000_000)?;
/// let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
/// for year in 1950u64..2017u64 {
/// let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
/// for _ in 0..num_docs_within_year {
@@ -474,10 +474,8 @@ mod tests {
use crate::collector::{Count, TopDocs};
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
};
use crate::{doc, Index, IndexWriter};
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT};
use crate::{doc, Index};
#[test]
fn test_range_query_simple() -> crate::Result<()> {
@@ -554,7 +552,7 @@ mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for i in 1..100 {
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
for j in 1..100 {
if i % j == 0 {
doc.add_i64(int_field, j as i64);
@@ -619,7 +617,7 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 60_000_000).unwrap();
let mut docs = Vec::new();
for i in 1..100 {
let mut doc = TantivyDocument::new();
let mut doc = Document::new();
for j in 1..100 {
if i % j == 0 {
doc.add_f64(float_field, j as f64);
@@ -724,7 +722,7 @@ mod tests {
let ip_addr_2 = IpAddr::from_str("127.0.0.20").unwrap().into_ipv6_addr();
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..1_000 {
index_writer
.add_document(doc!(

View File

@@ -88,7 +88,7 @@ pub mod tests {
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
use crate::{Index, IndexWriter};
use crate::Index;
#[derive(Clone, Debug)]
pub struct Doc {
@@ -158,7 +158,7 @@ pub mod tests {
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let mut writer = index.writer_for_tests().unwrap();
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
.into_iter()
.map(Ipv6Addr::from_u128)

View File

@@ -76,14 +76,12 @@ impl Weight for FastFieldRangeWeight {
else {
return Ok(Box::new(EmptyScorer));
};
#[allow(clippy::reversed_empty_ranges)]
let value_range = bound_to_value_range(
&self.lower_bound,
&self.upper_bound,
column.min_value(),
column.max_value(),
)
.unwrap_or(1..=0); // empty range
);
if value_range.is_empty() {
return Ok(Box::new(EmptyScorer));
}
@@ -104,17 +102,15 @@ impl Weight for FastFieldRangeWeight {
}
}
// Returns None, if the range cannot be converted to a inclusive range (which equals to a empty
// range).
fn bound_to_value_range<T: MonotonicallyMappableToU64>(
lower_bound: &Bound<T>,
upper_bound: &Bound<T>,
min_value: T,
max_value: T,
) -> Option<RangeInclusive<T>> {
) -> RangeInclusive<T> {
let mut start_value = match lower_bound {
Bound::Included(val) => *val,
Bound::Excluded(val) => T::from_u64(val.to_u64().checked_add(1)?),
Bound::Excluded(val) => T::from_u64(val.to_u64() + 1),
Bound::Unbounded => min_value,
};
if start_value.partial_cmp(&min_value) == Some(std::cmp::Ordering::Less) {
@@ -122,10 +118,10 @@ fn bound_to_value_range<T: MonotonicallyMappableToU64>(
}
let end_value = match upper_bound {
Bound::Included(val) => *val,
Bound::Excluded(val) => T::from_u64(val.to_u64().checked_sub(1)?),
Bound::Excluded(val) => T::from_u64(val.to_u64() - 1),
Bound::Unbounded => max_value,
};
Some(start_value..=end_value)
start_value..=end_value
}
#[cfg(test)]
@@ -141,7 +137,7 @@ pub mod tests {
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::query::{QueryParser, Weight};
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
use crate::{Index, IndexWriter, TERMINATED};
use crate::{Index, TERMINATED};
#[derive(Clone, Debug)]
pub struct Doc {
@@ -209,7 +205,7 @@ pub mod tests {
let field = schema_builder.add_u64_field("test_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(field=>52_000u64)).unwrap();
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
@@ -299,9 +295,6 @@ pub mod tests {
let gen_query_inclusive = |field: &str, range: RangeInclusive<u64>| {
format!("{}:[{} TO {}]", field, range.start(), range.end())
};
let gen_query_exclusive = |field: &str, range: RangeInclusive<u64>| {
format!("{}:{{{} TO {}}}", field, range.start(), range.end())
};
let test_sample = |sample_docs: Vec<Doc>| {
let mut ids: Vec<u64> = sample_docs.iter().map(|doc| doc.id).collect();
@@ -317,20 +310,6 @@ pub mod tests {
let query = gen_query_inclusive("ids", ids[0]..=ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Exclusive range
let expected_num_hits = docs
.iter()
.filter(|doc| {
(ids[0].saturating_add(1)..=ids[1].saturating_sub(1)).contains(&doc.id)
})
.count();
let query = gen_query_exclusive("id", ids[0]..=ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_exclusive("ids", ids[0]..=ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id_name.to_string();
let expected_num_hits = docs

View File

@@ -18,7 +18,7 @@ use crate::schema::Field;
/// use tantivy::collector::Count;
/// use tantivy::query::RegexQuery;
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, Index, IndexWriter, Term};
/// use tantivy::{doc, Index, Term};
///
/// # fn test() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
@@ -26,7 +26,7 @@ use crate::schema::Field;
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer: IndexWriter = index.writer(15_000_000)?;
/// let mut index_writer = index.writer(15_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ))?;
@@ -95,7 +95,7 @@ mod test {
use super::RegexQuery;
use crate::collector::TopDocs;
use crate::schema::{Field, Schema, TEXT};
use crate::{assert_nearly_equals, Index, IndexReader, IndexWriter};
use crate::{assert_nearly_equals, Index, IndexReader};
fn build_test_index() -> crate::Result<(IndexReader, Field)> {
let mut schema_builder = Schema::builder();
@@ -103,7 +103,7 @@ mod test {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
country_field => "japan",
))?;

View File

@@ -116,7 +116,7 @@ mod tests {
use crate::collector::TopDocs;
use crate::query::{QueryParser, TermSetQuery};
use crate::schema::{Schema, TEXT};
use crate::{assert_nearly_equals, Index, IndexWriter, Term};
use crate::{assert_nearly_equals, Index, Term};
#[test]
pub fn test_term_set_query() -> crate::Result<()> {
@@ -126,7 +126,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
field1 => "doc1",
field2 => "val1",
@@ -233,7 +233,7 @@ mod tests {
schema_builder.add_text_field("field", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let field = schema.get_field("field").unwrap();
index_writer.add_document(doc!(
field => "val1",

View File

@@ -14,7 +14,7 @@ mod tests {
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};
use crate::{assert_nearly_equals, DocAddress, Index, Term, TERMINATED};
#[test]
pub fn test_term_query_no_freq() -> crate::Result<()> {
@@ -24,7 +24,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
let doc = doc!(text_field => "a");
index_writer.add_document(doc)?;
index_writer.commit()?;
@@ -50,7 +50,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
for _ in 0..COMPRESSION_BLOCK_SIZE {
let doc = doc!(text_field => "a");
index_writer.add_document(doc)?;
@@ -86,7 +86,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
left_field => "left1 left2 left2 left2f2 left2f2 left3 abcde abcde abcde abcde abcde abcde abcde abcde abcde abcewde abcde abcde",
right_field => "right1 right2",
@@ -133,7 +133,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.add_document(doc!(text_field=>"a c"))?;
index_writer.delete_term(Term::from_field_text(text_field, "b"));
@@ -151,7 +151,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.commit()?;
@@ -185,7 +185,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"a"))?;

Some files were not shown because too many files have changed in this diff Show More