Compare commits

..

1 Commits

Author SHA1 Message Date
Pascal Seitz
bb57e63522 Store List of Fields in Segment
Fiels may be encoded in the columnar storage or in the inverted index
for JSON fields.
Add a new Segment file that contains the list of fields (schema +
encoded)
2023-12-13 15:52:41 +08:00
36 changed files with 640 additions and 441 deletions

View File

@@ -25,7 +25,7 @@ aho-corasick = "1.0"
tantivy-fst = "0.5" tantivy-fst = "0.5"
memmap2 = { version = "0.9.0", optional = true } memmap2 = { version = "0.9.0", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true } lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false } zstd = { version = "0.13", default-features = false }
tempfile = { version = "3.3.0", optional = true } tempfile = { version = "3.3.0", optional = true }
log = "0.4.16" log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
@@ -38,7 +38,7 @@ crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0" rust-stemmers = "1.2.0"
downcast-rs = "1.2.0" downcast-rs = "1.2.0"
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker4x"] } bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker4x"] }
census = "0.4.2" census = "0.4.0"
rustc-hash = "1.1.0" rustc-hash = "1.1.0"
thiserror = "1.0.30" thiserror = "1.0.30"
htmlescape = "0.3.1" htmlescape = "0.3.1"
@@ -105,7 +105,7 @@ mmap = ["fs4", "tempfile", "memmap2"]
stopwords = [] stopwords = []
lz4-compression = ["lz4_flex"] lz4-compression = ["lz4_flex"]
zstd-compression = ["zstd"] zstd-compression = []
failpoints = ["fail", "fail/failpoints"] failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.

View File

@@ -5,18 +5,19 @@
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy) [![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
<img src="https://tantivy-search.github.io/logo/tantivy-logo.png" alt="Tantivy, the fastest full-text search engine library written in Rust" height="250"> ![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)
## Fast full-text search engine library written in Rust **Tantivy** is a **full-text search engine library** written in Rust.
**If you are looking for an alternative to Elasticsearch or Apache Solr, check out [Quickwit](https://github.com/quickwit-oss/quickwit), our distributed search engine built on top of Tantivy.** It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
Tantivy is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not to build such a search engine.
an off-the-shelf search engine server, but rather a crate that can be used to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design. Tantivy is, in fact, strongly inspired by Lucene's design.
## Benchmark If you are looking for an alternative to Elasticsearch or Apache Solr, check out [Quickwit](https://github.com/quickwit-oss/quickwit), our search engine built on top of Tantivy.
# Benchmark
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
performance for different types of queries/collections. performance for different types of queries/collections.
@@ -27,7 +28,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game). Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game).
## Features # Features
- Full-text search - Full-text search
- Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder)) - Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
@@ -53,11 +54,11 @@ Details about the benchmark can be found at this [repository](https://github.com
- Searcher Warmer API - Searcher Warmer API
- Cheesy logo with a horse - Cheesy logo with a horse
### Non-features ## Non-features
Distributed search is out of the scope of Tantivy, but if you are looking for this feature, check out [Quickwit](https://github.com/quickwit-oss/quickwit/). Distributed search is out of the scope of Tantivy, but if you are looking for this feature, check out [Quickwit](https://github.com/quickwit-oss/quickwit/).
## Getting started # Getting started
Tantivy works on stable Rust and supports Linux, macOS, and Windows. Tantivy works on stable Rust and supports Linux, macOS, and Windows.
@@ -67,7 +68,7 @@ index documents, and search via the CLI or a small server with a REST API.
It walks you through getting a Wikipedia search engine up and running in a few minutes. It walks you through getting a Wikipedia search engine up and running in a few minutes.
- [Reference doc for the last released version](https://docs.rs/tantivy/) - [Reference doc for the last released version](https://docs.rs/tantivy/)
## How can I support this project? # How can I support this project?
There are many ways to support this project. There are many ways to support this project.
@@ -78,16 +79,16 @@ There are many ways to support this project.
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE)) - Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE))
- Talk about Tantivy around you - Talk about Tantivy around you
## Contributing code # Contributing code
We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR. We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
Feel free to update CHANGELOG.md with your contribution. Feel free to update CHANGELOG.md with your contribution.
### Tokenizer ## Tokenizer
When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate. When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate.
### Clone and build locally ## Clone and build locally
Tantivy compiles on stable Rust. Tantivy compiles on stable Rust.
To check out and run tests, you can simply run: To check out and run tests, you can simply run:
@@ -98,7 +99,7 @@ cd tantivy
cargo test cargo test
``` ```
## Companies Using Tantivy # Companies Using Tantivy
<p align="left"> <p align="left">
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />&nbsp; <img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />&nbsp;
@@ -110,7 +111,7 @@ cargo test
<img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" /> <img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" />
</p> </p>
## FAQ # FAQ
### Can I use Tantivy in other languages? ### Can I use Tantivy in other languages?

View File

@@ -126,18 +126,18 @@ impl ColumnIndex {
} }
} }
pub fn docid_range_to_rowids(&self, doc_id_range: Range<DocId>) -> Range<RowId> { pub fn docid_range_to_rowids(&self, doc_id: Range<DocId>) -> Range<RowId> {
match self { match self {
ColumnIndex::Empty { .. } => 0..0, ColumnIndex::Empty { .. } => 0..0,
ColumnIndex::Full => doc_id_range, ColumnIndex::Full => doc_id,
ColumnIndex::Optional(optional_index) => { ColumnIndex::Optional(optional_index) => {
let row_start = optional_index.rank(doc_id_range.start); let row_start = optional_index.rank(doc_id.start);
let row_end = optional_index.rank(doc_id_range.end); let row_end = optional_index.rank(doc_id.end);
row_start..row_end row_start..row_end
} }
ColumnIndex::Multivalued(multivalued_index) => { ColumnIndex::Multivalued(multivalued_index) => {
let end_docid = doc_id_range.end.min(multivalued_index.num_docs() - 1) + 1; let end_docid = doc_id.end.min(multivalued_index.num_docs() - 1) + 1;
let start_docid = doc_id_range.start.min(end_docid); let start_docid = doc_id.start.min(end_docid);
let row_start = multivalued_index.start_index_column.get_val(start_docid); let row_start = multivalued_index.start_index_column.get_val(start_docid);
let row_end = multivalued_index.start_index_column.get_val(end_docid); let row_end = multivalued_index.start_index_column.get_val(end_docid);

View File

@@ -21,6 +21,8 @@ const DENSE_BLOCK_THRESHOLD: u32 =
const ELEMENTS_PER_BLOCK: u32 = u16::MAX as u32 + 1; const ELEMENTS_PER_BLOCK: u32 = u16::MAX as u32 + 1;
const BLOCK_SIZE: RowId = 1 << 16;
#[derive(Copy, Clone, Debug)] #[derive(Copy, Clone, Debug)]
struct BlockMeta { struct BlockMeta {
non_null_rows_before_block: u32, non_null_rows_before_block: u32,
@@ -107,8 +109,8 @@ struct RowAddr {
#[inline(always)] #[inline(always)]
fn row_addr_from_row_id(row_id: RowId) -> RowAddr { fn row_addr_from_row_id(row_id: RowId) -> RowAddr {
RowAddr { RowAddr {
block_id: (row_id / ELEMENTS_PER_BLOCK) as u16, block_id: (row_id / BLOCK_SIZE) as u16,
in_block_row_id: (row_id % ELEMENTS_PER_BLOCK) as u16, in_block_row_id: (row_id % BLOCK_SIZE) as u16,
} }
} }
@@ -183,13 +185,8 @@ impl Set<RowId> for OptionalIndex {
} }
} }
/// Any value doc_id is allowed.
/// In particular, doc_id = num_rows.
#[inline] #[inline]
fn rank(&self, doc_id: DocId) -> RowId { fn rank(&self, doc_id: DocId) -> RowId {
if doc_id >= self.num_docs() {
return self.num_non_nulls();
}
let RowAddr { let RowAddr {
block_id, block_id,
in_block_row_id, in_block_row_id,
@@ -203,15 +200,13 @@ impl Set<RowId> for OptionalIndex {
block_meta.non_null_rows_before_block + block_offset_row_id block_meta.non_null_rows_before_block + block_offset_row_id
} }
/// Any value doc_id is allowed.
/// In particular, doc_id = num_rows.
#[inline] #[inline]
fn rank_if_exists(&self, doc_id: DocId) -> Option<RowId> { fn rank_if_exists(&self, doc_id: DocId) -> Option<RowId> {
let RowAddr { let RowAddr {
block_id, block_id,
in_block_row_id, in_block_row_id,
} = row_addr_from_row_id(doc_id); } = row_addr_from_row_id(doc_id);
let block_meta = *self.block_metas.get(block_id as usize)?; let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta); let block = self.block(block_meta);
let block_offset_row_id = match block { let block_offset_row_id = match block {
Block::Dense(dense_block) => dense_block.rank_if_exists(in_block_row_id), Block::Dense(dense_block) => dense_block.rank_if_exists(in_block_row_id),
@@ -496,7 +491,7 @@ fn deserialize_optional_index_block_metadatas(
non_null_rows_before_block += num_non_null_rows; non_null_rows_before_block += num_non_null_rows;
} }
block_metas.resize( block_metas.resize(
((num_rows + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK) as usize, ((num_rows + BLOCK_SIZE - 1) / BLOCK_SIZE) as usize,
BlockMeta { BlockMeta {
non_null_rows_before_block, non_null_rows_before_block,
start_byte_offset, start_byte_offset,

View File

@@ -39,8 +39,7 @@ pub trait Set<T> {
/// ///
/// # Panics /// # Panics
/// ///
/// May panic if rank is greater or equal to the number of /// May panic if rank is greater than the number of elements in the Set.
/// elements in the Set.
fn select(&self, rank: T) -> T; fn select(&self, rank: T) -> T;
/// Creates a brand new select cursor. /// Creates a brand new select cursor.

View File

@@ -3,30 +3,6 @@ use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest}; use proptest::{prop_oneof, proptest};
use super::*; use super::*;
use crate::{ColumnarReader, ColumnarWriter, DynamicColumnHandle};
#[test]
fn test_optional_index_bug_2293() {
// tests for panic in docid_range_to_rowids for docid == num_docs
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK - 1);
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK);
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK + 1);
}
fn test_optional_index_with_num_docs(num_docs: u32) {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(100, "score", 80i64);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(num_docs, None, &mut buffer)
.unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
assert_eq!(cols.len(), 1);
let col = cols[0].open().unwrap();
col.column_index().docid_range_to_rowids(0..num_docs);
}
#[test] #[test]
fn test_dense_block_threshold() { fn test_dense_block_threshold() {
@@ -59,7 +35,7 @@ proptest! {
#[test] #[test]
fn test_with_random_sets_simple() { fn test_with_random_sets_simple() {
let vals = 10..ELEMENTS_PER_BLOCK * 2; let vals = 10..BLOCK_SIZE * 2;
let mut out: Vec<u8> = Vec::new(); let mut out: Vec<u8> = Vec::new();
serialize_optional_index(&vals, 100, &mut out).unwrap(); serialize_optional_index(&vals, 100, &mut out).unwrap();
let null_index = open_optional_index(OwnedBytes::new(out)).unwrap(); let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
@@ -195,7 +171,7 @@ fn test_optional_index_rank() {
test_optional_index_rank_aux(&[0u32, 1u32]); test_optional_index_rank_aux(&[0u32, 1u32]);
let mut block = Vec::new(); let mut block = Vec::new();
block.push(3u32); block.push(3u32);
block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1)); block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1));
test_optional_index_rank_aux(&block); test_optional_index_rank_aux(&block);
} }
@@ -209,8 +185,8 @@ fn test_optional_index_iter_empty_one() {
fn test_optional_index_iter_dense_block() { fn test_optional_index_iter_dense_block() {
let mut block = Vec::new(); let mut block = Vec::new();
block.push(3u32); block.push(3u32);
block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1)); block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1));
test_optional_index_iter_aux(&block, 3 * ELEMENTS_PER_BLOCK); test_optional_index_iter_aux(&block, 3 * BLOCK_SIZE);
} }
#[test] #[test]

View File

@@ -101,7 +101,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
row_id_hits: &mut Vec<RowId>, row_id_hits: &mut Vec<RowId>,
) { ) {
let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals()); let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
for idx in row_id_range { for idx in row_id_range.start..row_id_range.end {
let val = self.get_val(idx); let val = self.get_val(idx);
if value_range.contains(&val) { if value_range.contains(&val) {
row_id_hits.push(idx); row_id_hits.push(idx);

View File

@@ -58,7 +58,7 @@ impl ColumnType {
self == &ColumnType::DateTime self == &ColumnType::DateTime
} }
pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> { pub fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData) COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)
} }
} }

View File

@@ -333,7 +333,7 @@ impl ColumnarWriter {
num_docs: RowId, num_docs: RowId,
old_to_new_row_ids: Option<&[RowId]>, old_to_new_row_ids: Option<&[RowId]>,
wrt: &mut dyn io::Write, wrt: &mut dyn io::Write,
) -> io::Result<()> { ) -> io::Result<Vec<(String, ColumnType)>> {
let mut serializer = ColumnarSerializer::new(wrt); let mut serializer = ColumnarSerializer::new(wrt);
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
.numerical_field_hash_map .numerical_field_hash_map
@@ -374,7 +374,9 @@ impl ColumnarWriter {
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries); let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new(); let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, column_type, addr) in columns { for (column_name, column_type, addr) in columns.iter() {
let column_type = *column_type;
let addr = *addr;
match column_type { match column_type {
ColumnType::Bool => { ColumnType::Bool => {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr); let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
@@ -485,7 +487,15 @@ impl ColumnarWriter {
}; };
} }
serializer.finalize(num_docs)?; serializer.finalize(num_docs)?;
Ok(()) Ok(columns
.into_iter()
.map(|(column_name, column_type, _)| {
(
String::from_utf8_lossy(column_name).to_string(),
column_type,
)
})
.collect())
} }
} }

View File

@@ -6,7 +6,7 @@ use ownedbytes::OwnedBytes;
use crate::ByteCount; use crate::ByteCount;
#[derive(Clone, Copy, Eq, PartialEq)] #[derive(Clone, Copy, Eq, PartialEq, Hash)]
pub struct TinySet(u64); pub struct TinySet(u64);
impl fmt::Debug for TinySet { impl fmt::Debug for TinySet {

View File

@@ -81,8 +81,8 @@ where
T: InputTakeAtPosition + Clone, T: InputTakeAtPosition + Clone,
<T as InputTakeAtPosition>::Item: AsChar + Clone, <T as InputTakeAtPosition>::Item: AsChar + Clone,
{ {
opt_i(nom::character::complete::multispace0)(input) opt_i(nom::character::complete::space0)(input)
.map(|(left, (spaces, errors))| (left, (spaces.expect("multispace0 can't fail"), errors))) .map(|(left, (spaces, errors))| (left, (spaces.expect("space0 can't fail"), errors)))
} }
pub(crate) fn space1_infallible<T>(input: T) -> JResult<T, Option<T>> pub(crate) fn space1_infallible<T>(input: T) -> JResult<T, Option<T>>
@@ -90,7 +90,7 @@ where
T: InputTakeAtPosition + Clone + InputLength, T: InputTakeAtPosition + Clone + InputLength,
<T as InputTakeAtPosition>::Item: AsChar + Clone, <T as InputTakeAtPosition>::Item: AsChar + Clone,
{ {
opt_i(nom::character::complete::multispace1)(input).map(|(left, (spaces, mut errors))| { opt_i(nom::character::complete::space1)(input).map(|(left, (spaces, mut errors))| {
if spaces.is_none() { if spaces.is_none() {
errors.push(LenientErrorInternal { errors.push(LenientErrorInternal {
pos: left.input_len(), pos: left.input_len(),

View File

@@ -3,7 +3,7 @@ use std::iter::once;
use nom::branch::alt; use nom::branch::alt;
use nom::bytes::complete::tag; use nom::bytes::complete::tag;
use nom::character::complete::{ use nom::character::complete::{
anychar, char, digit1, multispace0, multispace1, none_of, one_of, satisfy, u32, anychar, char, digit1, none_of, one_of, satisfy, space0, space1, u32,
}; };
use nom::combinator::{eof, map, map_res, opt, peek, recognize, value, verify}; use nom::combinator::{eof, map, map_res, opt, peek, recognize, value, verify};
use nom::error::{Error, ErrorKind}; use nom::error::{Error, ErrorKind};
@@ -65,7 +65,7 @@ fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&st
|inp| { |inp| {
opt_i_err( opt_i_err(
preceded( preceded(
multispace0, space0,
recognize(many1(satisfy(|c| { recognize(many1(satisfy(|c| {
!c.is_whitespace() && !delimiter.contains(c) !c.is_whitespace() && !delimiter.contains(c)
}))), }))),
@@ -225,10 +225,10 @@ fn term_group(inp: &str) -> IResult<&str, UserInputAst> {
map( map(
tuple(( tuple((
terminated(field_name, multispace0), terminated(field_name, space0),
delimited( delimited(
tuple((char('('), multispace0)), tuple((char('('), space0)),
separated_list0(multispace1, tuple((opt(occur_symbol), term_or_phrase))), separated_list0(space1, tuple((opt(occur_symbol), term_or_phrase))),
char(')'), char(')'),
), ),
)), )),
@@ -250,7 +250,7 @@ fn term_group_precond(inp: &str) -> IResult<&str, (), ()> {
(), (),
peek(tuple(( peek(tuple((
field_name, field_name,
multispace0, space0,
char('('), // when we are here, we know it can't be anything but a term group char('('), // when we are here, we know it can't be anything but a term group
))), ))),
)(inp) )(inp)
@@ -259,7 +259,7 @@ fn term_group_precond(inp: &str) -> IResult<&str, (), ()> {
fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> { fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (mut inp, (field_name, _, _, _)) = let (mut inp, (field_name, _, _, _)) =
tuple((field_name, multispace0, char('('), multispace0))(inp).expect("precondition failed"); tuple((field_name, space0, char('('), space0))(inp).expect("precondition failed");
let mut terms = Vec::new(); let mut terms = Vec::new();
let mut errs = Vec::new(); let mut errs = Vec::new();
@@ -305,7 +305,7 @@ fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
UserInputLeaf::Exists { UserInputLeaf::Exists {
field: String::new(), field: String::new(),
}, },
tuple((multispace0, char('*'))), tuple((space0, char('*'))),
)(inp) )(inp)
} }
@@ -314,7 +314,7 @@ fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
(), (),
peek(tuple(( peek(tuple((
field_name, field_name,
multispace0, space0,
char('*'), // when we are here, we know it can't be anything but a exists char('*'), // when we are here, we know it can't be anything but a exists
))), ))),
)(inp) )(inp)
@@ -323,7 +323,7 @@ fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
fn exists_infallible(inp: &str) -> JResult<&str, UserInputAst> { fn exists_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (inp, (field_name, _, _)) = let (inp, (field_name, _, _)) =
tuple((field_name, multispace0, char('*')))(inp).expect("precondition failed"); tuple((field_name, space0, char('*')))(inp).expect("precondition failed");
let exists = UserInputLeaf::Exists { field: field_name }.into(); let exists = UserInputLeaf::Exists { field: field_name }.into();
Ok((inp, (exists, Vec::new()))) Ok((inp, (exists, Vec::new())))
@@ -349,7 +349,7 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
alt_infallible( alt_infallible(
( (
( (
value((), tuple((tag("IN"), multispace0, char('[')))), value((), tuple((tag("IN"), space0, char('[')))),
map(set_infallible, |(set, errs)| (Some(set), errs)), map(set_infallible, |(set, errs)| (Some(set), errs)),
), ),
( (
@@ -430,8 +430,8 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
// check for unbounded range in the form of <5, <=10, >5, >=5 // check for unbounded range in the form of <5, <=10, >5, >=5
let elastic_unbounded_range = map( let elastic_unbounded_range = map(
tuple(( tuple((
preceded(multispace0, alt((tag(">="), tag("<="), tag("<"), tag(">")))), preceded(space0, alt((tag(">="), tag("<="), tag("<"), tag(">")))),
preceded(multispace0, range_term_val()), preceded(space0, range_term_val()),
)), )),
|(comparison_sign, bound)| match comparison_sign { |(comparison_sign, bound)| match comparison_sign {
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded), ">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
@@ -444,7 +444,7 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
); );
let lower_bound = map( let lower_bound = map(
separated_pair(one_of("{["), multispace0, range_term_val()), separated_pair(one_of("{["), space0, range_term_val()),
|(boundary_char, lower_bound)| { |(boundary_char, lower_bound)| {
if lower_bound == "*" { if lower_bound == "*" {
UserInputBound::Unbounded UserInputBound::Unbounded
@@ -457,7 +457,7 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
); );
let upper_bound = map( let upper_bound = map(
separated_pair(range_term_val(), multispace0, one_of("}]")), separated_pair(range_term_val(), space0, one_of("}]")),
|(upper_bound, boundary_char)| { |(upper_bound, boundary_char)| {
if upper_bound == "*" { if upper_bound == "*" {
UserInputBound::Unbounded UserInputBound::Unbounded
@@ -469,11 +469,8 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
}, },
); );
let lower_to_upper = separated_pair( let lower_to_upper =
lower_bound, separated_pair(lower_bound, tuple((space1, tag("TO"), space1)), upper_bound);
tuple((multispace1, tag("TO"), multispace1)),
upper_bound,
);
map( map(
alt((elastic_unbounded_range, lower_to_upper)), alt((elastic_unbounded_range, lower_to_upper)),
@@ -493,16 +490,13 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
word_infallible("]}"), word_infallible("]}"),
space1_infallible, space1_infallible,
opt_i_err( opt_i_err(
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))), terminated(tag("TO"), alt((value((), space1), value((), eof)))),
"missing keyword TO", "missing keyword TO",
), ),
word_infallible("]}"), word_infallible("]}"),
opt_i_err(one_of("]}"), "missing range delimiter"), opt_i_err(one_of("]}"), "missing range delimiter"),
)), )),
|( |((lower_bound_kind, _space0, lower, _space1, to, upper, upper_bound_kind), errs)| {
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
errs,
)| {
let lower_bound = match (lower_bound_kind, lower) { let lower_bound = match (lower_bound_kind, lower) {
(_, Some("*")) => UserInputBound::Unbounded, (_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded, (_, None) => UserInputBound::Unbounded,
@@ -602,10 +596,10 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
fn set(inp: &str) -> IResult<&str, UserInputLeaf> { fn set(inp: &str) -> IResult<&str, UserInputLeaf> {
map( map(
preceded( preceded(
tuple((multispace0, tag("IN"), multispace1)), tuple((space0, tag("IN"), space1)),
delimited( delimited(
tuple((char('['), multispace0)), tuple((char('['), space0)),
separated_list0(multispace1, map(simple_term, |(_, term)| term)), separated_list0(space1, map(simple_term, |(_, term)| term)),
char(']'), char(']'),
), ),
), ),
@@ -673,7 +667,7 @@ fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
alt(( alt((
delimited(char('('), ast, char(')')), delimited(char('('), ast, char(')')),
map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)), map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
map(preceded(tuple((tag("NOT"), multispace1)), leaf), negate), map(preceded(tuple((tag("NOT"), space1)), leaf), negate),
literal, literal,
))(inp) ))(inp)
} }
@@ -925,17 +919,17 @@ fn aggregate_infallible_expressions(
fn operand_leaf(inp: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> { fn operand_leaf(inp: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> {
tuple(( tuple((
terminated(binary_operand, multispace0), terminated(binary_operand, space0),
terminated(boosted_leaf, multispace0), terminated(boosted_leaf, space0),
))(inp) ))(inp)
} }
fn ast(inp: &str) -> IResult<&str, UserInputAst> { fn ast(inp: &str) -> IResult<&str, UserInputAst> {
let boolean_expr = map( let boolean_expr = map(
separated_pair(boosted_leaf, multispace1, many1(operand_leaf)), separated_pair(boosted_leaf, space1, many1(operand_leaf)),
|(left, right)| aggregate_binary_expressions(left, right), |(left, right)| aggregate_binary_expressions(left, right),
); );
let whitespace_separated_leaves = map(separated_list1(multispace1, occur_leaf), |subqueries| { let whitespace_separated_leaves = map(separated_list1(space1, occur_leaf), |subqueries| {
if subqueries.len() == 1 { if subqueries.len() == 1 {
let (occur_opt, ast) = subqueries.into_iter().next().unwrap(); let (occur_opt, ast) = subqueries.into_iter().next().unwrap();
match occur_opt.unwrap_or(Occur::Should) { match occur_opt.unwrap_or(Occur::Should) {
@@ -948,9 +942,9 @@ fn ast(inp: &str) -> IResult<&str, UserInputAst> {
}); });
delimited( delimited(
multispace0, space0,
alt((boolean_expr, whitespace_separated_leaves)), alt((boolean_expr, whitespace_separated_leaves)),
multispace0, space0,
)(inp) )(inp)
} }
@@ -975,7 +969,7 @@ fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
} }
pub fn parse_to_ast(inp: &str) -> IResult<&str, UserInputAst> { pub fn parse_to_ast(inp: &str) -> IResult<&str, UserInputAst> {
map(delimited(multispace0, opt(ast), eof), |opt_ast| { map(delimited(space0, opt(ast), eof), |opt_ast| {
rewrite_ast(opt_ast.unwrap_or_else(UserInputAst::empty_query)) rewrite_ast(opt_ast.unwrap_or_else(UserInputAst::empty_query))
})(inp) })(inp)
} }
@@ -1151,7 +1145,6 @@ mod test {
#[test] #[test]
fn test_parse_query_to_ast_binary_op() { fn test_parse_query_to_ast_binary_op() {
test_parse_query_to_ast_helper("a AND b", "(+a +b)"); test_parse_query_to_ast_helper("a AND b", "(+a +b)");
test_parse_query_to_ast_helper("a\nAND b", "(+a +b)");
test_parse_query_to_ast_helper("a OR b", "(?a ?b)"); test_parse_query_to_ast_helper("a OR b", "(?a ?b)");
test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))"); test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))");
test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)"); test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)");

View File

@@ -596,13 +596,10 @@ mod tests {
use super::*; use super::*;
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::tests::{ use crate::aggregation::tests::{
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit, exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs, get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs,
}; };
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
#[test] #[test]
fn histogram_test_crooked_values() -> crate::Result<()> { fn histogram_test_crooked_values() -> crate::Result<()> {
@@ -1354,35 +1351,6 @@ mod tests {
}) })
); );
Ok(())
}
#[test]
fn test_aggregation_histogram_empty_index() -> crate::Result<()> {
// test index without segments
let values = vec![];
let index = get_test_index_from_values(false, &values)?;
let agg_req_1: Aggregations = serde_json::from_value(json!({
"myhisto": {
"histogram": {
"field": "score",
"interval": 10.0
},
}
}))
.unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
// Make sure the result structure is correct
assert_eq!(res["myhisto"]["buckets"].as_array().unwrap().len(), 0);
Ok(()) Ok(())
} }
} }

View File

@@ -309,7 +309,7 @@ impl TopDocs {
/// ///
/// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to /// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method. /// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
pub fn order_by_u64_field( fn order_by_u64_field(
self, self,
field: impl ToString, field: impl ToString,
order: Order, order: Order,

View File

@@ -142,6 +142,7 @@ impl SegmentMeta {
SegmentComponent::FastFields => ".fast".to_string(), SegmentComponent::FastFields => ".fast".to_string(),
SegmentComponent::FieldNorms => ".fieldnorm".to_string(), SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)), SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
SegmentComponent::FieldList => ".fieldlist".to_string(),
}); });
PathBuf::from(path) PathBuf::from(path)
} }

View File

@@ -70,7 +70,7 @@ impl InvertedIndexReader {
&self.termdict &self.termdict
} }
/// Return the fields and types encoded in the dictionary in lexicographic oder. /// Return the fields and types encoded in the dictionary in lexicographic order.
/// Only valid on JSON fields. /// Only valid on JSON fields.
/// ///
/// Notice: This requires a full scan and therefore **very expensive**. /// Notice: This requires a full scan and therefore **very expensive**.

View File

@@ -1,4 +1,4 @@
use columnar::MonotonicallyMappableToU64; use columnar::{ColumnType, MonotonicallyMappableToU64};
use common::{replace_in_place, JsonPathWriter}; use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
@@ -153,7 +153,7 @@ fn index_json_value<'a, V: Value<'a>>(
let mut token_stream = text_analyzer.token_stream(val); let mut token_stream = text_analyzer.token_stream(val);
let unordered_id = ctx let unordered_id = ctx
.path_to_unordered_id .path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()); .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::Str);
// TODO: make sure the chain position works out. // TODO: make sure the chain position works out.
set_path_id(term_buffer, unordered_id); set_path_id(term_buffer, unordered_id);
@@ -171,7 +171,7 @@ fn index_json_value<'a, V: Value<'a>>(
set_path_id( set_path_id(
term_buffer, term_buffer,
ctx.path_to_unordered_id ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()), .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::U64),
); );
term_buffer.append_type_and_fast_value(val); term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx); postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
@@ -180,7 +180,7 @@ fn index_json_value<'a, V: Value<'a>>(
set_path_id( set_path_id(
term_buffer, term_buffer,
ctx.path_to_unordered_id ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()), .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::I64),
); );
term_buffer.append_type_and_fast_value(val); term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx); postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
@@ -189,7 +189,7 @@ fn index_json_value<'a, V: Value<'a>>(
set_path_id( set_path_id(
term_buffer, term_buffer,
ctx.path_to_unordered_id ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()), .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::F64),
); );
term_buffer.append_type_and_fast_value(val); term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx); postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
@@ -198,7 +198,7 @@ fn index_json_value<'a, V: Value<'a>>(
set_path_id( set_path_id(
term_buffer, term_buffer,
ctx.path_to_unordered_id ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()), .get_or_allocate_unordered_id(json_path_writer.as_str(), ColumnType::Bool),
); );
term_buffer.append_type_and_fast_value(val); term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx); postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
@@ -206,8 +206,10 @@ fn index_json_value<'a, V: Value<'a>>(
ReferenceValueLeaf::Date(val) => { ReferenceValueLeaf::Date(val) => {
set_path_id( set_path_id(
term_buffer, term_buffer,
ctx.path_to_unordered_id ctx.path_to_unordered_id.get_or_allocate_unordered_id(
.get_or_allocate_unordered_id(json_path_writer.as_str()), json_path_writer.as_str(),
ColumnType::DateTime,
),
); );
term_buffer.append_type_and_fast_value(val); term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx); postings_writer.subscribe(doc, 0u32, term_buffer, ctx);

View File

@@ -27,12 +27,14 @@ pub enum SegmentComponent {
/// Bitset describing which document of the segment is alive. /// Bitset describing which document of the segment is alive.
/// (It was representing deleted docs but changed to represent alive docs from v0.17) /// (It was representing deleted docs but changed to represent alive docs from v0.17)
Delete, Delete,
/// Field list describing the fields in the segment.
FieldList,
} }
impl SegmentComponent { impl SegmentComponent {
/// Iterates through the components. /// Iterates through the components.
pub fn iterator() -> slice::Iter<'static, SegmentComponent> { pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [ static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [
SegmentComponent::Postings, SegmentComponent::Postings,
SegmentComponent::Positions, SegmentComponent::Positions,
SegmentComponent::FastFields, SegmentComponent::FastFields,
@@ -41,6 +43,7 @@ impl SegmentComponent {
SegmentComponent::Store, SegmentComponent::Store,
SegmentComponent::TempStore, SegmentComponent::TempStore,
SegmentComponent::Delete, SegmentComponent::Delete,
SegmentComponent::FieldList,
]; ];
SEGMENT_COMPONENTS.iter() SEGMENT_COMPONENTS.iter()
} }

View File

@@ -3,15 +3,14 @@ use std::ops::BitOrAssign;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use std::{fmt, io}; use std::{fmt, io};
use fnv::FnvHashMap;
use itertools::Itertools; use itertools::Itertools;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId}; use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice}; use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders}; use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::field_list::read_split_fields;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::json_utils::json_path_sep_to_dot;
use crate::schema::{Field, IndexRecordOption, Schema, Type}; use crate::schema::{Field, IndexRecordOption, Schema, Type};
use crate::space_usage::SegmentSpaceUsage; use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader; use crate::store::StoreReader;
@@ -44,6 +43,7 @@ pub struct SegmentReader {
fast_fields_readers: FastFieldReaders, fast_fields_readers: FastFieldReaders,
fieldnorm_readers: FieldNormReaders, fieldnorm_readers: FieldNormReaders,
list_fields_file: Option<FileSlice>, // Optional field list file for backwards compatibility
store_file: FileSlice, store_file: FileSlice,
alive_bitset_opt: Option<AliveBitSet>, alive_bitset_opt: Option<AliveBitSet>,
schema: Schema, schema: Schema,
@@ -153,6 +153,7 @@ impl SegmentReader {
let termdict_composite = CompositeFile::open(&termdict_file)?; let termdict_composite = CompositeFile::open(&termdict_file)?;
let store_file = segment.open_read(SegmentComponent::Store)?; let store_file = segment.open_read(SegmentComponent::Store)?;
let list_fields_file = segment.open_read(SegmentComponent::FieldList).ok();
crate::fail_point!("SegmentReader::open#middle"); crate::fail_point!("SegmentReader::open#middle");
@@ -201,6 +202,7 @@ impl SegmentReader {
segment_id: segment.id(), segment_id: segment.id(),
delete_opstamp: segment.meta().delete_opstamp(), delete_opstamp: segment.meta().delete_opstamp(),
store_file, store_file,
list_fields_file,
alive_bitset_opt, alive_bitset_opt,
positions_composite, positions_composite,
schema, schema,
@@ -299,87 +301,25 @@ impl SegmentReader {
/// field that is not indexed nor a fast field but is stored, it is possible for the field /// field that is not indexed nor a fast field but is stored, it is possible for the field
/// to not be listed. /// to not be listed.
pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> { pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
let mut indexed_fields: Vec<FieldMetadata> = Vec::new(); if let Some(list_fields_file) = self.list_fields_file.as_ref() {
let mut map_to_canonical = FnvHashMap::default(); let file = list_fields_file.read_bytes()?;
for (field, field_entry) in self.schema().fields() { let fields_metadata =
let field_name = field_entry.name().to_string(); read_split_fields(file)?.collect::<io::Result<Vec<FieldMetadata>>>();
let is_indexed = field_entry.is_indexed(); fields_metadata.map_err(|e| e.into())
} else {
if is_indexed { // Schema fallback
let is_json = field_entry.field_type().value_type() == Type::Json; Ok(self
if is_json { .schema()
let inv_index = self.inverted_index(field)?; .fields()
let encoded_fields_in_index = inv_index.list_encoded_fields()?; .map(|(_field, entry)| FieldMetadata {
let mut build_path = |field_name: &str, mut json_path: String| { field_name: entry.name().to_string(),
// In this case we need to map the potential fast field to the field name typ: entry.field_type().value_type(),
// accepted by the query parser. indexed: entry.is_indexed(),
let create_canonical = stored: entry.is_stored(),
!field_entry.is_expand_dots_enabled() && json_path.contains('.'); fast: entry.is_fast(),
if create_canonical { })
// Without expand dots enabled dots need to be escaped. .collect())
let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path);
let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path
} else {
// With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path)
}
};
indexed_fields.extend(
encoded_fields_in_index
.into_iter()
.map(|(name, typ)| (build_path(&field_name, name), typ))
.map(|(field_name, typ)| FieldMetadata {
indexed: true,
stored: false,
field_name,
fast: false,
typ,
}),
);
} else {
indexed_fields.push(FieldMetadata {
indexed: true,
stored: false,
field_name: field_name.to_string(),
fast: false,
typ: field_entry.field_type().value_type(),
});
}
}
} }
let mut fast_fields: Vec<FieldMetadata> = self
.fast_fields()
.columnar()
.iter_columns()?
.map(|(mut field_name, handle)| {
json_path_sep_to_dot(&mut field_name);
// map to canonical path, to avoid similar but different entries.
// Eventually we should just accept '.' seperated for all cases.
let field_name = map_to_canonical
.get(&field_name)
.unwrap_or(&field_name)
.to_string();
FieldMetadata {
indexed: false,
stored: false,
field_name,
fast: true,
typ: Type::from(handle.column_type()),
}
})
.collect();
// Since the type is encoded differently in the fast field and in the inverted index,
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
// If we are sure that the order is the same, we can remove this sort.
indexed_fields.sort_unstable();
fast_fields.sort_unstable();
let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);
Ok(merged)
} }
/// Returns the segment id /// Returns the segment id

View File

@@ -238,13 +238,17 @@ impl FastFieldsWriter {
mut self, mut self,
wrt: &mut dyn io::Write, wrt: &mut dyn io::Write,
doc_id_map_opt: Option<&DocIdMapping>, doc_id_map_opt: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<Vec<(String, Type)>> {
let num_docs = self.num_docs; let num_docs = self.num_docs;
let old_to_new_row_ids = let old_to_new_row_ids =
doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids()); doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids());
self.columnar_writer let columns = self
.columnar_writer
.serialize(num_docs, old_to_new_row_ids, wrt)?; .serialize(num_docs, old_to_new_row_ids, wrt)?;
Ok(()) Ok(columns
.into_iter()
.map(|(field_name, column)| (field_name.to_string(), column.into()))
.collect())
} }
} }

369
src/field_list/mod.rs Normal file
View File

@@ -0,0 +1,369 @@
//! The list of fields that are stored in a `tantivy` `Index`.
use std::collections::HashSet;
use std::io::{self, ErrorKind, Read};
use columnar::ColumnType;
use common::TinySet;
use fnv::FnvHashMap;
use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::json_utils::json_path_sep_to_dot;
use crate::postings::IndexingContext;
use crate::schema::{Field, Schema, Type};
use crate::{merge_field_meta_data, FieldMetadata, Term};
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
pub(crate) struct FieldConfig {
pub typ: Type,
pub indexed: bool,
pub stored: bool,
pub fast: bool,
}
impl FieldConfig {
fn serialize(&self) -> [u8; 2] {
let typ = self.typ.to_code();
let flags = (self.indexed as u8) << 2 | (self.stored as u8) << 1 | (self.fast as u8);
[typ, flags]
}
fn deserialize_from(data: [u8; 2]) -> io::Result<FieldConfig> {
let typ = Type::from_code(data[0]).ok_or_else(|| {
io::Error::new(
ErrorKind::InvalidData,
format!("could not deserialize type {}", data[0]),
)
})?;
let data = data[1];
let indexed = (data & 0b100) != 0;
let stored = (data & 0b010) != 0;
let fast = (data & 0b001) != 0;
Ok(FieldConfig {
typ,
indexed,
stored,
fast,
})
}
}
/// Serializes the split fields.
pub(crate) fn serialize_segment_fields(
ctx: IndexingContext,
wrt: &mut dyn io::Write,
schema: &Schema,
unordered_id_to_ordered_id: &[(OrderedPathId, TinySet)],
mut columns: Vec<(String, Type)>,
) -> crate::Result<()> {
let mut field_list_set: HashSet<(Field, OrderedPathId, TinySet)> = HashSet::default();
let mut encoded_fields = Vec::new();
let mut map_to_canonical = FnvHashMap::default();
// Replace unordered ids by ordered ids to be able to sort
let ordered_id_to_path = ctx.path_to_unordered_id.ordered_id_to_path();
for (key, _addr) in ctx.term_index.iter() {
let field = Term::wrap(key).field();
let field_entry = schema.get_field_entry(field);
if field_entry.field_type().value_type() == Type::Json {
let byte_range_unordered_id = 5..5 + 4;
let unordered_id =
u32::from_be_bytes(key[byte_range_unordered_id.clone()].try_into().unwrap());
let (path_id, typ_code_bitvec) = unordered_id_to_ordered_id[unordered_id as usize];
if !field_list_set.contains(&(field, path_id, typ_code_bitvec)) {
field_list_set.insert((field, path_id, typ_code_bitvec));
let mut build_path = |field_name: &str, mut json_path: String| {
// In this case we need to map the potential fast field to the field name
// accepted by the query parser.
let create_canonical =
!field_entry.is_expand_dots_enabled() && json_path.contains('.');
if create_canonical {
// Without expand dots enabled dots need to be escaped.
let escaped_json_path = json_path.replace('.', "\\.");
let full_path = format!("{}.{}", field_name, escaped_json_path);
let full_path_unescaped = format!("{}.{}", field_name, &json_path);
map_to_canonical.insert(full_path_unescaped, full_path.to_string());
full_path
} else {
// With expand dots enabled, we can use '.' instead of '\u{1}'.
json_path_sep_to_dot(&mut json_path);
format!("{}.{}", field_name, json_path)
}
};
let path = build_path(
field_entry.name(),
ordered_id_to_path[path_id.path_id() as usize].to_string(), /* String::from_utf8(key[5..].to_vec()).unwrap(), */
);
encoded_fields.push((path, typ_code_bitvec));
}
}
}
let mut indexed_fields: Vec<FieldMetadata> = Vec::new();
for (_field, field_entry) in schema.fields() {
let field_name = field_entry.name().to_string();
let is_indexed = field_entry.is_indexed();
let is_json = field_entry.field_type().value_type() == Type::Json;
if is_indexed && !is_json {
indexed_fields.push(FieldMetadata {
indexed: true,
stored: false,
field_name: field_name.to_string(),
fast: false,
typ: field_entry.field_type().value_type(),
});
}
}
for (field_name, field_type_set) in encoded_fields {
for field_type in field_type_set {
let column_type = ColumnType::try_from_code(field_type as u8).unwrap();
indexed_fields.push(FieldMetadata {
indexed: true,
stored: false,
field_name: field_name.to_string(),
fast: false,
typ: Type::from(column_type),
});
}
}
let mut fast_fields: Vec<FieldMetadata> = columns
.iter_mut()
.map(|(field_name, typ)| {
json_path_sep_to_dot(field_name);
// map to canonical path, to avoid similar but different entries.
// Eventually we should just accept '.' seperated for all cases.
let field_name = map_to_canonical
.get(field_name)
.unwrap_or(field_name)
.to_string();
FieldMetadata {
indexed: false,
stored: false,
field_name,
fast: true,
typ: *typ,
}
})
.collect();
// Since the type is encoded differently in the fast field and in the inverted index,
// the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
// If we are sure that the order is the same, we can remove this sort.
indexed_fields.sort_unstable();
fast_fields.sort_unstable();
let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], schema);
let out = serialize_split_fields(&merged);
wrt.write_all(&out)?;
Ok(())
}
/// Serializes the Split fields.
///
/// `fields_metadata` has to be sorted.
pub fn serialize_split_fields(fields_metadata: &[FieldMetadata]) -> Vec<u8> {
// ensure that fields_metadata is strictly sorted.
debug_assert!(fields_metadata.windows(2).all(|w| w[0] < w[1]));
let mut payload = Vec::new();
// Write Num Fields
let length = fields_metadata.len() as u32;
payload.extend_from_slice(&length.to_le_bytes());
for field_metadata in fields_metadata {
write_field(field_metadata, &mut payload);
}
let compression_level = 3;
let payload_compressed = zstd::stream::encode_all(&mut &payload[..], compression_level)
.expect("zstd encoding failed");
let mut out = Vec::new();
// Write Header -- Format Version
let format_version = 1u8;
out.push(format_version);
// Write Payload
out.extend_from_slice(&payload_compressed);
out
}
fn write_field(field_metadata: &FieldMetadata, out: &mut Vec<u8>) {
let field_config = FieldConfig {
typ: field_metadata.typ,
indexed: field_metadata.indexed,
stored: field_metadata.stored,
fast: field_metadata.fast,
};
// Write Config 2 bytes
out.extend_from_slice(&field_config.serialize());
let str_length = field_metadata.field_name.len() as u16;
// Write String length 2 bytes
out.extend_from_slice(&str_length.to_le_bytes());
out.extend_from_slice(field_metadata.field_name.as_bytes());
}
/// Reads a fixed number of bytes into an array and returns the array.
fn read_exact_array<R: Read, const N: usize>(reader: &mut R) -> io::Result<[u8; N]> {
let mut buffer = [0u8; N];
reader.read_exact(&mut buffer)?;
Ok(buffer)
}
/// Reads the Split fields from a zstd compressed stream of bytes
pub fn read_split_fields<R: Read>(
mut reader: R,
) -> io::Result<impl Iterator<Item = io::Result<FieldMetadata>>> {
let format_version = read_exact_array::<_, 1>(&mut reader)?[0];
assert_eq!(format_version, 1);
let reader = zstd::Decoder::new(reader)?;
read_split_fields_from_zstd(reader)
}
fn read_field<R: Read>(reader: &mut R) -> io::Result<FieldMetadata> {
// Read FieldConfig (2 bytes)
let config_bytes = read_exact_array::<_, 2>(reader)?;
let field_config = FieldConfig::deserialize_from(config_bytes)?; // Assuming this returns a Result
// Read field name length and the field name
let name_len = u16::from_le_bytes(read_exact_array::<_, 2>(reader)?) as usize;
let mut data = vec![0; name_len];
reader.read_exact(&mut data)?;
let field_name = String::from_utf8(data).map_err(|err| {
io::Error::new(
ErrorKind::InvalidData,
format!(
"Encountered invalid utf8 when deserializing field name: {}",
err
),
)
})?;
Ok(FieldMetadata {
field_name,
typ: field_config.typ,
indexed: field_config.indexed,
stored: field_config.stored,
fast: field_config.fast,
})
}
/// Reads the Split fields from a stream of bytes
fn read_split_fields_from_zstd<R: Read>(
mut reader: R,
) -> io::Result<impl Iterator<Item = io::Result<FieldMetadata>>> {
let mut num_fields = u32::from_le_bytes(read_exact_array::<_, 4>(&mut reader)?);
Ok(std::iter::from_fn(move || {
if num_fields == 0 {
return None;
}
num_fields -= 1;
Some(read_field(&mut reader))
}))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn field_config_deser_test() {
let field_config = FieldConfig {
typ: Type::Str,
indexed: true,
stored: false,
fast: true,
};
let serialized = field_config.serialize();
let deserialized = FieldConfig::deserialize_from(serialized).unwrap();
assert_eq!(field_config, deserialized);
}
#[test]
fn write_read_field_test() {
for typ in Type::iter_values() {
let field_metadata = FieldMetadata {
field_name: "test".to_string(),
typ,
indexed: true,
stored: true,
fast: true,
};
let mut out = Vec::new();
write_field(&field_metadata, &mut out);
let deserialized = read_field(&mut &out[..]).unwrap();
assert_eq!(field_metadata, deserialized);
}
let field_metadata = FieldMetadata {
field_name: "test".to_string(),
typ: Type::Str,
indexed: false,
stored: true,
fast: true,
};
let mut out = Vec::new();
write_field(&field_metadata, &mut out);
let deserialized = read_field(&mut &out[..]).unwrap();
assert_eq!(field_metadata, deserialized);
let field_metadata = FieldMetadata {
field_name: "test".to_string(),
typ: Type::Str,
indexed: false,
stored: false,
fast: true,
};
let mut out = Vec::new();
write_field(&field_metadata, &mut out);
let deserialized = read_field(&mut &out[..]).unwrap();
assert_eq!(field_metadata, deserialized);
let field_metadata = FieldMetadata {
field_name: "test".to_string(),
typ: Type::Str,
indexed: true,
stored: false,
fast: false,
};
let mut out = Vec::new();
write_field(&field_metadata, &mut out);
let deserialized = read_field(&mut &out[..]).unwrap();
assert_eq!(field_metadata, deserialized);
}
#[test]
fn write_split_fields_test() {
let fields_metadata = vec![
FieldMetadata {
field_name: "test".to_string(),
typ: Type::Str,
indexed: true,
stored: true,
fast: true,
},
FieldMetadata {
field_name: "test2".to_string(),
typ: Type::Str,
indexed: true,
stored: false,
fast: false,
},
FieldMetadata {
field_name: "test3".to_string(),
typ: Type::U64,
indexed: true,
stored: false,
fast: true,
},
];
let out = serialize_split_fields(&fields_metadata);
let deserialized: Vec<FieldMetadata> = read_split_fields(&mut &out[..])
.unwrap()
.map(|el| el.unwrap())
.collect();
assert_eq!(fields_metadata, deserialized);
}
}

View File

@@ -1651,7 +1651,6 @@ mod tests {
force_end_merge: bool, force_end_merge: bool,
) -> crate::Result<Index> { ) -> crate::Result<Index> {
let mut schema_builder = schema::Schema::builder(); let mut schema_builder = schema::Schema::builder();
let json_field = schema_builder.add_json_field("json", FAST | TEXT | STORED);
let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED); let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED);
let ips_field = schema_builder let ips_field = schema_builder
.add_ip_addr_field("ips", IpAddrOptions::default().set_fast().set_indexed()); .add_ip_addr_field("ips", IpAddrOptions::default().set_fast().set_indexed());
@@ -1730,9 +1729,7 @@ mod tests {
id_field=>id, id_field=>id,
))?; ))?;
} else { } else {
let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()});
index_writer.add_document(doc!(id_field=>id, index_writer.add_document(doc!(id_field=>id,
json_field=>json,
bytes_field => id.to_le_bytes().as_slice(), bytes_field => id.to_le_bytes().as_slice(),
id_opt_field => id, id_opt_field => id,
ip_field => ip, ip_field => ip,

View File

@@ -1,3 +1,4 @@
use std::io::Write;
use std::sync::Arc; use std::sync::Arc;
use columnar::{ use columnar::{
@@ -13,6 +14,7 @@ use crate::directory::WritePtr;
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError}; use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError};
use crate::field_list::serialize_split_fields;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping}; use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
use crate::indexer::SegmentSerializer; use crate::indexer::SegmentSerializer;
@@ -21,8 +23,8 @@ use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
use crate::store::StoreWriter; use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal}; use crate::termdict::{TermMerger, TermOrdinal};
use crate::{ use crate::{
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, merge_field_meta_data, DocAddress, DocId, FieldMetadata, IndexSettings, IndexSortByField,
SegmentComponent, SegmentOrdinal, InvertedIndexReader, Order, SegmentComponent, SegmentOrdinal,
}; };
/// Segment's max doc must be `< MAX_DOC_LIMIT`. /// Segment's max doc must be `< MAX_DOC_LIMIT`.
@@ -255,6 +257,19 @@ impl IndexMerger {
Ok(()) Ok(())
} }
fn write_field_list(&self, list_field_wrt: &mut WritePtr) -> crate::Result<()> {
let field_metadatas: Vec<Vec<FieldMetadata>> = self
.readers
.iter()
.map(|reader| reader.fields_metadata())
.collect::<crate::Result<Vec<_>>>()?;
let merged = merge_field_meta_data(field_metadatas, &self.schema);
let out = serialize_split_fields(&merged);
list_field_wrt.write_all(&out)?;
Ok(())
}
fn write_fast_fields( fn write_fast_fields(
&self, &self,
fast_field_wrt: &mut WritePtr, fast_field_wrt: &mut WritePtr,
@@ -605,10 +620,6 @@ impl IndexMerger {
segment_postings.positions(&mut positions_buffer); segment_postings.positions(&mut positions_buffer);
segment_postings.term_freq() segment_postings.term_freq()
} else { } else {
// The positions_buffer may contain positions from the previous term
// Existence of positions depend on the value type in JSON fields.
// https://github.com/quickwit-oss/tantivy/issues/2283
positions_buffer.clear();
0u32 0u32
}; };
@@ -777,6 +788,7 @@ impl IndexMerger {
self.write_storable_fields(serializer.get_store_writer(), &doc_id_mapping)?; self.write_storable_fields(serializer.get_store_writer(), &doc_id_mapping)?;
debug!("write-fastfields"); debug!("write-fastfields");
self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?; self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?;
self.write_field_list(serializer.get_field_list_write())?;
debug!("close-serializer"); debug!("close-serializer");
serializer.close()?; serializer.close()?;

View File

@@ -1,3 +1,5 @@
use columnar::ColumnType;
use common::TinySet;
use fnv::FnvHashMap; use fnv::FnvHashMap;
/// `Field` is represented by an unsigned 32-bit integer type. /// `Field` is represented by an unsigned 32-bit integer type.
@@ -24,34 +26,44 @@ impl From<u32> for OrderedPathId {
#[derive(Default)] #[derive(Default)]
pub(crate) struct PathToUnorderedId { pub(crate) struct PathToUnorderedId {
map: FnvHashMap<String, u32>, /// TinySet contains the type codes of the columns in the path.
map: FnvHashMap<String, (u32, TinySet)>,
} }
impl PathToUnorderedId { impl PathToUnorderedId {
#[inline] #[inline]
pub(crate) fn get_or_allocate_unordered_id(&mut self, path: &str) -> u32 { pub(crate) fn get_or_allocate_unordered_id(&mut self, path: &str, typ: ColumnType) -> u32 {
if let Some(id) = self.map.get(path) { let code_bit = typ.to_code();
if let Some((id, all_codes)) = self.map.get_mut(path) {
*all_codes = all_codes.insert(code_bit as u32);
return *id; return *id;
} }
self.insert_new_path(path) self.insert_new_path(path, code_bit)
} }
#[cold] #[cold]
fn insert_new_path(&mut self, path: &str) -> u32 { fn insert_new_path(&mut self, path: &str, typ_code: u8) -> u32 {
let next_id = self.map.len() as u32; let next_id = self.map.len() as u32;
self.map.insert(path.to_string(), next_id); self.map.insert(
path.to_string(),
(next_id, TinySet::singleton(typ_code as u32)),
);
next_id next_id
} }
/// Retuns ids which reflect the lexical order of the paths. /// Retuns ids which reflect the lexical order of the paths.
/// ///
/// The returned vec can be indexed with the unordered id to get the ordered id. /// The returned vec can be indexed with the unordered id to get the ordered id.
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> { pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<(OrderedPathId, TinySet)> {
let mut sorted_ids: Vec<(&str, &u32)> = let mut sorted_ids: Vec<(&str, (u32, TinySet))> = self
self.map.iter().map(|(k, v)| (k.as_str(), v)).collect(); .map
.iter()
.map(|(k, (id, typ_code))| (k.as_str(), (*id, *typ_code)))
.collect();
sorted_ids.sort_unstable_by_key(|(path, _)| *path); sorted_ids.sort_unstable_by_key(|(path, _)| *path);
let mut result = vec![OrderedPathId::default(); sorted_ids.len()]; let mut result = vec![(OrderedPathId::default(), TinySet::empty()); sorted_ids.len()];
for (ordered, unordered) in sorted_ids.iter().map(|(_k, v)| v).enumerate() { for (ordered, (unordered, typ_code)) in sorted_ids.iter().map(|(_k, v)| v).enumerate() {
result[**unordered as usize] = OrderedPathId::from_ordered_id(ordered as u32); result[*unordered as usize] =
(OrderedPathId::from_ordered_id(ordered as u32), *typ_code);
} }
result result
} }
@@ -74,12 +86,12 @@ mod tests {
let terms = vec!["b", "a", "b", "c"]; let terms = vec!["b", "a", "b", "c"];
let ids = terms let ids = terms
.iter() .iter()
.map(|term| path_to_id.get_or_allocate_unordered_id(term)) .map(|term| path_to_id.get_or_allocate_unordered_id(term, ColumnType::Str))
.collect::<Vec<u32>>(); .collect::<Vec<u32>>();
assert_eq!(ids, vec![0, 1, 0, 2]); assert_eq!(ids, vec![0, 1, 0, 2]);
let ordered_ids = ids let ordered_ids = ids
.iter() .iter()
.map(|id| path_to_id.unordered_id_to_ordered_id()[*id as usize]) .map(|id| path_to_id.unordered_id_to_ordered_id()[*id as usize].0)
.collect::<Vec<OrderedPathId>>(); .collect::<Vec<OrderedPathId>>();
assert_eq!(ordered_ids, vec![1.into(), 0.into(), 1.into(), 2.into()]); assert_eq!(ordered_ids, vec![1.into(), 0.into(), 1.into(), 2.into()]);
// Fetch terms // Fetch terms

View File

@@ -12,6 +12,7 @@ pub struct SegmentSerializer {
segment: Segment, segment: Segment,
pub(crate) store_writer: StoreWriter, pub(crate) store_writer: StoreWriter,
fast_field_write: WritePtr, fast_field_write: WritePtr,
field_list_write: WritePtr,
fieldnorms_serializer: Option<FieldNormsSerializer>, fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer, postings_serializer: InvertedIndexSerializer,
} }
@@ -49,6 +50,7 @@ impl SegmentSerializer {
}; };
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?; let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
let field_list_write = segment.open_write(SegmentComponent::FieldList)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?; let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
@@ -58,6 +60,7 @@ impl SegmentSerializer {
segment, segment,
store_writer, store_writer,
fast_field_write, fast_field_write,
field_list_write,
fieldnorms_serializer: Some(fieldnorms_serializer), fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer, postings_serializer,
}) })
@@ -81,6 +84,11 @@ impl SegmentSerializer {
&mut self.postings_serializer &mut self.postings_serializer
} }
/// Accessor to the ``.
pub fn get_field_list_write(&mut self) -> &mut WritePtr {
&mut self.field_list_write
}
/// Accessor to the `FastFieldSerializer`. /// Accessor to the `FastFieldSerializer`.
pub fn get_fast_field_write(&mut self) -> &mut WritePtr { pub fn get_fast_field_write(&mut self) -> &mut WritePtr {
&mut self.fast_field_write &mut self.fast_field_write
@@ -104,6 +112,7 @@ impl SegmentSerializer {
fieldnorms_serializer.close()?; fieldnorms_serializer.close()?;
} }
self.fast_field_write.terminate()?; self.fast_field_write.terminate()?;
self.field_list_write.terminate()?;
self.postings_serializer.close()?; self.postings_serializer.close()?;
self.store_writer.close()?; self.store_writer.close()?;
Ok(()) Ok(())

View File

@@ -8,6 +8,7 @@ use super::operation::AddOperation;
use crate::core::json_utils::index_json_values; use crate::core::json_utils::index_json_values;
use crate::core::Segment; use crate::core::Segment;
use crate::fastfield::FastFieldsWriter; use crate::fastfield::FastFieldsWriter;
use crate::field_list::serialize_segment_fields;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::indexer::segment_serializer::SegmentSerializer; use crate::indexer::segment_serializer::SegmentSerializer;
use crate::postings::{ use crate::postings::{
@@ -443,16 +444,29 @@ fn remap_and_write(
.segment() .segment()
.open_read(SegmentComponent::FieldNorms)?; .open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let unordered_id_to_ordered_id = ctx.path_to_unordered_id.unordered_id_to_ordered_id();
serialize_postings( serialize_postings(
ctx, &ctx,
schema, schema.clone(),
per_field_postings_writers, per_field_postings_writers,
fieldnorm_readers, fieldnorm_readers,
doc_id_map, doc_id_map,
&unordered_id_to_ordered_id,
serializer.get_postings_serializer(), serializer.get_postings_serializer(),
)?; )?;
debug!("fastfield-serialize"); debug!("fastfield-serialize");
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?; let columns = fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
let field_list_serializer = serializer.get_field_list_write();
serialize_segment_fields(
ctx,
field_list_serializer,
&schema,
&unordered_id_to_ordered_id,
columns,
)?;
// finalize temp docstore and create version, which reflects the doc_id_map // finalize temp docstore and create version, which reflects the doc_id_map
if let Some(doc_id_map) = doc_id_map { if let Some(doc_id_map) = doc_id_map {
@@ -879,31 +893,6 @@ mod tests {
assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0); assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0);
} }
#[test]
fn test_json_term_with_numeric_merge_panic_regression_bug_2283() {
// https://github.com/quickwit-oss/tantivy/issues/2283
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
let doc = json!({"field": "a"});
writer.add_document(doc!(json=>doc)).unwrap();
writer.commit().unwrap();
let doc = json!({"field": "a", "id": 1});
writer.add_document(doc!(json=>doc.clone())).unwrap();
writer.commit().unwrap();
// Force Merge
writer.wait_merging_threads().unwrap();
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer.merge(&segment_ids).wait().unwrap();
assert!(index_writer.wait_merging_threads().is_ok());
}
#[test] #[test]
fn test_bug_regression_1629_position_when_array_with_a_field_value_that_does_not_contain_any_token( fn test_bug_regression_1629_position_when_array_with_a_field_value_that_does_not_contain_any_token(
) { ) {

View File

@@ -188,6 +188,7 @@ pub mod aggregation;
pub mod collector; pub mod collector;
pub mod directory; pub mod directory;
pub mod fastfield; pub mod fastfield;
pub mod field_list;
pub mod fieldnorm; pub mod fieldnorm;
pub mod positions; pub mod positions;
pub mod postings; pub mod postings;
@@ -238,7 +239,9 @@ pub use crate::schema::DatePrecision;
pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term}; pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term};
/// Index format version. /// Index format version.
const INDEX_FORMAT_VERSION: u32 = 6; ///
/// Version 7: Add `.fieldlist` file containing the list of fields in a segment.
const INDEX_FORMAT_VERSION: u32 = 7;
/// Oldest index format version this tantivy version can read. /// Oldest index format version this tantivy version can read.
const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4; const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;

View File

@@ -2,6 +2,7 @@ use std::io;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::Range; use std::ops::Range;
use common::TinySet;
use stacker::Addr; use stacker::Addr;
use crate::fieldnorm::FieldNormReaders; use crate::fieldnorm::FieldNormReaders;
@@ -46,37 +47,38 @@ fn make_field_partition(
/// It pushes all term, one field at a time, towards the /// It pushes all term, one field at a time, towards the
/// postings serializer. /// postings serializer.
pub(crate) fn serialize_postings( pub(crate) fn serialize_postings(
ctx: IndexingContext, ctx: &IndexingContext,
schema: Schema, schema: Schema,
per_field_postings_writers: &PerFieldPostingsWriter, per_field_postings_writers: &PerFieldPostingsWriter,
fieldnorm_readers: FieldNormReaders, fieldnorm_readers: FieldNormReaders,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
unordered_id_to_ordered_id: &[(OrderedPathId, TinySet)],
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
) -> crate::Result<()> { ) -> crate::Result<()> {
// Replace unordered ids by ordered ids to be able to sort // Replace unordered ids by ordered ids to be able to sort
let unordered_id_to_ordered_id: Vec<OrderedPathId> = let ordered_id_to_path = ctx.path_to_unordered_id.ordered_id_to_path();
ctx.path_to_unordered_id.unordered_id_to_ordered_id();
let mut term_offsets: Vec<(Field, OrderedPathId, &[u8], Addr)> = let mut term_offsets: Vec<(Field, OrderedPathId, &[u8], Addr)> =
Vec::with_capacity(ctx.term_index.len()); Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(ctx.term_index.iter().map(|(key, addr)| { for (key, addr) in ctx.term_index.iter() {
let field = Term::wrap(key).field(); let field = Term::wrap(key).field();
if schema.get_field_entry(field).field_type().value_type() == Type::Json { let field_entry = schema.get_field_entry(field);
let byte_range_path = 5..5 + 4; if field_entry.field_type().value_type() == Type::Json {
let unordered_id = u32::from_be_bytes(key[byte_range_path.clone()].try_into().unwrap()); let byte_range_unordered_id = 5..5 + 4;
let path_id = unordered_id_to_ordered_id[unordered_id as usize]; let unordered_id =
(field, path_id, &key[byte_range_path.end..], addr) u32::from_be_bytes(key[byte_range_unordered_id.clone()].try_into().unwrap());
let (path_id, _typ_code_bitvec) = unordered_id_to_ordered_id[unordered_id as usize];
term_offsets.push((field, path_id, &key[byte_range_unordered_id.end..], addr));
} else { } else {
(field, 0.into(), &key[5..], addr) term_offsets.push((field, 0.into(), &key[5..], addr));
} }
})); }
// Sort by field, path, and term // Sort by field, path, and term
term_offsets.sort_unstable_by( term_offsets.sort_unstable_by(
|(field1, path_id1, bytes1, _), (field2, path_id2, bytes2, _)| { |(field1, path_id1, bytes1, _), (field2, path_id2, bytes2, _)| {
(field1, path_id1, bytes1).cmp(&(field2, path_id2, bytes2)) (field1, path_id1, bytes1).cmp(&(field2, path_id2, bytes2))
}, },
); );
let ordered_id_to_path = ctx.path_to_unordered_id.ordered_id_to_path();
let field_offsets = make_field_partition(&term_offsets); let field_offsets = make_field_partition(&term_offsets);
for (field, byte_offsets) in field_offsets { for (field, byte_offsets) in field_offsets {
let postings_writer = per_field_postings_writers.get_for_field(field); let postings_writer = per_field_postings_writers.get_for_field(field);
@@ -87,7 +89,7 @@ pub(crate) fn serialize_postings(
&term_offsets[byte_offsets], &term_offsets[byte_offsets],
&ordered_id_to_path, &ordered_id_to_path,
doc_id_map, doc_id_map,
&ctx, ctx,
&mut field_serializer, &mut field_serializer,
)?; )?;
field_serializer.close()?; field_serializer.close()?;

View File

@@ -1,4 +1,4 @@
use std::borrow::Cow; use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn}; use columnar::{ColumnIndex, DynamicColumn};
@@ -14,7 +14,7 @@ use crate::{DocId, Score, TantivyError};
/// All of the matched documents get the score 1.0. /// All of the matched documents get the score 1.0.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct ExistsQuery { pub struct ExistsQuery {
field: Cow<'static, str>, field_name: String,
} }
impl ExistsQuery { impl ExistsQuery {
@@ -23,42 +23,40 @@ impl ExistsQuery {
/// This query matches all documents with at least one non-null value in the specified field. /// This query matches all documents with at least one non-null value in the specified field.
/// This constructor never fails, but executing the search with this query will return an /// This constructor never fails, but executing the search with this query will return an
/// error if the specified field doesn't exists or is not a fast field. /// error if the specified field doesn't exists or is not a fast field.
pub fn new_exists_query<F: Into<Cow<'static, str>>>(field: F) -> ExistsQuery { pub fn new_exists_query(field: String) -> ExistsQuery {
ExistsQuery { ExistsQuery { field_name: field }
field: field.into(),
}
} }
} }
impl Query for ExistsQuery { impl Query for ExistsQuery {
fn weight(&self, enable_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> { fn weight(&self, enable_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema(); let schema = enable_scoring.schema();
let Some((field, _path)) = schema.find_field(&self.field) else { let Some((field, _path)) = schema.find_field(&self.field_name) else {
return Err(TantivyError::FieldNotFound(self.field.to_string())); return Err(TantivyError::FieldNotFound(self.field_name.clone()));
}; };
let field_type = schema.get_field_entry(field).field_type(); let field_type = schema.get_field_entry(field).field_type();
if !field_type.is_fast() { if !field_type.is_fast() {
return Err(TantivyError::SchemaError(format!( return Err(TantivyError::SchemaError(format!(
"Field {} is not a fast field.", "Field {} is not a fast field.",
self.field self.field_name
))); )));
} }
Ok(Box::new(ExistsWeight { Ok(Box::new(ExistsWeight {
field: self.field.clone(), field_name: self.field_name.clone(),
})) }))
} }
} }
/// Weight associated with the `ExistsQuery` query. /// Weight associated with the `ExistsQuery` query.
pub struct ExistsWeight { pub struct ExistsWeight {
field: Cow<'static, str>, field_name: String,
} }
impl Weight for ExistsWeight { impl Weight for ExistsWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields(); let fast_field_reader = reader.fast_fields();
let dynamic_columns: crate::Result<Vec<DynamicColumn>> = fast_field_reader let dynamic_columns: crate::Result<Vec<DynamicColumn>> = fast_field_reader
.dynamic_column_handles(&self.field)? .dynamic_column_handles(&self.field_name)?
.into_iter() .into_iter()
.map(|handle| handle.open().map_err(|io_error| io_error.into())) .map(|handle| handle.open().map_err(|io_error| io_error.into()))
.collect(); .collect();

View File

@@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io; use std::io;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use std::ops::{Bound, Range}; use std::ops::{Bound, Range};
@@ -69,7 +68,7 @@ use crate::{DateTime, DocId, Score};
/// ``` /// ```
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct RangeQuery { pub struct RangeQuery {
field: Cow<'static, str>, field: String,
value_type: Type, value_type: Type,
lower_bound: Bound<Vec<u8>>, lower_bound: Bound<Vec<u8>>,
upper_bound: Bound<Vec<u8>>, upper_bound: Bound<Vec<u8>>,
@@ -81,15 +80,15 @@ impl RangeQuery {
/// ///
/// If the value type is not correct, something may go terribly wrong when /// If the value type is not correct, something may go terribly wrong when
/// the `Weight` object is created. /// the `Weight` object is created.
pub fn new_term_bounds<F: Into<Cow<'static, str>>>( pub fn new_term_bounds(
field: F, field: String,
value_type: Type, value_type: Type,
lower_bound: &Bound<Term>, lower_bound: &Bound<Term>,
upper_bound: &Bound<Term>, upper_bound: &Bound<Term>,
) -> RangeQuery { ) -> RangeQuery {
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned(); let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
RangeQuery { RangeQuery {
field: field.into(), field,
value_type, value_type,
lower_bound: map_bound(lower_bound, verify_and_unwrap_term), lower_bound: map_bound(lower_bound, verify_and_unwrap_term),
upper_bound: map_bound(upper_bound, verify_and_unwrap_term), upper_bound: map_bound(upper_bound, verify_and_unwrap_term),
@@ -101,7 +100,7 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `i64`, tantivy /// If the field is not of the type `i64`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_i64<F: Into<Cow<'static, str>>>(field: F, range: Range<i64>) -> RangeQuery { pub fn new_i64(field: String, range: Range<i64>) -> RangeQuery {
RangeQuery::new_i64_bounds( RangeQuery::new_i64_bounds(
field, field,
Bound::Included(range.start), Bound::Included(range.start),
@@ -116,8 +115,8 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `i64`, tantivy /// If the field is not of the type `i64`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_i64_bounds<F: Into<Cow<'static, str>>>( pub fn new_i64_bounds(
field: F, field: String,
lower_bound: Bound<i64>, lower_bound: Bound<i64>,
upper_bound: Bound<i64>, upper_bound: Bound<i64>,
) -> RangeQuery { ) -> RangeQuery {
@@ -127,7 +126,7 @@ impl RangeQuery {
.to_owned() .to_owned()
}; };
RangeQuery { RangeQuery {
field: field.into(), field,
value_type: Type::I64, value_type: Type::I64,
lower_bound: map_bound(&lower_bound, make_term_val), lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val), upper_bound: map_bound(&upper_bound, make_term_val),
@@ -139,7 +138,7 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `f64`, tantivy /// If the field is not of the type `f64`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_f64<F: Into<Cow<'static, str>>>(field: F, range: Range<f64>) -> RangeQuery { pub fn new_f64(field: String, range: Range<f64>) -> RangeQuery {
RangeQuery::new_f64_bounds( RangeQuery::new_f64_bounds(
field, field,
Bound::Included(range.start), Bound::Included(range.start),
@@ -154,8 +153,8 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `f64`, tantivy /// If the field is not of the type `f64`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_f64_bounds<F: Into<Cow<'static, str>>>( pub fn new_f64_bounds(
field: F, field: String,
lower_bound: Bound<f64>, lower_bound: Bound<f64>,
upper_bound: Bound<f64>, upper_bound: Bound<f64>,
) -> RangeQuery { ) -> RangeQuery {
@@ -165,7 +164,7 @@ impl RangeQuery {
.to_owned() .to_owned()
}; };
RangeQuery { RangeQuery {
field: field.into(), field,
value_type: Type::F64, value_type: Type::F64,
lower_bound: map_bound(&lower_bound, make_term_val), lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val), upper_bound: map_bound(&upper_bound, make_term_val),
@@ -180,8 +179,8 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `u64`, tantivy /// If the field is not of the type `u64`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_u64_bounds<F: Into<Cow<'static, str>>>( pub fn new_u64_bounds(
field: F, field: String,
lower_bound: Bound<u64>, lower_bound: Bound<u64>,
upper_bound: Bound<u64>, upper_bound: Bound<u64>,
) -> RangeQuery { ) -> RangeQuery {
@@ -191,7 +190,7 @@ impl RangeQuery {
.to_owned() .to_owned()
}; };
RangeQuery { RangeQuery {
field: field.into(), field,
value_type: Type::U64, value_type: Type::U64,
lower_bound: map_bound(&lower_bound, make_term_val), lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val), upper_bound: map_bound(&upper_bound, make_term_val),
@@ -203,8 +202,8 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `ip`, tantivy /// If the field is not of the type `ip`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_ip_bounds<F: Into<Cow<'static, str>>>( pub fn new_ip_bounds(
field: F, field: String,
lower_bound: Bound<Ipv6Addr>, lower_bound: Bound<Ipv6Addr>,
upper_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>,
) -> RangeQuery { ) -> RangeQuery {
@@ -214,7 +213,7 @@ impl RangeQuery {
.to_owned() .to_owned()
}; };
RangeQuery { RangeQuery {
field: field.into(), field,
value_type: Type::IpAddr, value_type: Type::IpAddr,
lower_bound: map_bound(&lower_bound, make_term_val), lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val), upper_bound: map_bound(&upper_bound, make_term_val),
@@ -226,7 +225,7 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `u64`, tantivy /// If the field is not of the type `u64`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_u64<F: Into<Cow<'static, str>>>(field: F, range: Range<u64>) -> RangeQuery { pub fn new_u64(field: String, range: Range<u64>) -> RangeQuery {
RangeQuery::new_u64_bounds( RangeQuery::new_u64_bounds(
field, field,
Bound::Included(range.start), Bound::Included(range.start),
@@ -241,8 +240,8 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `date`, tantivy /// If the field is not of the type `date`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_date_bounds<F: Into<Cow<'static, str>>>( pub fn new_date_bounds(
field: F, field: String,
lower_bound: Bound<DateTime>, lower_bound: Bound<DateTime>,
upper_bound: Bound<DateTime>, upper_bound: Bound<DateTime>,
) -> RangeQuery { ) -> RangeQuery {
@@ -252,7 +251,7 @@ impl RangeQuery {
.to_owned() .to_owned()
}; };
RangeQuery { RangeQuery {
field: field.into(), field,
value_type: Type::Date, value_type: Type::Date,
lower_bound: map_bound(&lower_bound, make_term_val), lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val), upper_bound: map_bound(&upper_bound, make_term_val),
@@ -264,7 +263,7 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `date`, tantivy /// If the field is not of the type `date`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_date<F: Into<Cow<'static, str>>>(field: F, range: Range<DateTime>) -> RangeQuery { pub fn new_date(field: String, range: Range<DateTime>) -> RangeQuery {
RangeQuery::new_date_bounds( RangeQuery::new_date_bounds(
field, field,
Bound::Included(range.start), Bound::Included(range.start),
@@ -279,14 +278,14 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `Str`, tantivy /// If the field is not of the type `Str`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_str_bounds<F: Into<Cow<'static, str>>>( pub fn new_str_bounds(
field: F, field: String,
lower_bound: Bound<&str>, lower_bound: Bound<&str>,
upper_bound: Bound<&str>, upper_bound: Bound<&str>,
) -> RangeQuery { ) -> RangeQuery {
let make_term_val = |val: &&str| val.as_bytes().to_vec(); let make_term_val = |val: &&str| val.as_bytes().to_vec();
RangeQuery { RangeQuery {
field: field.into(), field,
value_type: Type::Str, value_type: Type::Str,
lower_bound: map_bound(&lower_bound, make_term_val), lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val), upper_bound: map_bound(&upper_bound, make_term_val),
@@ -298,7 +297,7 @@ impl RangeQuery {
/// ///
/// If the field is not of the type `Str`, tantivy /// If the field is not of the type `Str`, tantivy
/// will panic when the `Weight` object is created. /// will panic when the `Weight` object is created.
pub fn new_str<F: Into<Cow<'static, str>>>(field: F, range: Range<&str>) -> RangeQuery { pub fn new_str(field: String, range: Range<&str>) -> RangeQuery {
RangeQuery::new_str_bounds( RangeQuery::new_str_bounds(
field, field,
Bound::Included(range.start), Bound::Included(range.start),
@@ -359,7 +358,7 @@ impl Query for RangeQuery {
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?; let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?; let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
Ok(Box::new(IPFastFieldRangeWeight::new( Ok(Box::new(IPFastFieldRangeWeight::new(
self.field.clone(), self.field.to_string(),
lower_bound, lower_bound,
upper_bound, upper_bound,
))) )))
@@ -374,14 +373,14 @@ impl Query for RangeQuery {
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes); let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes); let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
Ok(Box::new(FastFieldRangeWeight::new_u64_lenient( Ok(Box::new(FastFieldRangeWeight::new_u64_lenient(
self.field.clone(), self.field.to_string(),
lower_bound, lower_bound,
upper_bound, upper_bound,
))) )))
} }
} else { } else {
Ok(Box::new(RangeWeight { Ok(Box::new(RangeWeight {
field: self.field.clone(), field: self.field.to_string(),
lower_bound: self.lower_bound.clone(), lower_bound: self.lower_bound.clone(),
upper_bound: self.upper_bound.clone(), upper_bound: self.upper_bound.clone(),
limit: self.limit, limit: self.limit,
@@ -391,7 +390,7 @@ impl Query for RangeQuery {
} }
pub struct RangeWeight { pub struct RangeWeight {
field: Cow<'static, str>, field: String,
lower_bound: Bound<Vec<u8>>, lower_bound: Bound<Vec<u8>>,
upper_bound: Bound<Vec<u8>>, upper_bound: Bound<Vec<u8>>,
limit: Option<u64>, limit: Option<u64>,

View File

@@ -2,7 +2,6 @@
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is //! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings. //! used, which uses the term dictionary + postings.
use std::borrow::Cow;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive}; use std::ops::{Bound, RangeInclusive};
@@ -14,18 +13,14 @@ use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries. /// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
pub struct IPFastFieldRangeWeight { pub struct IPFastFieldRangeWeight {
field: Cow<'static, str>, field: String,
lower_bound: Bound<Ipv6Addr>, lower_bound: Bound<Ipv6Addr>,
upper_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>,
} }
impl IPFastFieldRangeWeight { impl IPFastFieldRangeWeight {
/// Creates a new IPFastFieldRangeWeight. /// Creates a new IPFastFieldRangeWeight.
pub fn new( pub fn new(field: String, lower_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>) -> Self {
field: Cow<'static, str>,
lower_bound: Bound<Ipv6Addr>,
upper_bound: Bound<Ipv6Addr>,
) -> Self {
Self { Self {
field, field,
lower_bound, lower_bound,
@@ -176,7 +171,7 @@ pub mod tests {
writer.commit().unwrap(); writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let range_weight = IPFastFieldRangeWeight { let range_weight = IPFastFieldRangeWeight {
field: Cow::Borrowed("ips"), field: "ips".to_string(),
lower_bound: Bound::Included(ip_addrs[1]), lower_bound: Bound::Included(ip_addrs[1]),
upper_bound: Bound::Included(ip_addrs[2]), upper_bound: Bound::Included(ip_addrs[2]),
}; };

View File

@@ -2,7 +2,6 @@
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is //! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings. //! used, which uses the term dictionary + postings.
use std::borrow::Cow;
use std::ops::{Bound, RangeInclusive}; use std::ops::{Bound, RangeInclusive};
use columnar::{ColumnType, HasAssociatedColumnType, MonotonicallyMappableToU64}; use columnar::{ColumnType, HasAssociatedColumnType, MonotonicallyMappableToU64};
@@ -15,7 +14,7 @@ use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `FastFieldRangeWeight` uses the fast field to execute range queries. /// `FastFieldRangeWeight` uses the fast field to execute range queries.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct FastFieldRangeWeight { pub struct FastFieldRangeWeight {
field: Cow<'static, str>, field: String,
lower_bound: Bound<u64>, lower_bound: Bound<u64>,
upper_bound: Bound<u64>, upper_bound: Bound<u64>,
column_type_opt: Option<ColumnType>, column_type_opt: Option<ColumnType>,
@@ -24,7 +23,7 @@ pub struct FastFieldRangeWeight {
impl FastFieldRangeWeight { impl FastFieldRangeWeight {
/// Create a new FastFieldRangeWeight, using the u64 representation of any fast field. /// Create a new FastFieldRangeWeight, using the u64 representation of any fast field.
pub(crate) fn new_u64_lenient( pub(crate) fn new_u64_lenient(
field: Cow<'static, str>, field: String,
lower_bound: Bound<u64>, lower_bound: Bound<u64>,
upper_bound: Bound<u64>, upper_bound: Bound<u64>,
) -> Self { ) -> Self {
@@ -40,7 +39,7 @@ impl FastFieldRangeWeight {
/// Create a new `FastFieldRangeWeight` for a range of a u64-mappable type . /// Create a new `FastFieldRangeWeight` for a range of a u64-mappable type .
pub fn new<T: HasAssociatedColumnType + MonotonicallyMappableToU64>( pub fn new<T: HasAssociatedColumnType + MonotonicallyMappableToU64>(
field: Cow<'static, str>, field: String,
lower_bound: Bound<T>, lower_bound: Bound<T>,
upper_bound: Bound<T>, upper_bound: Bound<T>,
) -> Self { ) -> Self {
@@ -131,7 +130,6 @@ fn bound_to_value_range<T: MonotonicallyMappableToU64>(
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
use std::borrow::Cow;
use std::ops::{Bound, RangeInclusive}; use std::ops::{Bound, RangeInclusive};
use proptest::prelude::*; use proptest::prelude::*;
@@ -216,7 +214,7 @@ pub mod tests {
writer.commit().unwrap(); writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher(); let searcher = index.reader().unwrap().searcher();
let range_query = FastFieldRangeWeight::new_u64_lenient( let range_query = FastFieldRangeWeight::new_u64_lenient(
Cow::Borrowed("test_field"), "test_field".to_string(),
Bound::Included(50_000), Bound::Included(50_000),
Bound::Included(50_002), Bound::Included(50_002),
); );

View File

@@ -63,7 +63,7 @@ impl RegexQuery {
/// Creates a new RegexQuery from a given pattern /// Creates a new RegexQuery from a given pattern
pub fn from_pattern(regex_pattern: &str, field: Field) -> crate::Result<Self> { pub fn from_pattern(regex_pattern: &str, field: Field) -> crate::Result<Self> {
let regex = Regex::new(regex_pattern) let regex = Regex::new(regex_pattern)
.map_err(|err| TantivyError::InvalidArgument(format!("RegexQueryError: {err}")))?; .map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_string()))?;
Ok(RegexQuery::from_regex(regex, field)) Ok(RegexQuery::from_regex(regex, field))
} }
@@ -176,16 +176,4 @@ mod test {
verify_regex_query(matching_one, matching_zero, reader); verify_regex_query(matching_one, matching_zero, reader);
Ok(()) Ok(())
} }
#[test]
pub fn test_pattern_error() {
let (_reader, field) = build_test_index().unwrap();
match RegexQuery::from_pattern(r"(foo", field) {
Err(crate::TantivyError::InvalidArgument(msg)) => {
assert!(msg.contains("error: unclosed group"))
}
res => panic!("unexpected result: {:?}", res),
}
}
} }

View File

@@ -117,6 +117,7 @@ impl SegmentSpaceUsage {
use self::ComponentSpaceUsage::*; use self::ComponentSpaceUsage::*;
use crate::SegmentComponent::*; use crate::SegmentComponent::*;
match component { match component {
FieldList => ComponentSpaceUsage::Basic(ByteCount::from(0u64)),
Postings => PerField(self.postings().clone()), Postings => PerField(self.postings().clone()),
Positions => PerField(self.positions().clone()), Positions => PerField(self.positions().clone()),
FastFields => PerField(self.fast_fields().clone()), FastFields => PerField(self.fast_fields().clone()),

View File

@@ -189,11 +189,6 @@ struct Page {
impl Page { impl Page {
fn new(page_id: usize) -> Page { fn new(page_id: usize) -> Page {
// We use 32-bits addresses.
// - 20 bits for the in-page addressing
// - 12 bits for the page id.
// This limits us to 2^12 - 1=4095 for the page id.
assert!(page_id < 4096);
Page { Page {
page_id, page_id,
len: 0, len: 0,
@@ -243,7 +238,6 @@ impl Page {
mod tests { mod tests {
use super::MemoryArena; use super::MemoryArena;
use crate::memory_arena::PAGE_SIZE;
#[test] #[test]
fn test_arena_allocate_slice() { fn test_arena_allocate_slice() {
@@ -261,31 +255,6 @@ mod tests {
assert_eq!(arena.slice(addr_b, b.len()), b); assert_eq!(arena.slice(addr_b, b.len()), b);
} }
#[test]
fn test_arena_allocate_end_of_page() {
let mut arena = MemoryArena::default();
// A big block
let len_a = PAGE_SIZE - 2;
let addr_a = arena.allocate_space(len_a);
*arena.slice_mut(addr_a, len_a).last_mut().unwrap() = 1;
// Single bytes
let addr_b = arena.allocate_space(1);
arena.slice_mut(addr_b, 1)[0] = 2;
let addr_c = arena.allocate_space(1);
arena.slice_mut(addr_c, 1)[0] = 3;
let addr_d = arena.allocate_space(1);
arena.slice_mut(addr_d, 1)[0] = 4;
assert_eq!(arena.slice(addr_a, len_a)[len_a - 1], 1);
assert_eq!(arena.slice(addr_b, 1)[0], 2);
assert_eq!(arena.slice(addr_c, 1)[0], 3);
assert_eq!(arena.slice(addr_d, 1)[0], 4);
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)] #[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct MyTest { struct MyTest {
pub a: usize, pub a: usize,

View File

@@ -295,8 +295,6 @@ impl SharedArenaHashMap {
/// will be in charge of returning a default value. /// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed /// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`. /// `Some(previous_value)`.
///
/// The key will be truncated to u16::MAX bytes.
#[inline] #[inline]
pub fn mutate_or_create<V>( pub fn mutate_or_create<V>(
&mut self, &mut self,
@@ -310,8 +308,6 @@ impl SharedArenaHashMap {
if self.is_saturated() { if self.is_saturated() {
self.resize(); self.resize();
} }
// Limit the key size to u16::MAX
let key = &key[..std::cmp::min(key.len(), u16::MAX as usize)];
let hash = self.get_hash(key); let hash = self.get_hash(key);
let mut probe = self.probe(hash); let mut probe = self.probe(hash);
let mut bucket = probe.next_probe(); let mut bucket = probe.next_probe();
@@ -383,36 +379,6 @@ mod tests {
} }
assert_eq!(vanilla_hash_map.len(), 2); assert_eq!(vanilla_hash_map.len(), 2);
} }
#[test]
fn test_long_key_truncation() {
// Keys longer than u16::MAX are truncated.
let mut memory_arena = MemoryArena::default();
let mut hash_map: SharedArenaHashMap = SharedArenaHashMap::default();
let key1 = (0..u16::MAX as usize).map(|i| i as u8).collect::<Vec<_>>();
hash_map.mutate_or_create(&key1, &mut memory_arena, |opt_val: Option<u32>| {
assert_eq!(opt_val, None);
4u32
});
// Due to truncation, this key is the same as key1
let key2 = (0..u16::MAX as usize + 1)
.map(|i| i as u8)
.collect::<Vec<_>>();
hash_map.mutate_or_create(&key2, &mut memory_arena, |opt_val: Option<u32>| {
assert_eq!(opt_val, Some(4));
3u32
});
let mut vanilla_hash_map = HashMap::new();
let iter_values = hash_map.iter(&memory_arena);
for (key, addr) in iter_values {
let val: u32 = memory_arena.read(addr);
vanilla_hash_map.insert(key.to_owned(), val);
assert_eq!(key.len(), key1[..].len());
assert_eq!(key, &key1[..])
}
assert_eq!(vanilla_hash_map.len(), 1); // Both map to the same key
}
#[test] #[test]
fn test_empty_hashmap() { fn test_empty_hashmap() {
let memory_arena = MemoryArena::default(); let memory_arena = MemoryArena::default();