mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 07:42:54 +00:00
Compare commits
23 Commits
add-random
...
warming
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d4e2d2e40e | ||
|
|
732f6847c0 | ||
|
|
1c6d9bdc6a | ||
|
|
3ea6800ac5 | ||
|
|
395303b644 | ||
|
|
2c200b46cb | ||
|
|
17e00df112 | ||
|
|
3129d86743 | ||
|
|
e5e252cbc0 | ||
|
|
b2da82f151 | ||
|
|
c81b3030fa | ||
|
|
9e66c75fc6 | ||
|
|
ebdbb6bd2e | ||
|
|
c980b19dd9 | ||
|
|
098eea843a | ||
|
|
466dc8233c | ||
|
|
03c2f6ece2 | ||
|
|
1d4e9a29db | ||
|
|
f378d9a57b | ||
|
|
dde49ac8e2 | ||
|
|
c3cc93406d | ||
|
|
bd0f9211da | ||
|
|
c503c6e4fa |
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -21,10 +21,10 @@ jobs:
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: nightly
|
||||
toolchain: stable
|
||||
override: true
|
||||
components: rustfmt
|
||||
- name: Run tests
|
||||
run: cargo test --all-features --verbose --workspace
|
||||
run: cargo test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
|
||||
- name: Check Formatting
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
10
CHANGELOG.md
10
CHANGELOG.md
@@ -1,3 +1,13 @@
|
||||
Tantivy 0.17
|
||||
================================
|
||||
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar) [#115](https://github.com/quickwit-inc/tantivy/issues/115)
|
||||
- Adds a searcher Warmer API (@shikhar)
|
||||
- Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211
|
||||
- Facets are necessarily indexed. Existing index with indexed facets should work out of the box. Index without facets that are marked with index: false should be broken (but they were already broken in a sense). (@fulmicoton) #1195 .
|
||||
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-inc/tantivy/issues/1224)
|
||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-inc/tantivy/issues/922)
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-inc/tantivy/issues/1225)
|
||||
|
||||
Tantivy 0.16.2
|
||||
================================
|
||||
- Bugfix in FuzzyTermQuery. (tranposition_cost_one was not doing anything)
|
||||
|
||||
@@ -65,7 +65,7 @@ maplit = "1.0.2"
|
||||
matches = "0.1.8"
|
||||
proptest = "1.0"
|
||||
criterion = "0.3.5"
|
||||
test-env-log = "0.2.7"
|
||||
test-log = "0.2.8"
|
||||
env_logger = "0.9.0"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
@@ -91,7 +91,6 @@ snappy-compression = ["snap"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
|
||||
|
||||
@@ -62,29 +62,30 @@ impl TinySet {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
///
|
||||
/// Careful on making this function public, as it will break the padding handling in the last
|
||||
/// bucket.
|
||||
#[inline]
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
#[inline]
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the number of elements in the TinySet.
|
||||
#[inline]
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the intersection of `self` and `other`
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
@@ -98,12 +99,14 @@ impl TinySet {
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Removes an element within [0..64)
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn remove(self, el: u32) -> TinySet {
|
||||
self.intersect(TinySet::singleton(el).complement())
|
||||
}
|
||||
@@ -130,6 +133,7 @@ impl TinySet {
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
|
||||
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let name = schema_builder.add_text_field("felin_name", TEXT | STORED);
|
||||
// this is our faceted field: its scientific classification
|
||||
let classification = schema_builder.add_facet_field("classification", INDEXED);
|
||||
let classification = schema_builder.add_facet_field("classification", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -9,7 +9,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let title = schema_builder.add_text_field("title", STORED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -70,13 +70,13 @@ fn highlight(snippet: Snippet) -> String {
|
||||
let mut start_from = 0;
|
||||
|
||||
for fragment_range in snippet.highlighted() {
|
||||
result.push_str(&snippet.fragments()[start_from..fragment_range.start]);
|
||||
result.push_str(&snippet.fragment()[start_from..fragment_range.start]);
|
||||
result.push_str(" --> ");
|
||||
result.push_str(&snippet.fragments()[fragment_range.clone()]);
|
||||
result.push_str(&snippet.fragment()[fragment_range.clone()]);
|
||||
result.push_str(" <-- ");
|
||||
start_from = fragment_range.end;
|
||||
}
|
||||
|
||||
result.push_str(&snippet.fragments()[start_from..]);
|
||||
result.push_str(&snippet.fragment()[start_from..]);
|
||||
result
|
||||
}
|
||||
|
||||
223
examples/warmer.rs
Normal file
223
examples/warmer.rs
Normal file
@@ -0,0 +1,223 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Field, Schema, FAST, TEXT};
|
||||
use tantivy::{doc, DocAddress, DocId, Index, IndexReader, SegmentReader, TrackedObject};
|
||||
use tantivy::{Opstamp, Searcher, SearcherGeneration, SegmentId, Warmer};
|
||||
|
||||
// This example shows how warmers can be used to
|
||||
// load a values from an external sources using the Warmer API.
|
||||
//
|
||||
// In this example, we assume an e-commerce search engine.
|
||||
|
||||
type ProductId = u64;
|
||||
|
||||
/// Price
|
||||
type Price = u32;
|
||||
|
||||
pub trait PriceFetcher: Send + Sync + 'static {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
|
||||
}
|
||||
|
||||
struct DynamicPriceColumn {
|
||||
field: Field,
|
||||
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
|
||||
price_fetcher: Box<dyn PriceFetcher>,
|
||||
}
|
||||
|
||||
impl DynamicPriceColumn {
|
||||
pub fn with_product_id_field<T: PriceFetcher>(field: Field, price_fetcher: T) -> Self {
|
||||
DynamicPriceColumn {
|
||||
field,
|
||||
price_cache: Default::default(),
|
||||
price_fetcher: Box::new(price_fetcher),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
|
||||
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
|
||||
self.price_cache.read().unwrap().get(&segment_key).cloned()
|
||||
}
|
||||
}
|
||||
impl Warmer for DynamicPriceColumn {
|
||||
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
|
||||
for segment in searcher.segment_readers() {
|
||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
||||
let product_id_reader = segment.fast_fields().u64(self.field)?;
|
||||
let product_ids: Vec<ProductId> = segment
|
||||
.doc_ids_alive()
|
||||
.map(|doc| product_id_reader.get(doc))
|
||||
.collect();
|
||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
let mut price_vals: Vec<Price> = Vec::new();
|
||||
for doc in 0..segment.max_doc() {
|
||||
if segment.is_deleted(doc) {
|
||||
price_vals.push(0);
|
||||
} else {
|
||||
price_vals.push(prices_it.next().unwrap())
|
||||
}
|
||||
}
|
||||
self.price_cache
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(key, Arc::new(price_vals));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn garbage_collect(&self, live_generations: &[TrackedObject<SearcherGeneration>]) {
|
||||
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
|
||||
live_generations
|
||||
.iter()
|
||||
.flat_map(|gen| gen.segments())
|
||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
||||
.collect();
|
||||
let mut price_cache_wrt = self.price_cache.write().unwrap();
|
||||
// let price_cache = std::mem::take(&mut *price_cache_wrt);
|
||||
// Drain would be nicer here.
|
||||
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
|
||||
.into_iter()
|
||||
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
|
||||
/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
|
||||
/// This map represents a map (ProductId -> Price)
|
||||
///
|
||||
/// In practise, it could be fetching things from an external service, like a SQL table.
|
||||
///
|
||||
#[derive(Default, Clone)]
|
||||
pub struct ExternalPriceTable {
|
||||
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
|
||||
}
|
||||
|
||||
impl ExternalPriceTable {
|
||||
pub fn update_price(&self, product_id: ProductId, price: Price) {
|
||||
let mut prices_wrt = self.prices.write().unwrap();
|
||||
prices_wrt.insert(product_id, price);
|
||||
}
|
||||
}
|
||||
|
||||
impl PriceFetcher for ExternalPriceTable {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
|
||||
let prices_read = self.prices.read().unwrap();
|
||||
product_ids
|
||||
.iter()
|
||||
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Declaring our schema.
|
||||
let mut schema_builder = Schema::builder();
|
||||
// The product id is assumed to be a primary id for our external price source.
|
||||
let product_id = schema_builder.add_u64_field("product_id", FAST);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema: Schema = schema_builder.build();
|
||||
|
||||
let price_table = ExternalPriceTable::default();
|
||||
let price_dynamic_column = Arc::new(DynamicPriceColumn::with_product_id_field(
|
||||
product_id,
|
||||
price_table.clone(),
|
||||
));
|
||||
price_table.update_price(OLIVE_OIL, 12);
|
||||
price_table.update_price(GLOVES, 13);
|
||||
price_table.update_price(SNEAKERS, 80);
|
||||
|
||||
const OLIVE_OIL: ProductId = 323423;
|
||||
const GLOVES: ProductId = 3966623;
|
||||
const SNEAKERS: ProductId = 23222;
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
|
||||
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
|
||||
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
|
||||
writer.commit()?;
|
||||
|
||||
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
|
||||
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
|
||||
)];
|
||||
let reader: IndexReader = index
|
||||
.reader_builder()
|
||||
.warmers(warmers)
|
||||
.num_searchers(1)
|
||||
.try_into()?;
|
||||
reader.reload()?;
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![text]);
|
||||
let query = query_parser.parse_query("cooking")?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let score_by_price = move |segment_reader: &SegmentReader| {
|
||||
let price = price_dynamic_column
|
||||
.price_for_segment(segment_reader)
|
||||
.unwrap();
|
||||
move |doc_id: DocId| Reverse(price[doc_id as usize])
|
||||
};
|
||||
|
||||
let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
|
||||
|
||||
let hits = searcher.search(&query, &most_expensive_first)?;
|
||||
assert_eq!(
|
||||
&hits,
|
||||
&[
|
||||
(
|
||||
Reverse(12u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 0u32
|
||||
}
|
||||
),
|
||||
(
|
||||
Reverse(13u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 1u32
|
||||
}
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
// Olive oil just got more expensive!
|
||||
price_table.update_price(OLIVE_OIL, 15);
|
||||
|
||||
// The price update are directly reflected on `reload`.
|
||||
//
|
||||
// Be careful here though!...
|
||||
// You may have spotted that we are still using the same `Searcher`.
|
||||
//
|
||||
// It is up to the `Warmer` implementer to decide how
|
||||
// to control this behavior.
|
||||
|
||||
reader.reload()?;
|
||||
|
||||
let hits_with_new_prices = searcher.search(&query, &most_expensive_first)?;
|
||||
assert_eq!(
|
||||
&hits_with_new_prices,
|
||||
&[
|
||||
(
|
||||
Reverse(13u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 1u32
|
||||
}
|
||||
),
|
||||
(
|
||||
Reverse(15u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 0u32
|
||||
}
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -91,10 +91,6 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|_| rand::random::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Random"));
|
||||
data_and_names
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![allow(clippy::return_self_not_must_use)]
|
||||
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::TryInto;
|
||||
use std::mem;
|
||||
@@ -35,6 +37,8 @@ impl OwnedBytes {
|
||||
}
|
||||
|
||||
/// creates a fileslice that is just a view over a slice of the data.
|
||||
#[must_use]
|
||||
#[inline]
|
||||
pub fn slice(&self, range: Range<usize>) -> Self {
|
||||
OwnedBytes {
|
||||
data: &self.data[range],
|
||||
@@ -63,6 +67,8 @@ impl OwnedBytes {
|
||||
/// On the other hand, both `left` and `right` retain a handle over
|
||||
/// the entire slice of memory. In other words, the memory will only
|
||||
/// be released when both left and right are dropped.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let left = OwnedBytes {
|
||||
@@ -97,7 +103,6 @@ impl OwnedBytes {
|
||||
|
||||
/// Drops the left most `advance_len` bytes.
|
||||
///
|
||||
/// See also [.clip(clip_len: usize))](#method.clip).
|
||||
#[inline]
|
||||
pub fn advance(&mut self, advance_len: usize) {
|
||||
self.data = &self.data[advance_len..]
|
||||
|
||||
@@ -91,6 +91,7 @@ pub enum UserInputAst {
|
||||
}
|
||||
|
||||
impl UserInputAst {
|
||||
#[must_use]
|
||||
pub fn unary(self, occur: Occur) -> UserInputAst {
|
||||
UserInputAst::Clause(vec![(Some(occur), self)])
|
||||
}
|
||||
|
||||
@@ -83,7 +83,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// ```rust
|
||||
/// use tantivy::collector::FacetCollector;
|
||||
/// use tantivy::query::AllQuery;
|
||||
/// use tantivy::schema::{Facet, Schema, INDEXED, TEXT};
|
||||
/// use tantivy::schema::{Facet, Schema, FacetOptions, TEXT};
|
||||
/// use tantivy::{doc, Index};
|
||||
///
|
||||
/// fn example() -> tantivy::Result<()> {
|
||||
@@ -92,7 +92,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// // Facet have their own specific type.
|
||||
/// // It is not a bad practise to put all of your
|
||||
/// // facet information in the same field.
|
||||
/// let facet = schema_builder.add_facet_field("facet", INDEXED);
|
||||
/// let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
@@ -400,7 +400,7 @@ impl<'a> Iterator for FacetChildIterator<'a> {
|
||||
|
||||
impl FacetCounts {
|
||||
/// Returns an iterator over all of the facet count pairs inside this result.
|
||||
/// See the documentation for `FacetCollector` for a usage example.
|
||||
/// See the documentation for [FacetCollector] for a usage example.
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
|
||||
where
|
||||
Facet: From<T>,
|
||||
@@ -421,7 +421,7 @@ impl FacetCounts {
|
||||
}
|
||||
|
||||
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
|
||||
/// See the documentation for `FacetCollector` for a usage example.
|
||||
/// See the documentation for [FacetCollector] for a usage example.
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
@@ -462,7 +462,7 @@ mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::core::Index;
|
||||
use crate::query::{AllQuery, QueryParser, TermQuery};
|
||||
use crate::schema::{Document, Facet, Field, IndexRecordOption, Schema, INDEXED};
|
||||
use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema};
|
||||
use crate::Term;
|
||||
use rand::distributions::Uniform;
|
||||
use rand::prelude::SliceRandom;
|
||||
@@ -472,7 +472,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -533,7 +533,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_unsorted_multifacet() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
@@ -558,7 +558,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_search_by_facet() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
@@ -615,7 +615,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_collector_topk() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -664,7 +664,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -727,7 +727,7 @@ mod bench {
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
/// The `FilterCollector` collector filters docs using a fast field value and a predicate.
|
||||
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
|
||||
///
|
||||
/// ```rust
|
||||
|
||||
@@ -210,6 +210,7 @@ impl TopDocs {
|
||||
/// Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn and_offset(self, offset: usize) -> TopDocs {
|
||||
TopDocs(self.0.and_offset(offset))
|
||||
}
|
||||
|
||||
@@ -217,7 +217,7 @@ impl Index {
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with a given number of threads.
|
||||
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
|
||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-")?);
|
||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "tantivy-search-")?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use super::SegmentComponent;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::{core::SegmentId, store::Compressor};
|
||||
use census::{Inventory, TrackedObject};
|
||||
use crate::{Inventory, TrackedObject};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
use std::{collections::HashSet, sync::atomic::AtomicBool};
|
||||
@@ -189,6 +189,10 @@ impl SegmentMeta {
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
assert!(
|
||||
num_deleted_docs <= self.max_doc(),
|
||||
"There cannot be more deleted docs than there are docs."
|
||||
);
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
@@ -394,7 +398,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ pub use self::index_meta::{
|
||||
IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
|
||||
};
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::searcher::{Searcher, SearcherGeneration};
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use crate::collector::Collector;
|
||||
use crate::core::Executor;
|
||||
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Query;
|
||||
use crate::schema::Document;
|
||||
@@ -10,9 +9,62 @@ use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::Opstamp;
|
||||
use crate::SegmentId;
|
||||
use crate::TrackedObject;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::{fmt, io};
|
||||
|
||||
/// Identifies the searcher generation accessed by a [Searcher].
|
||||
///
|
||||
/// While this might seem redundant, a [SearcherGeneration] contains
|
||||
/// both a `generation_id` AND a list of `(SegmentId, DeleteOpstamp)`.
|
||||
///
|
||||
/// This is on purpose. This object is used by the `Warmer` API.
|
||||
/// Having both information makes it possible to identify which
|
||||
/// artifact should be refreshed or garbage collected.
|
||||
///
|
||||
/// Depending on the use case, `Warmer`'s implementers can decide to
|
||||
/// produce artifacts per:
|
||||
/// - `generation_id` (e.g. some searcher level aggregates)
|
||||
/// - `(segment_id, delete_opstamp)` (e.g. segment level aggregates)
|
||||
/// - `segment_id` (e.g. for immutable document level information)
|
||||
/// - `(generation_id, segment_id)` (e.g. for consistent dynamic column)
|
||||
/// - ...
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct SearcherGeneration {
|
||||
segments: BTreeMap<SegmentId, Option<Opstamp>>,
|
||||
generation_id: u64,
|
||||
}
|
||||
|
||||
impl SearcherGeneration {
|
||||
pub(crate) fn from_segment_readers(
|
||||
segment_readers: &[SegmentReader],
|
||||
generation_id: u64,
|
||||
) -> Self {
|
||||
let mut segment_id_to_del_opstamp = BTreeMap::new();
|
||||
for segment_reader in segment_readers {
|
||||
segment_id_to_del_opstamp
|
||||
.insert(segment_reader.segment_id(), segment_reader.delete_opstamp());
|
||||
}
|
||||
Self {
|
||||
segments: segment_id_to_del_opstamp,
|
||||
generation_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the searcher generation id.
|
||||
pub fn generation_id(&self) -> u64 {
|
||||
self.generation_id
|
||||
}
|
||||
|
||||
/// Return a `(SegmentId -> DeleteOpstamp)` mapping.
|
||||
pub fn segments(&self) -> &BTreeMap<SegmentId, Option<Opstamp>> {
|
||||
&self.segments
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
@@ -23,6 +75,7 @@ pub struct Searcher {
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
store_readers: Vec<StoreReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
@@ -31,6 +84,7 @@ impl Searcher {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
) -> io::Result<Searcher> {
|
||||
let store_readers: Vec<StoreReader> = segment_readers
|
||||
.iter()
|
||||
@@ -41,6 +95,7 @@ impl Searcher {
|
||||
index,
|
||||
segment_readers,
|
||||
store_readers,
|
||||
generation,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -49,6 +104,11 @@ impl Searcher {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// [SearcherGeneration] which identifies the version of the snapshot held by this `Searcher`.
|
||||
pub fn generation(&self) -> &SearcherGeneration {
|
||||
self.generation.as_ref()
|
||||
}
|
||||
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
|
||||
@@ -17,6 +17,7 @@ use crate::space_usage::SegmentSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
use fail::fail_point;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
@@ -38,6 +39,8 @@ pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
|
||||
segment_id: SegmentId,
|
||||
delete_opstamp: Option<Opstamp>,
|
||||
|
||||
max_doc: DocId,
|
||||
num_docs: DocId,
|
||||
|
||||
@@ -100,7 +103,7 @@ impl SegmentReader {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match field_entry.field_type() {
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
FieldType::Facet(_) => {
|
||||
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||
let termdict = self
|
||||
.termdict_composite
|
||||
@@ -127,13 +130,17 @@ impl SegmentReader {
|
||||
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg = format!(
|
||||
"Field norm not found for field {:?}. Was it marked as indexed during indexing?",
|
||||
"Field norm not found for field {:?}. Was the field set to record norm during indexing?",
|
||||
field_name
|
||||
);
|
||||
crate::TantivyError::SchemaError(err_msg)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn fieldnorms_readers(&self) -> &FieldNormReaders {
|
||||
&self.fieldnorm_readers
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
pub fn get_store_reader(&self) -> io::Result<StoreReader> {
|
||||
StoreReader::open(self.store_file.clone())
|
||||
@@ -201,6 +208,7 @@ impl SegmentReader {
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment.id(),
|
||||
delete_opstamp: segment.meta().delete_opstamp(),
|
||||
store_file,
|
||||
alive_bitset_opt,
|
||||
positions_composite,
|
||||
@@ -286,6 +294,11 @@ impl SegmentReader {
|
||||
self.segment_id
|
||||
}
|
||||
|
||||
/// Returns the delete opstamp
|
||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
self.delete_opstamp
|
||||
}
|
||||
|
||||
/// Returns the bitset representing
|
||||
/// the documents that have been deleted.
|
||||
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
|
||||
|
||||
@@ -43,10 +43,8 @@ impl RetryPolicy {
|
||||
}
|
||||
|
||||
/// The `DirectoryLock` is an object that represents a file lock.
|
||||
/// See [`LockType`](struct.LockType.html)
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||
/// It is associated to a lock file, that gets deleted on `Drop.`
|
||||
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
@@ -142,10 +140,16 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Opens a writer for the *virtual file* associated with
|
||||
/// a Path.
|
||||
///
|
||||
/// Right after this call, the file should be created
|
||||
/// and any subsequent call to `open_read` for the
|
||||
/// Right after this call, for the span of the execution of the program
|
||||
/// the file should be created and any subsequent call to `open_read` for the
|
||||
/// same path should return a `FileSlice`.
|
||||
///
|
||||
/// However, depending on the directory implementation,
|
||||
/// it might be required to call `sync_directory` to ensure
|
||||
/// that the file is durably created.
|
||||
/// (The semantics here are the same when dealing with
|
||||
/// a posix filesystem.)
|
||||
///
|
||||
/// Write operations may be aggressively buffered.
|
||||
/// The client of this trait is responsible for calling flush
|
||||
/// to ensure that subsequent `read` operations
|
||||
@@ -176,6 +180,12 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// The file may or may not previously exist.
|
||||
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||
|
||||
/// Sync the directory.
|
||||
///
|
||||
/// This call is required to ensure that newly created files are
|
||||
/// effectively stored durably.
|
||||
fn sync_directory(&self) -> io::Result<()>;
|
||||
|
||||
/// Acquire a lock in the given directory.
|
||||
///
|
||||
/// The method is blocking or not depending on the `Lock` object.
|
||||
|
||||
@@ -7,8 +7,8 @@ use std::path::PathBuf;
|
||||
/// [`LockParams`](./enum.LockParams.html).
|
||||
/// Tantivy itself uses only two locks but client application
|
||||
/// can use the directory facility to define their own locks.
|
||||
/// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
|
||||
/// - [META_LOCK](./struct.META_LOCK.html)
|
||||
/// - [INDEX_WRITER_LOCK]
|
||||
/// - [META_LOCK]
|
||||
///
|
||||
/// Check out these locks documentation for more information.
|
||||
///
|
||||
|
||||
@@ -39,6 +39,16 @@ pub enum OpenDirectoryError {
|
||||
},
|
||||
}
|
||||
|
||||
impl OpenDirectoryError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, directory_path: PathBuf) -> Self {
|
||||
Self::IoError {
|
||||
io_error,
|
||||
directory_path,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when starting to write in a file
|
||||
#[derive(Debug, Error)]
|
||||
pub enum OpenWriteError {
|
||||
|
||||
@@ -66,6 +66,7 @@ impl FileSlice {
|
||||
|
||||
/// Wraps a FileHandle.
|
||||
#[doc(hidden)]
|
||||
#[must_use]
|
||||
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
|
||||
FileSlice {
|
||||
data: Arc::from(file_handle),
|
||||
|
||||
@@ -43,14 +43,16 @@ impl FileWatcher {
|
||||
thread::Builder::new()
|
||||
.name("thread-tantivy-meta-file-watcher".to_string())
|
||||
.spawn(move || {
|
||||
let mut current_checksum = None;
|
||||
let mut current_checksum_opt = None;
|
||||
|
||||
while state.load(Ordering::SeqCst) == 1 {
|
||||
if let Ok(checksum) = FileWatcher::compute_checksum(&path) {
|
||||
// `None.unwrap_or_else(|| !checksum) != checksum` evaluates to `true`
|
||||
if current_checksum.unwrap_or_else(|| !checksum) != checksum {
|
||||
let metafile_has_changed = current_checksum_opt
|
||||
.map(|current_checksum| current_checksum != checksum)
|
||||
.unwrap_or(true);
|
||||
if metafile_has_changed {
|
||||
info!("Meta file {:?} was modified", path);
|
||||
current_checksum = Some(checksum);
|
||||
current_checksum_opt = Some(checksum);
|
||||
futures::executor::block_on(callbacks.broadcast());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,6 +192,7 @@ impl ManagedDirectory {
|
||||
for delete_file in &deleted_files {
|
||||
managed_paths_write.remove(delete_file);
|
||||
}
|
||||
self.directory.sync_directory()?;
|
||||
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
|
||||
}
|
||||
|
||||
@@ -222,9 +223,22 @@ impl ManagedDirectory {
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
if has_changed {
|
||||
save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
|
||||
if !has_changed {
|
||||
return Ok(());
|
||||
}
|
||||
save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
|
||||
// This is not the first file we add.
|
||||
// Therefore, we are sure that `.managed.json` has been already
|
||||
// properly created and we do not need to sync its parent directory.
|
||||
//
|
||||
// (It might seem like a nicer solution to create the managed_json on the
|
||||
// creation of the ManagedDirectory instance but it would actually
|
||||
// prevent the use of read-only directories..)
|
||||
let managed_file_definitely_already_exists = meta_wlock.managed_paths.len() > 1;
|
||||
if managed_file_definitely_already_exists {
|
||||
return Ok(());
|
||||
}
|
||||
self.directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -310,6 +324,11 @@ impl Directory for ManagedDirectory {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
self.directory.watch(watch_callback)
|
||||
}
|
||||
|
||||
fn sync_directory(&self) -> io::Result<()> {
|
||||
self.directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for ManagedDirectory {
|
||||
|
||||
@@ -193,16 +193,19 @@ impl MmapDirectory {
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let directory_path: &Path = directory_path.as_ref();
|
||||
if !directory_path.exists() {
|
||||
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||
directory_path,
|
||||
)))
|
||||
} else if !directory_path.is_dir() {
|
||||
Err(OpenDirectoryError::NotADirectory(PathBuf::from(
|
||||
directory_path,
|
||||
)))
|
||||
} else {
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
|
||||
)));
|
||||
}
|
||||
let canonical_path: PathBuf = directory_path.canonicalize().map_err(|io_err| {
|
||||
OpenDirectoryError::wrap_io_error(io_err, PathBuf::from(directory_path))
|
||||
})?;
|
||||
if !canonical_path.is_dir() {
|
||||
return Err(OpenDirectoryError::NotADirectory(PathBuf::from(
|
||||
directory_path,
|
||||
)));
|
||||
}
|
||||
Ok(MmapDirectory::new(canonical_path, None))
|
||||
}
|
||||
|
||||
/// Joins a relative_path to the directory `root_path`
|
||||
@@ -211,33 +214,6 @@ impl MmapDirectory {
|
||||
self.inner.root_path.join(relative_path)
|
||||
}
|
||||
|
||||
/// Sync the root directory.
|
||||
/// In certain FS, this is required to persistently create
|
||||
/// a file.
|
||||
fn sync_directory(&self) -> Result<(), io::Error> {
|
||||
let mut open_opts = OpenOptions::new();
|
||||
|
||||
// Linux needs read to be set, otherwise returns EINVAL
|
||||
// write must not be set, or it fails with EISDIR
|
||||
open_opts.read(true);
|
||||
|
||||
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
|
||||
// and calling sync_all() only works if write access is requested.
|
||||
#[cfg(windows)]
|
||||
{
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::um::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
}
|
||||
|
||||
let fd = open_opts.open(&self.inner.root_path)?;
|
||||
fd.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns some statistical information
|
||||
/// about the Mmap cache.
|
||||
///
|
||||
@@ -288,8 +264,7 @@ impl Write for SafeFileWriter {
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.0.flush()?;
|
||||
self.0.sync_all()
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -301,7 +276,9 @@ impl Seek for SafeFileWriter {
|
||||
|
||||
impl TerminatingWrite for SafeFileWriter {
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
self.0.flush()?;
|
||||
self.0.sync_data()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -331,6 +308,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
|
||||
let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?;
|
||||
tempfile.write_all(content)?;
|
||||
tempfile.flush()?;
|
||||
tempfile.as_file_mut().sync_data()?;
|
||||
tempfile.into_temp_path().persist(path)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -365,22 +343,17 @@ impl Directory for MmapDirectory {
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
let full_path = self.resolve_path(path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
}),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||
} else {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
})
|
||||
fs::remove_file(&full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
DeleteError::FileDoesNotExist(path.to_owned())
|
||||
} else {
|
||||
DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
}
|
||||
}
|
||||
}
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> Result<bool, OpenReadError> {
|
||||
@@ -409,10 +382,13 @@ impl Directory for MmapDirectory {
|
||||
file.flush()
|
||||
.map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?;
|
||||
|
||||
// Apparetntly, on some filesystem syncing the parent
|
||||
// directory is required.
|
||||
self.sync_directory()
|
||||
.map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?;
|
||||
// Note we actually do not sync the parent directory here.
|
||||
//
|
||||
// A newly created file, may, in some case, be created and even flushed to disk.
|
||||
// and then lost...
|
||||
//
|
||||
// The file will only be durably written after we terminate AND
|
||||
// sync_directory() is called.
|
||||
|
||||
let writer = SafeFileWriter::new(file);
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
@@ -442,7 +418,7 @@ impl Directory for MmapDirectory {
|
||||
debug!("Atomic Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
atomic_write(&full_path, content)?;
|
||||
self.sync_directory()
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
|
||||
@@ -468,6 +444,30 @@ impl Directory for MmapDirectory {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
Ok(self.inner.watch(watch_callback))
|
||||
}
|
||||
|
||||
fn sync_directory(&self) -> Result<(), io::Error> {
|
||||
let mut open_opts = OpenOptions::new();
|
||||
|
||||
// Linux needs read to be set, otherwise returns EINVAL
|
||||
// write must not be set, or it fails with EISDIR
|
||||
open_opts.read(true);
|
||||
|
||||
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
|
||||
// and calling sync_all() only works if write access is requested.
|
||||
#[cfg(windows)]
|
||||
{
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::um::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
}
|
||||
|
||||
let fd = open_opts.open(&self.inner.root_path)?;
|
||||
fd.sync_data()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*!
|
||||
|
||||
WORM directory abstraction.
|
||||
WORM (Write Once Read Many) directory abstraction.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
@@ -225,6 +225,10 @@ impl Directory for RamDirectory {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
Ok(self.fs.write().unwrap().watch(watch_callback))
|
||||
}
|
||||
|
||||
fn sync_directory(&self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -84,81 +84,12 @@ impl FacetReader {
|
||||
mod tests {
|
||||
use crate::Index;
|
||||
use crate::{
|
||||
schema::{Facet, FacetOptions, SchemaBuilder, Value, INDEXED, STORED},
|
||||
schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED},
|
||||
DocAddress, Document,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_facet_only_indexed() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, None);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_only_stored() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, Some("/a/b".to_string()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_stored_and_indexed() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", STORED | INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, Some("/a/b".to_string()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_neither_stored_and_indexed() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
@@ -173,17 +104,40 @@ mod tests {
|
||||
.unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
let value = doc.get_first(facet_field).and_then(Value::facet);
|
||||
assert_eq!(value, None);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_stored_and_indexed() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::facet);
|
||||
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
@@ -206,7 +160,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
@@ -110,7 +110,7 @@ impl FastValue for u64 {
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
FieldType::HierarchicalFacet(_) => Some(Cardinality::MultiValues),
|
||||
FieldType::Facet(_) => Some(Cardinality::MultiValues),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,9 +12,9 @@ mod tests {
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::FacetOptions;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::Document;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
@@ -23,7 +23,7 @@ mod tests {
|
||||
use proptest::prop_oneof;
|
||||
use proptest::proptest;
|
||||
use proptest::strategy::Strategy;
|
||||
use test_env_log::test;
|
||||
use test_log::test;
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() -> crate::Result<()> {
|
||||
@@ -68,6 +68,7 @@ mod tests {
|
||||
IntOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_stored(),
|
||||
);
|
||||
let time_i =
|
||||
@@ -334,7 +335,7 @@ mod tests {
|
||||
#[ignore]
|
||||
fn test_many_facets() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_facet_field("facetfield", INDEXED);
|
||||
let field = schema_builder.add_facet_field("facetfield", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
@@ -91,12 +91,12 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
|
||||
mod tests {
|
||||
|
||||
use crate::core::Index;
|
||||
use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
@@ -40,7 +40,7 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
|
||||
FieldType::Date(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Date, cardinality)),
|
||||
FieldType::HierarchicalFacet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ impl FastFieldsWriter {
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
FieldType::Facet(_) => {
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
|
||||
@@ -26,3 +26,137 @@ pub use self::serializer::FieldNormsSerializer;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::query::Query;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::TextOptions;
|
||||
use crate::schema::TEXT;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use crate::TERMINATED;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::schema::{Field, Schema, STORED};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("field", STORED);
|
||||
schema_builder.add_text_field("txt_field", TEXT);
|
||||
schema_builder.add_text_field(
|
||||
"str_field",
|
||||
TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::Basic)
|
||||
.set_fieldnorms(false),
|
||||
),
|
||||
);
|
||||
schema_builder.build()
|
||||
});
|
||||
|
||||
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
|
||||
pub static TXT_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap());
|
||||
pub static STR_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("str_field").unwrap());
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Cannot register a given fieldnorm twice")]
|
||||
pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() {
|
||||
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
||||
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
|
||||
fieldnorm_writers.record(0u32, *TXT_FIELD, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fieldnorm() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let serializer = FieldNormsSerializer::from_write(write)?;
|
||||
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
||||
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
|
||||
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
|
||||
fieldnorm_writers.serialize(serializer, None)?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
{
|
||||
let fields_composite = CompositeFile::open(&file)?;
|
||||
assert!(fields_composite.open_read(*FIELD).is_none());
|
||||
assert!(fields_composite.open_read(*STR_FIELD).is_none());
|
||||
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
|
||||
let fieldnorm_reader = FieldNormReader::open(data)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fieldnorm_disabled() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(false));
|
||||
let text = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(text=>"hello"))?;
|
||||
writer.add_document(doc!(text=>"hello hello hello"))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(text, "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
let weight = query.weight(&*searcher, true)?;
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
|
||||
assert_eq!(scorer.doc(), 0);
|
||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
||||
assert_eq!(scorer.advance(), 1);
|
||||
assert_eq!(scorer.doc(), 1);
|
||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
||||
assert_eq!(scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fieldnorm_enabled() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(true));
|
||||
let text = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(text=>"hello"))?;
|
||||
writer.add_document(doc!(text=>"hello hello hello"))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(text, "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
let weight = query.weight(&*searcher, true)?;
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
|
||||
assert_eq!(scorer.doc(), 0);
|
||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
||||
assert_eq!(scorer.advance(), 1);
|
||||
assert_eq!(scorer.doc(), 1);
|
||||
assert!((scorer.score() - 0.15136132).abs() < 0.001f32);
|
||||
assert_eq!(scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ use super::fieldnorm_to_id;
|
||||
use super::FieldNormsSerializer;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use std::cmp::Ordering;
|
||||
use std::{io, iter};
|
||||
|
||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||
@@ -12,8 +13,7 @@ use std::{io, iter};
|
||||
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
|
||||
/// byte per document per field.
|
||||
pub struct FieldNormsWriter {
|
||||
fields: Vec<Field>,
|
||||
fieldnorms_buffer: Vec<Vec<u8>>,
|
||||
fieldnorms_buffers: Vec<Option<Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl FieldNormsWriter {
|
||||
@@ -23,7 +23,7 @@ impl FieldNormsWriter {
|
||||
schema
|
||||
.fields()
|
||||
.filter_map(|(field, field_entry)| {
|
||||
if field_entry.is_indexed() {
|
||||
if field_entry.is_indexed() && field_entry.has_fieldnorms() {
|
||||
Some(field)
|
||||
} else {
|
||||
None
|
||||
@@ -35,25 +35,20 @@ impl FieldNormsWriter {
|
||||
/// Initialize with state for tracking the field norm fields
|
||||
/// specified in the schema.
|
||||
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
|
||||
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
||||
let max_field = fields
|
||||
.iter()
|
||||
.map(Field::field_id)
|
||||
.max()
|
||||
.map(|max_field_id| max_field_id as usize + 1)
|
||||
.unwrap_or(0);
|
||||
FieldNormsWriter {
|
||||
fields,
|
||||
fieldnorms_buffer: iter::repeat_with(Vec::new)
|
||||
.take(max_field)
|
||||
.collect::<Vec<_>>(),
|
||||
let mut fieldnorms_buffers: Vec<Option<Vec<u8>>> = iter::repeat_with(|| None)
|
||||
.take(schema.num_fields())
|
||||
.collect();
|
||||
for field in FieldNormsWriter::fields_with_fieldnorm(schema) {
|
||||
fieldnorms_buffers[field.field_id() as usize] = Some(Vec::with_capacity(1_000));
|
||||
}
|
||||
FieldNormsWriter { fieldnorms_buffers }
|
||||
}
|
||||
|
||||
/// The memory used inclusive childs
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.fieldnorms_buffer
|
||||
self.fieldnorms_buffers
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|buf| buf.capacity())
|
||||
.sum()
|
||||
}
|
||||
@@ -62,8 +57,10 @@ impl FieldNormsWriter {
|
||||
///
|
||||
/// Will extend with 0-bytes for documents that have not been seen.
|
||||
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
||||
for field in self.fields.iter() {
|
||||
self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
|
||||
for fieldnorms_buffer_opt in self.fieldnorms_buffers.iter_mut() {
|
||||
if let Some(fieldnorms_buffer) = fieldnorms_buffer_opt.as_mut() {
|
||||
fieldnorms_buffer.resize(max_doc as usize, 0u8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,14 +73,23 @@ impl FieldNormsWriter {
|
||||
/// * field - the field being set
|
||||
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
||||
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
||||
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
|
||||
assert!(
|
||||
fieldnorm_buffer.len() <= doc as usize,
|
||||
"Cannot register a given fieldnorm twice"
|
||||
);
|
||||
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
||||
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
|
||||
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
|
||||
if let Some(fieldnorm_buffer) = self
|
||||
.fieldnorms_buffers
|
||||
.get_mut(field.field_id() as usize)
|
||||
.and_then(Option::as_mut)
|
||||
{
|
||||
match fieldnorm_buffer.len().cmp(&(doc as usize)) {
|
||||
Ordering::Less => {
|
||||
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
||||
fieldnorm_buffer.resize(doc as usize, 0u8);
|
||||
}
|
||||
Ordering::Equal => {}
|
||||
Ordering::Greater => {
|
||||
panic!("Cannot register a given fieldnorm twice")
|
||||
}
|
||||
}
|
||||
fieldnorm_buffer.push(fieldnorm_to_id(fieldnorm));
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
@@ -92,17 +98,18 @@ impl FieldNormsWriter {
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
for &field in self.fields.iter() {
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
|
||||
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map(
|
||||
|(field_id, fieldnorms_buffer_opt)| {
|
||||
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
|
||||
(Field::from_field_id(field_id as u32), fieldnorms_buffer)
|
||||
})
|
||||
},
|
||||
) {
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let mut mapped_fieldnorm_values = vec![];
|
||||
mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
|
||||
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
|
||||
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize];
|
||||
}
|
||||
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
|
||||
let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer);
|
||||
fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?;
|
||||
} else {
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
|
||||
}
|
||||
}
|
||||
fieldnorms_serializer.close()?;
|
||||
|
||||
@@ -63,6 +63,24 @@ pub struct DocIdMapping {
|
||||
}
|
||||
|
||||
impl DocIdMapping {
|
||||
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
|
||||
let max_doc = new_doc_id_to_old.len();
|
||||
let old_max_doc = new_doc_id_to_old
|
||||
.iter()
|
||||
.cloned()
|
||||
.max()
|
||||
.map(|n| n + 1)
|
||||
.unwrap_or(0);
|
||||
let mut old_doc_id_to_new = vec![0; old_max_doc as usize];
|
||||
for i in 0..max_doc {
|
||||
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
|
||||
}
|
||||
DocIdMapping {
|
||||
new_doc_id_to_old,
|
||||
old_doc_id_to_new,
|
||||
}
|
||||
}
|
||||
|
||||
/// returns the new doc_id for the old doc_id
|
||||
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
self.old_doc_id_to_new[doc_id as usize]
|
||||
@@ -75,6 +93,13 @@ impl DocIdMapping {
|
||||
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
|
||||
self.new_doc_id_to_old.iter().cloned()
|
||||
}
|
||||
/// Remaps a given array to the new doc ids.
|
||||
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
|
||||
self.new_doc_id_to_old
|
||||
.iter()
|
||||
.map(|old_doc| els[*old_doc as usize])
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn expect_field_id_for_sort_field(
|
||||
@@ -122,23 +147,13 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
||||
.into_iter()
|
||||
.map(|el| el.0)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// create old doc_id to new doc_id index (used in posting recorder)
|
||||
let max_doc = new_doc_id_to_old.len();
|
||||
let mut old_doc_id_to_new = vec![0; max_doc];
|
||||
for i in 0..max_doc {
|
||||
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
|
||||
}
|
||||
let doc_id_map = DocIdMapping {
|
||||
new_doc_id_to_old,
|
||||
old_doc_id_to_new,
|
||||
};
|
||||
Ok(doc_id_map)
|
||||
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_indexsorting {
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::{collector::TopDocs, query::QueryParser, schema::*};
|
||||
use crate::{schema::Schema, DocAddress};
|
||||
use crate::{Index, IndexSettings, IndexSortByField, Order};
|
||||
@@ -475,4 +490,27 @@ mod tests_indexsorting {
|
||||
assert_eq!(vals, &[3]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_mapping() {
|
||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]);
|
||||
assert_eq!(doc_mapping.get_old_doc_id(0), 3);
|
||||
assert_eq!(doc_mapping.get_old_doc_id(1), 2);
|
||||
assert_eq!(doc_mapping.get_old_doc_id(2), 5);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(0), 0);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(1), 0);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(2), 1);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(3), 0);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(4), 0);
|
||||
assert_eq!(doc_mapping.get_new_doc_id(5), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_mapping_remap() {
|
||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
|
||||
assert_eq!(
|
||||
&doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]),
|
||||
&[2000, 8000, 3000]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -553,7 +553,7 @@ impl IndexWriter {
|
||||
// marks the segment updater as killed. From now on, all
|
||||
// segment updates will be ignored.
|
||||
self.segment_updater.kill();
|
||||
let document_receiver = self.operation_receiver();
|
||||
let document_receiver_res = self.operation_receiver();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self
|
||||
@@ -579,7 +579,9 @@ impl IndexWriter {
|
||||
//
|
||||
// This will reach an end as the only document_sender
|
||||
// was dropped with the index_writer.
|
||||
for _ in document_receiver {}
|
||||
if let Ok(document_receiver) = document_receiver_res {
|
||||
for _ in document_receiver {}
|
||||
}
|
||||
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
@@ -796,6 +798,7 @@ mod tests {
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::FacetOptions;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::TextOptions;
|
||||
@@ -971,7 +974,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }"
|
||||
);
|
||||
let merge_policy = Box::new(NoMergePolicy::default());
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
@@ -1417,7 +1420,7 @@ mod tests {
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_stored(),
|
||||
);
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let settings = if sort_index {
|
||||
IndexSettings {
|
||||
|
||||
@@ -22,7 +22,7 @@ impl IndexWriterStatus {
|
||||
.receive_channel
|
||||
.read()
|
||||
.expect("This lock should never be poisoned");
|
||||
rlock.as_ref().map(|receiver| receiver.clone())
|
||||
rlock.as_ref().cloned()
|
||||
}
|
||||
|
||||
/// Create an index writer bomb.
|
||||
|
||||
@@ -2,12 +2,15 @@ use super::merge_policy::{MergeCandidate, MergePolicy};
|
||||
use crate::core::SegmentMeta;
|
||||
use itertools::Itertools;
|
||||
use std::cmp;
|
||||
use std::f64;
|
||||
|
||||
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
||||
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
|
||||
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
|
||||
// The default value of 1 means that deletes are not taken in account when
|
||||
// identifying merge candidates. This is not a very sensible default: it was
|
||||
// set like that for backward compatibility and might change in the near future.
|
||||
const DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE: f32 = 1.0f32;
|
||||
|
||||
/// `LogMergePolicy` tries to merge segments that have a similar number of
|
||||
/// documents.
|
||||
@@ -17,6 +20,7 @@ pub struct LogMergePolicy {
|
||||
max_docs_before_merge: usize,
|
||||
min_layer_size: u32,
|
||||
level_log_size: f64,
|
||||
del_docs_ratio_before_merge: f32,
|
||||
}
|
||||
|
||||
impl LogMergePolicy {
|
||||
@@ -52,19 +56,49 @@ impl LogMergePolicy {
|
||||
pub fn set_level_log_size(&mut self, level_log_size: f64) {
|
||||
self.level_log_size = level_log_size;
|
||||
}
|
||||
|
||||
/// Set the ratio of deleted documents in a segment to tolerate.
|
||||
///
|
||||
/// If it is exceeded by any segment at a log level, a merge
|
||||
/// will be triggered for that level.
|
||||
///
|
||||
/// If there is a single segment at a level, we effectively end up expunging
|
||||
/// deleted documents from it.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if del_docs_ratio_before_merge is not within (0..1].
|
||||
pub fn set_del_docs_ratio_before_merge(&mut self, del_docs_ratio_before_merge: f32) {
|
||||
assert!(del_docs_ratio_before_merge <= 1.0f32);
|
||||
assert!(del_docs_ratio_before_merge > 0f32);
|
||||
self.del_docs_ratio_before_merge = del_docs_ratio_before_merge;
|
||||
}
|
||||
|
||||
fn has_segment_above_deletes_threshold(&self, level: &[&SegmentMeta]) -> bool {
|
||||
level
|
||||
.iter()
|
||||
.any(|segment| deletes_ratio(segment) > self.del_docs_ratio_before_merge)
|
||||
}
|
||||
}
|
||||
|
||||
fn deletes_ratio(segment: &SegmentMeta) -> f32 {
|
||||
if segment.max_doc() == 0 {
|
||||
return 0f32;
|
||||
}
|
||||
segment.num_deleted_docs() as f32 / segment.max_doc() as f32
|
||||
}
|
||||
|
||||
impl MergePolicy for LogMergePolicy {
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
let mut size_sorted_segments = segments
|
||||
let size_sorted_segments = segments
|
||||
.iter()
|
||||
.filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
|
||||
.filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32))
|
||||
.sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
|
||||
.collect::<Vec<&SegmentMeta>>();
|
||||
|
||||
if size_sorted_segments.len() <= 1 {
|
||||
if size_sorted_segments.is_empty() {
|
||||
return vec![];
|
||||
}
|
||||
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
|
||||
|
||||
let mut current_max_log_size = f64::MAX;
|
||||
let mut levels = vec![];
|
||||
@@ -82,7 +116,10 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_num_segments)
|
||||
.filter(|level| {
|
||||
level.len() >= self.min_num_segments
|
||||
|| self.has_segment_above_deletes_threshold(level)
|
||||
})
|
||||
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
|
||||
.collect()
|
||||
}
|
||||
@@ -95,6 +132,7 @@ impl Default for LogMergePolicy {
|
||||
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
|
||||
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
|
||||
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
|
||||
del_docs_ratio_before_merge: DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -288,4 +326,49 @@ mod tests {
|
||||
assert_eq!(result_list[0].0[1], test_input[4].id());
|
||||
assert_eq!(result_list[0].0[2], test_input[5].id());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_single_segment_with_deletes_below_threshold() {
|
||||
let mut test_merge_policy = test_merge_policy();
|
||||
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
|
||||
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_000, 1)];
|
||||
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
|
||||
assert!(merge_candidates.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_single_segment_with_deletes_above_threshold() {
|
||||
let mut test_merge_policy = test_merge_policy();
|
||||
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
|
||||
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_001, 1)];
|
||||
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
|
||||
assert_eq!(merge_candidates.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_segments_with_deletes_above_threshold_all_in_level() {
|
||||
let mut test_merge_policy = test_merge_policy();
|
||||
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
|
||||
let test_input = vec![
|
||||
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
|
||||
create_random_segment_meta(40_000),
|
||||
];
|
||||
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
|
||||
assert_eq!(merge_candidates.len(), 1);
|
||||
assert_eq!(merge_candidates[0].0.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_segments_with_deletes_above_threshold_different_level_not_involved() {
|
||||
let mut test_merge_policy = test_merge_policy();
|
||||
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
|
||||
let test_input = vec![
|
||||
create_random_segment_meta(100),
|
||||
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
|
||||
];
|
||||
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
|
||||
assert_eq!(merge_candidates.len(), 1);
|
||||
assert_eq!(merge_candidates[0].0.len(), 1);
|
||||
assert_eq!(merge_candidates[0].0[0], test_input[1].id());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::Opstamp;
|
||||
use crate::SegmentId;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use crate::{Inventory, TrackedObject};
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Deref;
|
||||
|
||||
|
||||
@@ -41,31 +41,54 @@ use tantivy_bitpacker::minmax;
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
|
||||
let mut total_tokens = 0u64;
|
||||
let mut count: [usize; 256] = [0; 256];
|
||||
for reader in readers {
|
||||
if reader.has_deletes() {
|
||||
// if there are deletes, then we use an approximation
|
||||
// using the fieldnorm
|
||||
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
|
||||
for doc in reader.doc_ids_alive() {
|
||||
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
|
||||
count[fieldnorm_id as usize] += 1;
|
||||
}
|
||||
} else {
|
||||
total_tokens += reader.inverted_index(field)?.total_num_tokens();
|
||||
}
|
||||
fn estimate_total_num_tokens_in_single_segment(
|
||||
reader: &SegmentReader,
|
||||
field: Field,
|
||||
) -> crate::Result<u64> {
|
||||
// There are no deletes. We can simply use the exact value saved into the posting list.
|
||||
// Note that this value is not necessarily exact as it could have been the result of a merge between
|
||||
// segments themselves containing deletes.
|
||||
if !reader.has_deletes() {
|
||||
return Ok(reader.inverted_index(field)?.total_num_tokens());
|
||||
}
|
||||
Ok(total_tokens
|
||||
+ count
|
||||
|
||||
// When there are deletes, we use an approximation either
|
||||
// by using the fieldnorm.
|
||||
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
|
||||
let mut count: [usize; 256] = [0; 256];
|
||||
for doc in reader.doc_ids_alive() {
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
|
||||
count[fieldnorm_id as usize] += 1;
|
||||
}
|
||||
let total_num_tokens = count
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
.map(|(fieldnorm_ord, count)| {
|
||||
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
||||
})
|
||||
.sum::<u64>())
|
||||
.sum::<u64>();
|
||||
return Ok(total_num_tokens);
|
||||
}
|
||||
|
||||
// There are no fieldnorms available.
|
||||
// Here we just do a pro-rata with the overall number of tokens an the ratio of
|
||||
// documents alive.
|
||||
let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens();
|
||||
if reader.max_doc() == 0 {
|
||||
// That supposedly never happens, but let's be a bit defensive here.
|
||||
return Ok(0u64);
|
||||
}
|
||||
let ratio = reader.num_docs() as f64 / reader.max_doc() as f64;
|
||||
Ok((segment_num_tokens as f64 * ratio) as u64)
|
||||
}
|
||||
|
||||
fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
|
||||
let mut total_num_tokens: u64 = 0;
|
||||
for reader in readers {
|
||||
total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?;
|
||||
}
|
||||
Ok(total_num_tokens)
|
||||
}
|
||||
|
||||
pub struct IndexMerger {
|
||||
@@ -271,10 +294,10 @@ impl IndexMerger {
|
||||
for (field, field_entry) in self.schema.fields() {
|
||||
let field_type = field_entry.field_type();
|
||||
match field_type {
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
FieldType::Facet(_) => {
|
||||
let term_ordinal_mapping = term_ord_mappings
|
||||
.remove(&field)
|
||||
.expect("Logic Error in Tantivy (Please report). HierarchicalFact field should have required a\
|
||||
.expect("Logic Error in Tantivy (Please report). Facet field should have required a\
|
||||
`term_ordinal_mapping`.");
|
||||
self.write_hierarchical_facet_field(
|
||||
field,
|
||||
@@ -317,14 +340,13 @@ impl IndexMerger {
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
let (min_value, max_value) = self.readers.iter().map(|reader|{
|
||||
let (min_value, max_value) = self.readers.iter().filter_map(|reader|{
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
compute_min_max_val(&u64_reader, reader)
|
||||
})
|
||||
.flatten()
|
||||
.reduce(|a, b| {
|
||||
(a.0.min(b.0), a.1.max(b.1))
|
||||
}).expect("Unexpected error, empty readers in IndexMerger");
|
||||
@@ -634,12 +656,11 @@ impl IndexMerger {
|
||||
self.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(reader_ordinal, reader)| {
|
||||
.flat_map(|(reader_ordinal, reader)| {
|
||||
reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| (doc_id, reader_ordinal as SegmentOrdinal))
|
||||
})
|
||||
.flatten(),
|
||||
}),
|
||||
);
|
||||
Ok(SegmentDocIdMapping::new(mapping, true))
|
||||
}
|
||||
@@ -737,24 +758,18 @@ impl IndexMerger {
|
||||
fast_field_readers: &ff_readers,
|
||||
offsets,
|
||||
};
|
||||
let iter1 = doc_id_mapping
|
||||
.iter()
|
||||
.map(|(doc_id, reader_ordinal)| {
|
||||
let ff_reader = &ff_readers[*reader_ordinal as usize];
|
||||
let mut vals = vec![];
|
||||
ff_reader.get_vals(*doc_id, &mut vals);
|
||||
vals.into_iter()
|
||||
})
|
||||
.flatten();
|
||||
let iter2 = doc_id_mapping
|
||||
.iter()
|
||||
.map(|(doc_id, reader_ordinal)| {
|
||||
let ff_reader = &ff_readers[*reader_ordinal as usize];
|
||||
let mut vals = vec![];
|
||||
ff_reader.get_vals(*doc_id, &mut vals);
|
||||
vals.into_iter()
|
||||
})
|
||||
.flatten();
|
||||
let iter1 = doc_id_mapping.iter().flat_map(|(doc_id, reader_ordinal)| {
|
||||
let ff_reader = &ff_readers[*reader_ordinal as usize];
|
||||
let mut vals = vec![];
|
||||
ff_reader.get_vals(*doc_id, &mut vals);
|
||||
vals.into_iter()
|
||||
});
|
||||
let iter2 = doc_id_mapping.iter().flat_map(|(doc_id, reader_ordinal)| {
|
||||
let ff_reader = &ff_readers[*reader_ordinal as usize];
|
||||
let mut vals = vec![];
|
||||
ff_reader.get_vals(*doc_id, &mut vals);
|
||||
vals.into_iter()
|
||||
});
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
|
||||
field,
|
||||
stats,
|
||||
@@ -829,7 +844,7 @@ impl IndexMerger {
|
||||
}
|
||||
|
||||
let mut term_ord_mapping_opt = match field_type {
|
||||
FieldType::HierarchicalFacet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
|
||||
FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
@@ -851,10 +866,9 @@ impl IndexMerger {
|
||||
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
|
||||
}
|
||||
|
||||
// The total number of tokens will only be exact when there has been no deletes.
|
||||
//
|
||||
// Otherwise, we approximate by removing deleted documents proportionally.
|
||||
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
|
||||
// Note that the total number of tokens is not exact.
|
||||
// It is only used as a parameter in the BM25 formula.
|
||||
let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?;
|
||||
|
||||
// Create the total list of doc ids
|
||||
// by stacking the doc ids from the different segment.
|
||||
@@ -1118,13 +1132,13 @@ mod tests {
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::{Cardinality, TEXT};
|
||||
use crate::schema::{Document, FacetOptions};
|
||||
use crate::DocAddress;
|
||||
use crate::IndexSettings;
|
||||
use crate::IndexSortByField;
|
||||
@@ -1650,7 +1664,7 @@ mod tests {
|
||||
// ranges between segments so that merge algorithm can't apply certain optimizations
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let int_options = IntOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_indexed();
|
||||
|
||||
@@ -1,22 +1,17 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fastfield::{AliveBitSet, FastFieldReader};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{
|
||||
collector::TopDocs,
|
||||
schema::{Cardinality, TextFieldIndexing},
|
||||
};
|
||||
use crate::{core::Index, fastfield::MultiValuedFastFieldReader};
|
||||
use crate::{
|
||||
query::QueryParser,
|
||||
schema::{IntOptions, TextOptions},
|
||||
};
|
||||
use crate::{schema::Facet, IndexSortByField};
|
||||
use crate::{schema::INDEXED, Order};
|
||||
use crate::{
|
||||
schema::{self, BytesOptions},
|
||||
DocAddress,
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{
|
||||
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, TextFieldIndexing,
|
||||
};
|
||||
use crate::schema::{IntOptions, TextOptions};
|
||||
use crate::DocAddress;
|
||||
use crate::IndexSortByField;
|
||||
use crate::Order;
|
||||
use crate::{DocSet, IndexSettings, Postings, Term};
|
||||
use futures::executor::block_on;
|
||||
|
||||
@@ -27,7 +22,7 @@ mod tests {
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
@@ -79,7 +74,7 @@ mod tests {
|
||||
|
||||
let bytes_options = BytesOptions::default().set_fast().set_indexed();
|
||||
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
|
||||
let multi_numbers = schema_builder.add_u64_field(
|
||||
"multi_numbers",
|
||||
@@ -521,7 +516,7 @@ mod bench_sorted_index_merge {
|
||||
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
|
||||
let mut doc = Document::default();
|
||||
doc.add_u64(int_field, val);
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
};
|
||||
// 3 segments with 10_000 values in the fast fields
|
||||
for _ in 0..3 {
|
||||
|
||||
@@ -66,13 +66,10 @@ impl SegmentRegister {
|
||||
}
|
||||
|
||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let mut segment_ids: Vec<SegmentMeta> = self
|
||||
.segment_states
|
||||
self.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
segment_ids.sort_by_key(SegmentMeta::id);
|
||||
segment_ids
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
|
||||
|
||||
@@ -61,7 +61,9 @@ pub fn save_new_metas(
|
||||
payload: None,
|
||||
},
|
||||
directory,
|
||||
)
|
||||
)?;
|
||||
directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
@@ -82,6 +84,7 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
))));
|
||||
directory.sync_directory()?;
|
||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||
Ok(())
|
||||
|
||||
@@ -2,7 +2,6 @@ use super::{
|
||||
doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping},
|
||||
operation::AddOperation,
|
||||
};
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::compute_table_size;
|
||||
@@ -18,6 +17,7 @@ use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
||||
use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
||||
use crate::Opstamp;
|
||||
use crate::{core::Segment, store::StoreWriter};
|
||||
use crate::{fastfield::FastFieldsWriter, schema::Type};
|
||||
use crate::{DocId, SegmentComponent};
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
@@ -173,18 +173,11 @@ impl SegmentWriter {
|
||||
let (term_buffer, multifield_postings) =
|
||||
(&mut self.term_buffer, &mut self.multifield_postings);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
term_buffer.set_field(field);
|
||||
let facets =
|
||||
field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
});
|
||||
for facet_str in facets {
|
||||
FieldType::Facet(_) => {
|
||||
term_buffer.set_field(Type::Facet, field);
|
||||
for field_value in field_values {
|
||||
let facet = field_value.value().facet().ok_or_else(make_schema_error)?;
|
||||
let facet_str = facet.encoded_str();
|
||||
let mut unordered_term_id_opt = None;
|
||||
FacetTokenizer
|
||||
.token_stream(facet_str)
|
||||
@@ -241,12 +234,11 @@ impl SegmentWriter {
|
||||
term_buffer,
|
||||
)
|
||||
};
|
||||
|
||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for field_value in field_values {
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_field(Type::U64, field_value.field());
|
||||
let u64_val = field_value
|
||||
.value()
|
||||
.u64_value()
|
||||
@@ -257,7 +249,7 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
for field_value in field_values {
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_field(Type::Date, field_value.field());
|
||||
let date_val = field_value
|
||||
.value()
|
||||
.date_value()
|
||||
@@ -268,7 +260,7 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
for field_value in field_values {
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_field(Type::I64, field_value.field());
|
||||
let i64_val = field_value
|
||||
.value()
|
||||
.i64_value()
|
||||
@@ -279,7 +271,7 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
for field_value in field_values {
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_field(Type::F64, field_value.field());
|
||||
let f64_val = field_value
|
||||
.value()
|
||||
.f64_value()
|
||||
@@ -290,7 +282,7 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Bytes(_) => {
|
||||
for field_value in field_values {
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_field(Type::Bytes, field_value.field());
|
||||
let bytes = field_value
|
||||
.value()
|
||||
.bytes_value()
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![warn(missing_docs)]
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
#![allow(clippy::return_self_not_must_use)]
|
||||
|
||||
//! # `tantivy`
|
||||
//!
|
||||
@@ -157,7 +158,7 @@ pub mod termdict;
|
||||
|
||||
mod reader;
|
||||
|
||||
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
|
||||
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
|
||||
mod snippet;
|
||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
|
||||
@@ -165,8 +166,8 @@ mod docset;
|
||||
pub use self::docset::{DocSet, TERMINATED};
|
||||
pub use crate::core::{Executor, SegmentComponent};
|
||||
pub use crate::core::{
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment,
|
||||
SegmentId, SegmentMeta,
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher,
|
||||
SearcherGeneration, Segment, SegmentId, SegmentMeta,
|
||||
};
|
||||
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
||||
pub use crate::directory::Directory;
|
||||
@@ -178,6 +179,7 @@ pub use crate::indexer::{IndexWriter, PreparedCommit};
|
||||
pub use crate::postings::Postings;
|
||||
pub use crate::reader::LeasedItem;
|
||||
pub use crate::schema::{Document, Term};
|
||||
pub use census::{Inventory, TrackedObject};
|
||||
pub use common::HasLen;
|
||||
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use std::fmt;
|
||||
@@ -237,6 +239,7 @@ pub fn version_string() -> &'static str {
|
||||
pub mod merge_policy {
|
||||
pub use crate::indexer::DefaultMergePolicy;
|
||||
pub use crate::indexer::LogMergePolicy;
|
||||
pub use crate::indexer::MergeCandidate;
|
||||
pub use crate::indexer::MergePolicy;
|
||||
pub use crate::indexer::NoMergePolicy;
|
||||
}
|
||||
|
||||
@@ -47,7 +47,6 @@ pub mod tests {
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::query::Scorer;
|
||||
use crate::schema::{Field, TextOptions};
|
||||
use crate::schema::{IndexRecordOption, TextFieldIndexing};
|
||||
@@ -154,10 +153,7 @@ pub mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
|
||||
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
|
||||
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
|
||||
exceeding_token_text.push_str(" hello");
|
||||
pub fn test_index_max_length_token() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
@@ -170,33 +166,54 @@ pub mod tests {
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader().unwrap();
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>exceeding_token_text))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(&bytes, b"hello");
|
||||
}
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(1u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
|
||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
.set_tokenizer("simple_no_truncation"),
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
|
||||
exceeding_token_text.push_str(" hello");
|
||||
index_writer.add_document(doc!(text_field=>exceeding_token_text))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(&bytes, b"hello");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -600,7 +617,7 @@ mod bench {
|
||||
doc.add_text(text_field, "c");
|
||||
}
|
||||
doc.add_text(text_field, "d");
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
@@ -5,8 +5,8 @@ use crate::postings::recorder::{
|
||||
};
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||
use crate::schema::{IndexRecordOption, Type};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
|
||||
@@ -39,9 +39,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
|
||||
| FieldType::F64(_)
|
||||
| FieldType::Date(_)
|
||||
| FieldType::Bytes(_)
|
||||
| FieldType::HierarchicalFacet(_) => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,11 +51,11 @@ pub struct MultiFieldPostingsWriter {
|
||||
}
|
||||
|
||||
fn make_field_partition(
|
||||
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
|
||||
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
) -> Vec<(Field, Range<usize>)> {
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.map(|(term, _, _)| term.field())
|
||||
.enumerate();
|
||||
let mut prev_field_opt = None;
|
||||
let mut fields = vec![];
|
||||
@@ -132,10 +130,10 @@ impl MultiFieldPostingsWriter {
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
|
||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
|
||||
Vec::with_capacity(self.term_index.len());
|
||||
term_offsets.extend(self.term_index.iter());
|
||||
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
||||
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
@@ -146,7 +144,7 @@ impl MultiFieldPostingsWriter {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(_) | FieldType::HierarchicalFacet(_) => {
|
||||
FieldType::Str(_) | FieldType::Facet(_) => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let unordered_term_ids = term_offsets[byte_offsets.clone()]
|
||||
@@ -210,7 +208,7 @@ pub trait PostingsWriter {
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
term_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
@@ -227,7 +225,7 @@ pub trait PostingsWriter {
|
||||
heap: &mut MemoryArena,
|
||||
term_buffer: &mut Term,
|
||||
) -> u32 {
|
||||
term_buffer.set_field(field);
|
||||
term_buffer.set_field(Type::Str, field);
|
||||
let mut sink = |token: &Token| {
|
||||
// We skip all tokens with a len greater than u16.
|
||||
if token.text.len() <= MAX_TOKEN_LEN {
|
||||
@@ -281,7 +279,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
self.total_num_tokens += 1;
|
||||
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| {
|
||||
if let Some(mut recorder) = opt_recorder {
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
@@ -301,17 +299,17 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
termdict_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for &(term_bytes, addr, _) in term_addrs {
|
||||
let recorder: Rec = termdict_heap.read(addr);
|
||||
for (term, addr, _) in term_addrs {
|
||||
let recorder: Rec = termdict_heap.read(*addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(&term_bytes[4..], term_doc_freq)?;
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(&mut buffer_lender, serializer, heap, doc_id_map);
|
||||
serializer.close_term()?;
|
||||
}
|
||||
|
||||
@@ -308,10 +308,8 @@ pub struct PostingsSerializer<W: Write> {
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
|
||||
bm25_weight: Option<Bm25Weight>,
|
||||
|
||||
num_docs: u32, // Number of docs in the segment
|
||||
avg_fieldnorm: Score, // Average number of term in the field for that segment.
|
||||
// this value is used to compute the block wand information.
|
||||
// this value is used to compute the block wand information.
|
||||
}
|
||||
|
||||
impl<W: Write> PostingsSerializer<W> {
|
||||
@@ -321,10 +319,6 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
mode: IndexRecordOption,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> PostingsSerializer<W> {
|
||||
let num_docs = fieldnorm_reader
|
||||
.as_ref()
|
||||
.map(|fieldnorm_reader| fieldnorm_reader.num_docs())
|
||||
.unwrap_or(0u32);
|
||||
PostingsSerializer {
|
||||
output_write: CountingWriter::wrap(write),
|
||||
|
||||
@@ -339,21 +333,33 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
|
||||
fieldnorm_reader,
|
||||
bm25_weight: None,
|
||||
|
||||
num_docs,
|
||||
avg_fieldnorm,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_term(&mut self, term_doc_freq: u32) {
|
||||
if self.mode.has_freq() && self.num_docs > 0 {
|
||||
let bm25_weight = Bm25Weight::for_one_term(
|
||||
term_doc_freq as u64,
|
||||
self.num_docs as u64,
|
||||
self.avg_fieldnorm,
|
||||
);
|
||||
self.bm25_weight = Some(bm25_weight);
|
||||
self.bm25_weight = None;
|
||||
|
||||
if !self.mode.has_freq() {
|
||||
return;
|
||||
}
|
||||
|
||||
let num_docs_in_segment: u64 =
|
||||
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
|
||||
fieldnorm_reader.num_docs() as u64
|
||||
} else {
|
||||
return;
|
||||
};
|
||||
|
||||
if num_docs_in_segment == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
self.bm25_weight = Some(Bm25Weight::for_one_term(
|
||||
term_doc_freq as u64,
|
||||
num_docs_in_segment,
|
||||
self.avg_fieldnorm,
|
||||
));
|
||||
}
|
||||
|
||||
fn write_block(&mut self) {
|
||||
|
||||
@@ -186,7 +186,6 @@ mod tests {
|
||||
use super::*;
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
|
||||
#[test]
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let mut heap = MemoryArena::new();
|
||||
|
||||
@@ -3,6 +3,7 @@ use murmurhash32::murmurhash2;
|
||||
use super::{Addr, MemoryArena};
|
||||
use crate::postings::stacker::memory_arena::store;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::Term;
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
@@ -81,13 +82,13 @@ pub struct Iter<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = (&'a [u8], Addr, UnorderedTermId);
|
||||
type Item = (Term<&'a [u8]>, Addr, UnorderedTermId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
(key, offset, kv.unordered_term_id)
|
||||
(Term::wrap(key), offset, kv.unordered_term_id)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -189,21 +190,19 @@ impl TermHashMap {
|
||||
/// will be in charge of returning a default value.
|
||||
/// If the key already as an associated value, then it will be passed
|
||||
/// `Some(previous_value)`.
|
||||
pub fn mutate_or_create<S, V, TMutator>(
|
||||
pub fn mutate_or_create<V, TMutator>(
|
||||
&mut self,
|
||||
key: S,
|
||||
key: &[u8],
|
||||
mut updater: TMutator,
|
||||
) -> UnorderedTermId
|
||||
where
|
||||
S: AsRef<[u8]>,
|
||||
V: Copy + 'static,
|
||||
TMutator: FnMut(Option<V>) -> V,
|
||||
{
|
||||
if self.is_saturated() {
|
||||
self.resize();
|
||||
}
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2(key.as_ref());
|
||||
let hash = murmurhash2(key);
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
@@ -211,21 +210,18 @@ impl TermHashMap {
|
||||
if kv.is_empty() {
|
||||
// The key does not exists yet.
|
||||
let val = updater(None);
|
||||
let num_bytes =
|
||||
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
|
||||
let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
|
||||
let key_addr = self.heap.allocate_space(num_bytes);
|
||||
{
|
||||
let data = self.heap.slice_mut(key_addr, num_bytes);
|
||||
NativeEndian::write_u16(data, key_bytes.len() as u16);
|
||||
let stop = 2 + key_bytes.len();
|
||||
data[2..stop].copy_from_slice(key_bytes);
|
||||
NativeEndian::write_u16(data, key.len() as u16);
|
||||
let stop = 2 + key.len();
|
||||
data[2..stop].copy_from_slice(key);
|
||||
store(&mut data[stop..], val);
|
||||
}
|
||||
return self.set_bucket(hash, key_addr, bucket);
|
||||
} else if kv.hash == hash {
|
||||
if let Some(val_addr) =
|
||||
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
|
||||
{
|
||||
if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
|
||||
let v = self.heap.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.heap.write_at(val_addr, new_v);
|
||||
@@ -245,25 +241,18 @@ mod tests {
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18);
|
||||
{
|
||||
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
3u32
|
||||
});
|
||||
}
|
||||
{
|
||||
hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
4u32
|
||||
});
|
||||
}
|
||||
{
|
||||
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, Some(3u32));
|
||||
5u32
|
||||
});
|
||||
}
|
||||
|
||||
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
3u32
|
||||
});
|
||||
hash_map.mutate_or_create(b"abcd", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
4u32
|
||||
});
|
||||
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, Some(3u32));
|
||||
5u32
|
||||
});
|
||||
let mut vanilla_hash_map = HashMap::new();
|
||||
let iter_values = hash_map.iter();
|
||||
for (key, addr, _) in iter_values {
|
||||
|
||||
@@ -132,10 +132,15 @@ impl FuzzyTermQuery {
|
||||
match LEV_BUILDER.get(&(self.distance, self.transposition_cost_one)) {
|
||||
// Unwrap the option and build the Ok(AutomatonWeight)
|
||||
Some(automaton_builder) => {
|
||||
let term_text = self.term.as_str().ok_or_else(|| {
|
||||
crate::TantivyError::InvalidArgument(
|
||||
"The fuzzy term query requires a string term.".to_string(),
|
||||
)
|
||||
})?;
|
||||
let automaton = if self.prefix {
|
||||
automaton_builder.build_prefix_dfa(self.term.text())
|
||||
automaton_builder.build_prefix_dfa(term_text)
|
||||
} else {
|
||||
automaton_builder.build_dfa(self.term.text())
|
||||
automaton_builder.build_dfa(term_text)
|
||||
};
|
||||
Ok(AutomatonWeight::new(
|
||||
self.term.field(),
|
||||
|
||||
@@ -180,7 +180,7 @@ impl MoreLikeThis {
|
||||
|
||||
// extract the raw value, possibly tokenizing & filtering to update the term frequency map
|
||||
match field_entry.field_type() {
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
FieldType::Facet(_) => {
|
||||
let facets: Vec<&str> = field_values
|
||||
.iter()
|
||||
.map(|field_value| match *field_value.value() {
|
||||
|
||||
@@ -367,8 +367,8 @@ impl QueryParser {
|
||||
))
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet(_) => match Facet::from_text(phrase) {
|
||||
Ok(facet) => Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))]),
|
||||
FieldType::Facet(_) => match Facet::from_text(phrase) {
|
||||
Ok(facet) => Ok(vec![(0, Term::from_facet(field, &facet))]),
|
||||
Err(e) => Err(QueryParserError::from(e)),
|
||||
},
|
||||
FieldType::Bytes(_) => {
|
||||
@@ -587,6 +587,7 @@ mod test {
|
||||
use super::QueryParser;
|
||||
use super::QueryParserError;
|
||||
use crate::query::Query;
|
||||
use crate::schema::FacetOptions;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
|
||||
@@ -615,8 +616,7 @@ mod test {
|
||||
schema_builder.add_text_field("with_stop_words", text_options);
|
||||
schema_builder.add_date_field("date", INDEXED);
|
||||
schema_builder.add_f64_field("float", INDEXED);
|
||||
schema_builder.add_facet_field("facet", INDEXED);
|
||||
schema_builder.add_facet_field("facet_not_indexed", STORED);
|
||||
schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
schema_builder.add_bytes_field("bytes", INDEXED);
|
||||
schema_builder.add_bytes_field("bytes_not_indexed", STORED);
|
||||
schema_builder.build()
|
||||
@@ -665,17 +665,10 @@ mod test {
|
||||
let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
"TermQuery(Term(field=11,bytes=[114, 111, 111, 116, 0, 98, 114, 97, 110, 99, 104, 0, 108, 101, 97, 102]))"
|
||||
r#"TermQuery(Term(type=Facet, field=11, val="/root/branch/leaf"))"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_facet_not_indexed() {
|
||||
let error =
|
||||
parse_query_to_logical_ast("facet_not_indexed:/root/branch/leaf", false).unwrap_err();
|
||||
assert!(matches!(error, QueryParserError::FieldNotIndexed(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_with_boost() {
|
||||
let mut query_parser = make_query_parser();
|
||||
@@ -685,7 +678,7 @@ mod test {
|
||||
let query = query_parser.parse_query("text:hello").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
"Boost(query=TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111])), boost=2)"
|
||||
r#"Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2)"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -711,7 +704,7 @@ mod test {
|
||||
let query = query_parser.parse_query("text:hello^2").unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
"Boost(query=Boost(query=TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111])), boost=2), boost=2)"
|
||||
r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2), boost=2)"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -746,8 +739,7 @@ mod test {
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"nottokenized:\"wordone wordtwo\"",
|
||||
"Term(field=7,bytes=[119, 111, 114, 100, 111, 110, \
|
||||
101, 32, 119, 111, 114, 100, 116, 119, 111])",
|
||||
r#"Term(type=Str, field=7, val="wordone wordtwo")"#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -790,7 +782,7 @@ mod test {
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"unsigned:2324",
|
||||
"Term(field=3,bytes=[0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
"Term(type=U64, field=3, val=2324)",
|
||||
false,
|
||||
);
|
||||
|
||||
@@ -817,7 +809,7 @@ mod test {
|
||||
fn test_parse_bytes() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"bytes:YnVidQ==",
|
||||
"Term(field=13,bytes=[98, 117, 98, 117])",
|
||||
"Term(type=Bytes, field=12, val=[98, 117, 98, 117])",
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -832,7 +824,7 @@ mod test {
|
||||
fn test_parse_bytes_phrase() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"bytes:\"YnVidQ==\"",
|
||||
"Term(field=13,bytes=[98, 117, 98, 117])",
|
||||
"Term(type=Bytes, field=12, val=[98, 117, 98, 117])",
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -848,12 +840,12 @@ mod test {
|
||||
fn test_parse_query_to_ast_ab_c() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"(+title:a +title:b) title:c",
|
||||
"((+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98])) Term(field=0,bytes=[99]))",
|
||||
r#"((+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) Term(type=Str, field=0, val="c"))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"(+title:a +title:b) title:c",
|
||||
"(+(+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98])) +Term(field=0,bytes=[99]))",
|
||||
r#"(+(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) +Term(type=Str, field=0, val="c"))"#,
|
||||
true,
|
||||
);
|
||||
}
|
||||
@@ -862,19 +854,17 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_single_term() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term(field=0,bytes=[116, 111, 116, 111]) \
|
||||
-(Term(field=0,bytes=[116, 105, 116, 105]) \
|
||||
Term(field=1,bytes=[116, 105, 116, 105])))",
|
||||
r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -891,13 +881,12 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_two_terms() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(Term(field=0,bytes=[97]) (Term(field=0,bytes=[98]) Term(field=1,bytes=[98])))",
|
||||
r#"(Term(type=Str, field=0, val="a") (Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[(0, Term(field=0,bytes=[97])), \
|
||||
(1, Term(field=0,bytes=[98]))]\"",
|
||||
r#"title:"a b""#,
|
||||
r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
@@ -906,46 +895,39 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_ranges() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:[a TO b]",
|
||||
"(Included(Term(field=0,bytes=[97])) TO Included(Term(field=0,bytes=[98])))",
|
||||
r#"(Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"[a TO b]",
|
||||
"((Included(Term(field=0,bytes=[97])) TO \
|
||||
Included(Term(field=0,bytes=[98]))) \
|
||||
(Included(Term(field=1,bytes=[97])) TO \
|
||||
Included(Term(field=1,bytes=[98]))))",
|
||||
r#"((Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b"))) (Included(Term(type=Str, field=1, val="a")) TO Included(Term(type=Str, field=1, val="b"))))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO toto}",
|
||||
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO \
|
||||
Excluded(Term(field=0,bytes=[116, 111, 116, 111])))",
|
||||
r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Excluded(Term(type=Str, field=0, val="toto")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{* TO toto}",
|
||||
"(Unbounded TO Excluded(Term(field=0,bytes=[116, 111, 116, 111])))",
|
||||
r#"(Unbounded TO Excluded(Term(type=Str, field=0, val="toto")))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO *}",
|
||||
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO Unbounded)",
|
||||
r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Unbounded)"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:{-5 TO 3}",
|
||||
"(Excluded(Term(field=2,bytes=[127, 255, 255, 255, 255, 255, 255, 251])) TO \
|
||||
Excluded(Term(field=2,bytes=[128, 0, 0, 0, 0, 0, 0, 3])))",
|
||||
r#"(Excluded(Term(type=I64, field=2, val=-5)) TO Excluded(Term(type=I64, field=2, val=3)))"#,
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"float:{-1.5 TO 1.5}",
|
||||
"(Excluded(Term(field=10,bytes=[64, 7, 255, 255, 255, 255, 255, 255])) TO \
|
||||
Excluded(Term(field=10,bytes=[191, 248, 0, 0, 0, 0, 0, 0])))",
|
||||
r#"(Excluded(Term(type=F64, field=10, val=-1.5)) TO Excluded(Term(type=F64, field=10, val=1.5)))"#,
|
||||
false,
|
||||
);
|
||||
|
||||
test_parse_query_to_logical_ast_helper("*", "*", false);
|
||||
}
|
||||
|
||||
@@ -1072,32 +1054,27 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
r#"Term(type=Str, field=0, val="toto")"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term(field=0,bytes=[116, 111, 116, 111]) \
|
||||
-(Term(field=0,bytes=[116, 105, 116, 105]) \
|
||||
Term(field=1,bytes=[116, 105, 116, 105])))",
|
||||
r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(+Term(field=0,bytes=[97]) \
|
||||
+(Term(field=0,bytes=[98]) \
|
||||
Term(field=1,bytes=[98])))",
|
||||
r#"(+Term(type=Str, field=0, val="a") +(Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[(0, Term(field=0,bytes=[97])), \
|
||||
(1, Term(field=0,bytes=[98]))]\"",
|
||||
r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#,
|
||||
true,
|
||||
);
|
||||
}
|
||||
@@ -1106,8 +1083,8 @@ mod test {
|
||||
pub fn test_query_parser_hyphen() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:www-form-encoded",
|
||||
"\"[(0, Term(field=0,bytes=[119, 119, 119])), (1, Term(field=0,bytes=[102, 111, 114, 109])), (2, Term(field=0,bytes=[101, 110, 99, 111, 100, 101, 100]))]\"",
|
||||
false
|
||||
r#""[(0, Term(type=Str, field=0, val="www")), (1, Term(type=Str, field=0, val="form")), (2, Term(type=Str, field=0, val="encoded"))]""#,
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1116,7 +1093,7 @@ mod test {
|
||||
for &default_conjunction in &[false, true] {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a AND title:b",
|
||||
"(+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98]))",
|
||||
r#"(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b"))"#,
|
||||
default_conjunction,
|
||||
);
|
||||
}
|
||||
@@ -1127,7 +1104,7 @@ mod test {
|
||||
for &default_conjunction in &[false, true] {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a OR title:b",
|
||||
"(Term(field=0,bytes=[97]) Term(field=0,bytes=[98]))",
|
||||
r#"(Term(type=Str, field=0, val="a") Term(type=Str, field=0, val="b"))"#,
|
||||
default_conjunction,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -175,7 +175,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", term_query),
|
||||
"TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111]))"
|
||||
r#"TermQuery(Term(type=Str, field=1, val="hello"))"#
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -92,21 +92,16 @@ impl TermQuery {
|
||||
searcher: &Searcher,
|
||||
scoring_enabled: bool,
|
||||
) -> crate::Result<TermWeight> {
|
||||
let term = self.term.clone();
|
||||
let field_entry = searcher.schema().get_field_entry(term.field());
|
||||
let field_entry = searcher.schema().get_field_entry(self.term.field());
|
||||
if !field_entry.is_indexed() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not indexed",
|
||||
field_entry.name()
|
||||
)));
|
||||
let error_msg = format!("Field {:?} is not indexed.", field_entry.name());
|
||||
return Err(crate::TantivyError::SchemaError(error_msg));
|
||||
}
|
||||
let bm25_weight;
|
||||
if scoring_enabled {
|
||||
bm25_weight = Bm25Weight::for_terms(searcher, &[term])?;
|
||||
let bm25_weight = if scoring_enabled {
|
||||
Bm25Weight::for_terms(searcher, &[self.term.clone()])?
|
||||
} else {
|
||||
bm25_weight =
|
||||
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32);
|
||||
}
|
||||
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32)
|
||||
};
|
||||
let index_record_option = if scoring_enabled {
|
||||
self.index_record_option
|
||||
} else {
|
||||
|
||||
@@ -31,11 +31,7 @@ impl Weight for TermWeight {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let mut explanation = scorer.explain();
|
||||
explanation.add_context(format!(
|
||||
"Term ={:?}:{:?}",
|
||||
self.term.field(),
|
||||
self.term.value_bytes()
|
||||
));
|
||||
explanation.add_context(format!("Term={:?}", self.term,));
|
||||
Ok(explanation)
|
||||
}
|
||||
|
||||
@@ -106,11 +102,13 @@ impl TermWeight {
|
||||
) -> crate::Result<TermScorer> {
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field)?;
|
||||
let fieldnorm_reader = if self.scoring_enabled {
|
||||
reader.get_fieldnorms_reader(field)?
|
||||
let fieldnorm_reader_opt = if self.scoring_enabled {
|
||||
reader.fieldnorms_readers().get_field(field)?
|
||||
} else {
|
||||
FieldNormReader::constant(reader.max_doc(), 1)
|
||||
None
|
||||
};
|
||||
let fieldnorm_reader =
|
||||
fieldnorm_reader_opt.unwrap_or_else(|| FieldNormReader::constant(reader.max_doc(), 1));
|
||||
let similarity_weight = self.similarity_weight.boost_by(boost);
|
||||
let postings_opt: Option<SegmentPostings> =
|
||||
inverted_index.read_postings(&self.term, self.index_record_option)?;
|
||||
|
||||
@@ -1,16 +1,23 @@
|
||||
mod pool;
|
||||
mod warming;
|
||||
|
||||
pub use self::pool::LeasedItem;
|
||||
use self::pool::Pool;
|
||||
use crate::core::Segment;
|
||||
use self::warming::WarmingState;
|
||||
use crate::core::searcher::SearcherGeneration;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::META_LOCK;
|
||||
use crate::directory::{Directory, WatchCallback};
|
||||
use crate::Index;
|
||||
use crate::Searcher;
|
||||
use crate::SegmentReader;
|
||||
use crate::{Inventory, TrackedObject};
|
||||
use std::sync::atomic;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Weak;
|
||||
use std::{convert::TryInto, io};
|
||||
pub use warming::Warmer;
|
||||
|
||||
/// Defines when a new version of the index should be reloaded.
|
||||
///
|
||||
@@ -29,22 +36,20 @@ pub enum ReloadPolicy {
|
||||
OnCommit, // TODO add NEAR_REAL_TIME(target_ms)
|
||||
}
|
||||
|
||||
/// `IndexReader` builder
|
||||
/// [IndexReader] builder
|
||||
///
|
||||
/// It makes it possible to set the following values.
|
||||
///
|
||||
/// - `num_searchers` (by default, the number of detected CPU threads):
|
||||
///
|
||||
/// When `num_searchers` queries are requested at the same time, the `num_searchers` will block
|
||||
/// until the one of the searcher in-use gets released.
|
||||
/// - `reload_policy` (by default `ReloadPolicy::OnCommit`):
|
||||
///
|
||||
/// See [`ReloadPolicy`](./enum.ReloadPolicy.html) for more details.
|
||||
/// It makes it possible to configure:
|
||||
/// - [Searcher] pool size
|
||||
/// - [ReloadPolicy] defining when new index versions are detected
|
||||
/// - [Warmer] implementations
|
||||
/// - number of warming threads, for parallelizing warming work
|
||||
#[derive(Clone)]
|
||||
pub struct IndexReaderBuilder {
|
||||
num_searchers: usize,
|
||||
reload_policy: ReloadPolicy,
|
||||
index: Index,
|
||||
warmers: Vec<Weak<dyn Warmer>>,
|
||||
num_warming_threads: usize,
|
||||
}
|
||||
|
||||
impl IndexReaderBuilder {
|
||||
@@ -53,6 +58,8 @@ impl IndexReaderBuilder {
|
||||
num_searchers: num_cpus::get(),
|
||||
reload_policy: ReloadPolicy::OnCommit,
|
||||
index,
|
||||
warmers: Vec::new(),
|
||||
num_warming_threads: 1,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,11 +68,21 @@ impl IndexReaderBuilder {
|
||||
/// Building the reader is a non-trivial operation that requires
|
||||
/// to open different segment readers. It may take hundreds of milliseconds
|
||||
/// of time and it may return an error.
|
||||
#[allow(clippy::needless_late_init)]
|
||||
pub fn try_into(self) -> crate::Result<IndexReader> {
|
||||
let searcher_generation_inventory = Inventory::default();
|
||||
let warming_state = WarmingState::new(
|
||||
self.num_warming_threads,
|
||||
self.warmers,
|
||||
searcher_generation_inventory.clone(),
|
||||
)?;
|
||||
let inner_reader = InnerIndexReader {
|
||||
index: self.index,
|
||||
num_searchers: self.num_searchers,
|
||||
searcher_pool: Pool::new(),
|
||||
warming_state,
|
||||
searcher_generation_counter: Default::default(),
|
||||
searcher_generation_inventory,
|
||||
};
|
||||
inner_reader.reload()?;
|
||||
let inner_reader_arc = Arc::new(inner_reader);
|
||||
@@ -106,11 +123,27 @@ impl IndexReaderBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the number of `Searcher` in the searcher pool.
|
||||
/// Sets the number of [Searcher] to pool.
|
||||
///
|
||||
/// See [Self::searcher()].
|
||||
pub fn num_searchers(mut self, num_searchers: usize) -> IndexReaderBuilder {
|
||||
self.num_searchers = num_searchers;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the [Warmer]s that are invoked when reloading searchable segments.
|
||||
pub fn warmers(mut self, warmers: Vec<Weak<dyn Warmer>>) -> IndexReaderBuilder {
|
||||
self.warmers = warmers;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the number of warming threads.
|
||||
///
|
||||
/// This allows parallelizing warming work when there are multiple [Warmer] registered with the [IndexReader].
|
||||
pub fn num_warming_threads(mut self, num_warming_threads: usize) -> IndexReaderBuilder {
|
||||
self.num_warming_threads = num_warming_threads;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl TryInto<IndexReader> for IndexReaderBuilder {
|
||||
@@ -123,35 +156,62 @@ impl TryInto<IndexReader> for IndexReaderBuilder {
|
||||
|
||||
struct InnerIndexReader {
|
||||
num_searchers: usize,
|
||||
searcher_pool: Pool<Searcher>,
|
||||
index: Index,
|
||||
warming_state: WarmingState,
|
||||
searcher_pool: Pool<Searcher>,
|
||||
searcher_generation_counter: Arc<AtomicU64>,
|
||||
searcher_generation_inventory: Inventory<SearcherGeneration>,
|
||||
}
|
||||
|
||||
impl InnerIndexReader {
|
||||
/// Opens the freshest segments `SegmentReader`.
|
||||
///
|
||||
/// This function acquires a lot to prevent GC from removing files
|
||||
/// as we are opening our index.
|
||||
fn open_segment_readers(&self) -> crate::Result<Vec<SegmentReader>> {
|
||||
// Prevents segment files from getting deleted while we are in the process of opening them
|
||||
let _meta_lock = self.index.directory().acquire_lock(&META_LOCK)?;
|
||||
let searchable_segments = self.index.searchable_segments()?;
|
||||
let segment_readers = searchable_segments
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<crate::Result<_>>()?;
|
||||
Ok(segment_readers)
|
||||
}
|
||||
|
||||
fn create_new_searcher_generation(
|
||||
&self,
|
||||
segment_readers: &[SegmentReader],
|
||||
) -> TrackedObject<SearcherGeneration> {
|
||||
let generation_id = self
|
||||
.searcher_generation_counter
|
||||
.fetch_add(1, atomic::Ordering::Relaxed);
|
||||
let searcher_generation =
|
||||
SearcherGeneration::from_segment_readers(segment_readers, generation_id);
|
||||
self.searcher_generation_inventory
|
||||
.track(searcher_generation)
|
||||
}
|
||||
|
||||
fn reload(&self) -> crate::Result<()> {
|
||||
let segment_readers: Vec<SegmentReader> = {
|
||||
let _meta_lock = self.index.directory().acquire_lock(&META_LOCK)?;
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
searchable_segments
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<crate::Result<_>>()?
|
||||
};
|
||||
let segment_readers = self.open_segment_readers()?;
|
||||
let searcher_generation = self.create_new_searcher_generation(&segment_readers);
|
||||
let schema = self.index.schema();
|
||||
let searchers: Vec<Searcher> = std::iter::repeat_with(|| {
|
||||
Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
|
||||
Searcher::new(
|
||||
schema.clone(),
|
||||
self.index.clone(),
|
||||
segment_readers.clone(),
|
||||
searcher_generation.clone(),
|
||||
)
|
||||
})
|
||||
.take(self.num_searchers)
|
||||
.collect::<io::Result<_>>()?;
|
||||
self.warming_state
|
||||
.warm_new_searcher_generation(&searchers[0])?;
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
fn searchable_segments(&self) -> crate::Result<Vec<Segment>> {
|
||||
self.index.searchable_segments()
|
||||
}
|
||||
|
||||
fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.searcher_pool.acquire()
|
||||
}
|
||||
|
||||
343
src/reader/warming.rs
Normal file
343
src/reader/warming.rs
Normal file
@@ -0,0 +1,343 @@
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
sync::{Arc, Mutex, Weak},
|
||||
thread::JoinHandle,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use crate::{Executor, Searcher, SearcherGeneration, TantivyError};
|
||||
use crate::{Inventory, TrackedObject};
|
||||
|
||||
pub const GC_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// `Warmer` can be used to maintain segment-level state e.g. caches.
|
||||
///
|
||||
/// They must be registered with the [IndexReaderBuilder].
|
||||
pub trait Warmer: Sync + Send {
|
||||
/// Perform any warming work using the provided [Searcher].
|
||||
fn warm(&self, searcher: &Searcher) -> crate::Result<()>;
|
||||
|
||||
/// Discards internal state for any [SearcherGeneration] not provided.
|
||||
fn garbage_collect(&self, live_generations: &[TrackedObject<SearcherGeneration>]);
|
||||
}
|
||||
|
||||
/// Warming-related state with interior mutability.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct WarmingState(Arc<Mutex<WarmingStateInner>>);
|
||||
|
||||
impl WarmingState {
|
||||
pub fn new(
|
||||
num_warming_threads: usize,
|
||||
warmers: Vec<Weak<dyn Warmer>>,
|
||||
searcher_generation_inventory: Inventory<SearcherGeneration>,
|
||||
) -> crate::Result<Self> {
|
||||
Ok(Self(Arc::new(Mutex::new(WarmingStateInner {
|
||||
num_warming_threads,
|
||||
warmers,
|
||||
gc_thread: None,
|
||||
warmed_generation_ids: Default::default(),
|
||||
searcher_generation_inventory,
|
||||
}))))
|
||||
}
|
||||
|
||||
/// Start tracking a new generation of [Searcher], and [Warmer::warm] it if there are active warmers.
|
||||
///
|
||||
/// A background GC thread for [Warmer::garbage_collect] calls is uniquely created if there are active warmers.
|
||||
pub fn warm_new_searcher_generation(&self, searcher: &Searcher) -> crate::Result<()> {
|
||||
self.0
|
||||
.lock()
|
||||
.unwrap()
|
||||
.warm_new_searcher_generation(searcher, &self.0)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn gc_maybe(&self) -> bool {
|
||||
self.0.lock().unwrap().gc_maybe()
|
||||
}
|
||||
}
|
||||
|
||||
struct WarmingStateInner {
|
||||
num_warming_threads: usize,
|
||||
warmers: Vec<Weak<dyn Warmer>>,
|
||||
gc_thread: Option<JoinHandle<()>>,
|
||||
// Contains all generations that have been warmed up.
|
||||
// This list is used to avoid triggers the individual Warmer GCs
|
||||
// if no warmed generation needs to be collected.
|
||||
warmed_generation_ids: HashSet<u64>,
|
||||
searcher_generation_inventory: Inventory<SearcherGeneration>,
|
||||
}
|
||||
|
||||
impl WarmingStateInner {
|
||||
/// Start tracking provided searcher as an exemplar of a new generation.
|
||||
/// If there are active warmers, warm them with the provided searcher, and kick background GC thread if it has not yet been kicked.
|
||||
/// Otherwise, prune state for dropped searcher generations inline.
|
||||
fn warm_new_searcher_generation(
|
||||
&mut self,
|
||||
searcher: &Searcher,
|
||||
this: &Arc<Mutex<Self>>,
|
||||
) -> crate::Result<()> {
|
||||
let warmers = self.pruned_warmers();
|
||||
// Avoid threads (warming as well as background GC) if there are no warmers
|
||||
if warmers.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
self.start_gc_thread_maybe(this)?;
|
||||
self.warmed_generation_ids
|
||||
.insert(searcher.generation().generation_id());
|
||||
warming_executor(self.num_warming_threads.min(warmers.len()))?
|
||||
.map(|warmer| warmer.warm(searcher), warmers.into_iter())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Attempt to upgrade the weak Warmer references, pruning those which cannot be upgraded.
|
||||
/// Return the strong references.
|
||||
fn pruned_warmers(&mut self) -> Vec<Arc<dyn Warmer>> {
|
||||
let strong_warmers = self
|
||||
.warmers
|
||||
.iter()
|
||||
.flat_map(|weak_warmer| weak_warmer.upgrade())
|
||||
.collect::<Vec<_>>();
|
||||
self.warmers = strong_warmers.iter().map(Arc::downgrade).collect();
|
||||
strong_warmers
|
||||
}
|
||||
|
||||
/// [Warmer::garbage_collect] active warmers if some searcher generation is observed to have been dropped.
|
||||
fn gc_maybe(&mut self) -> bool {
|
||||
let live_generations = self.searcher_generation_inventory.list();
|
||||
let live_generation_ids: HashSet<u64> = live_generations
|
||||
.iter()
|
||||
.map(|searcher_generation| searcher_generation.generation_id())
|
||||
.collect();
|
||||
let gc_not_required = self
|
||||
.warmed_generation_ids
|
||||
.iter()
|
||||
.all(|warmed_up_generation| live_generation_ids.contains(warmed_up_generation));
|
||||
if gc_not_required {
|
||||
return false;
|
||||
}
|
||||
for warmer in self.pruned_warmers() {
|
||||
warmer.garbage_collect(&live_generations);
|
||||
}
|
||||
self.warmed_generation_ids = live_generation_ids;
|
||||
true
|
||||
}
|
||||
|
||||
/// Start GC thread if one has not already been started.
|
||||
fn start_gc_thread_maybe(&mut self, this: &Arc<Mutex<Self>>) -> crate::Result<bool> {
|
||||
if self.gc_thread.is_some() {
|
||||
return Ok(false);
|
||||
}
|
||||
let weak_inner = Arc::downgrade(this);
|
||||
let handle = std::thread::Builder::new()
|
||||
.name("tantivy-warm-gc".to_owned())
|
||||
.spawn(|| Self::gc_loop(weak_inner))
|
||||
.map_err(|_| {
|
||||
TantivyError::SystemError("Failed to spawn warmer GC thread".to_owned())
|
||||
})?;
|
||||
self.gc_thread = Some(handle);
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Every [GC_INTERVAL] attempt to GC, with panics caught and logged using [std::panic::catch_unwind].
|
||||
fn gc_loop(inner: Weak<Mutex<WarmingStateInner>>) {
|
||||
for _ in crossbeam::channel::tick(GC_INTERVAL) {
|
||||
if let Some(inner) = inner.upgrade() {
|
||||
// rely on deterministic gc in tests
|
||||
#[cfg(not(test))]
|
||||
if let Err(err) = std::panic::catch_unwind(|| inner.lock().unwrap().gc_maybe()) {
|
||||
error!("Panic in Warmer GC {:?}", err);
|
||||
}
|
||||
// avoid unused var warning in tests
|
||||
#[cfg(test)]
|
||||
drop(inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn warming_executor(num_threads: usize) -> crate::Result<Executor> {
|
||||
if num_threads <= 1 {
|
||||
Ok(Executor::single_thread())
|
||||
} else {
|
||||
Executor::multi_thread(num_threads, "tantivy-warm-")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
sync::{
|
||||
atomic::{self, AtomicUsize},
|
||||
Arc, RwLock, Weak,
|
||||
},
|
||||
};
|
||||
|
||||
use crate::TrackedObject;
|
||||
use crate::{
|
||||
core::searcher::SearcherGeneration,
|
||||
directory::RamDirectory,
|
||||
schema::{Schema, INDEXED},
|
||||
Index, IndexSettings, ReloadPolicy, Searcher, SegmentId,
|
||||
};
|
||||
|
||||
use super::Warmer;
|
||||
|
||||
#[derive(Default)]
|
||||
struct TestWarmer {
|
||||
active_segment_ids: RwLock<HashSet<SegmentId>>,
|
||||
warm_calls: AtomicUsize,
|
||||
gc_calls: AtomicUsize,
|
||||
}
|
||||
|
||||
impl TestWarmer {
|
||||
fn live_segment_ids(&self) -> HashSet<SegmentId> {
|
||||
self.active_segment_ids.read().unwrap().clone()
|
||||
}
|
||||
|
||||
fn warm_calls(&self) -> usize {
|
||||
self.warm_calls.load(atomic::Ordering::Acquire)
|
||||
}
|
||||
|
||||
fn gc_calls(&self) -> usize {
|
||||
self.gc_calls.load(atomic::Ordering::Acquire)
|
||||
}
|
||||
|
||||
fn verify(
|
||||
&self,
|
||||
expected_warm_calls: usize,
|
||||
expected_gc_calls: usize,
|
||||
expected_segment_ids: HashSet<SegmentId>,
|
||||
) {
|
||||
assert_eq!(self.warm_calls(), expected_warm_calls);
|
||||
assert_eq!(self.gc_calls(), expected_gc_calls);
|
||||
assert_eq!(self.live_segment_ids(), expected_segment_ids);
|
||||
}
|
||||
}
|
||||
|
||||
impl Warmer for TestWarmer {
|
||||
fn warm(&self, searcher: &crate::Searcher) -> crate::Result<()> {
|
||||
self.warm_calls.fetch_add(1, atomic::Ordering::SeqCst);
|
||||
for reader in searcher.segment_readers() {
|
||||
self.active_segment_ids
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(reader.segment_id());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn garbage_collect(&self, live_generations: &[TrackedObject<SearcherGeneration>]) {
|
||||
self.gc_calls
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
let active_segment_ids = live_generations
|
||||
.iter()
|
||||
.flat_map(|searcher_generation| searcher_generation.segments().keys().copied())
|
||||
.collect();
|
||||
*self.active_segment_ids.write().unwrap() = active_segment_ids;
|
||||
}
|
||||
}
|
||||
|
||||
fn segment_ids(searcher: &Searcher) -> HashSet<SegmentId> {
|
||||
searcher
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.map(|reader| reader.segment_id())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn test_warming(num_warming_threads: usize) -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("pk", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let directory = RamDirectory::create();
|
||||
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
|
||||
|
||||
let num_writer_threads = 4;
|
||||
let mut writer = index
|
||||
.writer_with_num_threads(num_writer_threads, 25_000_000)
|
||||
.unwrap();
|
||||
|
||||
for i in 0u64..1000u64 {
|
||||
writer.add_document(doc!(field => i))?;
|
||||
}
|
||||
writer.commit()?;
|
||||
|
||||
let warmer1 = Arc::new(TestWarmer::default());
|
||||
let warmer2 = Arc::new(TestWarmer::default());
|
||||
warmer1.verify(0, 0, HashSet::new());
|
||||
warmer2.verify(0, 0, HashSet::new());
|
||||
|
||||
let num_searchers = 4;
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.num_warming_threads(num_warming_threads)
|
||||
.num_searchers(num_searchers)
|
||||
.warmers(vec![
|
||||
Arc::downgrade(&warmer1) as Weak<dyn Warmer>,
|
||||
Arc::downgrade(&warmer2) as Weak<dyn Warmer>,
|
||||
])
|
||||
.try_into()?;
|
||||
|
||||
let warming_state = &reader.inner.warming_state;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), num_writer_threads);
|
||||
assert!(
|
||||
!warming_state.gc_maybe(),
|
||||
"no GC after first searcher generation"
|
||||
);
|
||||
warmer1.verify(1, 0, segment_ids(&searcher));
|
||||
warmer2.verify(1, 0, segment_ids(&searcher));
|
||||
assert_eq!(searcher.num_docs(), 1000);
|
||||
|
||||
for i in 1000u64..2000u64 {
|
||||
writer.add_document(doc!(field => i))?;
|
||||
}
|
||||
writer.commit()?;
|
||||
writer.wait_merging_threads()?;
|
||||
|
||||
drop(warmer1);
|
||||
|
||||
let old_searcher = searcher;
|
||||
|
||||
reader.reload()?;
|
||||
|
||||
assert!(!warming_state.gc_maybe(), "old searcher still around");
|
||||
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 2000);
|
||||
|
||||
warmer2.verify(
|
||||
2,
|
||||
0,
|
||||
segment_ids(&old_searcher)
|
||||
.union(&segment_ids(&searcher))
|
||||
.copied()
|
||||
.collect(),
|
||||
);
|
||||
|
||||
drop(old_searcher);
|
||||
for _ in 0..num_searchers {
|
||||
// make sure the old searcher is dropped by the pool too
|
||||
let _ = reader.searcher();
|
||||
}
|
||||
assert!(warming_state.gc_maybe(), "old searcher dropped");
|
||||
|
||||
warmer2.verify(2, 1, segment_ids(&searcher));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn warming_single_thread() -> crate::Result<()> {
|
||||
test_warming(1)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn warming_four_threads() -> crate::Result<()> {
|
||||
test_warming(4)
|
||||
}
|
||||
}
|
||||
@@ -3,19 +3,51 @@ use std::ops::BitOr;
|
||||
|
||||
use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
|
||||
/// Define how an a bytes field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(from = "BytesOptionsDeser")]
|
||||
pub struct BytesOptions {
|
||||
indexed: bool,
|
||||
fieldnorms: bool,
|
||||
fast: bool,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
/// For backward compability we add an intermediary to interpret the
|
||||
/// lack of fieldnorms attribute as "true" iff indexed.
|
||||
///
|
||||
/// (Downstream, for the moment, this attribute is not used anyway if not indexed...)
|
||||
/// Note that: newly serialized IntOptions will include the new attribute.
|
||||
#[derive(Deserialize)]
|
||||
struct BytesOptionsDeser {
|
||||
indexed: bool,
|
||||
#[serde(default)]
|
||||
fieldnorms: Option<bool>,
|
||||
fast: bool,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
impl From<BytesOptionsDeser> for BytesOptions {
|
||||
fn from(deser: BytesOptionsDeser) -> Self {
|
||||
BytesOptions {
|
||||
indexed: deser.indexed,
|
||||
fieldnorms: deser.fieldnorms.unwrap_or(deser.indexed),
|
||||
fast: deser.fast,
|
||||
stored: deser.stored,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesOptions {
|
||||
/// Returns true iff the value is indexed.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
self.indexed
|
||||
}
|
||||
|
||||
/// Returns true iff the value is normed.
|
||||
pub fn fieldnorms(&self) -> bool {
|
||||
self.fieldnorms
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast
|
||||
@@ -35,6 +67,15 @@ impl BytesOptions {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the field as normed.
|
||||
///
|
||||
/// Setting an integer as normed will generate
|
||||
/// the fieldnorm data for it.
|
||||
pub fn set_fieldnorms(mut self) -> BytesOptions {
|
||||
self.fieldnorms = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the field as a single-valued fast field.
|
||||
///
|
||||
/// Fast fields are designed for random access.
|
||||
@@ -63,6 +104,7 @@ impl<T: Into<BytesOptions>> BitOr<T> for BytesOptions {
|
||||
let other = other.into();
|
||||
BytesOptions {
|
||||
indexed: self.indexed | other.indexed,
|
||||
fieldnorms: self.fieldnorms | other.fieldnorms,
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast | other.fast,
|
||||
}
|
||||
@@ -79,6 +121,7 @@ impl From<FastFlag> for BytesOptions {
|
||||
fn from(_: FastFlag) -> Self {
|
||||
BytesOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: false,
|
||||
fast: true,
|
||||
}
|
||||
@@ -89,6 +132,7 @@ impl From<StoredFlag> for BytesOptions {
|
||||
fn from(_: StoredFlag) -> Self {
|
||||
BytesOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: true,
|
||||
fast: false,
|
||||
}
|
||||
@@ -99,6 +143,7 @@ impl From<IndexedFlag> for BytesOptions {
|
||||
fn from(_: IndexedFlag) -> Self {
|
||||
BytesOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
stored: false,
|
||||
fast: false,
|
||||
}
|
||||
@@ -123,7 +168,10 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bytes_option_fast_flag() {
|
||||
assert_eq!(BytesOptions::default().set_fast(), FAST.into());
|
||||
assert_eq!(BytesOptions::default().set_indexed(), INDEXED.into());
|
||||
assert_eq!(
|
||||
BytesOptions::default().set_indexed().set_fieldnorms(),
|
||||
INDEXED.into()
|
||||
);
|
||||
assert_eq!(BytesOptions::default().set_stored(), STORED.into());
|
||||
}
|
||||
#[test]
|
||||
@@ -133,11 +181,17 @@ mod tests {
|
||||
(FAST | STORED).into()
|
||||
);
|
||||
assert_eq!(
|
||||
BytesOptions::default().set_indexed().set_fast(),
|
||||
BytesOptions::default()
|
||||
.set_indexed()
|
||||
.set_fieldnorms()
|
||||
.set_fast(),
|
||||
(INDEXED | FAST).into()
|
||||
);
|
||||
assert_eq!(
|
||||
BytesOptions::default().set_stored().set_indexed(),
|
||||
BytesOptions::default()
|
||||
.set_stored()
|
||||
.set_fieldnorms()
|
||||
.set_indexed(),
|
||||
(STORED | INDEXED).into()
|
||||
);
|
||||
}
|
||||
@@ -147,8 +201,89 @@ mod tests {
|
||||
assert!(!BytesOptions::default().is_stored());
|
||||
assert!(!BytesOptions::default().is_fast());
|
||||
assert!(!BytesOptions::default().is_indexed());
|
||||
assert!(!BytesOptions::default().fieldnorms());
|
||||
assert!(BytesOptions::default().set_stored().is_stored());
|
||||
assert!(BytesOptions::default().set_fast().is_fast());
|
||||
assert!(BytesOptions::default().set_indexed().is_indexed());
|
||||
assert!(BytesOptions::default().set_fieldnorms().fieldnorms());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bytes_options_deser_if_fieldnorm_missing_indexed_true() {
|
||||
let json = r#"{
|
||||
"indexed": true,
|
||||
"fast": false,
|
||||
"stored": false
|
||||
}"#;
|
||||
let bytes_options: BytesOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&bytes_options,
|
||||
&BytesOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
fast: false,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bytes_options_deser_if_fieldnorm_missing_indexed_false() {
|
||||
let json = r#"{
|
||||
"indexed": false,
|
||||
"stored": false,
|
||||
"fast": false
|
||||
}"#;
|
||||
let bytes_options: BytesOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&bytes_options,
|
||||
&BytesOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
fast: false,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bytes_options_deser_if_fieldnorm_false_indexed_true() {
|
||||
let json = r#"{
|
||||
"indexed": true,
|
||||
"fieldnorms": false,
|
||||
"fast": false,
|
||||
"stored": false
|
||||
}"#;
|
||||
let bytes_options: BytesOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&bytes_options,
|
||||
&BytesOptions {
|
||||
indexed: true,
|
||||
fieldnorms: false,
|
||||
fast: false,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bytes_options_deser_if_fieldnorm_true_indexed_false() {
|
||||
// this one is kind of useless, at least at the moment
|
||||
let json = r#"{
|
||||
"indexed": false,
|
||||
"fieldnorms": true,
|
||||
"fast": false,
|
||||
"stored": false
|
||||
}"#;
|
||||
let bytes_options: BytesOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&bytes_options,
|
||||
&BytesOptions {
|
||||
indexed: false,
|
||||
fieldnorms: true,
|
||||
fast: false,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@ use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::mem;
|
||||
|
||||
@@ -30,10 +32,21 @@ impl From<Vec<FieldValue>> for Document {
|
||||
impl PartialEq for Document {
|
||||
fn eq(&self, other: &Document) -> bool {
|
||||
// super slow, but only here for tests
|
||||
let mut self_field_values: Vec<&_> = self.field_values.iter().collect();
|
||||
let mut other_field_values: Vec<&_> = other.field_values.iter().collect();
|
||||
self_field_values.sort();
|
||||
other_field_values.sort();
|
||||
let convert_to_comparable_map = |field_values: &[FieldValue]| {
|
||||
let mut field_value_set: HashMap<Field, HashSet<String>> = Default::default();
|
||||
for field_value in field_values.iter() {
|
||||
let json_val = serde_json::to_string(field_value.value()).unwrap();
|
||||
field_value_set
|
||||
.entry(field_value.field())
|
||||
.or_default()
|
||||
.insert(json_val);
|
||||
}
|
||||
field_value_set
|
||||
};
|
||||
let self_field_values: HashMap<Field, HashSet<String>> =
|
||||
convert_to_comparable_map(&self.field_values);
|
||||
let other_field_values: HashMap<Field, HashSet<String>> =
|
||||
convert_to_comparable_map(&other.field_values);
|
||||
self_field_values.eq(&other_field_values)
|
||||
}
|
||||
}
|
||||
@@ -131,18 +144,17 @@ impl Document {
|
||||
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
|
||||
field_values.sort_by_key(|field_value| field_value.field());
|
||||
|
||||
let mut grouped_field_values = vec![];
|
||||
|
||||
let mut current_field;
|
||||
let mut current_group;
|
||||
|
||||
let mut field_values_it = field_values.into_iter();
|
||||
if let Some(field_value) = field_values_it.next() {
|
||||
current_field = field_value.field();
|
||||
current_group = vec![field_value]
|
||||
|
||||
let first_field_value = if let Some(first_field_value) = field_values_it.next() {
|
||||
first_field_value
|
||||
} else {
|
||||
return grouped_field_values;
|
||||
}
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut grouped_field_values = vec![];
|
||||
let mut current_field = first_field_value.field();
|
||||
let mut current_group = vec![first_field_value];
|
||||
|
||||
for field_value in field_values_it {
|
||||
if field_value.field() == current_field {
|
||||
|
||||
@@ -3,9 +3,10 @@ use serde::{Deserialize, Serialize};
|
||||
use std::ops::BitOr;
|
||||
|
||||
/// Define how a facet field should be handled by tantivy.
|
||||
///
|
||||
/// Note that a Facet is always indexed and stored as a fastfield.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct FacetOptions {
|
||||
indexed: bool,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
@@ -15,11 +16,6 @@ impl FacetOptions {
|
||||
self.stored
|
||||
}
|
||||
|
||||
/// Returns true iff the value is indexed.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
self.indexed
|
||||
}
|
||||
|
||||
/// Set the field as stored.
|
||||
///
|
||||
/// Only the fields that are set as *stored* are
|
||||
@@ -28,15 +24,6 @@ impl FacetOptions {
|
||||
self.stored = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the field as indexed.
|
||||
///
|
||||
/// Setting a facet as indexed will generate
|
||||
/// a walkable path.
|
||||
pub fn set_indexed(mut self) -> FacetOptions {
|
||||
self.indexed = true;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl From<()> for FacetOptions {
|
||||
@@ -47,19 +34,7 @@ impl From<()> for FacetOptions {
|
||||
|
||||
impl From<StoredFlag> for FacetOptions {
|
||||
fn from(_: StoredFlag) -> Self {
|
||||
FacetOptions {
|
||||
indexed: false,
|
||||
stored: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<IndexedFlag> for FacetOptions {
|
||||
fn from(_: IndexedFlag) -> Self {
|
||||
FacetOptions {
|
||||
indexed: true,
|
||||
stored: false,
|
||||
}
|
||||
FacetOptions { stored: true }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,7 +44,6 @@ impl<T: Into<FacetOptions>> BitOr<T> for FacetOptions {
|
||||
fn bitor(self, other: T) -> FacetOptions {
|
||||
let other = other.into();
|
||||
FacetOptions {
|
||||
indexed: self.indexed | other.indexed,
|
||||
stored: self.stored | other.stored,
|
||||
}
|
||||
}
|
||||
@@ -85,3 +59,20 @@ where
|
||||
Self::from(head_tail.head) | Self::from(head_tail.tail)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<IndexedFlag> for FacetOptions {
|
||||
fn from(_: IndexedFlag) -> Self {
|
||||
FacetOptions { stored: false }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::{FacetOptions, INDEXED};
|
||||
|
||||
#[test]
|
||||
fn test_from_index_flag() {
|
||||
let facet_option = FacetOptions::from(INDEXED);
|
||||
assert_eq!(facet_option, FacetOptions::default());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,7 +85,7 @@ impl FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::HierarchicalFacet(field_type),
|
||||
field_type: FieldType::Facet(field_type),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,6 +114,11 @@ impl FieldEntry {
|
||||
self.field_type.is_indexed()
|
||||
}
|
||||
|
||||
/// Returns true iff the field is normed
|
||||
pub fn has_fieldnorms(&self) -> bool {
|
||||
self.field_type.has_fieldnorms()
|
||||
}
|
||||
|
||||
/// Returns true iff the field is a int (signed or unsigned) fast field
|
||||
pub fn is_fast(&self) -> bool {
|
||||
match self.field_type {
|
||||
@@ -133,7 +138,7 @@ impl FieldEntry {
|
||||
| FieldType::F64(ref options)
|
||||
| FieldType::Date(ref options) => options.is_stored(),
|
||||
FieldType::Str(ref options) => options.is_stored(),
|
||||
FieldType::HierarchicalFacet(ref options) => options.is_stored(),
|
||||
FieldType::Facet(ref options) => options.is_stored(),
|
||||
FieldType::Bytes(ref options) => options.is_stored(),
|
||||
}
|
||||
}
|
||||
@@ -142,7 +147,10 @@ impl FieldEntry {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::schema::TEXT;
|
||||
use crate::{
|
||||
schema::{Schema, TextFieldIndexing, TEXT},
|
||||
Index,
|
||||
};
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
@@ -161,6 +169,7 @@ mod tests {
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "position",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
@@ -187,6 +196,7 @@ mod tests {
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "position",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
@@ -199,4 +209,21 @@ mod tests {
|
||||
_ => panic!("expected FieldType::Str"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_missing_fieldnorms() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let no_field_norm = TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(false));
|
||||
let text = schema_builder.add_text_field("text", no_field_norm);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text=>"abc"))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let err = searcher.segment_reader(0u32).get_fieldnorms_reader(text);
|
||||
assert!(matches!(err, Err(crate::TantivyError::SchemaError(_))));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,21 +30,60 @@ pub enum ValueParsingError {
|
||||
/// Contrary to FieldType, this does
|
||||
/// not include the way the field must be indexed.
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
#[repr(u8)]
|
||||
pub enum Type {
|
||||
/// `&str`
|
||||
Str,
|
||||
Str = b's',
|
||||
/// `u64`
|
||||
U64,
|
||||
U64 = b'u',
|
||||
/// `i64`
|
||||
I64,
|
||||
I64 = b'i',
|
||||
/// `f64`
|
||||
F64,
|
||||
F64 = b'f',
|
||||
/// `date(i64) timestamp`
|
||||
Date,
|
||||
Date = b'd',
|
||||
/// `tantivy::schema::Facet`. Passed as a string in JSON.
|
||||
HierarchicalFacet,
|
||||
Facet = b'h',
|
||||
/// `Vec<u8>`
|
||||
Bytes,
|
||||
Bytes = b'b',
|
||||
}
|
||||
|
||||
const ALL_TYPES: [Type; 7] = [
|
||||
Type::Str,
|
||||
Type::U64,
|
||||
Type::I64,
|
||||
Type::F64,
|
||||
Type::Date,
|
||||
Type::Facet,
|
||||
Type::Bytes,
|
||||
];
|
||||
|
||||
impl Type {
|
||||
/// Returns an iterator over the different values
|
||||
/// the Type enum can tape.
|
||||
pub fn iter_values() -> impl Iterator<Item = Type> {
|
||||
ALL_TYPES.iter().cloned()
|
||||
}
|
||||
|
||||
/// Returns a 1 byte code used to identify the type.
|
||||
pub fn to_code(&self) -> u8 {
|
||||
*self as u8
|
||||
}
|
||||
|
||||
/// Interprets a 1byte code as a type.
|
||||
/// Returns None if the code is invalid.
|
||||
pub fn from_code(code: u8) -> Option<Self> {
|
||||
match code {
|
||||
b's' => Some(Type::Str),
|
||||
b'u' => Some(Type::U64),
|
||||
b'i' => Some(Type::I64),
|
||||
b'f' => Some(Type::F64),
|
||||
b'd' => Some(Type::Date),
|
||||
b'h' => Some(Type::Facet),
|
||||
b'b' => Some(Type::Bytes),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A `FieldType` describes the type (text, u64) of a field as well as
|
||||
@@ -65,7 +104,7 @@ pub enum FieldType {
|
||||
/// Signed 64-bits Date 64 field type configuration,
|
||||
Date(IntOptions),
|
||||
/// Hierachical Facet
|
||||
HierarchicalFacet(FacetOptions),
|
||||
Facet(FacetOptions),
|
||||
/// Bytes (one per document)
|
||||
Bytes(BytesOptions),
|
||||
}
|
||||
@@ -79,7 +118,7 @@ impl FieldType {
|
||||
FieldType::I64(_) => Type::I64,
|
||||
FieldType::F64(_) => Type::F64,
|
||||
FieldType::Date(_) => Type::Date,
|
||||
FieldType::HierarchicalFacet(_) => Type::HierarchicalFacet,
|
||||
FieldType::Facet(_) => Type::Facet,
|
||||
FieldType::Bytes(_) => Type::Bytes,
|
||||
}
|
||||
}
|
||||
@@ -92,11 +131,27 @@ impl FieldType {
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::F64(ref int_options) => int_options.is_indexed(),
|
||||
FieldType::Date(ref date_options) => date_options.is_indexed(),
|
||||
FieldType::HierarchicalFacet(ref facet_options) => facet_options.is_indexed(),
|
||||
FieldType::Facet(ref _facet_options) => true,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(),
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true iff the field is normed.
|
||||
pub fn has_fieldnorms(&self) -> bool {
|
||||
match *self {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|options| options.fieldnorms())
|
||||
.unwrap_or(false),
|
||||
FieldType::U64(ref int_options)
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::F64(ref int_options)
|
||||
| FieldType::Date(ref int_options) => int_options.fieldnorms(),
|
||||
FieldType::Facet(_) => false,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a field configuration, return the maximal possible
|
||||
/// `IndexRecordOption` available.
|
||||
///
|
||||
@@ -116,13 +171,7 @@ impl FieldType {
|
||||
None
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet(ref facet_options) => {
|
||||
if facet_options.is_indexed() {
|
||||
Some(IndexRecordOption::Basic)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
FieldType::Facet(ref _facet_options) => Some(IndexRecordOption::Basic),
|
||||
FieldType::Bytes(ref bytes_options) => {
|
||||
if bytes_options.is_indexed() {
|
||||
Some(IndexRecordOption::Basic)
|
||||
@@ -155,7 +204,7 @@ impl FieldType {
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err(
|
||||
ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)),
|
||||
),
|
||||
FieldType::HierarchicalFacet(_) => Ok(Value::Facet(Facet::from(field_text))),
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| {
|
||||
ValueParsingError::InvalidBase64(format!(
|
||||
"Expected base64 string, got {:?}",
|
||||
@@ -188,7 +237,7 @@ impl FieldType {
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::HierarchicalFacet(_) | FieldType::Bytes(_) => {
|
||||
FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => {
|
||||
let msg = format!("Expected a string, got {:?}", json);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
@@ -231,6 +280,7 @@ mod tests {
|
||||
use super::FieldType;
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::TextOptions;
|
||||
use crate::schema::Type;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{Schema, INDEXED};
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
@@ -348,4 +398,13 @@ mod tests {
|
||||
|
||||
assert_eq!(serialized_value_json, pre_tokenized_string_json);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_type_codes() {
|
||||
for type_val in Type::iter_values() {
|
||||
let code = type_val.to_code();
|
||||
assert_eq!(Type::from_code(code), Some(type_val));
|
||||
}
|
||||
assert_eq!(Type::from_code(b'z'), None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use common::BinarySerializable;
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// `FieldValue` holds together a `Field` and its `Value`.
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, serde::Serialize, serde::Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct FieldValue {
|
||||
field: Field,
|
||||
value: Value,
|
||||
|
||||
@@ -20,7 +20,7 @@ pub const STORED: SchemaFlagList<StoredFlag, ()> = SchemaFlagList {
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct IndexedFlag;
|
||||
/// Flag to mark the field as indexed. An indexed field is searchable.
|
||||
/// Flag to mark the field as indexed. An indexed field is searchable and has a fieldnorm.
|
||||
///
|
||||
/// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
|
||||
/// Of course, text fields can also be indexed... But this is expressed by using either the
|
||||
|
||||
@@ -16,13 +16,42 @@ pub enum Cardinality {
|
||||
|
||||
/// Define how an u64, i64, of f64 field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
#[serde(from = "IntOptionsDeser")]
|
||||
pub struct IntOptions {
|
||||
indexed: bool,
|
||||
// This boolean has no effect if the field is not marked as indexed too.
|
||||
fieldnorms: bool, // This attribute only has an effect if indexed is true.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
fast: Option<Cardinality>,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
/// For backward compability we add an intermediary to interpret the
|
||||
/// lack of fieldnorms attribute as "true" iff indexed.
|
||||
///
|
||||
/// (Downstream, for the moment, this attribute is not used anyway if not indexed...)
|
||||
/// Note that: newly serialized IntOptions will include the new attribute.
|
||||
#[derive(Deserialize)]
|
||||
struct IntOptionsDeser {
|
||||
indexed: bool,
|
||||
#[serde(default)]
|
||||
fieldnorms: Option<bool>, // This attribute only has an effect if indexed is true.
|
||||
#[serde(default)]
|
||||
fast: Option<Cardinality>,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
impl From<IntOptionsDeser> for IntOptions {
|
||||
fn from(deser: IntOptionsDeser) -> Self {
|
||||
IntOptions {
|
||||
indexed: deser.indexed,
|
||||
fieldnorms: deser.fieldnorms.unwrap_or(deser.indexed),
|
||||
fast: deser.fast,
|
||||
stored: deser.stored,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntOptions {
|
||||
/// Returns true iff the value is stored.
|
||||
pub fn is_stored(&self) -> bool {
|
||||
@@ -34,6 +63,11 @@ impl IntOptions {
|
||||
self.indexed
|
||||
}
|
||||
|
||||
/// Returns true iff the field has fieldnorm.
|
||||
pub fn fieldnorms(&self) -> bool {
|
||||
self.fieldnorms && self.indexed
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast.is_some()
|
||||
@@ -59,6 +93,15 @@ impl IntOptions {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the field with fieldnorm.
|
||||
///
|
||||
/// Setting an integer as fieldnorm will generate
|
||||
/// the fieldnorm data for it.
|
||||
pub fn set_fieldnorm(mut self) -> IntOptions {
|
||||
self.fieldnorms = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the field as a single-valued fast field.
|
||||
///
|
||||
/// Fast fields are designed for random access.
|
||||
@@ -89,6 +132,7 @@ impl From<FastFlag> for IntOptions {
|
||||
fn from(_: FastFlag) -> Self {
|
||||
IntOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: false,
|
||||
fast: Some(Cardinality::SingleValue),
|
||||
}
|
||||
@@ -99,6 +143,7 @@ impl From<StoredFlag> for IntOptions {
|
||||
fn from(_: StoredFlag) -> Self {
|
||||
IntOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
stored: true,
|
||||
fast: None,
|
||||
}
|
||||
@@ -109,6 +154,7 @@ impl From<IndexedFlag> for IntOptions {
|
||||
fn from(_: IndexedFlag) -> Self {
|
||||
IntOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
stored: false,
|
||||
fast: None,
|
||||
}
|
||||
@@ -122,6 +168,7 @@ impl<T: Into<IntOptions>> BitOr<T> for IntOptions {
|
||||
let other = other.into();
|
||||
IntOptions {
|
||||
indexed: self.indexed | other.indexed,
|
||||
fieldnorms: self.fieldnorms | other.fieldnorms,
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast.or(other.fast),
|
||||
}
|
||||
@@ -138,3 +185,83 @@ where
|
||||
Self::from(head_tail.head) | Self::from(head_tail.tail)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_int_options_deser_if_fieldnorm_missing_indexed_true() {
|
||||
let json = r#"{
|
||||
"indexed": true,
|
||||
"stored": false
|
||||
}"#;
|
||||
let int_options: IntOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&IntOptions {
|
||||
indexed: true,
|
||||
fieldnorms: true,
|
||||
fast: None,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_int_options_deser_if_fieldnorm_missing_indexed_false() {
|
||||
let json = r#"{
|
||||
"indexed": false,
|
||||
"stored": false
|
||||
}"#;
|
||||
let int_options: IntOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&IntOptions {
|
||||
indexed: false,
|
||||
fieldnorms: false,
|
||||
fast: None,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_int_options_deser_if_fieldnorm_false_indexed_true() {
|
||||
let json = r#"{
|
||||
"indexed": true,
|
||||
"fieldnorms": false,
|
||||
"stored": false
|
||||
}"#;
|
||||
let int_options: IntOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&IntOptions {
|
||||
indexed: true,
|
||||
fieldnorms: false,
|
||||
fast: None,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_int_options_deser_if_fieldnorm_true_indexed_false() {
|
||||
// this one is kind of useless, at least at the moment
|
||||
let json = r#"{
|
||||
"indexed": false,
|
||||
"fieldnorms": true,
|
||||
"stored": false
|
||||
}"#;
|
||||
let int_options: IntOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
&int_options,
|
||||
&IntOptions {
|
||||
indexed: false,
|
||||
fieldnorms: true,
|
||||
fast: None,
|
||||
stored: false
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -240,6 +240,11 @@ impl Schema {
|
||||
self.get_field_entry(field).name()
|
||||
}
|
||||
|
||||
/// Returns the number of fields in the schema.
|
||||
pub fn num_fields(&self) -> usize {
|
||||
self.0.fields.len()
|
||||
}
|
||||
|
||||
/// Return the list of all the `Field`s.
|
||||
pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
|
||||
self.0
|
||||
@@ -271,8 +276,6 @@ impl Schema {
|
||||
let field_value = FieldValue::new(field, value);
|
||||
document.add(field_value);
|
||||
}
|
||||
} else {
|
||||
return Err(DocParsingError::NoSuchFieldInSchema(field_name));
|
||||
}
|
||||
}
|
||||
Ok(document)
|
||||
@@ -314,26 +317,25 @@ impl Schema {
|
||||
|
||||
let mut doc = Document::default();
|
||||
for (field_name, json_value) in json_obj.iter() {
|
||||
let field = self
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| DocParsingError::NoSuchFieldInSchema(field_name.clone()))?;
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
if let Some(field) = self.get_field(field_name) {
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = field_type
|
||||
.value_from_json(json_item)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value = field_type
|
||||
.value_from_json(json_item)
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value = field_type
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(doc)
|
||||
@@ -398,9 +400,6 @@ pub enum DocParsingError {
|
||||
/// One of the value node could not be parsed.
|
||||
#[error("The field '{0:?}' could not be parsed: {1:?}")]
|
||||
ValueError(String, ValueParsingError),
|
||||
/// The json-document contains a field that is not declared in the schema.
|
||||
#[error("The document contains a field that is not declared in the schema: {0:?}")]
|
||||
NoSuchFieldInSchema(String),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -433,9 +432,17 @@ mod tests {
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let score_options = IntOptions::default()
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_text_field(
|
||||
"author",
|
||||
TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("raw")
|
||||
.set_fieldnorms(false),
|
||||
),
|
||||
);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
schema_builder.add_i64_field("popularity", popularity_options);
|
||||
schema_builder.add_f64_field("score", score_options);
|
||||
@@ -448,6 +455,7 @@ mod tests {
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "position",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
@@ -459,6 +467,7 @@ mod tests {
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "basic",
|
||||
"fieldnorms": false,
|
||||
"tokenizer": "raw"
|
||||
},
|
||||
"stored": false
|
||||
@@ -469,6 +478,7 @@ mod tests {
|
||||
"type": "u64",
|
||||
"options": {
|
||||
"indexed": false,
|
||||
"fieldnorms": false,
|
||||
"fast": "single",
|
||||
"stored": true
|
||||
}
|
||||
@@ -478,6 +488,7 @@ mod tests {
|
||||
"type": "i64",
|
||||
"options": {
|
||||
"indexed": false,
|
||||
"fieldnorms": false,
|
||||
"fast": "single",
|
||||
"stored": true
|
||||
}
|
||||
@@ -487,6 +498,7 @@ mod tests {
|
||||
"type": "f64",
|
||||
"options": {
|
||||
"indexed": true,
|
||||
"fieldnorms": true,
|
||||
"fast": "single",
|
||||
"stored": false
|
||||
}
|
||||
@@ -578,20 +590,16 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_document_from_nameddoc_error() {
|
||||
pub fn test_document_missing_field_no_error() {
|
||||
let schema = Schema::builder().build();
|
||||
let mut named_doc_map = BTreeMap::default();
|
||||
named_doc_map.insert(
|
||||
"title".to_string(),
|
||||
vec![Value::from("title1"), Value::from("title2")],
|
||||
);
|
||||
let err = schema
|
||||
schema
|
||||
.convert_named_doc(NamedFieldDocument(named_doc_map))
|
||||
.unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
DocParsingError::NoSuchFieldInSchema("title".to_string())
|
||||
);
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -644,8 +652,9 @@ mod tests {
|
||||
);
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
let res = schema.parse_document(
|
||||
r#"{
|
||||
"thisfieldisnotdefinedintheschema": "my title",
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
@@ -654,7 +663,7 @@ mod tests {
|
||||
"jambon": "bayonne"
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(json_err, Err(DocParsingError::NoSuchFieldInSchema(_)));
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
@@ -752,6 +761,7 @@ mod tests {
|
||||
let timestamp_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_fast(SingleValue);
|
||||
schema_builder.add_text_field("_id", id_options);
|
||||
schema_builder.add_date_field("_timestamp", timestamp_options);
|
||||
@@ -763,6 +773,7 @@ mod tests {
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "position",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
@@ -773,6 +784,7 @@ mod tests {
|
||||
"type": "i64",
|
||||
"options": {
|
||||
"indexed": false,
|
||||
"fieldnorms": false,
|
||||
"fast": "single",
|
||||
"stored": true
|
||||
}
|
||||
@@ -793,6 +805,7 @@ mod tests {
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "basic",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "raw"
|
||||
},
|
||||
"stored": true
|
||||
@@ -803,6 +816,7 @@ mod tests {
|
||||
"type": "date",
|
||||
"options": {
|
||||
"indexed": true,
|
||||
"fieldnorms": true,
|
||||
"fast": "single",
|
||||
"stored": true
|
||||
}
|
||||
@@ -813,6 +827,7 @@ mod tests {
|
||||
"options": {
|
||||
"indexing": {
|
||||
"record": "position",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
@@ -823,6 +838,7 @@ mod tests {
|
||||
"type": "i64",
|
||||
"options": {
|
||||
"indexed": false,
|
||||
"fieldnorms": false,
|
||||
"fast": "single",
|
||||
"stored": true
|
||||
}
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
use std::fmt;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use super::Field;
|
||||
use crate::schema::Facet;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::{Facet, Type};
|
||||
use crate::DateTime;
|
||||
use common;
|
||||
use std::str;
|
||||
|
||||
/// Size (in bytes) of the buffer of a int field.
|
||||
const INT_TERM_LEN: usize = 4 + 8;
|
||||
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
|
||||
/// <field> + <type byte> + <value len>
|
||||
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
|
||||
/// Term represents the value that the token can take.
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
|
||||
#[derive(Clone)]
|
||||
pub struct Term<B = Vec<u8>>(B)
|
||||
where
|
||||
B: AsRef<[u8]>;
|
||||
@@ -22,98 +24,61 @@ impl Term {
|
||||
Term(Vec::with_capacity(100))
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a i64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a i64 value of 3234,
|
||||
/// the Term will have 12 bytes.
|
||||
///
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
/// The 8 following bytes are encoding the u64 value.
|
||||
pub fn from_field_i64(field: Field, val: i64) -> Term {
|
||||
let val_u64: u64 = common::i64_to_u64(val);
|
||||
Term::from_field_u64(field, val_u64)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a f64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a f64 value of 1.5,
|
||||
/// the Term will have 12 bytes.
|
||||
///
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
/// The 8 following bytes are encoding the f64 as a u64 value.
|
||||
pub fn from_field_f64(field: Field, val: f64) -> Term {
|
||||
let val_u64: u64 = common::f64_to_u64(val);
|
||||
Term::from_field_u64(field, val_u64)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a DateTime value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a timestamp i64 value of 3234,
|
||||
/// the Term will have 12 bytes.
|
||||
///
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
/// The 8 following bytes are encoding the DateTime as i64 timestamp value.
|
||||
pub fn from_field_date(field: Field, val: &DateTime) -> Term {
|
||||
let val_timestamp = val.timestamp();
|
||||
Term::from_field_i64(field, val_timestamp)
|
||||
}
|
||||
|
||||
/// Creates a `Term` given a facet.
|
||||
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
||||
let bytes = facet.encoded_str().as_bytes();
|
||||
let buffer = Vec::with_capacity(4 + bytes.len());
|
||||
let mut term = Term(buffer);
|
||||
term.set_field(field);
|
||||
term.set_bytes(bytes);
|
||||
term
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a string value
|
||||
///
|
||||
/// Assuming the term has a field id of 2, and a text value of "abc",
|
||||
/// the Term will have 4 bytes.
|
||||
/// The first byte is 2, and the three following bytes are the utf-8
|
||||
/// representation of "abc".
|
||||
pub fn from_field_text(field: Field, text: &str) -> Term {
|
||||
let buffer = Vec::with_capacity(4 + text.len());
|
||||
let mut term = Term(buffer);
|
||||
term.set_field(field);
|
||||
term.set_text(text);
|
||||
fn from_fast_value<T: FastValue>(field: Field, val: &T) -> Term {
|
||||
let mut term = Term(vec![0u8; FAST_VALUE_TERM_LEN]);
|
||||
term.set_field(T::to_type(), field);
|
||||
term.set_u64(val.to_u64());
|
||||
term
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a u64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a u64 value of 3234,
|
||||
/// the Term will have 12 bytes.
|
||||
///
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
/// The 8 following bytes are encoding the u64 value.
|
||||
pub fn from_field_u64(field: Field, val: u64) -> Term {
|
||||
let mut term = Term(vec![0u8; INT_TERM_LEN]);
|
||||
term.set_field(field);
|
||||
term.set_u64(val);
|
||||
Term::from_fast_value(field, &val)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a i64-value
|
||||
pub fn from_field_i64(field: Field, val: i64) -> Term {
|
||||
Term::from_fast_value(field, &val)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a f64-value
|
||||
pub fn from_field_f64(field: Field, val: f64) -> Term {
|
||||
Term::from_fast_value(field, &val)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a DateTime value
|
||||
pub fn from_field_date(field: Field, val: &DateTime) -> Term {
|
||||
Term::from_fast_value(field, val)
|
||||
}
|
||||
|
||||
/// Creates a `Term` given a facet.
|
||||
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
||||
let facet_encoded_str = facet.encoded_str();
|
||||
Term::create_bytes_term(Type::Facet, field, facet_encoded_str.as_bytes())
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a string value
|
||||
pub fn from_field_text(field: Field, text: &str) -> Term {
|
||||
Term::create_bytes_term(Type::Str, field, text.as_bytes())
|
||||
}
|
||||
|
||||
fn create_bytes_term(typ: Type, field: Field, bytes: &[u8]) -> Term {
|
||||
let mut term = Term(vec![0u8; 5 + bytes.len()]);
|
||||
term.set_field(typ, field);
|
||||
term.0.extend_from_slice(bytes);
|
||||
term
|
||||
}
|
||||
|
||||
/// Builds a term bytes.
|
||||
pub fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
|
||||
let mut term = Term::for_field(field);
|
||||
term.set_bytes(bytes);
|
||||
term
|
||||
Term::create_bytes_term(Type::Bytes, field, bytes)
|
||||
}
|
||||
|
||||
/// Creates a new Term for a given field.
|
||||
pub(crate) fn for_field(field: Field) -> Term {
|
||||
let mut term = Term(Vec::with_capacity(100));
|
||||
term.set_field(field);
|
||||
term
|
||||
}
|
||||
|
||||
pub(crate) fn set_field(&mut self, field: Field) {
|
||||
pub(crate) fn set_field(&mut self, typ: Type, field: Field) {
|
||||
self.0.clear();
|
||||
self.0
|
||||
.extend_from_slice(field.field_id().to_be_bytes().as_ref());
|
||||
self.0.push(typ.to_code());
|
||||
}
|
||||
|
||||
/// Sets a u64 value in the term.
|
||||
@@ -123,23 +88,33 @@ impl Term {
|
||||
/// The use of BigEndian has the benefit of preserving
|
||||
/// the natural order of the values.
|
||||
pub fn set_u64(&mut self, val: u64) {
|
||||
self.0.resize(INT_TERM_LEN, 0u8);
|
||||
self.set_fast_value(val);
|
||||
self.set_bytes(val.to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
fn set_fast_value<T: FastValue>(&mut self, val: T) {
|
||||
self.0.resize(FAST_VALUE_TERM_LEN, 0u8);
|
||||
self.set_bytes(val.to_u64().to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
/// Sets a `i64` value in the term.
|
||||
pub fn set_i64(&mut self, val: i64) {
|
||||
self.set_u64(common::i64_to_u64(val));
|
||||
self.set_fast_value(val);
|
||||
}
|
||||
|
||||
/// Sets a `i64` value in the term.
|
||||
pub fn set_date(&mut self, date: crate::DateTime) {
|
||||
self.set_fast_value(date);
|
||||
}
|
||||
|
||||
/// Sets a `f64` value in the term.
|
||||
pub fn set_f64(&mut self, val: f64) {
|
||||
self.set_u64(common::f64_to_u64(val));
|
||||
self.set_fast_value(val);
|
||||
}
|
||||
|
||||
/// Sets the value of a `Bytes` field.
|
||||
pub fn set_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.resize(4, 0u8);
|
||||
self.0.resize(5, 0u8);
|
||||
self.0.extend(bytes);
|
||||
}
|
||||
|
||||
@@ -149,6 +124,44 @@ impl Term {
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> Ord for Term<B>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.as_slice().cmp(other.as_slice())
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> PartialOrd for Term<B>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> PartialEq for Term<B>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_slice() == other.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> Eq for Term<B> where B: AsRef<[u8]> {}
|
||||
|
||||
impl<B> Hash for Term<B>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.0.as_ref().hash(state)
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> Term<B>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
@@ -158,6 +171,15 @@ where
|
||||
Term(data)
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
pub fn typ(&self) -> Type {
|
||||
assert!(
|
||||
self.as_slice().len() >= 5,
|
||||
"the type does byte representation is too short"
|
||||
);
|
||||
Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code")
|
||||
}
|
||||
|
||||
/// Returns the field.
|
||||
pub fn field(&self) -> Field {
|
||||
let mut field_id_bytes = [0u8; 4];
|
||||
@@ -167,41 +189,86 @@ where
|
||||
|
||||
/// Returns the `u64` value stored in a term.
|
||||
///
|
||||
/// # Panics
|
||||
/// ... or returns an invalid value
|
||||
/// if the term is not a `u64` field.
|
||||
pub fn get_u64(&self) -> u64 {
|
||||
let mut field_id_bytes = [0u8; 8];
|
||||
field_id_bytes.copy_from_slice(self.value_bytes());
|
||||
u64::from_be_bytes(field_id_bytes)
|
||||
/// Returns None if the term is not of the u64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_u64(&self) -> Option<u64> {
|
||||
self.get_fast_type::<u64>()
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
|
||||
if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN {
|
||||
return None;
|
||||
}
|
||||
let mut value_bytes = [0u8; 8];
|
||||
value_bytes.copy_from_slice(self.value_bytes());
|
||||
let value_u64 = u64::from_be_bytes(value_bytes);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
}
|
||||
|
||||
/// Returns the `i64` value stored in a term.
|
||||
///
|
||||
/// # Panics
|
||||
/// ... or returns an invalid value
|
||||
/// if the term is not a `i64` field.
|
||||
pub fn get_i64(&self) -> i64 {
|
||||
common::u64_to_i64(self.get_u64())
|
||||
/// Returns None if the term is not of the i64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_i64(&self) -> Option<i64> {
|
||||
self.get_fast_type::<i64>()
|
||||
}
|
||||
|
||||
/// Returns the `f64` value stored in a term.
|
||||
///
|
||||
/// # Panics
|
||||
/// ... or returns an invalid value
|
||||
/// if the term is not a `f64` field.
|
||||
pub fn get_f64(&self) -> f64 {
|
||||
common::u64_to_f64(self.get_u64())
|
||||
/// Returns None if the term is not of the f64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_f64(&self) -> Option<f64> {
|
||||
self.get_fast_type::<f64>()
|
||||
}
|
||||
|
||||
/// Returns the `Date` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the Date type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_date(&self) -> Option<crate::DateTime> {
|
||||
self.get_fast_type::<crate::DateTime>()
|
||||
}
|
||||
|
||||
/// Returns the text associated with the term.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the value is not valid utf-8. This may happen
|
||||
/// if the index is corrupted or if you try to
|
||||
/// call this method on a non-string type.
|
||||
pub fn text(&self) -> &str {
|
||||
str::from_utf8(self.value_bytes()).expect("Term does not contain valid utf-8.")
|
||||
/// Returns None if the field is not of string type
|
||||
/// or if the bytes are not valid utf-8.
|
||||
pub fn as_str(&self) -> Option<&str> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Str {
|
||||
return None;
|
||||
}
|
||||
str::from_utf8(self.value_bytes()).ok()
|
||||
}
|
||||
|
||||
/// Returns the facet associated with the term.
|
||||
///
|
||||
/// Returns None if the field is not of facet type
|
||||
/// or if the bytes are not valid utf-8.
|
||||
pub fn as_facet(&self) -> Option<Facet> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Facet {
|
||||
return None;
|
||||
}
|
||||
let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?;
|
||||
Some(Facet::from_encoded_string(facet_encode_str.to_string()))
|
||||
}
|
||||
|
||||
/// Returns the bytes associated with the term.
|
||||
///
|
||||
/// Returns None if the field is not of bytes type.
|
||||
pub fn as_bytes(&self) -> Option<&[u8]> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Bytes {
|
||||
return None;
|
||||
}
|
||||
Some(self.value_bytes())
|
||||
}
|
||||
|
||||
/// Returns the serialized value of the term.
|
||||
@@ -211,32 +278,61 @@ where
|
||||
/// If the term is a u64, its value is encoded according
|
||||
/// to `byteorder::LittleEndian`.
|
||||
pub fn value_bytes(&self) -> &[u8] {
|
||||
&self.0.as_ref()[4..]
|
||||
&self.0.as_ref()[5..]
|
||||
}
|
||||
|
||||
/// Returns the underlying `&[u8]`
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
/// Returns the underlying `&[u8]`.
|
||||
///
|
||||
/// Do NOT rely on this byte representation in the index.
|
||||
/// This value is likely to change in the future.
|
||||
pub(crate) fn as_slice(&self) -> &[u8] {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> AsRef<[u8]> for Term<B>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.0.as_ref()
|
||||
fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) -> fmt::Result {
|
||||
if let Some(val) = val_opt {
|
||||
write!(f, "{:?}", val)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl fmt::Debug for Term {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Term(field={},bytes={:?})",
|
||||
self.field().field_id(),
|
||||
self.value_bytes()
|
||||
)
|
||||
let field_id = self.field().field_id();
|
||||
let typ = self.typ();
|
||||
write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?;
|
||||
match typ {
|
||||
Type::Str => {
|
||||
let s = str::from_utf8(self.value_bytes()).ok();
|
||||
write_opt(f, s)?;
|
||||
}
|
||||
Type::U64 => {
|
||||
write_opt(f, self.as_u64())?;
|
||||
}
|
||||
Type::I64 => {
|
||||
let val_i64 = self.as_i64();
|
||||
write_opt(f, val_i64)?;
|
||||
}
|
||||
Type::F64 => {
|
||||
let val_f64 = self.as_f64();
|
||||
write_opt(f, val_f64)?;
|
||||
}
|
||||
// TODO pretty print these types too.
|
||||
Type::Date => {
|
||||
let val_date = self.as_date();
|
||||
write_opt(f, val_date)?;
|
||||
}
|
||||
Type::Facet => {
|
||||
let facet = self.as_facet().map(|facet| facet.to_path_string());
|
||||
write_opt(f, facet)?;
|
||||
}
|
||||
Type::Bytes => {
|
||||
write_opt(f, self.as_bytes())?;
|
||||
}
|
||||
}
|
||||
write!(f, ")",)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -246,30 +342,24 @@ mod tests {
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
pub fn test_term() {
|
||||
pub fn test_term_str() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("text", STRING);
|
||||
let title_field = schema_builder.add_text_field("title", STRING);
|
||||
let count_field = schema_builder.add_text_field("count", STRING);
|
||||
{
|
||||
let term = Term::from_field_text(title_field, "test");
|
||||
assert_eq!(term.field(), title_field);
|
||||
assert_eq!(&term.as_slice()[0..4], &[0u8, 0u8, 0u8, 1u8]);
|
||||
assert_eq!(&term.as_slice()[4..], "test".as_bytes());
|
||||
}
|
||||
{
|
||||
let term = Term::from_field_u64(count_field, 983u64);
|
||||
assert_eq!(term.field(), count_field);
|
||||
assert_eq!(&term.as_slice()[0..4], &[0u8, 0u8, 0u8, 2u8]);
|
||||
assert_eq!(term.as_slice().len(), 4 + 8);
|
||||
assert_eq!(term.as_slice()[4], 0u8);
|
||||
assert_eq!(term.as_slice()[5], 0u8);
|
||||
assert_eq!(term.as_slice()[6], 0u8);
|
||||
assert_eq!(term.as_slice()[7], 0u8);
|
||||
assert_eq!(term.as_slice()[8], 0u8);
|
||||
assert_eq!(term.as_slice()[9], 0u8);
|
||||
assert_eq!(term.as_slice()[10], (933u64 / 256u64) as u8);
|
||||
assert_eq!(term.as_slice()[11], (983u64 % 256u64) as u8);
|
||||
}
|
||||
let term = Term::from_field_text(title_field, "test");
|
||||
assert_eq!(term.field(), title_field);
|
||||
assert_eq!(term.typ(), Type::Str);
|
||||
assert_eq!(term.as_str(), Some("test"))
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_term_u64() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let count_field = schema_builder.add_u64_field("count", INDEXED);
|
||||
let term = Term::from_field_u64(count_field, 983u64);
|
||||
assert_eq!(term.field(), count_field);
|
||||
assert_eq!(term.typ(), Type::U64);
|
||||
assert_eq!(term.as_slice().len(), super::FAST_VALUE_TERM_LEN);
|
||||
assert_eq!(term.as_u64(), Some(983u64))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ impl TextOptions {
|
||||
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
|
||||
pub struct TextFieldIndexing {
|
||||
record: IndexRecordOption,
|
||||
fieldnorms: bool,
|
||||
tokenizer: Cow<'static, str>,
|
||||
}
|
||||
|
||||
@@ -53,6 +54,7 @@ impl Default for TextFieldIndexing {
|
||||
TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("default"),
|
||||
record: IndexRecordOption::Basic,
|
||||
fieldnorms: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -69,6 +71,17 @@ impl TextFieldIndexing {
|
||||
&self.tokenizer
|
||||
}
|
||||
|
||||
/// Sets fieldnorms
|
||||
pub fn set_fieldnorms(mut self, fieldnorms: bool) -> TextFieldIndexing {
|
||||
self.fieldnorms = fieldnorms;
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns true iff fieldnorms are stored.
|
||||
pub fn fieldnorms(&self) -> bool {
|
||||
self.fieldnorms
|
||||
}
|
||||
|
||||
/// Sets which information should be indexed with the tokens.
|
||||
///
|
||||
/// See [IndexRecordOption](./enum.IndexRecordOption.html) for more detail.
|
||||
@@ -89,6 +102,7 @@ impl TextFieldIndexing {
|
||||
pub const STRING: TextOptions = TextOptions {
|
||||
indexing: Some(TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("raw"),
|
||||
fieldnorms: true,
|
||||
record: IndexRecordOption::Basic,
|
||||
}),
|
||||
stored: false,
|
||||
@@ -98,6 +112,7 @@ pub const STRING: TextOptions = TextOptions {
|
||||
pub const TEXT: TextOptions = TextOptions {
|
||||
indexing: Some(TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("default"),
|
||||
fieldnorms: true,
|
||||
record: IndexRecordOption::WithFreqsAndPositions,
|
||||
}),
|
||||
stored: false,
|
||||
|
||||
@@ -3,7 +3,7 @@ use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::{cmp::Ordering, fmt};
|
||||
use std::fmt;
|
||||
|
||||
/// Value represents the value of a any field.
|
||||
/// It is an enum over all over all of the possible field type.
|
||||
@@ -21,53 +21,13 @@ pub enum Value {
|
||||
F64(f64),
|
||||
/// Signed 64-bits Date time stamp `date`
|
||||
Date(DateTime),
|
||||
/// Hierarchical Facet
|
||||
/// Facet
|
||||
Facet(Facet),
|
||||
/// Arbitrarily sized byte array
|
||||
Bytes(Vec<u8>),
|
||||
}
|
||||
|
||||
impl Eq for Value {}
|
||||
impl PartialOrd for Value {
|
||||
fn partial_cmp(&self, other: &Value) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl Ord for Value {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
match (self, other) {
|
||||
(Value::Str(l), Value::Str(r)) => l.cmp(r),
|
||||
(Value::PreTokStr(l), Value::PreTokStr(r)) => l.cmp(r),
|
||||
(Value::U64(l), Value::U64(r)) => l.cmp(r),
|
||||
(Value::I64(l), Value::I64(r)) => l.cmp(r),
|
||||
(Value::Date(l), Value::Date(r)) => l.cmp(r),
|
||||
(Value::Facet(l), Value::Facet(r)) => l.cmp(r),
|
||||
(Value::Bytes(l), Value::Bytes(r)) => l.cmp(r),
|
||||
(Value::F64(l), Value::F64(r)) => {
|
||||
match (l.is_nan(), r.is_nan()) {
|
||||
(false, false) => l.partial_cmp(r).unwrap(), // only fail on NaN
|
||||
(true, true) => Ordering::Equal,
|
||||
(true, false) => Ordering::Less, // we define NaN as less than -∞
|
||||
(false, true) => Ordering::Greater,
|
||||
}
|
||||
}
|
||||
(Value::Str(_), _) => Ordering::Less,
|
||||
(_, Value::Str(_)) => Ordering::Greater,
|
||||
(Value::PreTokStr(_), _) => Ordering::Less,
|
||||
(_, Value::PreTokStr(_)) => Ordering::Greater,
|
||||
(Value::U64(_), _) => Ordering::Less,
|
||||
(_, Value::U64(_)) => Ordering::Greater,
|
||||
(Value::I64(_), _) => Ordering::Less,
|
||||
(_, Value::I64(_)) => Ordering::Greater,
|
||||
(Value::F64(_), _) => Ordering::Less,
|
||||
(_, Value::F64(_)) => Ordering::Greater,
|
||||
(Value::Date(_), _) => Ordering::Less,
|
||||
(_, Value::Date(_)) => Ordering::Greater,
|
||||
(Value::Facet(_), _) => Ordering::Less,
|
||||
(_, Value::Facet(_)) => Ordering::Greater,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Value {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
@@ -137,19 +97,18 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the path value, provided the value is of the `Facet` type.
|
||||
/// Returns the facet value, provided the value is of the `Facet` type.
|
||||
/// (Returns None if the value is not of the `Facet` type).
|
||||
pub fn path(&self) -> Option<String> {
|
||||
pub fn facet(&self) -> Option<&Facet> {
|
||||
if let Value::Facet(facet) = self {
|
||||
Some(facet.to_path_string())
|
||||
Some(facet)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the tokenized text, provided the value is of the `PreTokStr` type.
|
||||
///
|
||||
/// Returns None if the value is not of the `PreTokStr` type.
|
||||
/// (Returns None if the value is not of the `PreTokStr` type.)
|
||||
pub fn tokenized_text(&self) -> Option<&PreTokenizedString> {
|
||||
if let Value::PreTokStr(tokenized_text) = self {
|
||||
Some(tokenized_text)
|
||||
@@ -159,8 +118,7 @@ impl Value {
|
||||
}
|
||||
|
||||
/// Returns the u64-value, provided the value is of the `U64` type.
|
||||
///
|
||||
/// Returns None if the value is not of the `U64` type.
|
||||
/// (Returns None if the value is not of the `U64` type)
|
||||
pub fn u64_value(&self) -> Option<u64> {
|
||||
if let Value::U64(val) = self {
|
||||
Some(*val)
|
||||
|
||||
@@ -53,7 +53,7 @@ impl FragmentCandidate {
|
||||
/// Contains a fragment of a document, and some highlighed parts inside it.
|
||||
#[derive(Debug)]
|
||||
pub struct Snippet {
|
||||
fragments: String,
|
||||
fragment: String,
|
||||
highlighted: Vec<Range<usize>>,
|
||||
}
|
||||
|
||||
@@ -64,7 +64,7 @@ impl Snippet {
|
||||
/// Create a new, empty, `Snippet`
|
||||
pub fn empty() -> Snippet {
|
||||
Snippet {
|
||||
fragments: String::new(),
|
||||
fragment: String::new(),
|
||||
highlighted: Vec::new(),
|
||||
}
|
||||
}
|
||||
@@ -75,21 +75,21 @@ impl Snippet {
|
||||
let mut start_from: usize = 0;
|
||||
|
||||
for item in self.highlighted.iter() {
|
||||
html.push_str(&encode_minimal(&self.fragments[start_from..item.start]));
|
||||
html.push_str(&encode_minimal(&self.fragment[start_from..item.start]));
|
||||
html.push_str(HIGHLIGHTEN_PREFIX);
|
||||
html.push_str(&encode_minimal(&self.fragments[item.clone()]));
|
||||
html.push_str(&encode_minimal(&self.fragment[item.clone()]));
|
||||
html.push_str(HIGHLIGHTEN_POSTFIX);
|
||||
start_from = item.end;
|
||||
}
|
||||
html.push_str(&encode_minimal(
|
||||
&self.fragments[start_from..self.fragments.len()],
|
||||
&self.fragment[start_from..self.fragment.len()],
|
||||
));
|
||||
html
|
||||
}
|
||||
|
||||
/// Returns a fragment from the `Snippet`.
|
||||
pub fn fragments(&self) -> &str {
|
||||
&self.fragments
|
||||
/// Returns the fragment of text used in the snippet.
|
||||
pub fn fragment(&self) -> &str {
|
||||
&self.fragment
|
||||
}
|
||||
|
||||
/// Returns a list of higlighted positions from the `Snippet`.
|
||||
@@ -168,14 +168,14 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
.map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
|
||||
.collect();
|
||||
Snippet {
|
||||
fragments: fragment_text.to_string(),
|
||||
fragment: fragment_text.to_string(),
|
||||
highlighted,
|
||||
}
|
||||
} else {
|
||||
// when there no fragments to chose from,
|
||||
// for now create a empty snippet
|
||||
Snippet {
|
||||
fragments: String::new(),
|
||||
fragment: String::new(),
|
||||
highlighted: vec![],
|
||||
}
|
||||
}
|
||||
@@ -243,10 +243,15 @@ impl SnippetGenerator {
|
||||
if term.field() != field {
|
||||
continue;
|
||||
}
|
||||
let term_str = if let Some(term_str) = term.as_str() {
|
||||
term_str
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
let doc_freq = searcher.doc_freq(&term)?;
|
||||
if doc_freq > 0 {
|
||||
let score = 1.0 / (1.0 + doc_freq as Score);
|
||||
terms_text.insert(term.text().to_string(), score);
|
||||
terms_text.insert(term_str.to_string(), score);
|
||||
}
|
||||
}
|
||||
let tokenizer = searcher.index().tokenizer_for_field(field)?;
|
||||
@@ -329,7 +334,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
}
|
||||
let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
assert_eq!(
|
||||
snippet.fragments,
|
||||
snippet.fragment,
|
||||
"Rust is a systems programming language sponsored by\n\
|
||||
Mozilla which describes it as a \"safe"
|
||||
);
|
||||
@@ -391,7 +396,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
assert_eq!(snippet.fragments, "c d");
|
||||
assert_eq!(snippet.fragment, "c d");
|
||||
assert_eq!(snippet.to_html(), "<b>c</b> d");
|
||||
}
|
||||
|
||||
@@ -413,7 +418,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
assert_eq!(snippet.fragments, "e f");
|
||||
assert_eq!(snippet.fragment, "e f");
|
||||
assert_eq!(snippet.to_html(), "e <b>f</b>");
|
||||
}
|
||||
|
||||
@@ -436,7 +441,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
}
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
assert_eq!(snippet.fragments, "e f g");
|
||||
assert_eq!(snippet.fragment, "e f g");
|
||||
assert_eq!(snippet.to_html(), "e <b>f</b> g");
|
||||
}
|
||||
|
||||
@@ -452,7 +457,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
assert_eq!(snippet.fragments, "");
|
||||
assert_eq!(snippet.fragment, "");
|
||||
assert_eq!(snippet.to_html(), "");
|
||||
}
|
||||
|
||||
@@ -465,7 +470,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
assert_eq!(snippet.fragments, "");
|
||||
assert_eq!(snippet.fragment, "");
|
||||
assert_eq!(snippet.to_html(), "");
|
||||
}
|
||||
|
||||
|
||||
@@ -2,8 +2,10 @@ use std::io;
|
||||
|
||||
#[inline]
|
||||
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let mut params = brotli::enc::BrotliEncoderParams::default();
|
||||
params.quality = 5;
|
||||
let params = brotli::enc::BrotliEncoderParams {
|
||||
quality: 5,
|
||||
..Default::default()
|
||||
};
|
||||
compressed.clear();
|
||||
brotli::BrotliCompress(&mut uncompressed, compressed, ¶ms)?;
|
||||
Ok(())
|
||||
|
||||
@@ -2,27 +2,30 @@ use std::io::{self};
|
||||
|
||||
use core::convert::TryInto;
|
||||
use lz4_flex::{compress_into, decompress_into};
|
||||
use std::mem;
|
||||
|
||||
#[inline]
|
||||
#[allow(clippy::uninit_vec)]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let maximum_ouput_size = lz4_flex::block::get_maximum_output_size(uncompressed.len());
|
||||
let maximum_ouput_size =
|
||||
mem::size_of::<u32>() + lz4_flex::block::get_maximum_output_size(uncompressed.len());
|
||||
compressed.reserve(maximum_ouput_size);
|
||||
|
||||
unsafe {
|
||||
compressed.set_len(maximum_ouput_size + 4);
|
||||
compressed.set_len(maximum_ouput_size);
|
||||
}
|
||||
let bytes_written = compress_into(uncompressed, &mut compressed[4..])
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
|
||||
let num_bytes = uncompressed.len() as u32;
|
||||
compressed[0..4].copy_from_slice(&num_bytes.to_le_bytes());
|
||||
unsafe {
|
||||
compressed.set_len(bytes_written + 4);
|
||||
compressed.set_len(bytes_written + mem::size_of::<u32>());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[allow(clippy::uninit_vec)]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
let uncompressed_size_bytes: &[u8; 4] = compressed
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::io::{self, Read, Write};
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = snap::write::FrameEncoder::new(compressed);
|
||||
encoder.write_all(&uncompressed)?;
|
||||
encoder.write_all(uncompressed)?;
|
||||
encoder.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ mod tests {
|
||||
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::schema::{SchemaBuilder, STORED, STRING};
|
||||
use crate::schema::{SchemaBuilder, STORED, TEXT};
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::{DocAddress, DocId, Index, Term};
|
||||
|
||||
@@ -128,7 +128,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_merge_store_with_stacking_reproducing_issue969() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text = schema_builder.add_text_field("text", STORED | STRING);
|
||||
let text = schema_builder.add_text_field("text", STORED | TEXT);
|
||||
let body = schema_builder.add_text_field("body", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -19,9 +19,7 @@ impl<'a> Iterator for LayerCursor<'a> {
|
||||
return None;
|
||||
}
|
||||
let (block_mut, remaining_mut) = (&mut self.block, &mut self.remaining);
|
||||
if block_mut.deserialize(remaining_mut).is_err() {
|
||||
return None;
|
||||
}
|
||||
block_mut.deserialize(remaining_mut).ok()?;
|
||||
self.cursor = 0;
|
||||
}
|
||||
let res = Some(self.block.get(self.cursor));
|
||||
|
||||
@@ -259,11 +259,11 @@ pub mod tests {
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
// put enough data create enough blocks in the doc store to be considered for stacking
|
||||
for _ in 0..200 {
|
||||
index_writer.add_document(doc!(text_field=> LOREM));
|
||||
index_writer.add_document(doc!(text_field=> LOREM))?;
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
for _ in 0..200 {
|
||||
index_writer.add_document(doc!(text_field=> LOREM));
|
||||
index_writer.add_document(doc!(text_field=> LOREM))?;
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
@@ -234,12 +234,12 @@ impl TermInfoStoreWriter {
|
||||
};
|
||||
|
||||
term_info_block_meta.serialize(&mut self.buffer_block_metas)?;
|
||||
for term_info in self.term_infos[1..].iter().cloned() {
|
||||
for term_info in &self.term_infos[1..] {
|
||||
bitpack_serialize(
|
||||
&mut self.buffer_term_infos,
|
||||
&mut bit_packer,
|
||||
&term_info_block_meta,
|
||||
&term_info,
|
||||
term_info,
|
||||
)?;
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ use tantivy_fst::Streamer;
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// The item yielded is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the term.
|
||||
@@ -39,6 +39,11 @@ impl<'a> TermMerger<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over (segment ordinal, [TermOrdinal]) sorted by segment ordinal
|
||||
///
|
||||
/// This method may be called
|
||||
/// if [Self::advance] has been called before
|
||||
/// and `true` was returned.
|
||||
pub fn matching_segments<'b: 'a>(&'b self) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
|
||||
self.current_segment_and_term_ordinals
|
||||
.iter()
|
||||
@@ -46,8 +51,8 @@ impl<'a> TermMerger<'a> {
|
||||
}
|
||||
|
||||
/// Advance the term iterator to the next term.
|
||||
/// Returns true if there is indeed another term
|
||||
/// False if there is none.
|
||||
/// Returns `true` if there is indeed another term
|
||||
/// `false` if there is none.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if let Some((k, values)) = self.union.next() {
|
||||
self.current_key.clear();
|
||||
@@ -66,17 +71,17 @@ impl<'a> TermMerger<'a> {
|
||||
/// Returns the current term.
|
||||
///
|
||||
/// This method may be called
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
/// if [Self::advance] has been called before
|
||||
/// and `true` was returned.
|
||||
pub fn key(&self) -> &[u8] {
|
||||
&self.current_key
|
||||
}
|
||||
|
||||
/// Iterator over (segment ordinal, TermInfo) pairs iterator sorted by the ordinal.
|
||||
/// Iterator over (segment ordinal, [TermInfo]) pairs iterator sorted by the ordinal.
|
||||
///
|
||||
/// This method may be called
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
/// if [Self::advance] has been called before
|
||||
/// and `true` was returned.
|
||||
pub fn current_segment_ords_and_term_infos<'b: 'a>(
|
||||
&'b self,
|
||||
) -> impl 'b + Iterator<Item = (usize, TermInfo)> {
|
||||
|
||||
@@ -6,7 +6,7 @@ about the term.
|
||||
Internally, the term dictionary relies on the `fst` crate to store
|
||||
a sorted mapping that associate each term to its rank in the lexicographical order.
|
||||
For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan",
|
||||
the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`.
|
||||
the [TermOrdinal] are respectively `0`, `1`, `2`, and `3`.
|
||||
|
||||
For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the
|
||||
lexicographical order matches the natural order of integers.
|
||||
@@ -20,37 +20,18 @@ as `u64`.
|
||||
A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html).
|
||||
*/
|
||||
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
|
||||
mod fst_termdict;
|
||||
use fst_termdict as termdict;
|
||||
|
||||
mod merger;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
pub use self::merger::TermMerger;
|
||||
pub use self::termdict::TermDictionary;
|
||||
pub use self::termdict::TermDictionaryBuilder;
|
||||
pub use self::termdict::TermStreamer;
|
||||
|
||||
/// Position of the term in the sorted list of terms.
|
||||
pub type TermOrdinal = u64;
|
||||
|
||||
/// The term dictionary contains all of the terms in
|
||||
/// `tantivy index` in a sorted manner.
|
||||
pub type TermDictionary = self::termdict::TermDictionary;
|
||||
|
||||
/// Builder for the new term dictionary.
|
||||
///
|
||||
/// Inserting must be done in the order of the `keys`.
|
||||
pub type TermDictionaryBuilder<W> = self::termdict::TermDictionaryBuilder<W>;
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub type TermMerger<'a> = self::merger::TermMerger<'a>;
|
||||
|
||||
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
|
||||
/// Terms are guaranteed to be sorted.
|
||||
pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
@@ -157,8 +157,8 @@ pub use self::tokenizer_manager::TokenizerManager;
|
||||
///
|
||||
/// Tokenizer are in charge of not emitting tokens larger than this value.
|
||||
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
|
||||
/// `2^16 - 1 - 4`, the token will simply be ignored downstream.
|
||||
pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
|
||||
/// `2^16 - 1 - 5`, the token will simply be ignored downstream.
|
||||
pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 5;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
@@ -29,7 +29,7 @@ impl<'a> SimpleTokenStream<'a> {
|
||||
.filter(|&(_, ref c)| !c.is_alphanumeric())
|
||||
.map(|(offset, _)| offset)
|
||||
.next()
|
||||
.unwrap_or_else(|| self.text.len())
|
||||
.unwrap_or(self.text.len())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ impl<'a> WhitespaceTokenStream<'a> {
|
||||
.filter(|&(_, ref c)| c.is_ascii_whitespace())
|
||||
.map(|(offset, _)| offset)
|
||||
.next()
|
||||
.unwrap_or_else(|| self.text.len())
|
||||
.unwrap_or(self.text.len())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user