mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-30 14:02:55 +00:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
97227a2938 | ||
|
|
6a8a8557d2 | ||
|
|
3a65dc84c8 | ||
|
|
ce42bbf5c9 | ||
|
|
7b21b3f25a | ||
|
|
46caec1040 | ||
|
|
1187a02a3e | ||
|
|
f6c525b19e | ||
|
|
4a8f7712f3 | ||
|
|
2f867aad17 | ||
|
|
5c6580eb15 |
@@ -9,7 +9,9 @@ Tantivy 0.11.0
|
||||
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
||||
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
||||
|
||||
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
|
||||
- Fix crash when committing multiple times with deleted documents. #681 (@brainlock)
|
||||
|
||||
## How to update?
|
||||
|
||||
- `Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
|
||||
|
||||
@@ -13,7 +13,7 @@ keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.10.0"
|
||||
base64 = "0.11.0"
|
||||
byteorder = "1.0"
|
||||
crc32fast = "1.2.0"
|
||||
once_cell = "1.0"
|
||||
@@ -34,7 +34,7 @@ itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
notify = {version="4", optional=true}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7.2", features = ["v4", "serde"] }
|
||||
uuid = { version = "0.8", features = ["v4", "serde"] }
|
||||
crossbeam = "0.7"
|
||||
futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
|
||||
57
README.md
57
README.md
@@ -21,9 +21,9 @@
|
||||
[](https://www.patreon.com/fulmicoton)
|
||||
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
**Tantivy** is a **full text search engine library** written in Rust.
|
||||
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
@@ -31,7 +31,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Benchmark
|
||||
|
||||
Tantivy is typically faster than Lucene, but the results will depend on
|
||||
Tantivy is typically faster than Lucene, but the results depend on
|
||||
the nature of the queries in your workload.
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
@@ -40,19 +40,19 @@ performance for different type of queries / collection.
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Configurable tokenizer. (stemming available for 17 latin languages. Third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter))
|
||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- BM25 scoring (the same as lucene)
|
||||
- Natural query language `(michael AND jackson) OR "king of pop"`
|
||||
- Phrase queries search (`"michael jackson"`)
|
||||
- BM25 scoring (the same as Lucene)
|
||||
- Natural query language (e.g. `(michael AND jackson) OR "king of pop"`)
|
||||
- Phrase queries search (e.g. `"michael jackson"`)
|
||||
- Incremental indexing
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- Mmap directory
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||
- Single valued and multivalued u64, i64 and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set
|
||||
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, f64, dates and hierarchical facet fields
|
||||
- Text, i64, u64, f64, dates, and hierarchical facet fields
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceted search
|
||||
@@ -61,43 +61,42 @@ performance for different type of queries / collection.
|
||||
|
||||
# Non-features
|
||||
|
||||
- Distributed search is out of the scope of tantivy. That being said, tantivy is meant as a
|
||||
- Distributed search is out of the scope of Tantivy. That being said, Tantivy is a
|
||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
||||
are within the scope of tantivy.
|
||||
are within the scope of Tantivy.
|
||||
|
||||
# Supported OS and compiler
|
||||
|
||||
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
|
||||
Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows.
|
||||
|
||||
# Getting started
|
||||
|
||||
- [tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
|
||||
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents and search via the CLI or a small server with a REST API.
|
||||
It will walk you through getting a wikipedia search engine up and running in a few minutes.
|
||||
- [reference doc for the last released version](https://docs.rs/tantivy/)
|
||||
- [Tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli) - `tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents, and search via the CLI or a small server with a REST API.
|
||||
It walks you through getting a wikipedia search engine up and running in a few minutes.
|
||||
- [Reference doc for the last released version](https://docs.rs/tantivy/)
|
||||
|
||||
# How can I support this project?
|
||||
|
||||
There are many ways to support this project.
|
||||
|
||||
- Use tantivy and tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||
- Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||
- Report bugs
|
||||
- Write a blog post
|
||||
- Help with documentation by asking questions or submitting PRs
|
||||
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
|
||||
- Talk about tantivy around you
|
||||
- Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
|
||||
- Talk about Tantivy around you
|
||||
- Drop a word on on [](https://saythanks.io/to/fulmicoton) or even [](https://www.patreon.com/fulmicoton)
|
||||
|
||||
# Contributing code
|
||||
|
||||
We use the GitHub Pull Request workflow - reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
|
||||
We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
|
||||
|
||||
## Clone and build locally
|
||||
|
||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run :
|
||||
Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/tantivy-search/tantivy.git
|
||||
@@ -108,7 +107,7 @@ To check out and run tests, you can simply run :
|
||||
## Run tests
|
||||
|
||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`
|
||||
To run the tests exhaustively, run `./run-tests.sh`.
|
||||
|
||||
## Debug
|
||||
|
||||
@@ -116,13 +115,13 @@ You might find it useful to step through the programme with a debugger.
|
||||
|
||||
### A failing test
|
||||
|
||||
Make sure you haven't run `cargo clean` after the most recent `cargo test` or `cargo build` to guarantee that `target/` dir exists. Use this bash script to find the most name of the most recent debug build of tantivy and run it under rust-gdb.
|
||||
Make sure you haven't run `cargo clean` after the most recent `cargo test` or `cargo build` to guarantee that the `target/` directory exists. Use this bash script to find the name of the most recent debug build of Tantivy and run it under `rust-gdb`:
|
||||
|
||||
```bash
|
||||
find target/debug/ -maxdepth 1 -executable -type f -name "tantivy*" -printf '%TY-%Tm-%Td %TT %p\n' | sort -r | cut -d " " -f 3 | xargs -I RECENT_DBG_TANTIVY rust-gdb RECENT_DBG_TANTIVY
|
||||
```
|
||||
|
||||
Now that you are in rust-gdb, you can set breakpoints on lines and methods that match your source-code and run the debug executable with flags that you normally pass to `cargo test` to like this
|
||||
Now that you are in `rust-gdb`, you can set breakpoints on lines and methods that match your source code and run the debug executable with flags that you normally pass to `cargo test` like this:
|
||||
|
||||
```bash
|
||||
$gdb run --test-threads 1 --test $NAME_OF_TEST
|
||||
@@ -130,7 +129,7 @@ $gdb run --test-threads 1 --test $NAME_OF_TEST
|
||||
|
||||
### An example
|
||||
|
||||
By default, rustc compiles everything in the `examples/` dir in debug mode. This makes it easy for you to make examples to reproduce bugs.
|
||||
By default, `rustc` compiles everything in the `examples/` directory in debug mode. This makes it easy for you to make examples to reproduce bugs:
|
||||
|
||||
```bash
|
||||
rust-gdb target/debug/examples/$EXAMPLE_NAME
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fmt;
|
||||
use std::fmt::Write;
|
||||
|
||||
/// Defines whether a term in a query must be present,
|
||||
/// should be present or must not be present.
|
||||
/// should be present or must be not present.
|
||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||
pub enum Occur {
|
||||
/// For a given document to be considered for scoring,
|
||||
|
||||
@@ -515,7 +515,7 @@ mod tests {
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/country/europe"));
|
||||
}
|
||||
@@ -546,7 +546,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_non_used_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/countryeurope"));
|
||||
}
|
||||
|
||||
@@ -12,6 +12,9 @@ use std::collections::BinaryHeap;
|
||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||
/// default Rust heap is a max heap, whereas a min heap is needed.
|
||||
///
|
||||
/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document
|
||||
/// address is used.
|
||||
///
|
||||
/// WARNING: equality is not what you would expect here.
|
||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
||||
@@ -21,29 +24,37 @@ struct ComparableDoc<T, D> {
|
||||
doc: D,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other
|
||||
// Reversed to make BinaryHeap work as a min-heap
|
||||
let by_feature = other
|
||||
.feature
|
||||
.partial_cmp(&self.feature)
|
||||
.unwrap_or_else(|| Ordering::Equal)
|
||||
.unwrap_or(Ordering::Equal);
|
||||
|
||||
let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
|
||||
|
||||
// In case of a tie on the feature, we sort by ascending
|
||||
// `DocAddress` in order to ensure a stable sorting of the
|
||||
// documents.
|
||||
by_feature.then_with(lazy_by_doc_address)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
|
||||
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
||||
|
||||
pub(crate) struct TopCollector<T> {
|
||||
limit: usize,
|
||||
@@ -214,4 +225,94 @@ mod tests {
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_segment_collector_stable_ordering_for_equal_feature() {
|
||||
// given that the documents are collected in ascending doc id order,
|
||||
// when harvesting we have to guarantee stable sorting in case of a tie
|
||||
// on the score
|
||||
let doc_ids_collection = [4, 5, 6];
|
||||
let score = 3.14;
|
||||
|
||||
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
|
||||
for id in &doc_ids_collection {
|
||||
top_collector_limit_2.collect(*id, score);
|
||||
}
|
||||
|
||||
let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
|
||||
for id in &doc_ids_collection {
|
||||
top_collector_limit_3.collect(*id, score);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
top_collector_limit_2.harvest(),
|
||||
top_collector_limit_3.harvest()[..2].to_vec(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::TopSegmentCollector;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 400);
|
||||
|
||||
b.iter(|| {
|
||||
for i in 0..100 {
|
||||
top_collector.collect(i, 0.8);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||
|
||||
for i in 0..100 {
|
||||
top_collector.collect(i, 0.8);
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
for i in 0..100 {
|
||||
top_collector.collect(i, 0.8);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||
|
||||
for i in 0..100 {
|
||||
top_collector.collect(i, 0.8);
|
||||
}
|
||||
|
||||
// it would be nice to be able to do the setup N times but still
|
||||
// measure only harvest(). We can't since harvest() consumes
|
||||
// the top_collector.
|
||||
top_collector.harvest()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||
let mut score = 1.0;
|
||||
|
||||
for i in 0..100 {
|
||||
score += 1.0;
|
||||
top_collector.collect(i, score);
|
||||
}
|
||||
|
||||
// it would be nice to be able to do the setup N times but still
|
||||
// measure only harvest(). We can't since harvest() consumes
|
||||
// the top_collector.
|
||||
top_collector.harvest()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,13 +15,16 @@ use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use std::fmt;
|
||||
|
||||
/// The Top Score Collector keeps track of the K documents
|
||||
/// The `TopDocs` collector keeps track of the top `K` documents
|
||||
/// sorted by their score.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// This collector guarantees a stable sorting in case of a tie on the
|
||||
/// document score. As such, it is suitable to implement pagination.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::query::QueryParser;
|
||||
@@ -428,12 +431,13 @@ impl SegmentCollector for TopScoreSegmentCollector {
|
||||
mod tests {
|
||||
use super::TopDocs;
|
||||
use crate::collector::Collector;
|
||||
use crate::query::{Query, QueryParser};
|
||||
use crate::query::{AllQuery, Query, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::IndexWriter;
|
||||
use crate::Score;
|
||||
use itertools::Itertools;
|
||||
|
||||
fn make_index() -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -494,6 +498,29 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_stable_sorting() {
|
||||
let index = make_index();
|
||||
|
||||
// using AllQuery to get a constant score
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
|
||||
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
|
||||
|
||||
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
// precondition for the test to be meaningful: we did get documents
|
||||
// with the same score
|
||||
assert!(page_1.iter().map(|result| result.0).all_equal());
|
||||
assert!(page_2.iter().map(|result| result.0).all_equal());
|
||||
|
||||
// sanity check since we're relying on make_index()
|
||||
assert_eq!(page_1.len(), 2);
|
||||
assert_eq!(page_2.len(), 3);
|
||||
|
||||
assert_eq!(page_1, &page_2[..page_1.len()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_top_0() {
|
||||
@@ -551,7 +578,7 @@ mod tests {
|
||||
));
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field(2));
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
top_collector
|
||||
.for_segment(0, segment_reader)
|
||||
|
||||
@@ -199,13 +199,13 @@ mod test {
|
||||
let w = directory.open_write(path).unwrap();
|
||||
let mut composite_write = CompositeWrite::wrap(w);
|
||||
{
|
||||
let mut write_0 = composite_write.for_field(Field(0u32));
|
||||
let mut write_0 = composite_write.for_field(Field::from_field_id(0u32));
|
||||
VInt(32431123u64).serialize(&mut write_0).unwrap();
|
||||
write_0.flush().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut write_4 = composite_write.for_field(Field(4u32));
|
||||
let mut write_4 = composite_write.for_field(Field::from_field_id(4u32));
|
||||
VInt(2).serialize(&mut write_4).unwrap();
|
||||
write_4.flush().unwrap();
|
||||
}
|
||||
@@ -215,14 +215,18 @@ mod test {
|
||||
let r = directory.open_read(path).unwrap();
|
||||
let composite_file = CompositeFile::open(&r).unwrap();
|
||||
{
|
||||
let file0 = composite_file.open_read(Field(0u32)).unwrap();
|
||||
let file0 = composite_file
|
||||
.open_read(Field::from_field_id(0u32))
|
||||
.unwrap();
|
||||
let mut file0_buf = file0.as_slice();
|
||||
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
|
||||
assert_eq!(file0_buf.len(), 0);
|
||||
assert_eq!(payload_0, 32431123u64);
|
||||
}
|
||||
{
|
||||
let file4 = composite_file.open_read(Field(4u32)).unwrap();
|
||||
let file4 = composite_file
|
||||
.open_read(Field::from_field_id(4u32))
|
||||
.unwrap();
|
||||
let mut file4_buf = file4.as_slice();
|
||||
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
|
||||
assert_eq!(file4_buf.len(), 0);
|
||||
|
||||
@@ -150,6 +150,21 @@ impl SegmentMeta {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
/// Updates the max_doc value from the `SegmentMeta`.
|
||||
///
|
||||
/// This method is only used when updating `max_doc` from 0
|
||||
/// as we finalize a fresh new segment.
|
||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
||||
assert_eq!(self.tracked.max_doc, 0);
|
||||
assert!(self.tracked.deletes.is_none());
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
|
||||
@@ -50,6 +50,17 @@ impl Segment {
|
||||
&self.meta
|
||||
}
|
||||
|
||||
/// Updates the max_doc value from the `SegmentMeta`.
|
||||
///
|
||||
/// This method is only used when updating `max_doc` from 0
|
||||
/// as we finalize a fresh new segment.
|
||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
meta: self.meta.with_max_doc(max_doc),
|
||||
}
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||
Segment {
|
||||
|
||||
@@ -76,7 +76,7 @@ impl SegmentId {
|
||||
}
|
||||
|
||||
/// Error type used when parsing a `SegmentId` from a string fails.
|
||||
pub struct SegmentIdParseError(uuid::parser::ParseError);
|
||||
pub struct SegmentIdParseError(uuid::Error);
|
||||
|
||||
impl Error for SegmentIdParseError {}
|
||||
|
||||
|
||||
@@ -327,8 +327,7 @@ mod tests_mmap_specific {
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(managed_directory.exists(test_path2));
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[test_path1.to_owned()].into_iter().cloned().collect();
|
||||
let living_files: HashSet<PathBuf> = [test_path1.to_owned()].iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(!managed_directory.exists(test_path2));
|
||||
|
||||
@@ -10,11 +10,14 @@ use std::io::Write;
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
|
||||
let max_doc = delete_bitset.capacity();
|
||||
pub fn write_delete_bitset(
|
||||
delete_bitset: &BitSet,
|
||||
max_doc: u32,
|
||||
writer: &mut WritePtr,
|
||||
) -> io::Result<()> {
|
||||
let mut byte = 0u8;
|
||||
let mut shift = 0u8;
|
||||
for doc in 0..max_doc {
|
||||
for doc in 0..(max_doc as usize) {
|
||||
if delete_bitset.contains(doc) {
|
||||
byte |= 1 << shift;
|
||||
}
|
||||
@@ -86,18 +89,17 @@ mod tests {
|
||||
use bit_set::BitSet;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn test_delete_bitset_helper(bitset: &BitSet) {
|
||||
fn test_delete_bitset_helper(bitset: &BitSet, max_doc: u32) {
|
||||
let test_path = PathBuf::from("test");
|
||||
let mut directory = RAMDirectory::create();
|
||||
{
|
||||
let mut writer = directory.open_write(&*test_path).unwrap();
|
||||
write_delete_bitset(bitset, &mut writer).unwrap();
|
||||
write_delete_bitset(bitset, max_doc, &mut writer).unwrap();
|
||||
}
|
||||
{
|
||||
let source = directory.open_read(&test_path).unwrap();
|
||||
let delete_bitset = DeleteBitSet::open(source);
|
||||
let n = bitset.capacity();
|
||||
for doc in 0..n {
|
||||
for doc in 0..max_doc as usize {
|
||||
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
|
||||
}
|
||||
assert_eq!(delete_bitset.len(), bitset.len());
|
||||
@@ -110,7 +112,7 @@ mod tests {
|
||||
let mut bitset = BitSet::with_capacity(10);
|
||||
bitset.insert(1);
|
||||
bitset.insert(9);
|
||||
test_delete_bitset_helper(&bitset);
|
||||
test_delete_bitset_helper(&bitset, 10);
|
||||
}
|
||||
{
|
||||
let mut bitset = BitSet::with_capacity(8);
|
||||
@@ -119,7 +121,7 @@ mod tests {
|
||||
bitset.insert(3);
|
||||
bitset.insert(5);
|
||||
bitset.insert(7);
|
||||
test_delete_bitset_helper(&bitset);
|
||||
test_delete_bitset_helper(&bitset, 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,8 +59,7 @@ impl FastFieldReaders {
|
||||
fast_bytes: Default::default(),
|
||||
fast_fields_composite: fast_fields_composite.clone(),
|
||||
};
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
for (field, field_entry) in schema.fields() {
|
||||
let field_type = field_entry.field_type();
|
||||
if field_type == &FieldType::Bytes {
|
||||
let idx_reader = fast_fields_composite
|
||||
|
||||
@@ -24,8 +24,7 @@ impl FastFieldsWriter {
|
||||
let mut multi_values_writers = Vec::new();
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
for (field, field_entry) in schema.fields() {
|
||||
let default_value = match *field_entry.field_type() {
|
||||
FieldType::I64(_) => common::i64_to_u64(0i64),
|
||||
FieldType::F64(_) => common::f64_to_u64(0.0f64),
|
||||
|
||||
@@ -22,11 +22,14 @@ impl FieldNormsWriter {
|
||||
pub(crate) fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field, _)| Field(field as u32))
|
||||
.collect::<Vec<Field>>()
|
||||
.filter_map(|(field, field_entry)| {
|
||||
if field_entry.is_indexed() {
|
||||
Some(field)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
/// Initialize with state for tracking the field norm fields
|
||||
@@ -35,7 +38,7 @@ impl FieldNormsWriter {
|
||||
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
||||
let max_field = fields
|
||||
.iter()
|
||||
.map(|field| field.0)
|
||||
.map(Field::field_id)
|
||||
.max()
|
||||
.map(|max_field_id| max_field_id as usize + 1)
|
||||
.unwrap_or(0);
|
||||
@@ -50,8 +53,8 @@ impl FieldNormsWriter {
|
||||
///
|
||||
/// Will extend with 0-bytes for documents that have not been seen.
|
||||
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
||||
for &field in self.fields.iter() {
|
||||
self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8);
|
||||
for field in self.fields.iter() {
|
||||
self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,7 +67,7 @@ impl FieldNormsWriter {
|
||||
/// * field - the field being set
|
||||
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
||||
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
||||
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.0 as usize];
|
||||
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
|
||||
assert!(
|
||||
fieldnorm_buffer.len() <= doc as usize,
|
||||
"Cannot register a given fieldnorm twice"
|
||||
@@ -77,7 +80,7 @@ impl FieldNormsWriter {
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
|
||||
for &field in self.fields.iter() {
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..];
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -258,7 +258,7 @@ mod tests {
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let make_op = |i: usize| {
|
||||
let field = Field(1u32);
|
||||
let field = Field::from_field_id(1u32);
|
||||
DeleteOperation {
|
||||
opstamp: i as u64,
|
||||
term: Term::from_field_u64(field, i as u64),
|
||||
|
||||
@@ -148,7 +148,6 @@ pub(crate) fn advance_deletes(
|
||||
};
|
||||
|
||||
let delete_cursor = segment_entry.delete_cursor();
|
||||
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
@@ -168,7 +167,7 @@ pub(crate) fn advance_deletes(
|
||||
if num_deleted_docs > 0 {
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
}
|
||||
@@ -178,13 +177,13 @@ pub(crate) fn advance_deletes(
|
||||
|
||||
fn index_documents(
|
||||
memory_budget: usize,
|
||||
segment: &Segment,
|
||||
segment: Segment,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
@@ -204,21 +203,30 @@ fn index_documents(
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let num_docs = segment_writer.max_doc();
|
||||
let max_doc = segment_writer.max_doc();
|
||||
|
||||
// this is ensured by the call to peek before starting
|
||||
// the worker thread.
|
||||
assert!(num_docs > 0);
|
||||
assert!(max_doc > 0);
|
||||
|
||||
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||
let segment_meta = segment.index().new_segment_meta(segment_id, num_docs);
|
||||
|
||||
let segment_with_max_doc = segment.with_max_doc(max_doc);
|
||||
|
||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let delete_bitset_opt =
|
||||
apply_deletes(&segment, &mut delete_cursor, &doc_opstamps, last_docstamp)?;
|
||||
let delete_bitset_opt = apply_deletes(
|
||||
&segment_with_max_doc,
|
||||
&mut delete_cursor,
|
||||
&doc_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||
let segment_entry = SegmentEntry::new(
|
||||
segment_with_max_doc.meta().clone(),
|
||||
delete_cursor,
|
||||
delete_bitset_opt,
|
||||
);
|
||||
Ok(segment_updater.add_segment(segment_entry))
|
||||
}
|
||||
|
||||
@@ -235,7 +243,9 @@ fn apply_deletes(
|
||||
}
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let mut deleted_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let mut deleted_bitset = BitSet::with_capacity(max_doc as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
@@ -407,7 +417,7 @@ impl IndexWriter {
|
||||
let segment = index.new_segment();
|
||||
index_documents(
|
||||
mem_budget,
|
||||
&segment,
|
||||
segment,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone(),
|
||||
|
||||
@@ -190,8 +190,7 @@ impl IndexMerger {
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
|
||||
) -> Result<()> {
|
||||
for (field_id, field_entry) in self.schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
for (field, field_entry) in self.schema.fields() {
|
||||
let field_type = field_entry.field_type();
|
||||
match *field_type {
|
||||
FieldType::HierarchicalFacet => {
|
||||
@@ -649,15 +648,12 @@ impl IndexMerger {
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, TermOrdinalMapping>> {
|
||||
let mut term_ordinal_mappings = HashMap::new();
|
||||
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
|
||||
for (field, field_entry) in self.schema.fields() {
|
||||
if field_entry.is_indexed() {
|
||||
let indexed_field = Field(field_ord as u32);
|
||||
if let Some(term_ordinal_mapping) = self.write_postings_for_field(
|
||||
indexed_field,
|
||||
field_entry.field_type(),
|
||||
serializer,
|
||||
)? {
|
||||
term_ordinal_mappings.insert(indexed_field, term_ordinal_mapping);
|
||||
if let Some(term_ordinal_mapping) =
|
||||
self.write_postings_for_field(field, field_entry.field_type(), serializer)?
|
||||
{
|
||||
term_ordinal_mappings.insert(field, term_ordinal_mapping);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,3 +28,25 @@ pub use self::segment_writer::SegmentWriter;
|
||||
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::{self, Schema};
|
||||
use crate::{Index, Term};
|
||||
#[test]
|
||||
fn test_advance_delete_bug() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
// there must be one deleted document in the segment
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||
// we need enough data to trigger the bug (at least 32 documents)
|
||||
for _ in 0..32 {
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,11 +6,11 @@ use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::compute_table_size;
|
||||
use crate::postings::MultiFieldPostingsWriter;
|
||||
use crate::schema::FieldEntry;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::FacetTokenizer;
|
||||
use crate::tokenizer::{TokenStream, Tokenizer};
|
||||
@@ -70,12 +70,10 @@ impl SegmentWriter {
|
||||
let table_num_bits = initial_table_size(memory_budget)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||
let tokenizers =
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(FieldEntry::field_type)
|
||||
.map(|field_type| match *field_type {
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.map(
|
||||
|(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.and_then(|text_index_option| {
|
||||
@@ -83,8 +81,9 @@ impl SegmentWriter {
|
||||
segment.index().tokenizers().get(tokenizer_name)
|
||||
}),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
},
|
||||
)
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
multifield_postings,
|
||||
@@ -160,7 +159,7 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let num_tokens = if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.0 as usize]
|
||||
self.tokenizers[field.field_id() as usize]
|
||||
{
|
||||
let texts: Vec<&str> = field_values
|
||||
.iter()
|
||||
|
||||
14
src/lib.rs
14
src/lib.rs
@@ -212,15 +212,13 @@ pub type Score = f32;
|
||||
pub type SegmentLocalId = u32;
|
||||
|
||||
impl DocAddress {
|
||||
/// Return the segment ordinal.
|
||||
/// The segment ordinal is an id identifying the segment
|
||||
/// hosting the document. It is only meaningful, in the context
|
||||
/// of a searcher.
|
||||
/// Return the segment ordinal id that identifies the segment
|
||||
/// hosting the document in the `Searcher` it is called from.
|
||||
pub fn segment_ord(self) -> SegmentLocalId {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Return the segment local `DocId`
|
||||
/// Return the segment-local `DocId`
|
||||
pub fn doc(self) -> DocId {
|
||||
self.1
|
||||
}
|
||||
@@ -229,11 +227,11 @@ impl DocAddress {
|
||||
/// `DocAddress` contains all the necessary information
|
||||
/// to identify a document given a `Searcher` object.
|
||||
///
|
||||
/// It consists in an id identifying its segment, and
|
||||
/// its segment-local `DocId`.
|
||||
/// It consists of an id identifying its segment, and
|
||||
/// a segment-local `DocId`.
|
||||
///
|
||||
/// The id used for the segment is actually an ordinal
|
||||
/// in the list of segment hold by a `Searcher`.
|
||||
/// in the list of `Segment`s held by a `Searcher`.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
|
||||
|
||||
@@ -356,9 +356,9 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_skip_next() {
|
||||
let term_0 = Term::from_field_u64(Field(0), 0);
|
||||
let term_1 = Term::from_field_u64(Field(0), 1);
|
||||
let term_2 = Term::from_field_u64(Field(0), 2);
|
||||
let term_0 = Term::from_field_u64(Field::from_field_id(0), 0);
|
||||
let term_1 = Term::from_field_u64(Field::from_field_id(0), 1);
|
||||
let term_2 = Term::from_field_u64(Field::from_field_id(0), 2);
|
||||
|
||||
let num_docs = 300u32;
|
||||
|
||||
@@ -511,19 +511,19 @@ pub mod tests {
|
||||
}
|
||||
|
||||
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field(0);
|
||||
let field = Field::from_field_id(0);
|
||||
Term::from_field_text(field, "a")
|
||||
});
|
||||
pub static TERM_B: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field(0);
|
||||
let field = Field::from_field_id(0);
|
||||
Term::from_field_text(field, "b")
|
||||
});
|
||||
pub static TERM_C: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field(0);
|
||||
let field = Field::from_field_id(0);
|
||||
Term::from_field_text(field, "c")
|
||||
});
|
||||
pub static TERM_D: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field(0);
|
||||
let field = Field::from_field_id(0);
|
||||
Term::from_field_text(field, "d")
|
||||
});
|
||||
|
||||
|
||||
@@ -61,12 +61,12 @@ fn make_field_partition(
|
||||
.iter()
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
let mut prev_field_opt = None;
|
||||
let mut fields = vec![];
|
||||
let mut offsets = vec![];
|
||||
for (offset, field) in term_offsets_it {
|
||||
if field != prev_field {
|
||||
prev_field = field;
|
||||
if Some(field) != prev_field_opt {
|
||||
prev_field_opt = Some(field);
|
||||
fields.push(field);
|
||||
offsets.push(offset);
|
||||
}
|
||||
@@ -86,8 +86,7 @@ impl MultiFieldPostingsWriter {
|
||||
let term_index = TermHashMap::new(table_bits);
|
||||
let per_field_postings_writers: Vec<_> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| posting_from_field_entry(field_entry))
|
||||
.map(|(_, field_entry)| posting_from_field_entry(field_entry))
|
||||
.collect();
|
||||
MultiFieldPostingsWriter {
|
||||
heap: MemoryArena::new(),
|
||||
@@ -107,7 +106,8 @@ impl MultiFieldPostingsWriter {
|
||||
field: Field,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
) -> u32 {
|
||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
||||
let postings_writer =
|
||||
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
|
||||
postings_writer.index_text(
|
||||
&mut self.term_index,
|
||||
doc,
|
||||
@@ -118,7 +118,8 @@ impl MultiFieldPostingsWriter {
|
||||
}
|
||||
|
||||
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
|
||||
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
|
||||
let postings_writer =
|
||||
self.per_field_postings_writers[term.field().field_id() as usize].deref_mut();
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
|
||||
}
|
||||
|
||||
@@ -160,7 +161,7 @@ impl MultiFieldPostingsWriter {
|
||||
FieldType::Bytes => {}
|
||||
}
|
||||
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
let postings_writer = &self.per_field_postings_writers[field.field_id() as usize];
|
||||
let mut field_serializer =
|
||||
serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||
postings_writer.serialize(
|
||||
|
||||
@@ -9,7 +9,8 @@ use crate::Result;
|
||||
use crate::Searcher;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
/// The boolean query combines a set of queries
|
||||
/// The boolean query returns a set of documents
|
||||
/// that matches the Boolean combination of constituent subqueries.
|
||||
///
|
||||
/// The documents matched by the boolean query are
|
||||
/// those which
|
||||
@@ -19,6 +20,113 @@ use std::collections::BTreeSet;
|
||||
/// `MustNot` occurence.
|
||||
/// * match at least one of the subqueries that is not
|
||||
/// a `MustNot` occurence.
|
||||
///
|
||||
///
|
||||
/// You can combine other query types and their `Occur`ances into one `BooleanQuery`
|
||||
///
|
||||
/// ```rust
|
||||
///use tantivy::collector::Count;
|
||||
///use tantivy::doc;
|
||||
///use tantivy::query::{BooleanQuery, Occur, PhraseQuery, Query, TermQuery};
|
||||
///use tantivy::schema::{IndexRecordOption, Schema, TEXT};
|
||||
///use tantivy::Term;
|
||||
///use tantivy::{Index, Result};
|
||||
///
|
||||
///fn main() -> Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let body = schema_builder.add_text_field("body", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// body => "hidden",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// body => "found",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// // Make TermQuery's for "girl" and "diary" in the title
|
||||
/// let girl_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||
/// Term::from_field_text(title, "girl"),
|
||||
/// IndexRecordOption::Basic,
|
||||
/// ));
|
||||
/// let diary_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||
/// Term::from_field_text(title, "diary"),
|
||||
/// IndexRecordOption::Basic,
|
||||
/// ));
|
||||
/// // A TermQuery with "found" in the body
|
||||
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||
/// Term::from_field_text(body, "found"),
|
||||
/// IndexRecordOption::Basic,
|
||||
/// ));
|
||||
/// // TermQuery "diary" must and "girl" must not be present
|
||||
/// let queries_with_occurs1 = vec![
|
||||
/// (Occur::Must, diary_term_query.box_clone()),
|
||||
/// (Occur::MustNot, girl_term_query),
|
||||
/// ];
|
||||
/// // Make a BooleanQuery equivalent to
|
||||
/// // title:+diary title:-girl
|
||||
/// let diary_must_and_girl_mustnot = BooleanQuery::from(queries_with_occurs1);
|
||||
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
|
||||
/// assert_eq!(count1, 1);
|
||||
///
|
||||
/// // TermQuery for "cow" in the title
|
||||
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||
/// Term::from_field_text(title, "cow"),
|
||||
/// IndexRecordOption::Basic,
|
||||
/// ));
|
||||
/// // "title:diary OR title:cow"
|
||||
/// let title_diary_or_cow = BooleanQuery::from(vec![
|
||||
/// (Occur::Should, diary_term_query.box_clone()),
|
||||
/// (Occur::Should, cow_term_query),
|
||||
/// ]);
|
||||
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
|
||||
/// assert_eq!(count2, 4);
|
||||
///
|
||||
/// // Make a `PhraseQuery` from a vector of `Term`s
|
||||
/// let phrase_query: Box<dyn Query> = Box::new(PhraseQuery::new(vec![
|
||||
/// Term::from_field_text(title, "dairy"),
|
||||
/// Term::from_field_text(title, "cow"),
|
||||
/// ]));
|
||||
/// // You can combine subqueries of different types into 1 BooleanQuery:
|
||||
/// // `TermQuery` and `PhraseQuery`
|
||||
/// // "title:diary OR "dairy cow"
|
||||
/// let term_of_phrase_query = BooleanQuery::from(vec![
|
||||
/// (Occur::Should, diary_term_query.box_clone()),
|
||||
/// (Occur::Should, phrase_query.box_clone()),
|
||||
/// ]);
|
||||
/// let count3 = searcher.search(&term_of_phrase_query, &Count)?;
|
||||
/// assert_eq!(count3, 4);
|
||||
///
|
||||
/// // You can nest one BooleanQuery inside another
|
||||
/// // body:found AND ("title:diary OR "dairy cow")
|
||||
/// let nested_query = BooleanQuery::from(vec![
|
||||
/// (Occur::Must, body_term_query),
|
||||
/// (Occur::Must, Box::new(term_of_phrase_query))
|
||||
/// ]);
|
||||
/// let count4 = searcher.search(&nested_query, &Count)?;
|
||||
/// assert_eq!(count4, 1);
|
||||
/// Ok(())
|
||||
///}
|
||||
/// ```
|
||||
#[derive(Debug)]
|
||||
pub struct BooleanQuery {
|
||||
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
||||
|
||||
@@ -40,7 +40,7 @@ impl PhraseQuery {
|
||||
PhraseQuery::new_with_offset(terms_with_offset)
|
||||
}
|
||||
|
||||
/// Creates a new `PhraseQuery` given a list of terms and there offsets.
|
||||
/// Creates a new `PhraseQuery` given a list of terms and their offsets.
|
||||
///
|
||||
/// Can be used to provide custom offset for each term.
|
||||
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery {
|
||||
@@ -73,7 +73,7 @@ impl PhraseQuery {
|
||||
.collect::<Vec<Term>>()
|
||||
}
|
||||
|
||||
/// Returns the `PhraseWeight` for the given phrase query given a specific `searcher`.
|
||||
/// Returns the `PhraseWeight` for the given phrase query given a specific `searcher`.
|
||||
///
|
||||
/// This function is the same as `.weight(...)` except it returns
|
||||
/// a specialized type `PhraseWeight` instead of a Boxed trait.
|
||||
|
||||
@@ -674,13 +674,19 @@ mod test {
|
||||
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:-2324",
|
||||
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
|
||||
&format!(
|
||||
"{:?}",
|
||||
Term::from_field_i64(Field::from_field_id(2u32), -2324)
|
||||
),
|
||||
false,
|
||||
);
|
||||
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"float:2.5",
|
||||
&format!("{:?}", Term::from_field_f64(Field(10u32), 2.5)),
|
||||
&format!(
|
||||
"{:?}",
|
||||
Term::from_field_f64(Field::from_field_id(10u32), 2.5)
|
||||
),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_term_query_debug() {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(Field(1), "hello"),
|
||||
Term::from_field_text(Field::from_field_id(1), "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
assert_eq!(
|
||||
|
||||
@@ -3,14 +3,23 @@ use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
/// `Field` is actually a `u8` identifying a `Field`
|
||||
/// The schema is in charge of holding mapping between field names
|
||||
/// to `Field` objects.
|
||||
///
|
||||
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
|
||||
/// Value 255 is reserved.
|
||||
/// `Field` is represented by an unsigned 32-bit integer type
|
||||
/// The schema holds the mapping between field names and `Field` objects.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
|
||||
pub struct Field(pub u32);
|
||||
pub struct Field(u32);
|
||||
|
||||
impl Field {
|
||||
/// Create a new field object for the given FieldId.
|
||||
pub fn from_field_id(field_id: u32) -> Field {
|
||||
Field(field_id)
|
||||
}
|
||||
|
||||
/// Returns a u32 identifying uniquely a field within a schema.
|
||||
#[allow(clippy::trivially_copy_pass_by_ref)]
|
||||
pub fn field_id(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Field {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
|
||||
@@ -167,7 +167,7 @@ impl SchemaBuilder {
|
||||
|
||||
/// Adds a field entry to the schema in build.
|
||||
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
|
||||
let field = Field(self.fields.len() as u32);
|
||||
let field = Field::from_field_id(self.fields.len() as u32);
|
||||
let field_name = field_entry.name().to_string();
|
||||
self.fields.push(field_entry);
|
||||
self.fields_map.insert(field_name, field);
|
||||
@@ -223,7 +223,7 @@ pub struct Schema(Arc<InnerSchema>);
|
||||
impl Schema {
|
||||
/// Return the `FieldEntry` associated to a `Field`.
|
||||
pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
|
||||
&self.0.fields[field.0 as usize]
|
||||
&self.0.fields[field.field_id() as usize]
|
||||
}
|
||||
|
||||
/// Return the field name for a given `Field`.
|
||||
@@ -232,8 +232,12 @@ impl Schema {
|
||||
}
|
||||
|
||||
/// Return the list of all the `Field`s.
|
||||
pub fn fields(&self) -> &[FieldEntry] {
|
||||
&self.0.fields
|
||||
pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
|
||||
self.0
|
||||
.fields
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(field_id, field_entry)| (Field::from_field_id(field_id as u32), field_entry))
|
||||
}
|
||||
|
||||
/// Creates a new builder.
|
||||
@@ -485,13 +489,32 @@ mod tests {
|
||||
|
||||
let schema: Schema = serde_json::from_str(expected).unwrap();
|
||||
|
||||
let mut fields = schema.fields().iter();
|
||||
|
||||
assert_eq!("title", fields.next().unwrap().name());
|
||||
assert_eq!("author", fields.next().unwrap().name());
|
||||
assert_eq!("count", fields.next().unwrap().name());
|
||||
assert_eq!("popularity", fields.next().unwrap().name());
|
||||
assert_eq!("score", fields.next().unwrap().name());
|
||||
let mut fields = schema.fields();
|
||||
{
|
||||
let (field, field_entry) = fields.next().unwrap();
|
||||
assert_eq!("title", field_entry.name());
|
||||
assert_eq!(0, field.field_id());
|
||||
}
|
||||
{
|
||||
let (field, field_entry) = fields.next().unwrap();
|
||||
assert_eq!("author", field_entry.name());
|
||||
assert_eq!(1, field.field_id());
|
||||
}
|
||||
{
|
||||
let (field, field_entry) = fields.next().unwrap();
|
||||
assert_eq!("count", field_entry.name());
|
||||
assert_eq!(2, field.field_id());
|
||||
}
|
||||
{
|
||||
let (field, field_entry) = fields.next().unwrap();
|
||||
assert_eq!("popularity", field_entry.name());
|
||||
assert_eq!(3, field.field_id());
|
||||
}
|
||||
{
|
||||
let (field, field_entry) = fields.next().unwrap();
|
||||
assert_eq!("score", field_entry.name());
|
||||
assert_eq!(4, field.field_id());
|
||||
}
|
||||
assert!(fields.next().is_none());
|
||||
}
|
||||
|
||||
|
||||
@@ -105,7 +105,7 @@ impl Term {
|
||||
if self.0.len() < 4 {
|
||||
self.0.resize(4, 0u8);
|
||||
}
|
||||
BigEndian::write_u32(&mut self.0[0..4], field.0);
|
||||
BigEndian::write_u32(&mut self.0[0..4], field.field_id());
|
||||
}
|
||||
|
||||
/// Sets a u64 value in the term.
|
||||
@@ -157,7 +157,7 @@ where
|
||||
|
||||
/// Returns the field.
|
||||
pub fn field(&self) -> Field {
|
||||
Field(BigEndian::read_u32(&self.0.as_ref()[..4]))
|
||||
Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4]))
|
||||
}
|
||||
|
||||
/// Returns the `u64` value stored in a term.
|
||||
@@ -227,7 +227,7 @@ impl fmt::Debug for Term {
|
||||
write!(
|
||||
f,
|
||||
"Term(field={},bytes={:?})",
|
||||
self.field().0,
|
||||
self.field().field_id(),
|
||||
self.value_bytes()
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use fail;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use tantivy::directory::{Directory, ManagedDirectory, RAMDirectory, TerminatingWrite};
|
||||
use tantivy::doc;
|
||||
|
||||
Reference in New Issue
Block a user