mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 10:02:55 +00:00
Compare commits
2 Commits
issue/526b
...
0.9.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2b28e491c2 | ||
|
|
1d4fa4547c |
@@ -29,7 +29,7 @@ addons:
|
|||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
# Android
|
# Android
|
||||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
- env: TARGET=aarch64-linux-android
|
||||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||||
@@ -68,11 +68,6 @@ cache: cargo
|
|||||||
before_cache:
|
before_cache:
|
||||||
# Travis can't cache files that are not readable by "others"
|
# Travis can't cache files that are not readable by "others"
|
||||||
- chmod -R a+r $HOME/.cargo
|
- chmod -R a+r $HOME/.cargo
|
||||||
- find ./target/debug -type f -maxdepth 1 -delete
|
|
||||||
- rm -f ./target/.rustc_info.json
|
|
||||||
- rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
|
||||||
- rm -r target/debug/examples/
|
|
||||||
- ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
|
||||||
|
|
||||||
#branches:
|
#branches:
|
||||||
# only:
|
# only:
|
||||||
@@ -82,4 +77,4 @@ before_cache:
|
|||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
email:
|
email:
|
||||||
on_success: never
|
on_success: never
|
||||||
42
CHANGELOG.md
42
CHANGELOG.md
@@ -1,14 +1,7 @@
|
|||||||
Tantivy 0.10.0
|
Tantivy 0.9.1
|
||||||
====================
|
=====================
|
||||||
|
|
||||||
|
|
||||||
Minor
|
|
||||||
---------
|
|
||||||
- Small simplification of the code.
|
|
||||||
Calling .freq() or .doc() when .advance() has never
|
|
||||||
on segment postings should panic from now on.
|
|
||||||
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
|
||||||
|
|
||||||
|
Hotfix: The english stemmer was actually used for all languages.
|
||||||
|
|
||||||
Tantivy 0.9.0
|
Tantivy 0.9.0
|
||||||
=====================
|
=====================
|
||||||
@@ -27,35 +20,6 @@ previous index format.*
|
|||||||
for int fields. (@fulmicoton)
|
for int fields. (@fulmicoton)
|
||||||
- Added DateTime field (@barrotsteindev)
|
- Added DateTime field (@barrotsteindev)
|
||||||
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
|
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
|
||||||
- SIMD linear search within blocks (@fulmicoton)
|
|
||||||
|
|
||||||
## How to update ?
|
|
||||||
|
|
||||||
tantivy 0.9 brought some API breaking change.
|
|
||||||
To update from tantivy 0.8, you will need to go through the following steps.
|
|
||||||
|
|
||||||
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
|
|
||||||
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
|
|
||||||
`IndexReader` for this.
|
|
||||||
|
|
||||||
```rust
|
|
||||||
// create the reader. You typically need to create 1 reader for the entire
|
|
||||||
// lifetime of you program.
|
|
||||||
let reader = index.reader()?;
|
|
||||||
|
|
||||||
// Acquire a searcher (previously `index.searcher()`) is now written:
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
// With the default setting of the reader, you are not required to
|
|
||||||
// call `index.load_searchers()` anymore.
|
|
||||||
//
|
|
||||||
// The IndexReader will pick up that change automatically, regardless
|
|
||||||
// of whether the update was done in a different process or not.
|
|
||||||
// If this behavior is not wanted, you can create your reader with
|
|
||||||
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
|
|
||||||
// by calling `reader.reload()?`.
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.8.2
|
Tantivy 0.8.2
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.10.0-dev"
|
version = "0.9.1"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -23,7 +23,7 @@ snap = {version="0.2"}
|
|||||||
atomicwrites = {version="0.2.2", optional=true}
|
atomicwrites = {version="0.2.2", optional=true}
|
||||||
tempfile = "3.0"
|
tempfile = "3.0"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
combine = ">=3.6.0,<4.0.0"
|
combine = "3"
|
||||||
tempdir = "0.3"
|
tempdir = "0.3"
|
||||||
serde = "1.0"
|
serde = "1.0"
|
||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
|
|||||||
21
README.md
21
README.md
@@ -17,7 +17,6 @@
|
|||||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||||
|
|
||||||
[](https://www.patreon.com/fulmicoton)
|
|
||||||
|
|
||||||
|
|
||||||
**Tantivy** is a **full text search engine library** written in rust.
|
**Tantivy** is a **full text search engine library** written in rust.
|
||||||
@@ -28,14 +27,6 @@ to build such a search engine.
|
|||||||
|
|
||||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||||
|
|
||||||
# Benchmark
|
|
||||||
|
|
||||||
Tantivy is typically faster than Lucene, but the results will depend on
|
|
||||||
the nature of the queries in your workload.
|
|
||||||
|
|
||||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
|
||||||
performance for different type of queries / collection.
|
|
||||||
|
|
||||||
# Features
|
# Features
|
||||||
|
|
||||||
- Full-text search
|
- Full-text search
|
||||||
@@ -96,14 +87,6 @@ To check out and run tests, you can simply run :
|
|||||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||||
To run the tests exhaustively, run `./run-tests.sh`.
|
To run the tests exhaustively, run `./run-tests.sh`.
|
||||||
|
|
||||||
# How can I support this project ?
|
# Contribute
|
||||||
|
|
||||||
There are many ways to support this project.
|
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||||
|
|
||||||
- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
|
||||||
- Report bugs
|
|
||||||
- Write a blog post
|
|
||||||
- Complete documentation
|
|
||||||
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
|
|
||||||
- Talk about tantivy around you
|
|
||||||
- Drop a word on on [](https://saythanks.io/to/fulmicoton) or even [](https://www.patreon.com/fulmicoton)
|
|
||||||
|
|||||||
@@ -13,10 +13,7 @@ pub use self::serialize::{BinarySerializable, FixedSize};
|
|||||||
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
||||||
pub use byteorder::LittleEndian as Endianness;
|
pub use byteorder::LittleEndian as Endianness;
|
||||||
|
|
||||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
use std::io;
|
||||||
///
|
|
||||||
/// We do not allow segments with more than
|
|
||||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
|
||||||
|
|
||||||
/// Computes the number of bits that will be used for bitpacking.
|
/// Computes the number of bits that will be used for bitpacking.
|
||||||
///
|
///
|
||||||
@@ -55,6 +52,11 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
|
|||||||
(n > 0) && (n & (n - 1) == 0)
|
(n > 0) && (n & (n - 1) == 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Create a default io error given a string.
|
||||||
|
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||||
|
io::Error::new(io::ErrorKind::Other, msg)
|
||||||
|
}
|
||||||
|
|
||||||
/// Has length trait
|
/// Has length trait
|
||||||
pub trait HasLen {
|
pub trait HasLen {
|
||||||
/// Return length
|
/// Return length
|
||||||
@@ -132,11 +134,4 @@ pub(crate) mod test {
|
|||||||
assert_eq!(compute_num_bits(256), 9u8);
|
assert_eq!(compute_num_bits(256), 9u8);
|
||||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_max_doc() {
|
|
||||||
// this is the first time I write a unit test for a constant.
|
|
||||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
|
||||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,7 +24,6 @@ use schema::Schema;
|
|||||||
use serde_json;
|
use serde_json;
|
||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokenizer::BoxedTokenizer;
|
use tokenizer::BoxedTokenizer;
|
||||||
@@ -356,8 +355,10 @@ mod tests {
|
|||||||
use directory::RAMDirectory;
|
use directory::RAMDirectory;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::{Schema, INDEXED, TEXT};
|
use schema::{Schema, INDEXED, TEXT};
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
use tempdir::TempDir;
|
||||||
use Index;
|
use Index;
|
||||||
use IndexReader;
|
use IndexReader;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
@@ -443,69 +444,61 @@ mod tests {
|
|||||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
#[test]
|
||||||
mod mmap_specific {
|
fn test_index_on_commit_reload_policy_mmap() {
|
||||||
|
let schema = throw_away_schema();
|
||||||
|
let field = schema.get_field("num_likes").unwrap();
|
||||||
|
let tempdir = TempDir::new("index").unwrap();
|
||||||
|
let tempdir_path = PathBuf::from(tempdir.path());
|
||||||
|
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||||
|
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
writer.commit().unwrap();
|
||||||
|
let reader = index
|
||||||
|
.reader_builder()
|
||||||
|
.reload_policy(ReloadPolicy::OnCommit)
|
||||||
|
.try_into()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
|
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||||
|
}
|
||||||
|
|
||||||
use super::*;
|
#[test]
|
||||||
use std::path::PathBuf;
|
fn test_index_manual_policy_mmap() {
|
||||||
use tempdir::TempDir;
|
let schema = throw_away_schema();
|
||||||
|
let field = schema.get_field("num_likes").unwrap();
|
||||||
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
|
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
writer.commit().unwrap();
|
||||||
|
let reader = index
|
||||||
|
.reader_builder()
|
||||||
|
.reload_policy(ReloadPolicy::Manual)
|
||||||
|
.try_into()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
|
writer.add_document(doc!(field=>1u64));
|
||||||
|
writer.commit().unwrap();
|
||||||
|
thread::sleep(Duration::from_millis(500));
|
||||||
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
|
reader.reload().unwrap();
|
||||||
|
assert_eq!(reader.searcher().num_docs(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_on_commit_reload_policy_mmap() {
|
fn test_index_on_commit_reload_policy_different_directories() {
|
||||||
let schema = throw_away_schema();
|
let schema = throw_away_schema();
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
let field = schema.get_field("num_likes").unwrap();
|
||||||
let tempdir = TempDir::new("index").unwrap();
|
let tempdir = TempDir::new("index").unwrap();
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
let tempdir_path = PathBuf::from(tempdir.path());
|
||||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
||||||
writer.commit().unwrap();
|
let reader = read_index
|
||||||
let reader = index
|
.reader_builder()
|
||||||
.reader_builder()
|
.reload_policy(ReloadPolicy::OnCommit)
|
||||||
.reload_policy(ReloadPolicy::OnCommit)
|
.try_into()
|
||||||
.try_into()
|
.unwrap();
|
||||||
.unwrap();
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_manual_policy_mmap() {
|
|
||||||
let schema = throw_away_schema();
|
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
|
||||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
writer.commit().unwrap();
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
writer.add_document(doc!(field=>1u64));
|
|
||||||
writer.commit().unwrap();
|
|
||||||
thread::sleep(Duration::from_millis(500));
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
reader.reload().unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_on_commit_reload_policy_different_directories() {
|
|
||||||
let schema = throw_away_schema();
|
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
|
||||||
let tempdir = TempDir::new("index").unwrap();
|
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
|
||||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
|
||||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
|
||||||
let reader = read_index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::OnCommit)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_index_on_commit_reload_policy_aux(
|
fn test_index_on_commit_reload_policy_aux(
|
||||||
|
|||||||
@@ -260,98 +260,95 @@ impl Clone for ManagedDirectory {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
mod mmap_specific {
|
use directory::MmapDirectory;
|
||||||
|
use std::io::Write;
|
||||||
|
use std::path::Path;
|
||||||
|
use tempdir::TempDir;
|
||||||
|
|
||||||
use super::super::*;
|
lazy_static! {
|
||||||
use std::path::Path;
|
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||||
use tempdir::TempDir;
|
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||||
|
}
|
||||||
lazy_static! {
|
|
||||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
|
||||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
|
||||||
}
|
|
||||||
|
|
||||||
use directory::MmapDirectory;
|
|
||||||
use std::io::Write;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_managed_directory() {
|
|
||||||
let tempdir = TempDir::new("index").unwrap();
|
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
|
||||||
{
|
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
|
||||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
|
||||||
{
|
|
||||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
|
||||||
write_file.flush().unwrap();
|
|
||||||
}
|
|
||||||
{
|
|
||||||
managed_directory
|
|
||||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
|
||||||
assert!(managed_directory.exists(*TEST_PATH2));
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let living_files: HashSet<PathBuf> =
|
|
||||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
|
||||||
managed_directory.garbage_collect(|| living_files);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
|
||||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
|
||||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
|
||||||
{
|
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
|
||||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
|
||||||
managed_directory.garbage_collect(|| living_files);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
|
||||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_managed_directory_gc_while_mmapped() {
|
|
||||||
let tempdir = TempDir::new("index").unwrap();
|
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
|
||||||
let living_files = HashSet::new();
|
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
|
fn test_managed_directory() {
|
||||||
|
let tempdir = TempDir::new("index").unwrap();
|
||||||
|
let tempdir_path = PathBuf::from(tempdir.path());
|
||||||
|
{
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
managed_directory
|
{
|
||||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||||
.unwrap();
|
write_file.flush().unwrap();
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
}
|
||||||
|
{
|
||||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
managed_directory
|
||||||
managed_directory.garbage_collect(|| living_files.clone());
|
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||||
if cfg!(target_os = "windows") {
|
.unwrap();
|
||||||
// On Windows, gc should try and fail the file as it is mmapped.
|
}
|
||||||
|
{
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
// unmap should happen here.
|
assert!(managed_directory.exists(*TEST_PATH2));
|
||||||
drop(_mmap_read);
|
}
|
||||||
// The file should still be in the list of managed file and
|
{
|
||||||
// eventually be deleted once mmap is released.
|
let living_files: HashSet<PathBuf> =
|
||||||
|
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||||
managed_directory.garbage_collect(|| living_files);
|
managed_directory.garbage_collect(|| living_files);
|
||||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
}
|
||||||
} else {
|
{
|
||||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
|
{
|
||||||
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||||
|
managed_directory.garbage_collect(|| living_files);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(feature = "mmap ")]
|
||||||
|
fn test_managed_directory_gc_while_mmapped() {
|
||||||
|
let tempdir = TempDir::new("index").unwrap();
|
||||||
|
let tempdir_path = PathBuf::from(tempdir.path());
|
||||||
|
let living_files = HashSet::new();
|
||||||
|
|
||||||
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
|
managed_directory
|
||||||
|
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||||
|
.unwrap();
|
||||||
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
|
|
||||||
|
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||||
|
managed_directory.garbage_collect(|| living_files.clone());
|
||||||
|
if cfg!(target_os = "windows") {
|
||||||
|
// On Windows, gc should try and fail the file as it is mmapped.
|
||||||
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
|
// unmap should happen here.
|
||||||
|
drop(_mmap_read);
|
||||||
|
// The file should still be in the list of managed file and
|
||||||
|
// eventually be deleted once mmap is released.
|
||||||
|
managed_directory.garbage_collect(|| living_files);
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||||
|
} else {
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use self::notify::RawEvent;
|
|||||||
use self::notify::RecursiveMode;
|
use self::notify::RecursiveMode;
|
||||||
use self::notify::Watcher;
|
use self::notify::Watcher;
|
||||||
use atomicwrites;
|
use atomicwrites;
|
||||||
|
use common::make_io_err;
|
||||||
use core::META_FILEPATH;
|
use core::META_FILEPATH;
|
||||||
use directory::error::LockError;
|
use directory::error::LockError;
|
||||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||||
@@ -36,11 +37,6 @@ use std::sync::Weak;
|
|||||||
use std::thread;
|
use std::thread;
|
||||||
use tempdir::TempDir;
|
use tempdir::TempDir;
|
||||||
|
|
||||||
/// Create a default io error given a string.
|
|
||||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
|
||||||
io::Error::new(io::ErrorKind::Other, msg)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns None iff the file exists, can be read, but is empty (and hence
|
/// Returns None iff the file exists, can be read, but is empty (and hence
|
||||||
/// cannot be mmapped)
|
/// cannot be mmapped)
|
||||||
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use common::BitSet;
|
use common::BitSet;
|
||||||
use fastfield::DeleteBitSet;
|
|
||||||
use std::borrow::Borrow;
|
use std::borrow::Borrow;
|
||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
@@ -96,23 +95,9 @@ pub trait DocSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number documents matching.
|
/// Returns the number documents matching.
|
||||||
/// Calling this method consumes the `DocSet`.
|
|
||||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
|
||||||
let mut count = 0u32;
|
|
||||||
while self.advance() {
|
|
||||||
if !delete_bitset.is_deleted(self.doc()) {
|
|
||||||
count += 1u32;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
count
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the count of documents, deleted or not.
|
|
||||||
/// Calling this method consumes the `DocSet`.
|
|
||||||
///
|
///
|
||||||
/// Of course, the result is an upper bound of the result
|
/// Calling this method consumes the `DocSet`.
|
||||||
/// given by `count()`.
|
fn count(&mut self) -> u32 {
|
||||||
fn count_including_deleted(&mut self) -> u32 {
|
|
||||||
let mut count = 0u32;
|
let mut count = 0u32;
|
||||||
while self.advance() {
|
while self.advance() {
|
||||||
count += 1u32;
|
count += 1u32;
|
||||||
@@ -142,14 +127,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
|||||||
unboxed.size_hint()
|
unboxed.size_hint()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
fn count(&mut self) -> u32 {
|
||||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||||
unboxed.count(delete_bitset)
|
unboxed.count()
|
||||||
}
|
|
||||||
|
|
||||||
fn count_including_deleted(&mut self) -> u32 {
|
|
||||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
|
||||||
unboxed.count_including_deleted()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[ignore]
|
#[ignore]
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
fn test_indexing() {
|
fn test_indexing() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
use common::MAX_DOC_LIMIT;
|
|
||||||
use core::Segment;
|
use core::Segment;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use core::SerializableSegment;
|
use core::SerializableSegment;
|
||||||
@@ -24,7 +23,6 @@ use termdict::TermMerger;
|
|||||||
use termdict::TermOrdinal;
|
use termdict::TermOrdinal;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use TantivyError;
|
|
||||||
|
|
||||||
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
||||||
let mut total_tokens = 0u64;
|
let mut total_tokens = 0u64;
|
||||||
@@ -152,14 +150,6 @@ impl IndexMerger {
|
|||||||
readers.push(reader);
|
readers.push(reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if max_doc >= MAX_DOC_LIMIT {
|
|
||||||
let err_msg = format!(
|
|
||||||
"The segment resulting from this merge would have {} docs,\
|
|
||||||
which exceeds the limit {}.",
|
|
||||||
max_doc, MAX_DOC_LIMIT
|
|
||||||
);
|
|
||||||
return Err(TantivyError::InvalidArgument(err_msg));
|
|
||||||
}
|
|
||||||
Ok(IndexMerger {
|
Ok(IndexMerger {
|
||||||
schema,
|
schema,
|
||||||
readers,
|
readers,
|
||||||
|
|||||||
@@ -420,7 +420,6 @@ impl SegmentUpdater {
|
|||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
merge_candidates.extend(committed_merge_candidates.into_iter());
|
merge_candidates.extend(committed_merge_candidates.into_iter());
|
||||||
|
|
||||||
for merge_operation in merge_candidates {
|
for merge_operation in merge_candidates {
|
||||||
match self.start_merge_impl(merge_operation) {
|
match self.start_merge_impl(merge_operation) {
|
||||||
Ok(merge_future) => {
|
Ok(merge_future) => {
|
||||||
|
|||||||
@@ -174,7 +174,6 @@ extern crate downcast_rs;
|
|||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate fail;
|
extern crate fail;
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod functional_test;
|
mod functional_test;
|
||||||
|
|
||||||
|
|||||||
@@ -1,249 +0,0 @@
|
|||||||
use postings::compression::AlignedBuffer;
|
|
||||||
|
|
||||||
/// This modules define the logic used to search for a doc in a given
|
|
||||||
/// block. (at most 128 docs)
|
|
||||||
///
|
|
||||||
/// Searching within a block is a hotspot when running intersection.
|
|
||||||
/// so it was worth defining it in its own module.
|
|
||||||
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
mod sse2 {
|
|
||||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
|
||||||
use std::arch::x86_64::__m128i as DataType;
|
|
||||||
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
|
||||||
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
|
||||||
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
|
|
||||||
use std::arch::x86_64::_mm_set1_epi32 as set1;
|
|
||||||
use std::arch::x86_64::_mm_setzero_si128 as set0;
|
|
||||||
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
|
|
||||||
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
|
|
||||||
|
|
||||||
const MASK1: i32 = 78;
|
|
||||||
const MASK2: i32 = 177;
|
|
||||||
|
|
||||||
/// Performs an exhaustive linear search over the
|
|
||||||
///
|
|
||||||
/// There is no early exit here. We simply count the
|
|
||||||
/// number of elements that are `< target`.
|
|
||||||
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
|
||||||
unsafe {
|
|
||||||
let ptr = arr as *const AlignedBuffer as *const DataType;
|
|
||||||
let vkey = set1(target as i32);
|
|
||||||
let mut cnt = set0();
|
|
||||||
// We work over 4 `__m128i` at a time.
|
|
||||||
// A single `__m128i` actual contains 4 `u32`.
|
|
||||||
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
|
|
||||||
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
|
|
||||||
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
|
|
||||||
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
|
|
||||||
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
|
|
||||||
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
|
|
||||||
cnt = op_sub(cnt, sum);
|
|
||||||
}
|
|
||||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
|
|
||||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
|
|
||||||
_mm_cvtsi128_si32(cnt) as usize
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test {
|
|
||||||
use super::linear_search_sse2_128;
|
|
||||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_linear_search_sse2_128_u32() {
|
|
||||||
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
|
|
||||||
for el in 0u32..128u32 {
|
|
||||||
block[el as usize] = el * 2 + 1 << 18;
|
|
||||||
}
|
|
||||||
let target = block[64] + 1;
|
|
||||||
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This `linear search` browser exhaustively through the array.
|
|
||||||
/// but the early exit is very difficult to predict.
|
|
||||||
///
|
|
||||||
/// Coupled with `exponential search` this function is likely
|
|
||||||
/// to be called with the same `len`
|
|
||||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
|
||||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
|
||||||
let end = arr.len();
|
|
||||||
let mut begin = 0;
|
|
||||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
|
||||||
if pivot >= end {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if arr[pivot] > target {
|
|
||||||
return (begin, pivot);
|
|
||||||
}
|
|
||||||
begin = pivot;
|
|
||||||
}
|
|
||||||
(begin, end)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
|
||||||
let (start, end) = exponential_search(&block_docs, target);
|
|
||||||
start + linear_search(&block_docs[start..end], target)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Tantivy may rely on SIMD instructions to search for a specific document within
|
|
||||||
/// a given block.
|
|
||||||
#[derive(Clone, Copy, PartialEq)]
|
|
||||||
pub enum BlockSearcher {
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
SSE2,
|
|
||||||
Scalar,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BlockSearcher {
|
|
||||||
/// Search the first index containing an element greater or equal to
|
|
||||||
/// the target.
|
|
||||||
///
|
|
||||||
/// The results should be equivalent to
|
|
||||||
/// ```ignore
|
|
||||||
/// block[..]
|
|
||||||
// .iter()
|
|
||||||
// .take_while(|&&val| val < target)
|
|
||||||
// .count()
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// The `start` argument is just used to hint that the response is
|
|
||||||
/// greater than beyond `start`. The implementation may or may not use
|
|
||||||
/// it for optimization.
|
|
||||||
///
|
|
||||||
/// # Assumption
|
|
||||||
///
|
|
||||||
/// The array len is > start.
|
|
||||||
/// The block is sorted
|
|
||||||
/// The target is assumed greater or equal to the `arr[start]`.
|
|
||||||
/// The target is assumed smaller or equal to the last element of the block.
|
|
||||||
///
|
|
||||||
/// Currently the scalar implementation starts by an exponential search, and
|
|
||||||
/// then operates a linear search in the result subarray.
|
|
||||||
///
|
|
||||||
/// If SSE2 instructions are available in the `(platform, running CPU)`,
|
|
||||||
/// then we use a different implementation that does an exhaustive linear search over
|
|
||||||
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
|
|
||||||
/// of branch.
|
|
||||||
pub(crate) fn search_in_block(
|
|
||||||
self,
|
|
||||||
block_docs: &AlignedBuffer,
|
|
||||||
len: usize,
|
|
||||||
start: usize,
|
|
||||||
target: u32,
|
|
||||||
) -> usize {
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
{
|
|
||||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
|
||||||
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
|
|
||||||
return sse2::linear_search_sse2_128(block_docs, target);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
start + galloping(&block_docs.0[start..len], target)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for BlockSearcher {
|
|
||||||
fn default() -> BlockSearcher {
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
{
|
|
||||||
if is_x86_feature_detected!("sse2") {
|
|
||||||
return BlockSearcher::SSE2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
BlockSearcher::Scalar
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::exponential_search;
|
|
||||||
use super::linear_search;
|
|
||||||
use super::BlockSearcher;
|
|
||||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_linear_search() {
|
|
||||||
let len: usize = 50;
|
|
||||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
|
||||||
for target in 1..*arr.last().unwrap() {
|
|
||||||
let res = linear_search(&arr[..], target);
|
|
||||||
if res > 0 {
|
|
||||||
assert!(arr[res - 1] < target);
|
|
||||||
}
|
|
||||||
if res < len {
|
|
||||||
assert!(arr[res] >= target);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_exponentiel_search() {
|
|
||||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
|
||||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
|
||||||
assert_eq!(
|
|
||||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
|
||||||
(3, 7)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
|
||||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
|
||||||
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
|
|
||||||
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
|
|
||||||
output_buffer[..block.len()].copy_from_slice(block);
|
|
||||||
for i in 0..cursor {
|
|
||||||
assert_eq!(
|
|
||||||
block_searcher.search_in_block(
|
|
||||||
&AlignedBuffer(output_buffer),
|
|
||||||
block.len(),
|
|
||||||
i,
|
|
||||||
target
|
|
||||||
),
|
|
||||||
cursor
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
|
|
||||||
use std::collections::HashSet;
|
|
||||||
let mut targets = HashSet::new();
|
|
||||||
for (i, val) in block.iter().cloned().enumerate() {
|
|
||||||
if i > 0 {
|
|
||||||
targets.insert(val - 1);
|
|
||||||
}
|
|
||||||
targets.insert(val);
|
|
||||||
}
|
|
||||||
for target in targets {
|
|
||||||
util_test_search_in_block(block_searcher, block, target);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
|
||||||
block.iter().take_while(|&&val| val < target).count()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_search_in_block_util(block_searcher: BlockSearcher) {
|
|
||||||
for len in 1u32..128u32 {
|
|
||||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
|
||||||
util_test_search_in_block_all(block_searcher, &v[..]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_search_in_block_scalar() {
|
|
||||||
test_search_in_block_util(BlockSearcher::Scalar);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
#[test]
|
|
||||||
fn test_search_in_block_sse2() {
|
|
||||||
test_search_in_block_util(BlockSearcher::SSE2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -43,14 +43,9 @@ impl BlockEncoder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We ensure that the OutputBuffer is align on 128 bits
|
|
||||||
/// in order to run SSE2 linear search on it.
|
|
||||||
#[repr(align(128))]
|
|
||||||
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
|
|
||||||
|
|
||||||
pub struct BlockDecoder {
|
pub struct BlockDecoder {
|
||||||
bitpacker: BitPacker4x,
|
bitpacker: BitPacker4x,
|
||||||
output: AlignedBuffer,
|
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
|
||||||
pub output_len: usize,
|
pub output_len: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,9 +55,11 @@ impl BlockDecoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_val(val: u32) -> BlockDecoder {
|
pub fn with_val(val: u32) -> BlockDecoder {
|
||||||
|
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
|
||||||
|
output[COMPRESSION_BLOCK_SIZE] = 0u32;
|
||||||
BlockDecoder {
|
BlockDecoder {
|
||||||
bitpacker: BitPacker4x::new(),
|
bitpacker: BitPacker4x::new(),
|
||||||
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
|
output,
|
||||||
output_len: 0,
|
output_len: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -75,28 +72,23 @@ impl BlockDecoder {
|
|||||||
) -> usize {
|
) -> usize {
|
||||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||||
self.bitpacker
|
self.bitpacker
|
||||||
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
|
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||||
self.bitpacker
|
self.bitpacker
|
||||||
.decompress(&compressed_data, &mut self.output.0, num_bits)
|
.decompress(&compressed_data, &mut self.output, num_bits)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn output_array(&self) -> &[u32] {
|
pub fn output_array(&self) -> &[u32] {
|
||||||
&self.output.0[..self.output_len]
|
&self.output[..self.output_len]
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
|
|
||||||
(&self.output, self.output_len)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn output(&self, idx: usize) -> u32 {
|
pub fn output(&self, idx: usize) -> u32 {
|
||||||
self.output.0[idx]
|
self.output[idx]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,12 +159,12 @@ impl VIntDecoder for BlockDecoder {
|
|||||||
num_els: usize,
|
num_els: usize,
|
||||||
) -> usize {
|
) -> usize {
|
||||||
self.output_len = num_els;
|
self.output_len = num_els;
|
||||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||||
self.output_len = num_els;
|
self.output_len = num_els;
|
||||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Postings module (also called inverted index)
|
Postings module (also called inverted index)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
mod block_search;
|
|
||||||
pub(crate) mod compression;
|
pub(crate) mod compression;
|
||||||
/// Postings module
|
/// Postings module
|
||||||
///
|
///
|
||||||
@@ -17,8 +16,6 @@ mod skip;
|
|||||||
mod stacker;
|
mod stacker;
|
||||||
mod term_info;
|
mod term_info;
|
||||||
|
|
||||||
pub(crate) use self::block_search::BlockSearcher;
|
|
||||||
|
|
||||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||||
|
|
||||||
@@ -55,15 +52,13 @@ pub mod tests {
|
|||||||
use fieldnorm::FieldNormReader;
|
use fieldnorm::FieldNormReader;
|
||||||
use indexer::operation::AddOperation;
|
use indexer::operation::AddOperation;
|
||||||
use indexer::SegmentWriter;
|
use indexer::SegmentWriter;
|
||||||
use merge_policy::NoMergePolicy;
|
|
||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
|
use schema::Field;
|
||||||
|
use schema::IndexRecordOption;
|
||||||
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
||||||
use schema::{Field, TextOptions};
|
|
||||||
use schema::{IndexRecordOption, TextFieldIndexing};
|
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
@@ -109,7 +104,9 @@ pub mod tests {
|
|||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
||||||
let term = Term::from_field_text(title, "abc");
|
let term = Term::from_field_text(title, "abc");
|
||||||
|
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||||
@@ -162,52 +159,6 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
pub fn test_drop_token_that_are_too_long() {
|
|
||||||
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
|
|
||||||
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
|
|
||||||
exceeding_token_text.push_str(" hello");
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_options = TextOptions::default().set_indexing_options(
|
|
||||||
TextFieldIndexing::default()
|
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
|
||||||
.set_tokenizer("simple_no_truncation"),
|
|
||||||
);
|
|
||||||
let text_field = schema_builder.add_text_field("text", text_options);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
index
|
|
||||||
.tokenizers()
|
|
||||||
.register("simple_no_truncation", SimpleTokenizer);
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
|
||||||
{
|
|
||||||
index_writer.add_document(doc!(text_field=>exceeding_token_text));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
reader.reload().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let segment_reader = searcher.segment_reader(0u32);
|
|
||||||
let inverted_index = segment_reader.inverted_index(text_field);
|
|
||||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
|
||||||
let mut bytes = vec![];
|
|
||||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
|
||||||
assert_eq!(&bytes, b"hello");
|
|
||||||
}
|
|
||||||
{
|
|
||||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
reader.reload().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let segment_reader = searcher.segment_reader(1u32);
|
|
||||||
let inverted_index = segment_reader.inverted_index(text_field);
|
|
||||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
|
||||||
let mut bytes = vec![];
|
|
||||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
|
||||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_and_fieldnorm1() {
|
pub fn test_position_and_fieldnorm1() {
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ use std::io;
|
|||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use termdict::TermOrdinal;
|
use termdict::TermOrdinal;
|
||||||
|
use tokenizer::Token;
|
||||||
use tokenizer::TokenStream;
|
use tokenizer::TokenStream;
|
||||||
use tokenizer::{Token, MAX_TOKEN_LEN};
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
@@ -210,18 +210,8 @@ pub trait PostingsWriter {
|
|||||||
) -> u32 {
|
) -> u32 {
|
||||||
let mut term = Term::for_field(field);
|
let mut term = Term::for_field(field);
|
||||||
let mut sink = |token: &Token| {
|
let mut sink = |token: &Token| {
|
||||||
// We skip all tokens with a len greater than u16.
|
term.set_text(token.text.as_str());
|
||||||
if token.text.len() <= MAX_TOKEN_LEN {
|
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||||
term.set_text(token.text.as_str());
|
|
||||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
|
||||||
} else {
|
|
||||||
info!(
|
|
||||||
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
|
||||||
MAX_TOKEN_LEN in the documentation for more information.",
|
|
||||||
token.text.len(),
|
|
||||||
MAX_TOKEN_LEN
|
|
||||||
);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
token_stream.process(&mut sink)
|
token_stream.process(&mut sink)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,10 +4,9 @@ use common::{BinarySerializable, VInt};
|
|||||||
use docset::{DocSet, SkipResult};
|
use docset::{DocSet, SkipResult};
|
||||||
use owned_read::OwnedRead;
|
use owned_read::OwnedRead;
|
||||||
use positions::PositionReader;
|
use positions::PositionReader;
|
||||||
use postings::compression::{compressed_block_size, AlignedBuffer};
|
use postings::compression::compressed_block_size;
|
||||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||||
use postings::serializer::PostingsSerializer;
|
use postings::serializer::PostingsSerializer;
|
||||||
use postings::BlockSearcher;
|
|
||||||
use postings::FreqReadingOption;
|
use postings::FreqReadingOption;
|
||||||
use postings::Postings;
|
use postings::Postings;
|
||||||
use postings::SkipReader;
|
use postings::SkipReader;
|
||||||
@@ -61,7 +60,6 @@ pub struct SegmentPostings {
|
|||||||
block_cursor: BlockSegmentPostings,
|
block_cursor: BlockSegmentPostings,
|
||||||
cur: usize,
|
cur: usize,
|
||||||
position_computer: Option<PositionComputer>,
|
position_computer: Option<PositionComputer>,
|
||||||
block_searcher: BlockSearcher,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentPostings {
|
impl SegmentPostings {
|
||||||
@@ -72,7 +70,6 @@ impl SegmentPostings {
|
|||||||
block_cursor: empty_block_cursor,
|
block_cursor: empty_block_cursor,
|
||||||
cur: COMPRESSION_BLOCK_SIZE,
|
cur: COMPRESSION_BLOCK_SIZE,
|
||||||
position_computer: None,
|
position_computer: None,
|
||||||
block_searcher: BlockSearcher::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,33 +117,42 @@ impl SegmentPostings {
|
|||||||
block_cursor: segment_block_postings,
|
block_cursor: segment_block_postings,
|
||||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||||
position_computer: positions_stream_opt.map(PositionComputer::new),
|
position_computer: positions_stream_opt.map(PositionComputer::new),
|
||||||
block_searcher: BlockSearcher::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocSet for SegmentPostings {
|
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||||
// goes to the next element.
|
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||||
// next needs to be called a first time to point to the correct element.
|
}
|
||||||
#[inline]
|
|
||||||
fn advance(&mut self) -> bool {
|
|
||||||
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
|
|
||||||
let term_freq = self.term_freq() as usize;
|
|
||||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
|
||||||
position_computer.add_skip(term_freq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.cur += 1;
|
|
||||||
if self.cur >= self.block_cursor.block_len() {
|
|
||||||
self.cur = 0;
|
|
||||||
if !self.block_cursor.advance() {
|
|
||||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
|
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||||
|
let end = arr.len();
|
||||||
|
let mut begin = 0;
|
||||||
|
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||||
|
if pivot >= end {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if arr[pivot] > target {
|
||||||
|
return (begin, pivot);
|
||||||
|
}
|
||||||
|
begin = pivot;
|
||||||
|
}
|
||||||
|
(begin, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Search the first index containing an element greater or equal to the target.
|
||||||
|
///
|
||||||
|
/// # Assumption
|
||||||
|
///
|
||||||
|
/// The array is assumed non empty.
|
||||||
|
/// The target is assumed greater or equal to the first element.
|
||||||
|
/// The target is assumed smaller or equal to the last element.
|
||||||
|
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||||
|
let (start, end) = exponential_search(block_docs, target);
|
||||||
|
start + linear_search(&block_docs[start..end], target)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocSet for SegmentPostings {
|
||||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||||
if !self.advance() {
|
if !self.advance() {
|
||||||
return SkipResult::End;
|
return SkipResult::End;
|
||||||
@@ -169,6 +175,7 @@ impl DocSet for SegmentPostings {
|
|||||||
|
|
||||||
// skip blocks until one that might contain the target
|
// skip blocks until one that might contain the target
|
||||||
// check if we need to go to the next block
|
// check if we need to go to the next block
|
||||||
|
let need_positions = self.position_computer.is_some();
|
||||||
let mut sum_freqs_skipped: u32 = 0;
|
let mut sum_freqs_skipped: u32 = 0;
|
||||||
if !self
|
if !self
|
||||||
.block_cursor
|
.block_cursor
|
||||||
@@ -182,7 +189,7 @@ impl DocSet for SegmentPostings {
|
|||||||
// we are not in the right block.
|
// we are not in the right block.
|
||||||
//
|
//
|
||||||
// First compute all of the freqs skipped from the current block.
|
// First compute all of the freqs skipped from the current block.
|
||||||
if self.position_computer.is_some() {
|
if need_positions {
|
||||||
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
||||||
match self.block_cursor.skip_to(target) {
|
match self.block_cursor.skip_to(target) {
|
||||||
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
||||||
@@ -201,21 +208,25 @@ impl DocSet for SegmentPostings {
|
|||||||
self.cur = 0;
|
self.cur = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
let cur = self.cur;
|
|
||||||
|
|
||||||
// we're in the right block now, start with an exponential search
|
// we're in the right block now, start with an exponential search
|
||||||
let (output, len) = self.block_cursor.docs_aligned();
|
let block_docs = self.block_cursor.docs();
|
||||||
let new_cur = self
|
let new_cur = self
|
||||||
.block_searcher
|
.cur
|
||||||
.search_in_block(&output, len, cur, target);
|
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
|
||||||
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
|
if need_positions {
|
||||||
position_computer.add_skip(sum_freqs_skipped as usize);
|
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||||
|
.iter()
|
||||||
|
.sum::<u32>();
|
||||||
|
self.position_computer
|
||||||
|
.as_mut()
|
||||||
|
.unwrap()
|
||||||
|
.add_skip(sum_freqs_skipped as usize);
|
||||||
}
|
}
|
||||||
self.cur = new_cur;
|
self.cur = new_cur;
|
||||||
|
|
||||||
// `doc` is now the first element >= `target`
|
// `doc` is now the first element >= `target`
|
||||||
let doc = output.0[new_cur];
|
let doc = block_docs[new_cur];
|
||||||
debug_assert!(doc >= target);
|
debug_assert!(doc >= target);
|
||||||
if doc == target {
|
if doc == target {
|
||||||
SkipResult::Reached
|
SkipResult::Reached
|
||||||
@@ -224,25 +235,40 @@ impl DocSet for SegmentPostings {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// goes to the next element.
|
||||||
|
// next needs to be called a first time to point to the correct element.
|
||||||
|
#[inline]
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
if self.position_computer.is_some() {
|
||||||
|
let term_freq = self.term_freq() as usize;
|
||||||
|
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
||||||
|
}
|
||||||
|
self.cur += 1;
|
||||||
|
if self.cur >= self.block_cursor.block_len() {
|
||||||
|
self.cur = 0;
|
||||||
|
if !self.block_cursor.advance() {
|
||||||
|
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.len() as u32
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the current document's `DocId`.
|
/// Return the current document's `DocId`.
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
///
|
|
||||||
/// Will panics if called without having called advance before.
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn doc(&self) -> DocId {
|
fn doc(&self) -> DocId {
|
||||||
let docs = self.block_cursor.docs();
|
let docs = self.block_cursor.docs();
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
self.cur < docs.len(),
|
self.cur < docs.len(),
|
||||||
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
|
"Have you forgotten to call `.advance()` at least once before calling .doc()."
|
||||||
);
|
);
|
||||||
docs[self.cur]
|
docs[self.cur]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn size_hint(&self) -> u32 {
|
|
||||||
self.len() as u32
|
|
||||||
}
|
|
||||||
|
|
||||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||||
// finish the current block
|
// finish the current block
|
||||||
if self.advance() {
|
if self.advance() {
|
||||||
@@ -266,33 +292,17 @@ impl HasLen for SegmentPostings {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Postings for SegmentPostings {
|
impl Postings for SegmentPostings {
|
||||||
/// Returns the frequency associated to the current document.
|
|
||||||
/// If the schema is set up so that no frequency have been encoded,
|
|
||||||
/// this method should always return 1.
|
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
///
|
|
||||||
/// Will panics if called without having called advance before.
|
|
||||||
fn term_freq(&self) -> u32 {
|
fn term_freq(&self) -> u32 {
|
||||||
debug_assert!(
|
|
||||||
// Here we do not use the len of `freqs()`
|
|
||||||
// because it is actually ok to request for the freq of doc
|
|
||||||
// even if no frequency were encoded for the field.
|
|
||||||
//
|
|
||||||
// In that case we hit the block just as if the frequency had been
|
|
||||||
// decoded. The block is simply prefilled by the value 1.
|
|
||||||
self.cur < COMPRESSION_BLOCK_SIZE,
|
|
||||||
"Have you forgotten to call `.advance()` at least once before calling \
|
|
||||||
`.term_freq()`."
|
|
||||||
);
|
|
||||||
self.block_cursor.freq(self.cur)
|
self.block_cursor.freq(self.cur)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||||
let term_freq = self.term_freq() as usize;
|
if self.position_computer.is_some() {
|
||||||
if let Some(position_comp) = self.position_computer.as_mut() {
|
output.resize(self.term_freq() as usize, 0u32);
|
||||||
output.resize(term_freq, 0u32);
|
self.position_computer
|
||||||
position_comp.positions_with_offset(offset, &mut output[..]);
|
.as_mut()
|
||||||
|
.unwrap()
|
||||||
|
.positions_with_offset(offset, &mut output[..])
|
||||||
} else {
|
} else {
|
||||||
output.clear();
|
output.clear();
|
||||||
}
|
}
|
||||||
@@ -414,10 +424,6 @@ impl BlockSegmentPostings {
|
|||||||
self.doc_decoder.output_array()
|
self.doc_decoder.output_array()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
|
||||||
self.doc_decoder.output_aligned()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the document at index `idx` of the block.
|
/// Return the document at index `idx` of the block.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn doc(&self, idx: usize) -> u32 {
|
pub fn doc(&self, idx: usize) -> u32 {
|
||||||
@@ -608,13 +614,16 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
use super::exponential_search;
|
||||||
|
use super::linear_search;
|
||||||
|
use super::search_within_block;
|
||||||
use super::BlockSegmentPostings;
|
use super::BlockSegmentPostings;
|
||||||
use super::BlockSegmentPostingsSkipResult;
|
use super::BlockSegmentPostingsSkipResult;
|
||||||
use super::SegmentPostings;
|
use super::SegmentPostings;
|
||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use postings::postings::Postings;
|
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
@@ -623,6 +632,21 @@ mod tests {
|
|||||||
use DocId;
|
use DocId;
|
||||||
use SkipResult;
|
use SkipResult;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_linear_search() {
|
||||||
|
let len: usize = 50;
|
||||||
|
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||||
|
for target in 1..*arr.last().unwrap() {
|
||||||
|
let res = linear_search(&arr[..], target);
|
||||||
|
if res > 0 {
|
||||||
|
assert!(arr[res - 1] < target);
|
||||||
|
}
|
||||||
|
if res < len {
|
||||||
|
assert!(arr[res] >= target);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty_segment_postings() {
|
fn test_empty_segment_postings() {
|
||||||
let mut postings = SegmentPostings::empty();
|
let mut postings = SegmentPostings::empty();
|
||||||
@@ -631,18 +655,6 @@ mod tests {
|
|||||||
assert_eq!(postings.len(), 0);
|
assert_eq!(postings.len(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
|
||||||
fn test_panic_if_doc_called_before_advance() {
|
|
||||||
SegmentPostings::empty().doc();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
|
||||||
fn test_panic_if_freq_called_before_advance() {
|
|
||||||
SegmentPostings::empty().term_freq();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty_block_segment_postings() {
|
fn test_empty_block_segment_postings() {
|
||||||
let mut postings = BlockSegmentPostings::empty();
|
let mut postings = BlockSegmentPostings::empty();
|
||||||
@@ -650,6 +662,56 @@ mod tests {
|
|||||||
assert_eq!(postings.doc_freq(), 0);
|
assert_eq!(postings.doc_freq(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||||
|
block
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|&(_, ref val)| *val >= target)
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.0
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_exponentiel_search() {
|
||||||
|
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||||
|
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||||
|
assert_eq!(
|
||||||
|
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||||
|
(3, 7)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||||
|
assert_eq!(
|
||||||
|
search_within_block(block, target),
|
||||||
|
search_within_block_trivial_but_slow(block, target)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn util_test_search_within_block_all(block: &[u32]) {
|
||||||
|
use std::collections::HashSet;
|
||||||
|
let mut targets = HashSet::new();
|
||||||
|
for (i, val) in block.iter().cloned().enumerate() {
|
||||||
|
if i > 0 {
|
||||||
|
targets.insert(val - 1);
|
||||||
|
}
|
||||||
|
targets.insert(val);
|
||||||
|
}
|
||||||
|
for target in targets {
|
||||||
|
util_test_search_within_block(block, target);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_search_within_block() {
|
||||||
|
for len in 1u32..128u32 {
|
||||||
|
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||||
|
util_test_search_within_block_all(&v[..]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings() {
|
fn test_block_segment_postings() {
|
||||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use termdict::{TermDictionaryBuilder, TermOrdinal};
|
|||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// `InvertedIndexSerializer` is in charge of serializing
|
/// `PostingsSerializer` is in charge of serializing
|
||||||
/// postings on disk, in the
|
/// postings on disk, in the
|
||||||
/// * `.idx` (inverted index)
|
/// * `.idx` (inverted index)
|
||||||
/// * `.pos` (positions file)
|
/// * `.pos` (positions file)
|
||||||
@@ -54,7 +54,7 @@ pub struct InvertedIndexSerializer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl InvertedIndexSerializer {
|
impl InvertedIndexSerializer {
|
||||||
/// Open a new `InvertedIndexSerializer` for the given segment
|
/// Open a new `PostingsSerializer` for the given segment
|
||||||
fn create(
|
fn create(
|
||||||
terms_write: CompositeWrite<WritePtr>,
|
terms_write: CompositeWrite<WritePtr>,
|
||||||
postings_write: CompositeWrite<WritePtr>,
|
postings_write: CompositeWrite<WritePtr>,
|
||||||
|
|||||||
@@ -14,35 +14,41 @@ use Score;
|
|||||||
/// specialized implementation if the two
|
/// specialized implementation if the two
|
||||||
/// shortest scorers are `TermScorer`s.
|
/// shortest scorers are `TermScorer`s.
|
||||||
pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
||||||
if scorers.is_empty() {
|
|
||||||
return Box::new(EmptyScorer);
|
|
||||||
}
|
|
||||||
if scorers.len() == 1 {
|
|
||||||
return scorers.pop().unwrap();
|
|
||||||
}
|
|
||||||
// We know that we have at least 2 elements.
|
|
||||||
let num_docsets = scorers.len();
|
let num_docsets = scorers.len();
|
||||||
scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
|
scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
|
||||||
let left = scorers.pop().unwrap();
|
let rarest_opt = scorers.pop();
|
||||||
let right = scorers.pop().unwrap();
|
let second_rarest_opt = scorers.pop();
|
||||||
scorers.reverse();
|
scorers.reverse();
|
||||||
let all_term_scorers = [&left, &right]
|
match (rarest_opt, second_rarest_opt) {
|
||||||
.iter()
|
(None, None) => Box::new(EmptyScorer),
|
||||||
.all(|&scorer| scorer.is::<TermScorer>());
|
(Some(single_docset), None) => single_docset,
|
||||||
if all_term_scorers {
|
(Some(left), Some(right)) => {
|
||||||
return Box::new(Intersection {
|
{
|
||||||
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
let all_term_scorers = [&left, &right]
|
||||||
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
.iter()
|
||||||
others: scorers,
|
.all(|&scorer| scorer.is::<TermScorer>());
|
||||||
num_docsets,
|
if all_term_scorers {
|
||||||
});
|
let left = *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap());
|
||||||
|
let right = *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap());
|
||||||
|
return Box::new(Intersection {
|
||||||
|
left,
|
||||||
|
right,
|
||||||
|
others: scorers,
|
||||||
|
num_docsets,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Box::new(Intersection {
|
||||||
|
left,
|
||||||
|
right,
|
||||||
|
others: scorers,
|
||||||
|
num_docsets,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
unreachable!();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Box::new(Intersection {
|
|
||||||
left,
|
|
||||||
right,
|
|
||||||
others: scorers,
|
|
||||||
num_docsets,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||||
@@ -118,6 +124,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
match left.skip_next(candidate) {
|
match left.skip_next(candidate) {
|
||||||
SkipResult::Reached => {
|
SkipResult::Reached => {
|
||||||
break;
|
break;
|
||||||
@@ -133,36 +140,35 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
|||||||
}
|
}
|
||||||
// test the remaining scorers;
|
// test the remaining scorers;
|
||||||
for (ord, docset) in self.others.iter_mut().enumerate() {
|
for (ord, docset) in self.others.iter_mut().enumerate() {
|
||||||
if ord == other_candidate_ord {
|
if ord != other_candidate_ord {
|
||||||
continue;
|
// `candidate_ord` is already at the
|
||||||
}
|
// right position.
|
||||||
// `candidate_ord` is already at the
|
//
|
||||||
// right position.
|
// Calling `skip_next` would advance this docset
|
||||||
//
|
// and miss it.
|
||||||
// Calling `skip_next` would advance this docset
|
match docset.skip_next(candidate) {
|
||||||
// and miss it.
|
SkipResult::Reached => {}
|
||||||
match docset.skip_next(candidate) {
|
SkipResult::OverStep => {
|
||||||
SkipResult::Reached => {}
|
// this is not in the intersection,
|
||||||
SkipResult::OverStep => {
|
// let's update our candidate.
|
||||||
// this is not in the intersection,
|
candidate = docset.doc();
|
||||||
// let's update our candidate.
|
match left.skip_next(candidate) {
|
||||||
candidate = docset.doc();
|
SkipResult::Reached => {
|
||||||
match left.skip_next(candidate) {
|
other_candidate_ord = ord;
|
||||||
SkipResult::Reached => {
|
}
|
||||||
other_candidate_ord = ord;
|
SkipResult::OverStep => {
|
||||||
}
|
candidate = left.doc();
|
||||||
SkipResult::OverStep => {
|
other_candidate_ord = usize::max_value();
|
||||||
candidate = left.doc();
|
}
|
||||||
other_candidate_ord = usize::max_value();
|
SkipResult::End => {
|
||||||
}
|
return false;
|
||||||
SkipResult::End => {
|
}
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
SkipResult::End => {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
continue 'outer;
|
|
||||||
}
|
|
||||||
SkipResult::End => {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -98,20 +98,4 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_term_query_count_when_there_are_deletes() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
|
||||||
index_writer.add_document(doc!(text_field=>"a c"));
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
|
||||||
let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,15 +39,15 @@ impl Weight for TermWeight {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
if reader.num_deleted_docs() == 0 {
|
||||||
Ok(self.scorer(reader)?.count(delete_bitset))
|
|
||||||
} else {
|
|
||||||
let field = self.term.field();
|
let field = self.term.field();
|
||||||
Ok(reader
|
Ok(reader
|
||||||
.inverted_index(field)
|
.inverted_index(field)
|
||||||
.get_term_info(&self.term)
|
.get_term_info(&self.term)
|
||||||
.map(|term_info| term_info.doc_freq)
|
.map(|term_info| term_info.doc_freq)
|
||||||
.unwrap_or(0))
|
.unwrap_or(0))
|
||||||
|
} else {
|
||||||
|
Ok(self.scorer(reader)?.count())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -145,7 +145,7 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn count_including_deleted(&mut self) -> u32 {
|
fn count(&mut self) -> u32 {
|
||||||
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
||||||
.iter()
|
.iter()
|
||||||
.map(|bitset| bitset.len())
|
.map(|bitset| bitset.len())
|
||||||
@@ -163,8 +163,6 @@ where
|
|||||||
count
|
count
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO implement `count` efficiently.
|
|
||||||
|
|
||||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||||
if !self.advance() {
|
if !self.advance() {
|
||||||
return SkipResult::End;
|
return SkipResult::End;
|
||||||
@@ -302,7 +300,7 @@ mod tests {
|
|||||||
count += 1;
|
count += 1;
|
||||||
}
|
}
|
||||||
assert!(!union_expected.advance());
|
assert!(!union_expected.advance());
|
||||||
assert_eq!(count, make_union().count_including_deleted());
|
assert_eq!(count, make_union().count());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -13,11 +13,6 @@ pub trait Weight: Send + Sync + 'static {
|
|||||||
|
|
||||||
/// Returns the number documents within the given `SegmentReader`.
|
/// Returns the number documents within the given `SegmentReader`.
|
||||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||||
let mut scorer = self.scorer(reader)?;
|
Ok(self.scorer(reader)?.count())
|
||||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
|
||||||
Ok(scorer.count(delete_bitset))
|
|
||||||
} else {
|
|
||||||
Ok(scorer.count_including_deleted())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -97,8 +97,6 @@
|
|||||||
//! If you built your schema programmatically, a complete example
|
//! If you built your schema programmatically, a complete example
|
||||||
//! could like this for instance.
|
//! could like this for instance.
|
||||||
//!
|
//!
|
||||||
//! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html).
|
|
||||||
//!
|
|
||||||
//! # Example
|
//! # Example
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
@@ -159,13 +157,6 @@ pub use self::tokenizer::BoxedTokenizer;
|
|||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|
||||||
/// Maximum authorized len (in bytes) for a token.
|
|
||||||
///
|
|
||||||
/// Tokenizer are in charge of not emitting tokens larger than this value.
|
|
||||||
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
|
|
||||||
/// `2^16 - 1 - 4`, the token will simply be ignored downstream.
|
|
||||||
pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
use super::{
|
use super::{
|
||||||
|
|||||||
Reference in New Issue
Block a user