mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-30 22:12:55 +00:00
Compare commits
2 Commits
0.5.2
...
common-cra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1658be3792 | ||
|
|
23fad88b35 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,4 +1,3 @@
|
||||
*.swp
|
||||
target
|
||||
target/debug
|
||||
.vscode
|
||||
@@ -9,4 +8,4 @@ benchmark
|
||||
cpp/simdcomp/bitpackingbenchmark
|
||||
*.bk
|
||||
.idea
|
||||
trace.dat
|
||||
trace.dat
|
||||
25
.travis.yml
25
.travis.yml
@@ -1,6 +1,4 @@
|
||||
language: rust
|
||||
sudo: required
|
||||
cache: cargo
|
||||
rust:
|
||||
- nightly
|
||||
env:
|
||||
@@ -13,7 +11,6 @@ addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
@@ -21,18 +18,18 @@ addons:
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
- cargo install-update -a # update outdated cached binaries
|
||||
- |
|
||||
pip install 'travis-cargo<0.2' --user &&
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
script:
|
||||
- cargo build
|
||||
- cargo test
|
||||
- cargo test -- --ignored
|
||||
- |
|
||||
travis-cargo build &&
|
||||
travis-cargo test &&
|
||||
travis-cargo bench
|
||||
- cargo run --example simple_search
|
||||
- cargo doc
|
||||
after_success:
|
||||
- cargo coveralls --exclude-pattern cpp/,src/functional_test.rs
|
||||
- cargo doc-upload
|
||||
- bash ./script/build-doc.sh
|
||||
- travis-cargo doc-upload
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --include-path=`pwd`/src --exclude-path=`pwd`/cpp --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi
|
||||
|
||||
13
.vimrc
13
.vimrc
@@ -1,13 +0,0 @@
|
||||
set wildignore+=*/examples/*
|
||||
|
||||
set tabstop=2
|
||||
set shiftwidth=2
|
||||
set softtabstop=2
|
||||
set expandtab
|
||||
set nosmarttab
|
||||
|
||||
set textwidth=100
|
||||
|
||||
autocmd BufRead *.rs :setlocal tags=./rusty-tags.vi;/
|
||||
autocmd BufWritePost *.rs :silent! exec "!rusty-tags vi -o --quiet --start-dir=" . expand('%:p:h') . "&" | redraw!
|
||||
|
||||
23
CHANGELOG.md
23
CHANGELOG.md
@@ -1,26 +1,3 @@
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
- bugfix #274
|
||||
- bugfix #280
|
||||
- bugfix #289
|
||||
|
||||
Tantivy 0.5.1
|
||||
==========================
|
||||
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
|
||||
|
||||
|
||||
Tantivy 0.5
|
||||
==========================
|
||||
- Faceting
|
||||
- RangeQuery
|
||||
- Configurable tokenization pipeline
|
||||
- Bugfix in PhraseQuery
|
||||
- Various query optimisation
|
||||
- Allowing very large indexes
|
||||
- 64 bits file address
|
||||
- Smarter encoding of the `TermInfo` objects
|
||||
|
||||
|
||||
|
||||
Tantivy 0.4.3
|
||||
==========================
|
||||
|
||||
15
Cargo.toml
15
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.5.2"
|
||||
version = "0.5.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
build = "build.rs"
|
||||
license = "MIT"
|
||||
@@ -14,10 +14,11 @@ keywords = ["search", "information", "retrieval"]
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.0"
|
||||
memmap = "0.4"
|
||||
lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = "0.2"
|
||||
fst = "0.1.37"
|
||||
atomicwrites = "0.1.3"
|
||||
tempfile = "2.1"
|
||||
log = "0.3.6"
|
||||
@@ -26,14 +27,16 @@ tempdir = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
libc = { version = "0.2.20", optional=true }
|
||||
bincode = "0.8"
|
||||
libc = {version = "0.2.20", optional=true}
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
lz4 = "1.20"
|
||||
bit-set = "0.4.0"
|
||||
time = "0.1"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
uuid = { version = "0.5", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
version = "2"
|
||||
crossbeam = "0.3"
|
||||
futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
@@ -41,8 +44,6 @@ error-chain = "0.8"
|
||||
owning_ref = "0.3"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9", features = ["nightly"]}
|
||||
matches = "0.1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
@@ -52,7 +53,7 @@ rand = "0.3"
|
||||
env_logger = "0.4"
|
||||
|
||||
[build-dependencies]
|
||||
cc = { version="1.0.0", optional=true }
|
||||
cc = {version = "1.0.0", optional=true}
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
|
||||
@@ -20,7 +20,10 @@ fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
@@ -28,6 +31,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
@@ -58,6 +62,8 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
@@ -66,6 +72,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// with our schema in the directory.
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
@@ -78,6 +85,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
|
||||
// ### Create a document "manually".
|
||||
//
|
||||
// We can create a document manually, by setting the fields
|
||||
@@ -90,7 +98,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
@@ -137,6 +145,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 4 minutes on my computer!
|
||||
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
// At this point our documents are not searchable.
|
||||
@@ -158,6 +167,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// tantivy behaves as if has rolled back to its last
|
||||
// commit.
|
||||
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// Let's search our index. Start by reloading
|
||||
@@ -182,6 +192,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// A ticket has been opened regarding this problem.
|
||||
let query = query_parser.parse_query("sea whale")?;
|
||||
|
||||
|
||||
// A query defines a set of documents, as
|
||||
// well as the way they should be scored.
|
||||
//
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
use_try_shorthand = true
|
||||
@@ -16,10 +16,6 @@ impl Collector for DoNothingCollector {
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
#[inline]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
@@ -46,8 +42,8 @@ impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Rig
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
self.left.set_segment(segment_local_id, segment)?;
|
||||
self.right.set_segment(segment_local_id, segment)?;
|
||||
try!(self.left.set_segment(segment_local_id, segment));
|
||||
try!(self.right.set_segment(segment_local_id, segment));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -55,10 +51,6 @@ impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Rig
|
||||
self.left.collect(doc, score);
|
||||
self.right.collect(doc, score);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a `ChainedCollector`
|
||||
|
||||
@@ -7,7 +7,6 @@ use SegmentLocalId;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
#[derive(Default)]
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
}
|
||||
@@ -20,6 +19,12 @@ impl CountCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CountCollector {
|
||||
fn default() -> CountCollector {
|
||||
CountCollector { count: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for CountCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
@@ -28,27 +33,23 @@ impl Collector for CountCollector {
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use collector::{Collector, CountCollector};
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use collector::Collector;
|
||||
|
||||
#[test]
|
||||
fn test_count_collector() {
|
||||
let mut count_collector = CountCollector::default();
|
||||
assert_eq!(count_collector.count(), 0);
|
||||
count_collector.collect(0u32, 1f32);
|
||||
assert_eq!(count_collector.count(), 1);
|
||||
assert_eq!(count_collector.count(), 1);
|
||||
count_collector.collect(1u32, 1f32);
|
||||
assert_eq!(count_collector.count(), 2);
|
||||
assert!(!count_collector.requires_scoring());
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,637 +1,113 @@
|
||||
use std::mem;
|
||||
use std::cmp::Eq;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::Hash;
|
||||
|
||||
use collector::Collector;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
use std::cell::UnsafeCell;
|
||||
use schema::Facet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermStreamer;
|
||||
use termdict::TermStreamerBuilder;
|
||||
use std::collections::BTreeSet;
|
||||
use termdict::TermMerger;
|
||||
use docset::SkipResult;
|
||||
use std::{usize, u64};
|
||||
use std::iter::Peekable;
|
||||
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
facet: &'a Facet,
|
||||
}
|
||||
|
||||
impl<'a> Eq for Hit<'a> {}
|
||||
|
||||
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
|
||||
fn eq(&self, other: &Hit) -> bool {
|
||||
self.count == other.count
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
|
||||
fn partial_cmp(&self, other: &Hit) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for Hit<'a> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.count.cmp(&self.count)
|
||||
}
|
||||
}
|
||||
|
||||
struct SegmentFacetCounter {
|
||||
pub facet_reader: FacetReader,
|
||||
pub facet_ords: Vec<u64>,
|
||||
pub facet_counts: Vec<u64>,
|
||||
}
|
||||
|
||||
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
if facet_bytes.is_empty() {
|
||||
0
|
||||
} else {
|
||||
facet_bytes.iter().cloned().filter(|b| *b == 0u8).count() + 1
|
||||
}
|
||||
}
|
||||
|
||||
/// Collector for faceting
|
||||
///
|
||||
/// The collector collects all facets. You need to configure it
|
||||
/// beforehand with the facet you want to extract.
|
||||
///
|
||||
/// This is done by calling `.add_facet(...)` with the root of the
|
||||
/// facet you want to extract as argument.
|
||||
///
|
||||
/// Facet counts will only be computed for the facet that are direct children
|
||||
/// of such a root facet.
|
||||
///
|
||||
/// For instance, if your index represents books, your hierarchy of facets
|
||||
/// may contain `category`, `language`.
|
||||
///
|
||||
/// The category facet may include `subcategories`. For instance, a book
|
||||
/// could belong to `/category/fiction/fantasy`.
|
||||
///
|
||||
/// If you request the facet counts for `/category`, the result will be
|
||||
/// the breakdown of counts for the direct children of `/category`
|
||||
/// (e.g. `/category/fiction`, `/category/biography`, `/category/personal_development`).
|
||||
///
|
||||
/// Once collection is finished, you can harvest its results in the form
|
||||
/// of a `FacetCounts` object, and extract your face t counts from it.
|
||||
///
|
||||
/// This implementation assumes you are working with a number of facets that
|
||||
/// is much hundreds of time lower than your number of documents.
|
||||
///
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Facet, SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::FacetCollector;
|
||||
/// use tantivy::query::AllQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
///
|
||||
/// // Facet have their own specific type.
|
||||
/// // It is not a bad practise to put all of your
|
||||
/// // facet information in the same field.
|
||||
/// let facet = schema_builder.add_facet_field("facet");
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// // a document can be associated to any number of facets
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy")
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "Dune",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/sci-fi")
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "La Vénus d'Ille",
|
||||
/// facet => Facet::from("/lang/fr"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy"),
|
||||
/// facet => Facet::from("/category/fiction/horror")
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/biography")
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/lang");
|
||||
/// facet_collector.add_facet("/category");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts
|
||||
/// .get("/category")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category/biography"), 1),
|
||||
/// (&Facet::from("/category/fiction"), 3)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/category/fiction");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts
|
||||
/// .get("/category/fiction")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category/fiction/fantasy"), 2),
|
||||
/// (&Facet::from("/category/fiction/horror"), 1),
|
||||
/// (&Facet::from("/category/fiction/sci-fi"), 1)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/category/fiction");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1);
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct FacetCollector {
|
||||
facet_ords: Vec<u64>,
|
||||
/// Facet collector for i64/u64 fast field
|
||||
pub struct FacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
counters: HashMap<T::ValueType, u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<UnsafeCell<FacetReader>>,
|
||||
segment_counters: Vec<SegmentFacetCounter>,
|
||||
|
||||
// facet_ord -> collapse facet_id
|
||||
current_segment_collapse_mapping: Vec<usize>,
|
||||
// collapse facet_id -> count
|
||||
current_segment_counts: Vec<u64>,
|
||||
// collapse facet_id -> facet_ord
|
||||
current_collapse_facet_ords: Vec<u64>,
|
||||
|
||||
facets: BTreeSet<Facet>,
|
||||
ff_reader: Option<T>,
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
target: &[u8],
|
||||
collapse_it: &mut Peekable<I>,
|
||||
) -> SkipResult {
|
||||
loop {
|
||||
match collapse_it.peek() {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
return SkipResult::End;
|
||||
}
|
||||
}
|
||||
collapse_it.next();
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetCollector {
|
||||
/// Create a facet collector to collect the facets
|
||||
/// from a specific facet `Field`.
|
||||
///
|
||||
/// This function does not check whether the field
|
||||
/// is of the proper type.
|
||||
pub fn for_field(field: Field) -> FacetCollector {
|
||||
impl<T> FacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
/// Creates a new facet collector for aggregating a given field.
|
||||
pub fn new(field: Field) -> FacetCollector<T> {
|
||||
FacetCollector {
|
||||
facet_ords: Vec::with_capacity(255),
|
||||
segment_counters: Vec::new(),
|
||||
field,
|
||||
counters: HashMap::new(),
|
||||
field: field,
|
||||
ff_reader: None,
|
||||
facets: BTreeSet::new(),
|
||||
|
||||
current_segment_collapse_mapping: Vec::new(),
|
||||
current_collapse_facet_ords: Vec::new(),
|
||||
current_segment_counts: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds a facet that we want to record counts
|
||||
///
|
||||
/// Adding facet `Facet::from("/country")` for instance,
|
||||
/// will record the counts of all of the direct children of the facet country
|
||||
/// (e.g. `/country/FR`, `/country/UK`).
|
||||
///
|
||||
/// Adding two facets within which one is the prefix of the other is forbidden.
|
||||
/// If you need the correct number of unique documents for two such facets,
|
||||
/// just add them in separate `FacetCollector`.
|
||||
pub fn add_facet<T>(&mut self, facet_from: T)
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let facet = Facet::from(facet_from);
|
||||
for old_facet in &self.facets {
|
||||
assert!(
|
||||
!old_facet.is_prefix_of(&facet),
|
||||
"Tried to add a facet which is a descendant of an already added facet."
|
||||
);
|
||||
assert!(
|
||||
!facet.is_prefix_of(old_facet),
|
||||
"Tried to add a facet which is an ancestor of an already added facet."
|
||||
);
|
||||
}
|
||||
self.facets.insert(facet);
|
||||
}
|
||||
|
||||
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||
self.current_segment_collapse_mapping.clear();
|
||||
self.current_collapse_facet_ords.clear();
|
||||
self.current_segment_counts.clear();
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
self.current_collapse_facet_ords.push(0);
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if !facet_streamer.advance() {
|
||||
return;
|
||||
}
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = self.current_collapse_facet_ords.len();
|
||||
self.current_collapse_facet_ords
|
||||
.push(facet_streamer.term_ord());
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(SegmentFacetCounter {
|
||||
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the results of the collection.
|
||||
///
|
||||
/// This method does not just return the counters,
|
||||
/// it also translates the facet ordinals of the last segment.
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
.collect();
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||
.collect();
|
||||
|
||||
let facet_streams = self.segment_counters
|
||||
.iter()
|
||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut facet_merger = TermMerger::new(facet_streams);
|
||||
let mut facet_counts = BTreeMap::new();
|
||||
|
||||
while facet_merger.advance() {
|
||||
let count = facet_merger
|
||||
.current_kvs()
|
||||
.iter()
|
||||
.map(|it| {
|
||||
let seg_ord = it.segment_ord;
|
||||
let term_ord = it.streamer.term_ord();
|
||||
collapsed_facet_ords[seg_ord]
|
||||
.binary_search(&term_ord)
|
||||
.map(|collapsed_term_id| {
|
||||
if collapsed_term_id == 0 {
|
||||
0
|
||||
} else {
|
||||
collapsed_facet_counts[seg_ord][collapsed_term_id]
|
||||
}
|
||||
})
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum();
|
||||
if count > 0u64 {
|
||||
let bytes = facet_merger.key().to_owned();
|
||||
facet_counts.insert(Facet::from_encoded(bytes), count);
|
||||
}
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FacetCollector {
|
||||
impl<T> Collector for FacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.finalize_segment();
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
self.set_collapse_mapping(&facet_reader);
|
||||
self.current_segment_counts
|
||||
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
};
|
||||
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||
for &facet_ord in &self.facet_ords {
|
||||
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||
{
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
previous_collapsed_ord = collapsed_ord;
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Intermediary result of the `FacetCollector` that stores
|
||||
/// the facet counts for all the segments.
|
||||
pub struct FacetCounts {
|
||||
facet_counts: BTreeMap<Facet, u64>,
|
||||
}
|
||||
|
||||
impl FacetCounts {
|
||||
#[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime
|
||||
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let facet = Facet::from(facet_from);
|
||||
let left_bound = Bound::Excluded(facet.clone());
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = Facet::from_encoded(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
|
||||
self.facet_counts
|
||||
.range((left_bound, right_bound))
|
||||
.map(|(facet, count)| (facet, *count))
|
||||
}
|
||||
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let mut heap = BinaryHeap::with_capacity(k);
|
||||
let mut it = self.get(facet);
|
||||
|
||||
for (facet, count) in (&mut it).take(k) {
|
||||
heap.push(Hit { count, facet });
|
||||
}
|
||||
|
||||
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN);
|
||||
for (facet, count) in it {
|
||||
if count > lowest_count {
|
||||
lowest_count = count;
|
||||
if let Some(mut head) = heap.peek_mut() {
|
||||
*head = Hit { count, facet };
|
||||
}
|
||||
}
|
||||
}
|
||||
heap.into_sorted_vec()
|
||||
.into_iter()
|
||||
.map(|hit| (hit.facet, hit.count))
|
||||
.collect::<Vec<_>>()
|
||||
let val = self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get(doc);
|
||||
*(self.counters.entry(val).or_insert(0)) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use test::Bencher;
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use query::AllQuery;
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use std::iter;
|
||||
use schema::Field;
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use collector::{chain, FacetCollector};
|
||||
use query::QueryParser;
|
||||
use fastfield::{I64FastFieldReader, U64FastFieldReader};
|
||||
use schema::{self, FAST, STRING};
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
// create 10 documents, set num field value to 0 or 1 for even/odd ones
|
||||
// make sure we have facet counters correctly filled
|
||||
fn test_facet_collector_results() {
|
||||
let mut schema_builder = schema::SchemaBuilder::new();
|
||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
let top = n % 3;
|
||||
n /= 3;
|
||||
let mid = n % 4;
|
||||
n /= 4;
|
||||
let leaf = n % 5;
|
||||
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
||||
})
|
||||
.collect();
|
||||
for i in 0..num_facets * 10 {
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top1"));
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
|
||||
let counts: FacetCounts = facet_collector.harvest();
|
||||
{
|
||||
let facets: Vec<(String, u64)> = counts
|
||||
.get("/top1")
|
||||
.map(|(facet, count)| (facet.to_string(), count))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
[
|
||||
("/top1/mid0", 50),
|
||||
("/top1/mid1", 50),
|
||||
("/top1/mid2", 50),
|
||||
("/top1/mid3", 50),
|
||||
].iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/country/europe"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_used_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/countryeurope"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_topk() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
|
||||
.into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet_{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
})
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/");
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
|
||||
let counts: FacetCounts = facet_collector.harvest();
|
||||
{
|
||||
let facets: Vec<(&Facet, u64)> = counts.top_k("/", 3);
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![
|
||||
(&Facet::from("/facet_b"), 100),
|
||||
(&Facet::from("/facet_e"), 21),
|
||||
(&Facet::from("/facet_d"), 12),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut docs = vec![];
|
||||
for val in 0..50 {
|
||||
let facet = Facet::from(&format!("/facet_{}", val));
|
||||
for _ in 0..val * val {
|
||||
docs.push(doc!(facet_field=>facet.clone()));
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
for i in 0u64..10u64 {
|
||||
index_writer.add_document(doc!(
|
||||
num_field_i64 => ((i as i64) % 3i64) as i64,
|
||||
num_field_u64 => (i % 2u64) as u64,
|
||||
text_field => "text"
|
||||
));
|
||||
}
|
||||
}
|
||||
assert_eq!(index_writer.commit().unwrap(), 10u64);
|
||||
}
|
||||
// 40425 docs
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let mut ffvf_i64: FacetCollector<I64FastFieldReader> = FacetCollector::new(num_field_i64);
|
||||
let mut ffvf_u64: FacetCollector<U64FastFieldReader> = FacetCollector::new(num_field_u64);
|
||||
|
||||
b.iter(|| {
|
||||
let searcher = index.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
});
|
||||
{
|
||||
// perform the query
|
||||
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let query = query_parser.parse_query("text:text").unwrap();
|
||||
query.search(&searcher, &mut facet_collectors).unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(ffvf_u64.counters[&0], 5);
|
||||
assert_eq!(ffvf_u64.counters[&1], 5);
|
||||
assert_eq!(ffvf_i64.counters[&0], 4);
|
||||
assert_eq!(ffvf_i64.counters[&1], 3);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,123 +0,0 @@
|
||||
use std::cmp::Eq;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::Hash;
|
||||
|
||||
use collector::Collector;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
|
||||
/// Facet collector for i64/u64 fast field
|
||||
pub struct IntFacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
counters: HashMap<T::ValueType, u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<T>,
|
||||
}
|
||||
|
||||
|
||||
impl<T> IntFacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
/// Creates a new facet collector for aggregating a given field.
|
||||
pub fn new(field: Field) -> IntFacetCollector<T> {
|
||||
IntFacetCollector {
|
||||
counters: HashMap::new(),
|
||||
field: field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<T> Collector for IntFacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let val = self.ff_reader
|
||||
.as_ref()
|
||||
.expect(
|
||||
"collect() was called before set_segment. \
|
||||
This should never happen.",
|
||||
)
|
||||
.get(doc);
|
||||
*(self.counters.entry(val).or_insert(0)) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use collector::{chain, IntFacetCollector};
|
||||
use query::QueryParser;
|
||||
use fastfield::{I64FastFieldReader, U64FastFieldReader};
|
||||
use schema::{self, FAST, STRING};
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
// create 10 documents, set num field value to 0 or 1 for even/odd ones
|
||||
// make sure we have facet counters correctly filled
|
||||
fn test_facet_collector_results() {
|
||||
|
||||
let mut schema_builder = schema::SchemaBuilder::new();
|
||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
for i in 0u64..10u64 {
|
||||
index_writer.add_document(doc!(
|
||||
num_field_i64 => ((i as i64) % 3i64) as i64,
|
||||
num_field_u64 => (i % 2u64) as u64,
|
||||
text_field => "text"
|
||||
));
|
||||
}
|
||||
}
|
||||
assert_eq!(index_writer.commit().unwrap(), 10u64);
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
|
||||
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
|
||||
|
||||
{
|
||||
// perform the query
|
||||
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
|
||||
let mut query_parser = QueryParser::for_index(index, vec![text_field]);
|
||||
let query = query_parser.parse_query("text:text").unwrap();
|
||||
query.search(&searcher, &mut facet_collectors).unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(ffvf_u64.counters[&0], 5);
|
||||
assert_eq!(ffvf_u64.counters[&1], 5);
|
||||
assert_eq!(ffvf_i64.counters[&0], 4);
|
||||
assert_eq!(ffvf_i64.counters[&1], 3);
|
||||
|
||||
}
|
||||
}
|
||||
@@ -62,9 +62,6 @@ pub trait Collector {
|
||||
) -> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
|
||||
/// Returns true iff the collector requires to compute scores for documents.
|
||||
fn requires_scoring(&self) -> bool;
|
||||
}
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
@@ -77,11 +74,7 @@ impl<'a, C: Collector> Collector for &'a mut C {
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
C::collect(self, doc, score)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
C::requires_scoring(self)
|
||||
(*self).collect(doc, score);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,6 +87,7 @@ pub mod tests {
|
||||
use Score;
|
||||
use core::SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
|
||||
@@ -134,10 +128,6 @@ pub mod tests {
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
self.docs.push(doc + self.offset);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast fields for all of the
|
||||
@@ -147,14 +137,14 @@ pub mod tests {
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<FastFieldReader<u64>>,
|
||||
ff_reader: Option<U64FastFieldReader>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
field: field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
@@ -166,7 +156,7 @@ pub mod tests {
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
||||
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -174,9 +164,6 @@ pub mod tests {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
|
||||
@@ -16,7 +16,9 @@ pub struct MultiCollector<'a> {
|
||||
impl<'a> MultiCollector<'a> {
|
||||
/// Constructor
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector { collectors }
|
||||
MultiCollector {
|
||||
collectors: collectors,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +29,7 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
collector.set_segment(segment_local_id, segment)?;
|
||||
try!(collector.set_segment(segment_local_id, segment));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -37,11 +39,6 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
collector.collect(doc, score);
|
||||
}
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collectors
|
||||
.iter()
|
||||
.any(|collector| collector.requires_scoring())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -60,7 +60,7 @@ impl TopCollector {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
TopCollector {
|
||||
limit,
|
||||
limit: limit,
|
||||
heap: BinaryHeap::with_capacity(limit),
|
||||
segment_id: 0,
|
||||
}
|
||||
@@ -119,16 +119,12 @@ impl Collector for TopCollector {
|
||||
}
|
||||
} else {
|
||||
let wrapped_doc = GlobalScoredDoc {
|
||||
score,
|
||||
score: score,
|
||||
doc_address: DocAddress(self.segment_id, doc),
|
||||
};
|
||||
self.heap.push(wrapped_doc);
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -4,35 +4,64 @@ use common::serialize::BinarySerializable;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
|
||||
pub(crate) struct BitPacker {
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
/// In general the target is the minimum number of bits
|
||||
/// required to express the amplitude given in argument.
|
||||
///
|
||||
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
|
||||
///
|
||||
/// The logic is slightly more convoluted here as for optimization
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligns bytes.
|
||||
///
|
||||
/// Spanning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
/// In this case, the second int will start on bit
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
/// Hence 9 bytes (from byte 7 to byte 15 included).
|
||||
///
|
||||
/// To avoid this, we force the number of bits to 64bits
|
||||
/// when the result is greater than `64-8 = 56 bits`.
|
||||
///
|
||||
/// Note that this only affects rare use cases spawning over
|
||||
/// a very large range of values. Even in this case, it results
|
||||
/// in an extra cost of at most 12% compared to the optimal
|
||||
/// number of bits.
|
||||
pub fn compute_num_bits(amplitude: u64) -> u8 {
|
||||
let amplitude = (64u32 - amplitude.leading_zeros()) as u8;
|
||||
if amplitude <= 64 - 8 {
|
||||
amplitude
|
||||
} else {
|
||||
64
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
mini_buffer_written: usize,
|
||||
num_bits: usize,
|
||||
}
|
||||
|
||||
impl BitPacker {
|
||||
pub fn new() -> BitPacker {
|
||||
pub fn new(num_bits: usize) -> BitPacker {
|
||||
BitPacker {
|
||||
mini_buffer: 0u64,
|
||||
mini_buffer_written: 0,
|
||||
num_bits,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<TWrite: Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
num_bits: u8,
|
||||
output: &mut TWrite,
|
||||
) -> io::Result<()> {
|
||||
pub fn write<TWrite: Write>(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> {
|
||||
let val_u64 = val as u64;
|
||||
let num_bits = num_bits as usize;
|
||||
if self.mini_buffer_written + num_bits > 64 {
|
||||
if self.mini_buffer_written + self.num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
self.mini_buffer.serialize(output)?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||
self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64;
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += num_bits;
|
||||
self.mini_buffer_written += self.num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
self.mini_buffer.serialize(output)?;
|
||||
self.mini_buffer_written = 0;
|
||||
@@ -42,7 +71,7 @@ impl BitPacker {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub(crate) fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
@@ -60,7 +89,6 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitUnpacker<Data>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
@@ -74,14 +102,14 @@ impl<Data> BitUnpacker<Data>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
pub fn new(data: Data, num_bits: u8) -> BitUnpacker<Data> {
|
||||
pub fn new(data: Data, num_bits: usize) -> BitUnpacker<Data> {
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
!0u64
|
||||
} else {
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: num_bits as usize,
|
||||
num_bits,
|
||||
mask,
|
||||
data,
|
||||
}
|
||||
@@ -89,7 +117,7 @@ where
|
||||
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
return 0;
|
||||
}
|
||||
let data: &[u8] = &*self.data;
|
||||
let num_bits = self.num_bits;
|
||||
@@ -105,9 +133,10 @@ where
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
(val_shifted & mask)
|
||||
} else {
|
||||
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
|
||||
unsafe { *(data[addr..].as_ptr() as *const u64) }
|
||||
@@ -118,19 +147,15 @@ where
|
||||
}
|
||||
unsafe { *(buffer[..].as_ptr() as *const u64) }
|
||||
};
|
||||
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
|
||||
val_shifted & mask
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
(val_shifted & mask)
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads a range of values from the fast field.
|
||||
///
|
||||
/// The range of values read is from
|
||||
/// `[start..start + output.len()[`
|
||||
pub fn get_range(&self, start: u32, output: &mut [u64]) {
|
||||
if self.num_bits == 0 {
|
||||
for val in output.iter_mut() {
|
||||
*val = 0u64;
|
||||
*val = 0;
|
||||
}
|
||||
} else {
|
||||
let data: &[u8] = &*self.data;
|
||||
@@ -140,7 +165,8 @@ where
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
@@ -151,25 +177,37 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{BitPacker, BitUnpacker};
|
||||
use super::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: usize) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
|
||||
let mut data = Vec::new();
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
|
||||
let mut bitpacker = BitPacker::new(num_bits);
|
||||
let max_val: u64 = (1 << num_bits) - 1;
|
||||
let vals: Vec<u64> = (0u64..len as u64)
|
||||
.map(|i| if max_val == 0 { 0 } else { i % max_val })
|
||||
.collect();
|
||||
for &val in &vals {
|
||||
bitpacker.write(val, num_bits, &mut data).unwrap();
|
||||
bitpacker.write(val, &mut data).unwrap();
|
||||
}
|
||||
bitpacker.close(&mut data).unwrap();
|
||||
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
|
||||
assert_eq!(data.len(), (num_bits * len + 7) / 8 + 7);
|
||||
let bitunpacker = BitUnpacker::new(data, num_bits);
|
||||
(bitunpacker, vals)
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||
fn test_bitpacker_util(len: usize, num_bits: usize) {
|
||||
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i), *val);
|
||||
|
||||
@@ -1,390 +0,0 @@
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub(crate) struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
/// Returns an empty `TinySet`.
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(&self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(&self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline(always)]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline(always)]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline(always)]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline(always)]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if let Some(lowest) = self.lowest() {
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// (or None if the set is empty).
|
||||
#[inline(always)]
|
||||
pub fn lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let least_significant_bit = self.0.trailing_zeros() as u32;
|
||||
Some(least_significant_bit)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: usize, //< Technically it should be u32, but we
|
||||
// count multiple inserts.
|
||||
// `usize` guards us from overflow.
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val[`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybisets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
extern crate test;
|
||||
use tests;
|
||||
use std::collections::HashSet;
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use query::BitSetDocSet;
|
||||
use docset::DocSet;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..1_000_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
for el in btreeset.into_iter() {
|
||||
bitset_docset.advance();
|
||||
assert_eq!(bitset_docset.doc(), el);
|
||||
}
|
||||
assert!(!bitset_docset.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = tests::sample(1_000, 0.01f32);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| test::black_box(TinySet::singleton(31u32)).pop_lowest());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
@@ -4,43 +4,14 @@ use std::collections::HashMap;
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use directory::WritePtr;
|
||||
use std::io::{self, Read};
|
||||
use std::io;
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
field: Field,
|
||||
idx: usize,
|
||||
}
|
||||
|
||||
impl FileAddr {
|
||||
fn new(field: Field, idx: usize) -> FileAddr {
|
||||
FileAddr { field, idx }
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for FileAddr {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.field.serialize(writer)?;
|
||||
VInt(self.idx as u64).serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let field = Field::deserialize(reader)?;
|
||||
let idx = VInt::deserialize(reader)?.0 as usize;
|
||||
Ok(FileAddr {
|
||||
field,
|
||||
idx,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||
pub struct CompositeWrite<W = WritePtr> {
|
||||
write: CountingWriter<W>,
|
||||
offsets: HashMap<FileAddr, usize>,
|
||||
offsets: HashMap<Field, usize>,
|
||||
}
|
||||
|
||||
impl<W: Write> CompositeWrite<W> {
|
||||
@@ -55,15 +26,9 @@ impl<W: Write> CompositeWrite<W> {
|
||||
|
||||
/// Start writing a new field.
|
||||
pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
|
||||
self.for_field_with_idx(field, 0)
|
||||
}
|
||||
|
||||
/// Start writing a new field.
|
||||
pub fn for_field_with_idx(&mut self, field: Field, idx: usize) -> &mut CountingWriter<W> {
|
||||
let offset = self.write.written_bytes();
|
||||
let file_addr = FileAddr::new(field, idx);
|
||||
assert!(!self.offsets.contains_key(&file_addr));
|
||||
self.offsets.insert(file_addr, offset);
|
||||
assert!(!self.offsets.contains_key(&field));
|
||||
self.offsets.insert(field, offset);
|
||||
&mut self.write
|
||||
}
|
||||
|
||||
@@ -77,16 +42,16 @@ impl<W: Write> CompositeWrite<W> {
|
||||
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
.iter()
|
||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||
.map(|(field, offset)| (offset, field))
|
||||
.collect();
|
||||
|
||||
offset_fields.sort();
|
||||
|
||||
let mut prev_offset = 0;
|
||||
for (offset, file_addr) in offset_fields {
|
||||
for (offset, field) in offset_fields {
|
||||
VInt((offset - prev_offset) as u64).serialize(&mut self.write)?;
|
||||
file_addr.serialize(&mut self.write)?;
|
||||
prev_offset = offset;
|
||||
field.serialize(&mut self.write)?;
|
||||
prev_offset = *offset;
|
||||
}
|
||||
|
||||
let footer_len = (self.write.written_bytes() - footer_offset) as u32;
|
||||
@@ -105,7 +70,7 @@ impl<W: Write> CompositeWrite<W> {
|
||||
#[derive(Clone)]
|
||||
pub struct CompositeFile {
|
||||
data: ReadOnlySource,
|
||||
offsets_index: HashMap<FileAddr, (usize, usize)>,
|
||||
offsets_index: HashMap<Field, (usize, usize)>,
|
||||
}
|
||||
|
||||
impl CompositeFile {
|
||||
@@ -121,7 +86,7 @@ impl CompositeFile {
|
||||
let mut footer_buffer = footer_data.as_slice();
|
||||
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
|
||||
|
||||
let mut file_addrs = vec![];
|
||||
let mut fields = vec![];
|
||||
let mut offsets = vec![];
|
||||
|
||||
let mut field_index = HashMap::new();
|
||||
@@ -129,16 +94,16 @@ impl CompositeFile {
|
||||
let mut offset = 0;
|
||||
for _ in 0..num_fields {
|
||||
offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
|
||||
let file_addr = FileAddr::deserialize(&mut footer_buffer)?;
|
||||
let field = Field::deserialize(&mut footer_buffer)?;
|
||||
offsets.push(offset);
|
||||
file_addrs.push(file_addr);
|
||||
fields.push(field);
|
||||
}
|
||||
offsets.push(footer_start);
|
||||
for i in 0..num_fields {
|
||||
let file_addr = file_addrs[i];
|
||||
let field = fields[i];
|
||||
let start_offset = offsets[i];
|
||||
let end_offset = offsets[i + 1];
|
||||
field_index.insert(file_addr, (start_offset, end_offset));
|
||||
field_index.insert(field, (start_offset, end_offset));
|
||||
}
|
||||
|
||||
Ok(CompositeFile {
|
||||
@@ -159,14 +124,8 @@ impl CompositeFile {
|
||||
/// Returns the `ReadOnlySource` associated
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
|
||||
self.open_read_with_idx(field, 0)
|
||||
}
|
||||
|
||||
/// Returns the `ReadOnlySource` associated
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
|
||||
self.offsets_index
|
||||
.get(&FileAddr { field, idx, })
|
||||
.get(&field)
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,60 +4,19 @@ mod vint;
|
||||
mod counting_writer;
|
||||
mod composite_file;
|
||||
pub mod bitpacker;
|
||||
mod bitset;
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::serialize::BinarySerializable;
|
||||
pub use self::timer::Timing;
|
||||
pub use self::timer::TimerTree;
|
||||
pub use self::timer::OpenTimer;
|
||||
pub use self::vint::VInt;
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
/// In general the target is the minimum number of bits
|
||||
/// required to express the amplitude given in argument.
|
||||
///
|
||||
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
|
||||
///
|
||||
/// The logic is slightly more convoluted here as for optimization
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligns bytes.
|
||||
///
|
||||
/// Spanning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
/// In this case, the second int will start on bit
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
/// Hence 9 bytes (from byte 7 to byte 15 included).
|
||||
///
|
||||
/// To avoid this, we force the number of bits to 64bits
|
||||
/// when the result is greater than `64-8 = 56 bits`.
|
||||
///
|
||||
/// Note that this only affects rare use cases spawning over
|
||||
/// a very large range of values. Even in this case, it results
|
||||
/// in an extra cost of at most 12% compared to the optimal
|
||||
/// number of bits.
|
||||
pub(crate) fn compute_num_bits(n: u64) -> u8 {
|
||||
let amplitude = (64u32 - n.leading_zeros()) as u8;
|
||||
if amplitude <= 64 - 8 {
|
||||
amplitude
|
||||
} else {
|
||||
64
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_power_of_2(n: usize) -> bool {
|
||||
(n > 0) && (n & (n - 1) == 0)
|
||||
}
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
pub fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
|
||||
@@ -106,10 +65,9 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
mod test {
|
||||
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{i64_to_u64, u64_to_i64};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
@@ -126,16 +84,4 @@ pub(crate) mod test {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,25 +1,16 @@
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use common::Endianness;
|
||||
use byteorder::LittleEndian as Endianness;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use std::io;
|
||||
use common::VInt;
|
||||
|
||||
/// Trait for a simple binary serialization.
|
||||
pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
/// Serialize
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()>;
|
||||
/// Deserialize
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
|
||||
}
|
||||
|
||||
/// `FixedSize` marks a `BinarySerializable` as
|
||||
/// always serializing to the same size.
|
||||
pub trait FixedSize: BinarySerializable {
|
||||
const SIZE_IN_BYTES: usize;
|
||||
}
|
||||
|
||||
impl BinarySerializable for () {
|
||||
fn serialize<W: Write>(&self, _: &mut W) -> io::Result<()> {
|
||||
Ok(())
|
||||
@@ -29,10 +20,6 @@ impl BinarySerializable for () {
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for () {
|
||||
const SIZE_IN_BYTES: usize = 0;
|
||||
}
|
||||
|
||||
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
@@ -72,10 +59,6 @@ impl BinarySerializable for u32 {
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for u32 {
|
||||
const SIZE_IN_BYTES: usize = 4;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u64 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u64::<Endianness>(*self)
|
||||
@@ -85,10 +68,6 @@ impl BinarySerializable for u64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for u64 {
|
||||
const SIZE_IN_BYTES: usize = 8;
|
||||
}
|
||||
|
||||
impl BinarySerializable for i64 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_i64::<Endianness>(*self)
|
||||
@@ -98,10 +77,6 @@ impl BinarySerializable for i64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for i64 {
|
||||
const SIZE_IN_BYTES: usize = 8;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u8 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u8(*self)
|
||||
@@ -111,10 +86,6 @@ impl BinarySerializable for u8 {
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for u8 {
|
||||
const SIZE_IN_BYTES: usize = 1;
|
||||
}
|
||||
|
||||
impl BinarySerializable for String {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
@@ -133,78 +104,63 @@ impl BinarySerializable for String {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
mod test {
|
||||
|
||||
use common::VInt;
|
||||
use super::*;
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
fn serialize_test<T: BinarySerializable + Eq>(v: T) -> usize {
|
||||
fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
v.serialize(&mut buffer).unwrap();
|
||||
let num_bytes = buffer.len();
|
||||
if num_bytes != 0 {
|
||||
v.serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), num_bytes);
|
||||
} else {
|
||||
v.serialize(&mut buffer).unwrap();
|
||||
}
|
||||
let mut cursor = &buffer[..];
|
||||
let deser = T::deserialize(&mut cursor).unwrap();
|
||||
assert_eq!(deser, v);
|
||||
num_bytes
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_u8() {
|
||||
fixed_size_test::<u8>();
|
||||
serialize_test(3u8, 1);
|
||||
serialize_test(5u8, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_u32() {
|
||||
fixed_size_test::<u32>();
|
||||
assert_eq!(4, serialize_test(3u32));
|
||||
assert_eq!(4, serialize_test(5u32));
|
||||
assert_eq!(4, serialize_test(u32::max_value()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_i64() {
|
||||
fixed_size_test::<i64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_u64() {
|
||||
fixed_size_test::<u64>();
|
||||
serialize_test(3u32, 4);
|
||||
serialize_test(5u32, 4);
|
||||
serialize_test(u32::max_value(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_string() {
|
||||
assert_eq!(serialize_test(String::from("")), 1);
|
||||
assert_eq!(serialize_test(String::from("ぽよぽよ")), 1 + 3 * 4);
|
||||
assert_eq!(
|
||||
serialize_test(String::from("富士さん見える。")),
|
||||
1 + 3 * 8
|
||||
);
|
||||
serialize_test(String::from(""), 1);
|
||||
serialize_test(String::from("ぽよぽよ"), 1 + 3 * 4);
|
||||
serialize_test(String::from("富士さん見える。"), 1 + 3 * 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_vec() {
|
||||
assert_eq!(serialize_test(Vec::<u8>::new()), 1);
|
||||
assert_eq!(serialize_test(vec![1u32, 3u32]), 1 + 4 * 2);
|
||||
let v: Vec<u8> = Vec::new();
|
||||
serialize_test(v, 1);
|
||||
serialize_test(vec![1u32, 3u32], 1 + 4 * 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_vint() {
|
||||
for i in 0..10_000 {
|
||||
serialize_test(VInt(i as u64));
|
||||
serialize_test(VInt(i as u64), 0);
|
||||
}
|
||||
assert_eq!(serialize_test(VInt(7u64)), 1);
|
||||
assert_eq!(serialize_test(VInt(127u64)), 1);
|
||||
assert_eq!(serialize_test(VInt(128u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(129u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(1234u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(16_383u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(16_384u64)), 3);
|
||||
assert_eq!(serialize_test(VInt(u64::max_value())), 10);
|
||||
serialize_test(VInt(7u64), 1);
|
||||
serialize_test(VInt(127u64), 1);
|
||||
serialize_test(VInt(128u64), 2);
|
||||
serialize_test(VInt(129u64), 2);
|
||||
serialize_test(VInt(1234u64), 2);
|
||||
serialize_test(VInt(16_383), 2);
|
||||
serialize_test(VInt(16_384), 3);
|
||||
serialize_test(VInt(u64::max_value()), 10);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ impl<'a> OpenTimer<'a> {
|
||||
/// when the `OpenTimer` is dropped.
|
||||
pub fn open(&mut self, name: &'static str) -> OpenTimer {
|
||||
OpenTimer {
|
||||
name,
|
||||
name: name,
|
||||
timer_tree: self.timer_tree,
|
||||
start: PreciseTime::now(),
|
||||
depth: self.depth + 1,
|
||||
@@ -58,7 +58,7 @@ impl TimerTree {
|
||||
/// Open a new named subtask
|
||||
pub fn open(&mut self, name: &'static str) -> OpenTimer {
|
||||
OpenTimer {
|
||||
name,
|
||||
name: name,
|
||||
timer_tree: self,
|
||||
start: PreciseTime::now(),
|
||||
depth: 0,
|
||||
|
||||
@@ -11,10 +11,6 @@ impl VInt {
|
||||
pub fn val(&self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn deserialize_u64<R: Read>(reader: &mut R) -> io::Result<u64> {
|
||||
VInt::deserialize(reader).map(|vint| vint.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for VInt {
|
||||
|
||||
@@ -5,13 +5,6 @@ mod stream;
|
||||
|
||||
pub use self::stream::CompressedIntStream;
|
||||
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
|
||||
|
||||
/// Returns the size in bytes of a compressed block, given `num_bits`.
|
||||
pub fn compressed_block_size(num_bits: u8) -> usize {
|
||||
1 + (num_bits as usize) * 16
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "simdcompression"))]
|
||||
mod pack {
|
||||
mod compression_pack_nosimd;
|
||||
@@ -38,6 +31,11 @@ mod vint {
|
||||
pub(crate) use self::compression_vint_simd::*;
|
||||
}
|
||||
|
||||
/// Returns the size in bytes of a compressed block, given `num_bits`.
|
||||
pub fn compressed_block_size(num_bits: u8) -> usize {
|
||||
1 + (num_bits as usize) * 16
|
||||
}
|
||||
|
||||
pub trait VIntEncoder {
|
||||
/// Compresses an array of `u32` integers,
|
||||
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
|
||||
@@ -114,6 +112,8 @@ impl VIntDecoder for BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use common::compute_num_bits;
|
||||
use common::bitpacker::compute_num_bits;
|
||||
use common::bitpacker::{BitPacker, BitUnpacker};
|
||||
use common::CountingWriter;
|
||||
use std::cmp;
|
||||
use std::io::Write;
|
||||
use super::super::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
|
||||
use super::super::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
@@ -23,15 +23,11 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
|
||||
let num_bits = compute_num_bits(max_delta as u64);
|
||||
counting_writer.write_all(&[num_bits]).unwrap();
|
||||
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut bit_packer = BitPacker::new(num_bits as usize);
|
||||
for val in vals {
|
||||
bit_packer
|
||||
.write(*val as u64, num_bits, &mut counting_writer)
|
||||
.unwrap();
|
||||
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
|
||||
}
|
||||
let compressed_size = counting_writer.written_bytes();
|
||||
assert_eq!(compressed_size, compressed_block_size(num_bits));
|
||||
compressed_size
|
||||
counting_writer.written_bytes()
|
||||
}
|
||||
|
||||
pub struct BlockEncoder {
|
||||
@@ -65,15 +61,13 @@ impl BlockEncoder {
|
||||
let num_bits = compute_num_bits(max as u64);
|
||||
let mut counting_writer = CountingWriter::wrap(output);
|
||||
counting_writer.write_all(&[num_bits]).unwrap();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut bit_packer = BitPacker::new(num_bits as usize);
|
||||
for val in vals {
|
||||
bit_packer
|
||||
.write(*val as u64, num_bits, &mut counting_writer)
|
||||
.unwrap();
|
||||
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
|
||||
}
|
||||
for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
|
||||
bit_packer
|
||||
.write(vals[0] as u64, num_bits, &mut counting_writer)
|
||||
.write(vals[0] as u64, &mut counting_writer)
|
||||
.unwrap();
|
||||
}
|
||||
bit_packer.flush(&mut counting_writer).expect(
|
||||
@@ -112,14 +106,14 @@ impl BlockDecoder {
|
||||
) -> usize {
|
||||
let consumed_size = {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits);
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
let delta = bit_unpacker.get(i);
|
||||
let val = offset + delta as u32;
|
||||
self.output[i] = val;
|
||||
offset = val;
|
||||
}
|
||||
compressed_block_size(num_bits)
|
||||
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
|
||||
};
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
@@ -127,7 +121,7 @@ impl BlockDecoder {
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits);
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
self.output[i] = bit_unpacker.get(i) as u32;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use super::super::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
@@ -25,7 +25,9 @@ fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
|
||||
}
|
||||
|
||||
fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
|
||||
unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
|
||||
unsafe {
|
||||
simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset)
|
||||
}
|
||||
}
|
||||
|
||||
fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {
|
||||
|
||||
@@ -18,7 +18,6 @@ use core::SegmentMeta;
|
||||
use super::pool::LeasedItem;
|
||||
use std::path::Path;
|
||||
use core::IndexMeta;
|
||||
use indexer::DirectoryLock;
|
||||
use IndexWriter;
|
||||
use directory::ManagedDirectory;
|
||||
use core::META_FILEPATH;
|
||||
@@ -114,9 +113,12 @@ impl Index {
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Reads the index meta file from the directory.
|
||||
pub fn load_metas(&self) -> Result<IndexMeta> {
|
||||
load_metas(self.directory())
|
||||
/// Returns the index opstamp.
|
||||
///
|
||||
/// The opstamp is the number of documents that have been added
|
||||
/// from the beginning of time, and until the moment of the last commit.
|
||||
pub fn opstamp(&self) -> u64 {
|
||||
load_metas(self.directory()).unwrap().opstamp
|
||||
}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
@@ -139,8 +141,7 @@ impl Index {
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes)
|
||||
}
|
||||
|
||||
/// Creates a multithreaded writer
|
||||
@@ -193,7 +194,7 @@ impl Index {
|
||||
/// Reads the meta.json and returns the list of
|
||||
/// `SegmentMeta` from the last commit.
|
||||
pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
|
||||
Ok(self.load_metas()?.segments)
|
||||
Ok(load_metas(self.directory())?.segments)
|
||||
}
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use schema::Schema;
|
||||
use core::SegmentMeta;
|
||||
use std::fmt;
|
||||
use serde_json;
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
@@ -11,13 +9,11 @@ use serde_json;
|
||||
/// * the index `docstamp`
|
||||
/// * the schema
|
||||
///
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct IndexMeta {
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
pub schema: Schema,
|
||||
pub opstamp: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
@@ -26,43 +22,6 @@ impl IndexMeta {
|
||||
segments: vec![],
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for IndexMeta {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}",
|
||||
serde_json::ser::to_string(self)
|
||||
.expect("JSON serialization for IndexMeta should never fail.")
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json;
|
||||
use super::IndexMeta;
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
let schema = {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.build()
|
||||
};
|
||||
let index_metas = IndexMeta {
|
||||
segments: Vec::new(),
|
||||
schema: schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
};
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,10 +4,10 @@ use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use postings::TermInfo;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use std::cmp;
|
||||
use fastfield::DeleteBitSet;
|
||||
use schema::Schema;
|
||||
use compression::CompressedIntStream;
|
||||
use postings::FreqReadingOption;
|
||||
use schema::FieldType;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -27,44 +27,29 @@ pub struct InvertedIndexReader {
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
record_option: IndexRecordOption,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionaryImpl,
|
||||
termdict_source: ReadOnlySource,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
record_option: IndexRecordOption,
|
||||
schema: Schema,
|
||||
) -> InvertedIndexReader {
|
||||
InvertedIndexReader {
|
||||
termdict,
|
||||
termdict: TermDictionaryImpl::from_source(termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
delete_bitset,
|
||||
record_option,
|
||||
schema,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates an empty `InvertedIndexReader` object, which
|
||||
/// contains no terms at all.
|
||||
pub fn empty(field_type: FieldType) -> InvertedIndexReader {
|
||||
let record_option = field_type
|
||||
.get_index_record_option()
|
||||
.unwrap_or(IndexRecordOption::Basic);
|
||||
InvertedIndexReader::new(
|
||||
TermDictionaryImpl::empty(field_type),
|
||||
ReadOnlySource::empty(),
|
||||
ReadOnlySource::empty(),
|
||||
DeleteBitSet::empty(),
|
||||
record_option,
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns the term info associated with the term.
|
||||
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
|
||||
self.termdict.get(term.value_bytes())
|
||||
self.termdict.get(term.as_slice())
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
@@ -101,19 +86,15 @@ impl InvertedIndexReader {
|
||||
pub fn read_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
requested_option: IndexRecordOption,
|
||||
option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = self.postings_source.slice_from(offset);
|
||||
let freq_reading_option = match (self.record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
let has_freq = option.has_freq();
|
||||
BlockSegmentPostings::from_data(
|
||||
term_info.doc_freq as usize,
|
||||
SourceRead::from(postings_data),
|
||||
freq_reading_option,
|
||||
has_freq,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -153,14 +134,19 @@ impl InvertedIndexReader {
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
let maximum_option = get!(field_entry.field_type().get_index_record_option());
|
||||
let best_effort_option = cmp::min(maximum_option, option);
|
||||
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.get_term_info(term)
|
||||
.map(|term_info| term_info.doc_freq)
|
||||
.unwrap_or(0u32)
|
||||
match self.get_term_info(term) {
|
||||
Some(term_info) => term_info.doc_freq,
|
||||
None => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ impl Searcher {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.num_docs())
|
||||
.sum::<u32>()
|
||||
.fold(0u32, |acc, val| acc + val)
|
||||
}
|
||||
|
||||
/// Return the overall number of documents containing
|
||||
@@ -46,7 +46,7 @@ impl Searcher {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term))
|
||||
.sum::<u32>()
|
||||
.fold(0u32, |acc, val| acc + val)
|
||||
}
|
||||
|
||||
/// Return the list of segment readers
|
||||
|
||||
@@ -8,6 +8,7 @@ use core::SegmentMeta;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::DeleteBitSet;
|
||||
use store::StoreReader;
|
||||
use directory::ReadOnlySource;
|
||||
use schema::Document;
|
||||
use DocId;
|
||||
use std::sync::Arc;
|
||||
@@ -16,15 +17,8 @@ use common::CompositeFile;
|
||||
use std::fmt;
|
||||
use core::InvertedIndexReader;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use error::ErrorKind;
|
||||
use termdict::TermDictionaryImpl;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::{FastFieldReader, U64FastFieldReader};
|
||||
use schema::Schema;
|
||||
use termdict::TermDictionary;
|
||||
use fastfield::{FastValue, MultiValueIntFastFieldReader};
|
||||
use schema::Cardinality;
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -37,8 +31,6 @@ use schema::Cardinality;
|
||||
/// The segment reader has a very low memory footprint,
|
||||
/// as close to all of the memory data is mmapped.
|
||||
///
|
||||
///
|
||||
/// TODO fix not decoding docfreq
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
@@ -91,76 +83,21 @@ impl SegmentReader {
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn fast_field_reader<Item: FastValue>(
|
||||
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
) -> fastfield::Result<TFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
||||
{
|
||||
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
} else {
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
.map(TFastFieldReader::open)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
idx: usize
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||
Ok(FastFieldReader::open(ff_source))
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||
return Err(ErrorKind::InvalidArgument(format!(
|
||||
"The field {:?} is not a \
|
||||
hierarchical facet.",
|
||||
field_entry
|
||||
)).into());
|
||||
}
|
||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||
ErrorKind::InvalidArgument(format!(
|
||||
"The field \"{}\" is a hierarchical \
|
||||
but this segment does not seem to have the field term \
|
||||
dictionary.",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let termdict = TermDictionaryImpl::from_source(termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
///
|
||||
/// Field norms are the length (in tokens) of the fields.
|
||||
@@ -169,10 +106,10 @@ impl SegmentReader {
|
||||
///
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
|
||||
self.fieldnorms_composite
|
||||
.open_read(field)
|
||||
.map(FastFieldReader::open)
|
||||
.map(U64FastFieldReader::open)
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
@@ -229,8 +166,6 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Returns a field reader associated to the field given in argument.
|
||||
/// If the field was not present in the index during indexing time,
|
||||
/// the InvertedIndexReader is empty.
|
||||
///
|
||||
/// The field reader is in charge of iterating through the
|
||||
/// term dictionary associated to a specific field,
|
||||
@@ -241,44 +176,27 @@ impl SegmentReader {
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
{
|
||||
return Arc::clone(inv_idx_reader);
|
||||
}
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let record_option_opt = field_type.get_index_record_option();
|
||||
|
||||
if record_option_opt.is_none() {
|
||||
panic!("Field {:?} does not seem indexed.", field_entry.name());
|
||||
Arc::clone(inv_idx_reader);
|
||||
}
|
||||
|
||||
let record_option = record_option_opt.unwrap();
|
||||
|
||||
let postings_source_opt = self.postings_composite.open_read(field);
|
||||
|
||||
if postings_source_opt.is_none() {
|
||||
// no documents in the segment contained this field.
|
||||
// As a result, no data is associated to the inverted index.
|
||||
//
|
||||
// Returns an empty inverted index.
|
||||
return Arc::new(InvertedIndexReader::empty(field_type.clone()));
|
||||
}
|
||||
|
||||
let postings_source = postings_source_opt.unwrap();
|
||||
|
||||
let termdict_source = self.termdict_composite
|
||||
let termdict_source: ReadOnlySource = self.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
.expect("Index corrupted. Failed to open field term dictionary in composite file.");
|
||||
|
||||
let postings_source = self.postings_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field postings in composite file.");
|
||||
|
||||
let positions_source = self.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionaryImpl::from_source(termdict_source),
|
||||
termdict_source,
|
||||
postings_source,
|
||||
positions_source,
|
||||
self.delete_bitset.clone(),
|
||||
record_option,
|
||||
self.schema.clone(),
|
||||
));
|
||||
|
||||
// by releasing the lock in between, we may end up opening the inverting index
|
||||
|
||||
@@ -9,12 +9,12 @@ pub use self::skiplist::SkipList;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{SkipList, SkipListBuilder};
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_skiplist() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(8);
|
||||
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
|
||||
skip_list_builder.insert(2, &3).unwrap();
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
let mut skip_list: SkipList<u32> = SkipList::from(output.as_slice());
|
||||
@@ -24,7 +24,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_skiplist2() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(8);
|
||||
let skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
let mut skip_list: SkipList<u32> = SkipList::from(output.as_slice());
|
||||
assert_eq!(skip_list.next(), None);
|
||||
@@ -71,7 +71,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_skiplist5() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
|
||||
skip_list_builder.insert(2, &()).unwrap();
|
||||
skip_list_builder.insert(3, &()).unwrap();
|
||||
skip_list_builder.insert(5, &()).unwrap();
|
||||
@@ -103,7 +103,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_skiplist7() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
|
||||
for i in 0..1000 {
|
||||
skip_list_builder.insert(i, &()).unwrap();
|
||||
}
|
||||
@@ -121,48 +121,35 @@ mod tests {
|
||||
#[test]
|
||||
fn test_skiplist8() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(8);
|
||||
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
|
||||
skip_list_builder.insert(2, &3).unwrap();
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 11);
|
||||
assert_eq!(output.len(), 13);
|
||||
assert_eq!(output[0], 1u8 + 128u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skiplist9() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(4);
|
||||
for i in 0..4 * 4 * 4 {
|
||||
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(3);
|
||||
for i in 0..9 {
|
||||
skip_list_builder.insert(i, &i).unwrap();
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 774);
|
||||
assert_eq!(output[0], 4u8 + 128u8);
|
||||
assert_eq!(output.len(), 117);
|
||||
assert_eq!(output[0], 3u8 + 128u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skiplist10() {
|
||||
// checking that void gets serialized to nothing.
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
|
||||
for i in 0..((4 * 4 * 4) - 1) {
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
|
||||
for i in 0..9 {
|
||||
skip_list_builder.insert(i, &()).unwrap();
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 230);
|
||||
assert_eq!(output[0], 128u8 + 3u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skiplist11() {
|
||||
// checking that void gets serialized to nothing.
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
|
||||
for i in 0..(4 * 4) {
|
||||
skip_list_builder.insert(i, &()).unwrap();
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 65);
|
||||
assert_eq!(output.len(), 81);
|
||||
assert_eq!(output[0], 128u8 + 3u8);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use common::{BinarySerializable, VInt};
|
||||
use common::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
use DocId;
|
||||
use std::cmp::max;
|
||||
|
||||
static EMPTY: [u8; 0] = [];
|
||||
@@ -7,20 +8,21 @@ static EMPTY: [u8; 0] = [];
|
||||
struct Layer<'a, T> {
|
||||
data: &'a [u8],
|
||||
cursor: &'a [u8],
|
||||
next_id: Option<u64>,
|
||||
next_id: DocId,
|
||||
_phantom_: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
|
||||
type Item = (u64, T);
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self) -> Option<(u64, T)> {
|
||||
if let Some(cur_id) = self.next_id {
|
||||
let cur_val = T::deserialize(&mut self.cursor).unwrap();
|
||||
self.next_id = VInt::deserialize_u64(&mut self.cursor).ok();
|
||||
Some((cur_id, cur_val))
|
||||
} else {
|
||||
fn next(&mut self) -> Option<(DocId, T)> {
|
||||
if self.next_id == u32::max_value() {
|
||||
None
|
||||
} else {
|
||||
let cur_val = T::deserialize(&mut self.cursor).unwrap();
|
||||
let cur_id = self.next_id;
|
||||
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
|
||||
Some((cur_id, cur_val))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -28,7 +30,7 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
|
||||
impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
|
||||
fn from(data: &'a [u8]) -> Layer<'a, T> {
|
||||
let mut cursor = data;
|
||||
let next_id = VInt::deserialize_u64(&mut cursor).ok();
|
||||
let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value());
|
||||
Layer {
|
||||
data,
|
||||
cursor,
|
||||
@@ -43,14 +45,14 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
|
||||
Layer {
|
||||
data: &EMPTY,
|
||||
cursor: &EMPTY,
|
||||
next_id: None,
|
||||
next_id: DocId::max_value(),
|
||||
_phantom_: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
fn seek_offset(&mut self, offset: usize) {
|
||||
self.cursor = &self.data[offset..];
|
||||
self.next_id = VInt::deserialize_u64(&mut self.cursor).ok();
|
||||
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
|
||||
}
|
||||
|
||||
// Returns the last element (key, val)
|
||||
@@ -58,61 +60,54 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
|
||||
//
|
||||
// If there is no such element anymore,
|
||||
// returns None.
|
||||
//
|
||||
// If the element exists, it will be returned
|
||||
// at the next call to `.next()`.
|
||||
fn seek(&mut self, key: u64) -> Option<(u64, T)> {
|
||||
let mut result: Option<(u64, T)> = None;
|
||||
loop {
|
||||
if let Some(next_id) = self.next_id {
|
||||
if next_id < key {
|
||||
if let Some(v) = self.next() {
|
||||
result = Some(v);
|
||||
continue;
|
||||
}
|
||||
fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
|
||||
let mut val = None;
|
||||
while self.next_id < doc_id {
|
||||
match self.next() {
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
v => {
|
||||
val = v;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
val
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SkipList<'a, T: BinarySerializable> {
|
||||
data_layer: Layer<'a, T>,
|
||||
skip_layers: Vec<Layer<'a, u64>>,
|
||||
skip_layers: Vec<Layer<'a, u32>>,
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> {
|
||||
type Item = (u64, T);
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self) -> Option<(u64, T)> {
|
||||
fn next(&mut self) -> Option<(DocId, T)> {
|
||||
self.data_layer.next()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> SkipList<'a, T> {
|
||||
pub fn seek(&mut self, key: u64) -> Option<(u64, T)> {
|
||||
let mut next_layer_skip: Option<(u64, u64)> = None;
|
||||
pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
|
||||
let mut next_layer_skip: Option<(DocId, u32)> = None;
|
||||
for skip_layer in &mut self.skip_layers {
|
||||
if let Some((_, offset)) = next_layer_skip {
|
||||
skip_layer.seek_offset(offset as usize);
|
||||
}
|
||||
next_layer_skip = skip_layer.seek(key);
|
||||
next_layer_skip = skip_layer.seek(doc_id);
|
||||
}
|
||||
if let Some((_, offset)) = next_layer_skip {
|
||||
self.data_layer.seek_offset(offset as usize);
|
||||
}
|
||||
self.data_layer.seek(key)
|
||||
self.data_layer.seek(doc_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> {
|
||||
fn from(mut data: &'a [u8]) -> SkipList<'a, T> {
|
||||
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|el| el.0)
|
||||
.collect();
|
||||
let offsets: Vec<u32> = Vec::deserialize(&mut data).unwrap();
|
||||
let num_layers = offsets.len();
|
||||
let layers_data: &[u8] = data;
|
||||
let data_layer: Layer<'a, T> = if num_layers == 0 {
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
use std::io::Write;
|
||||
use common::{BinarySerializable, VInt, is_power_of_2};
|
||||
use common::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
use DocId;
|
||||
use std::io;
|
||||
|
||||
struct LayerBuilder<T: BinarySerializable> {
|
||||
period_mask: usize,
|
||||
period: usize,
|
||||
buffer: Vec<u8>,
|
||||
remaining: usize,
|
||||
len: usize,
|
||||
_phantom_: PhantomData<T>,
|
||||
}
|
||||
@@ -21,33 +23,34 @@ impl<T: BinarySerializable> LayerBuilder<T> {
|
||||
}
|
||||
|
||||
fn with_period(period: usize) -> LayerBuilder<T> {
|
||||
assert!(is_power_of_2(period), "The period has to be a power of 2.");
|
||||
LayerBuilder {
|
||||
period_mask: (period - 1),
|
||||
period,
|
||||
buffer: Vec::new(),
|
||||
remaining: period,
|
||||
len: 0,
|
||||
_phantom_: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
fn insert(&mut self, key: u64, value: &T) -> io::Result<Option<(u64, u64)>> {
|
||||
fn insert(&mut self, doc_id: DocId, value: &T) -> io::Result<Option<(DocId, u32)>> {
|
||||
self.remaining -= 1;
|
||||
self.len += 1;
|
||||
let offset = self.written_size() as u64;
|
||||
VInt(key).serialize(&mut self.buffer)?;
|
||||
let offset = self.written_size() as u32;
|
||||
doc_id.serialize(&mut self.buffer)?;
|
||||
value.serialize(&mut self.buffer)?;
|
||||
let emit_skip_info = (self.period_mask & self.len) == 0;
|
||||
if emit_skip_info {
|
||||
Ok(Some((key, offset)))
|
||||
Ok(if self.remaining == 0 {
|
||||
self.remaining = self.period;
|
||||
Some((doc_id, offset))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
None
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SkipListBuilder<T: BinarySerializable> {
|
||||
period: usize,
|
||||
data_layer: LayerBuilder<T>,
|
||||
skip_layers: Vec<LayerBuilder<u64>>,
|
||||
skip_layers: Vec<LayerBuilder<u32>>,
|
||||
}
|
||||
|
||||
impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
@@ -59,7 +62,7 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder<u64> {
|
||||
fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder<u32> {
|
||||
if layer_id == self.skip_layers.len() {
|
||||
let layer_builder = LayerBuilder::with_period(self.period);
|
||||
self.skip_layers.push(layer_builder);
|
||||
@@ -67,9 +70,9 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
&mut self.skip_layers[layer_id]
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: u64, dest: &T) -> io::Result<()> {
|
||||
pub fn insert(&mut self, doc_id: DocId, dest: &T) -> io::Result<()> {
|
||||
let mut layer_id = 0;
|
||||
let mut skip_pointer = self.data_layer.insert(key, dest)?;
|
||||
let mut skip_pointer = self.data_layer.insert(doc_id, dest)?;
|
||||
loop {
|
||||
skip_pointer = match skip_pointer {
|
||||
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
|
||||
@@ -83,11 +86,13 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
}
|
||||
|
||||
pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> {
|
||||
let mut size: u64 = self.data_layer.buffer.len() as u64;
|
||||
let mut layer_sizes = vec![VInt(size)];
|
||||
let mut size: u32 = 0;
|
||||
let mut layer_sizes: Vec<u32> = Vec::new();
|
||||
size += self.data_layer.buffer.len() as u32;
|
||||
layer_sizes.push(size);
|
||||
for layer in self.skip_layers.iter().rev() {
|
||||
size += layer.buffer.len() as u64;
|
||||
layer_sizes.push(VInt(size));
|
||||
size += layer.buffer.len() as u32;
|
||||
layer_sizes.push(size);
|
||||
}
|
||||
layer_sizes.serialize(output)?;
|
||||
self.data_layer.write(output)?;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use postings::UnorderedTermId;
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
|
||||
mod murmurhash2 {
|
||||
@@ -59,8 +58,10 @@ mod murmurhash2 {
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
let table_size_limit: usize = per_thread_memory_budget / 5;
|
||||
let compute_table_size = |num_bits: usize| {
|
||||
(1 << num_bits) * mem::size_of::<KeyValue>()
|
||||
};
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
@@ -102,7 +103,7 @@ impl KeyValue {
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct TermHashMap<'a> {
|
||||
pub struct HashMap<'a> {
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
mask: usize,
|
||||
@@ -117,11 +118,7 @@ struct QuadraticProbing {
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing {
|
||||
hash,
|
||||
i: 0,
|
||||
mask,
|
||||
}
|
||||
QuadraticProbing { hash, i: 0, mask }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -131,11 +128,11 @@ impl QuadraticProbing {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermHashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
|
||||
impl<'a> HashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
HashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
@@ -158,25 +155,22 @@ impl<'a> TermHashMap<'a> {
|
||||
(key_bytes, expull_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
|
||||
pub fn set_bucket(&mut self, hash: u32, key_bytes_ref: BytesRef, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr, hash
|
||||
key_value_addr: key_bytes_ref,
|
||||
hash,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
|
||||
self.occupied.iter().cloned().map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
let (key, offset) = self.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
self.get_key_value(kv.key_value_addr)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
|
||||
&mut self,
|
||||
key: S,
|
||||
) -> (UnorderedTermId, &mut V) {
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
@@ -188,14 +182,11 @@ impl<'a> TermHashMap<'a> {
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
|
||||
self.set_bucket(hash, key_bytes_ref, bucket);
|
||||
return (bucket as UnorderedTermId, val);
|
||||
return val;
|
||||
} else if kv.hash == hash {
|
||||
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
|
||||
if stored_key == key_bytes {
|
||||
return (
|
||||
bucket as UnorderedTermId,
|
||||
self.heap.get_mut_ref(expull_addr),
|
||||
);
|
||||
return self.heap.get_mut_ref(expull_addr);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -228,41 +219,41 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(split_memory(100_000), (67232, 12));
|
||||
assert_eq!(split_memory(1_000_000), (737856, 15));
|
||||
assert_eq!(split_memory(10_000_000), (7902848, 18));
|
||||
assert_eq!(split_memory(100_000), (67232, 9));
|
||||
assert_eq!(split_memory(1_000_000), (737856, 12));
|
||||
assert_eq!(split_memory(10_000_000), (7902848, 15));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let heap = Heap::with_capacity(2_000_000);
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
|
||||
let mut hash_map: HashMap = HashMap::new(18, &heap);
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc");
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd");
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 4u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc");
|
||||
assert_eq!(v.val, 3u32);
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd");
|
||||
assert_eq!(v.val, 4u32);
|
||||
}
|
||||
let mut iter_values = hash_map.iter();
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let (_, addr) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 3u32);
|
||||
}
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let (_, addr) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 4u32);
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ mod expull;
|
||||
|
||||
pub use self::heap::{Heap, HeapAllocable};
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::hashmap::TermHashMap;
|
||||
pub use self::hashmap::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_unrolled_linked_list() {
|
||||
@@ -16,15 +16,15 @@ fn test_unrolled_linked_list() {
|
||||
ks.push(2);
|
||||
ks.push(3);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
|
||||
let mut hashmap: HashMap = HashMap::new(10, &heap);
|
||||
for j in 0..k {
|
||||
for i in 0..500 {
|
||||
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
|
||||
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
|
||||
v.push(i * j, &heap);
|
||||
}
|
||||
}
|
||||
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
|
||||
for (key, addr, _) in hashmap.iter() {
|
||||
for (key, addr) in hashmap.iter() {
|
||||
map_addr.insert(Vec::from(key), addr);
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ use directory::ReadOnlySource;
|
||||
use directory::shared_vec_slice::SharedVecSlice;
|
||||
use directory::WritePtr;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use memmap::{Mmap, Protection};
|
||||
use std::collections::hash_map::Entry as HashMapEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
@@ -14,17 +15,16 @@ use std::fs::{self, File};
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::mem;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
use tempdir::TempDir;
|
||||
|
||||
/// Returns None iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped).
|
||||
///
|
||||
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
|
||||
let file = File::open(full_path).map_err(|e| {
|
||||
fn open_mmap(full_path: &Path) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
let file = File::open(&full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.to_owned())
|
||||
} else {
|
||||
@@ -36,13 +36,14 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
// to mmap the file, so we return None
|
||||
// to mmap the file, so we return an anonymous mmap_cache
|
||||
// instead.
|
||||
return Ok(None);
|
||||
}
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
match Mmap::open(&file, Protection::Read) {
|
||||
Ok(mmap) => Ok(Some(Arc::new(mmap))),
|
||||
Err(e) => Err(IOError::with_path(full_path.to_owned(), e))?,
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
@@ -51,7 +52,10 @@ pub struct CacheCounters {
|
||||
pub hit: usize,
|
||||
// Number of time tantivy had to call `mmap`
|
||||
// as no entry was in the cache.
|
||||
pub miss: usize,
|
||||
pub miss_empty: usize,
|
||||
// Number of time tantivy had to call `mmap`
|
||||
// as the entry in the cache was evinced.
|
||||
pub miss_weak: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
@@ -62,25 +66,38 @@ pub struct CacheInfo {
|
||||
|
||||
struct MmapCache {
|
||||
counters: CacheCounters,
|
||||
cache: HashMap<PathBuf, MmapReadOnly>,
|
||||
cache: HashMap<PathBuf, Weak<Mmap>>,
|
||||
purge_weak_limit: usize,
|
||||
}
|
||||
|
||||
const STARTING_PURGE_WEAK_LIMIT: usize = 1_000;
|
||||
|
||||
impl Default for MmapCache {
|
||||
fn default() -> MmapCache {
|
||||
MmapCache {
|
||||
counters: CacheCounters::default(),
|
||||
cache: HashMap::new(),
|
||||
purge_weak_limit: STARTING_PURGE_WEAK_LIMIT,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MmapCache {
|
||||
/// Removes a `MmapReadOnly` entry from the mmap cache.
|
||||
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
|
||||
self.cache.remove(full_path).is_some()
|
||||
fn cleanup(&mut self) {
|
||||
let previous_cache_size = self.cache.len();
|
||||
let mut new_cache = HashMap::new();
|
||||
mem::swap(&mut new_cache, &mut self.cache);
|
||||
self.cache = new_cache
|
||||
.into_iter()
|
||||
.filter(|&(_, ref weak_ref)| weak_ref.upgrade().is_some())
|
||||
.collect();
|
||||
if self.cache.len() == previous_cache_size {
|
||||
self.purge_weak_limit *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
fn get_info(&mut self) -> CacheInfo {
|
||||
self.cleanup();
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
CacheInfo {
|
||||
counters: self.counters.clone(),
|
||||
@@ -88,18 +105,33 @@ impl MmapCache {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
|
||||
Ok(match self.cache.entry(full_path.to_owned()) {
|
||||
HashMapEntry::Occupied(occupied_entry) => {
|
||||
let mmap = occupied_entry.get();
|
||||
self.counters.hit += 1;
|
||||
Some(mmap.clone())
|
||||
fn get_mmap(&mut self, full_path: &PathBuf) -> Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
// if we exceed this limit, then we go through the weak
|
||||
// and remove those that are obsolete.
|
||||
if self.cache.len() > self.purge_weak_limit {
|
||||
self.cleanup();
|
||||
}
|
||||
Ok(match self.cache.entry(full_path.clone()) {
|
||||
HashMapEntry::Occupied(mut occupied_entry) => {
|
||||
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(Arc::clone(&mmap_arc))
|
||||
} else {
|
||||
// The entry exists but the weak ref has been destroyed.
|
||||
self.counters.miss_weak += 1;
|
||||
if let Some(mmap_arc) = open_mmap(full_path)? {
|
||||
occupied_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss += 1;
|
||||
if let Some(mmap) = open_mmap(full_path)? {
|
||||
vacant_entry.insert(mmap.clone());
|
||||
Some(mmap)
|
||||
self.counters.miss_empty += 1;
|
||||
if let Some(mmap_arc) = open_mmap(full_path)? {
|
||||
vacant_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -196,7 +228,6 @@ impl MmapDirectory {
|
||||
fd.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns some statistical information
|
||||
/// about the Mmap cache.
|
||||
///
|
||||
@@ -253,6 +284,7 @@ impl Directory for MmapDirectory {
|
||||
|
||||
Ok(mmap_cache
|
||||
.get_mmap(&full_path)?
|
||||
.map(MmapReadOnly::from)
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||
}
|
||||
@@ -287,8 +319,6 @@ impl Directory for MmapDirectory {
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
}
|
||||
|
||||
/// Any entry associated to the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
@@ -300,8 +330,6 @@ impl Directory for MmapDirectory {
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
mmap_cache.discard_from_cache(path);
|
||||
|
||||
// Removing the entry in the MMap cache.
|
||||
// The munmap will appear on Drop,
|
||||
// when the last reference is gone.
|
||||
@@ -387,8 +415,7 @@ mod tests {
|
||||
// here we test if the cache releases
|
||||
// mmaps correctly.
|
||||
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
let num_paths = 10;
|
||||
let paths: Vec<PathBuf> = (0..num_paths)
|
||||
let paths: Vec<PathBuf> = (0..10)
|
||||
.map(|i| PathBuf::from(&*format!("file_{}", i)))
|
||||
.collect();
|
||||
{
|
||||
@@ -399,24 +426,49 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
|
||||
}
|
||||
for path in paths.iter() {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
|
||||
}
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
mmap_directory.delete(path).unwrap();
|
||||
assert_eq!(
|
||||
mmap_directory.get_cache_info().mmapped.len(),
|
||||
num_paths - i - 1
|
||||
);
|
||||
for path in &paths {
|
||||
{
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
}
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss_empty, 10);
|
||||
|
||||
{
|
||||
// test weak miss
|
||||
// the first pass create the weak refs.
|
||||
for path in &paths {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
}
|
||||
// ... the second hits the weak refs.
|
||||
for path in &paths {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
}
|
||||
let cache_info = mmap_directory.get_cache_info();
|
||||
assert_eq!(cache_info.counters.miss_empty, 20);
|
||||
assert_eq!(cache_info.counters.miss_weak, 10);
|
||||
}
|
||||
|
||||
{
|
||||
let mut saved_readmmaps = vec![];
|
||||
// Keeps reference alive
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
let r = mmap_directory.open_read(path).unwrap();
|
||||
saved_readmmaps.push(r);
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
|
||||
}
|
||||
let cache_info = mmap_directory.get_cache_info();
|
||||
assert_eq!(cache_info.counters.miss_empty, 30);
|
||||
assert_eq!(cache_info.counters.miss_weak, 10);
|
||||
assert_eq!(cache_info.mmapped.len(), 10);
|
||||
|
||||
for saved_readmmap in saved_readmmaps {
|
||||
assert_eq!(saved_readmmap.as_slice(), content);
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
}
|
||||
|
||||
|
||||
@@ -116,6 +116,9 @@ mod tests {
|
||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||
let _w = directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
if let Err(e) = directory.open_read(*TEST_PATH) {
|
||||
println!("{:?}", e);
|
||||
}
|
||||
assert!(directory.open_read(*TEST_PATH).is_ok());
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::HasLen;
|
||||
use std::slice;
|
||||
use std::io::{self, Read};
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
@@ -20,7 +20,6 @@ pub enum ReadOnlySource {
|
||||
}
|
||||
|
||||
unsafe impl StableDeref for ReadOnlySource {}
|
||||
unsafe impl CloneStableDeref for ReadOnlySource {}
|
||||
|
||||
impl Deref for ReadOnlySource {
|
||||
type Target = [u8];
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use DocId;
|
||||
use termdict::TermOrdinal;
|
||||
use schema::Facet;
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
/// facets associated to a given document in a specific
|
||||
/// segment.
|
||||
///
|
||||
/// Rather than manipulating `Facet` object directly, the API
|
||||
/// exposes those in the form of list of `Facet` ordinal.
|
||||
///
|
||||
/// A segment ordinal can then be translated into a facet via
|
||||
/// `.facet_from_ord(...)`.
|
||||
///
|
||||
/// Facet ordinals are defined as their position in the sorted
|
||||
/// list of facets. This ordinal is segment local and
|
||||
/// only makes sense for a given segment.
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionaryImpl,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
/// Creates a new `FacetReader`.
|
||||
///
|
||||
/// A facet reader just wraps :
|
||||
/// - a `MultiValueIntFastFieldReader` that makes it possible to
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionaryImpl` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
pub fn new(
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionaryImpl,
|
||||
) -> FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
term_dict,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the size of the sets of facets in the segment.
|
||||
/// This does not take in account the documents that may be marked
|
||||
/// as deleted.
|
||||
///
|
||||
/// `Facet` ordinals range from `0` to `num_facets() - 1`.
|
||||
pub fn num_facets(&self) -> usize {
|
||||
self.term_dict.num_terms()
|
||||
}
|
||||
|
||||
/// Accessor for the facet term dictionary.
|
||||
pub fn facet_dict(&self) -> &TermDictionaryImpl {
|
||||
&self.term_dict
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
let found_term = self.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
}
|
||||
|
||||
/// Return the list of facet ordinals associated to a document.
|
||||
pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec<u64>) {
|
||||
self.term_ords.get_vals(doc, output);
|
||||
}
|
||||
}
|
||||
@@ -23,119 +23,36 @@ values stored.
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
use common;
|
||||
use schema::Cardinality;
|
||||
use schema::FieldType;
|
||||
use schema::Value;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::MultiValueIntFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
|
||||
mod reader;
|
||||
mod writer;
|
||||
mod serializer;
|
||||
mod error;
|
||||
mod delete;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
|
||||
/// Trait for types that are allowed for fast fields: (u64 or i64).
|
||||
pub trait FastValue: Default + Clone + Copy {
|
||||
/// Converts a value from u64
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
fn from_u64(val: u64) -> Self;
|
||||
|
||||
/// Converts a value to u64.
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
fn to_u64(&self) -> u64;
|
||||
|
||||
/// Returns the fast field cardinality that can be extracted from the given
|
||||
/// `FieldType`.
|
||||
///
|
||||
/// If the type is not a fast field, `None` is returned.
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality>;
|
||||
|
||||
/// Cast value to `u64`.
|
||||
/// The value is just reinterpreted in memory.
|
||||
fn as_u64(&self) -> u64;
|
||||
}
|
||||
|
||||
impl FastValue for u64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
val
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
FieldType::HierarchicalFacet => Some(Cardinality::MultiValues),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for i64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
common::u64_to_i64(val)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::i64_to_u64(*self)
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::I64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self as u64
|
||||
}
|
||||
}
|
||||
|
||||
fn value_to_u64(value: &Value) -> u64 {
|
||||
match *value {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => panic!("Expected a u64/i64 field, got {:?} ", value),
|
||||
}
|
||||
}
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
pub use self::reader::{I64FastFieldReader, U64FastFieldReader};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use common::CompositeFile;
|
||||
use super::*;
|
||||
use schema::Field;
|
||||
use std::path::Path;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use schema::Document;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use schema::FAST;
|
||||
use test::Bencher;
|
||||
use test;
|
||||
use fastfield::FastFieldReader;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use common::CompositeFile;
|
||||
use rand::XorShiftRng;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use schema::Document;
|
||||
use schema::FAST;
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use super::*;
|
||||
use test;
|
||||
use test::Bencher;
|
||||
|
||||
lazy_static! {
|
||||
static ref SCHEMA: Schema = {
|
||||
@@ -148,9 +65,15 @@ mod tests {
|
||||
};
|
||||
}
|
||||
|
||||
fn add_single_field_doc(fast_field_writers: &mut FastFieldsWriter, field: Field, value: u64) {
|
||||
let mut doc = Document::default();
|
||||
doc.add_u64(field, value);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
let test_fastfield = U64FastFieldReader::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
@@ -164,22 +87,20 @@ mod tests {
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>2u64));
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64);
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 36 as usize);
|
||||
assert_eq!(source.len(), 35 as usize);
|
||||
}
|
||||
{
|
||||
let composite_file = CompositeFile::open(&source).unwrap();
|
||||
let field_source = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(field_source);
|
||||
let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source);
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
@@ -194,28 +115,26 @@ mod tests {
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>3_052u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>9_002u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>15_001u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>777u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>1_002u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>1_501u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>215u64));
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64);
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 61 as usize);
|
||||
assert_eq!(source.len(), 60 as usize);
|
||||
}
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
@@ -238,21 +157,19 @@ mod tests {
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for _ in 0..10_000 {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 34 as usize);
|
||||
assert_eq!(source.len(), 33 as usize);
|
||||
}
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
@@ -269,23 +186,26 @@ mod tests {
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
// forcing the amplitude to be high
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
|
||||
for i in 0u64..10_000u64 {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + i));
|
||||
add_single_field_doc(
|
||||
&mut fast_field_writers,
|
||||
*FIELD,
|
||||
5_000_000_000_000_000_000u64 + i,
|
||||
);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 80042 as usize);
|
||||
assert_eq!(source.len(), 80041 as usize);
|
||||
}
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
@@ -313,19 +233,17 @@ mod tests {
|
||||
doc.add_i64(i64_field, i);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 17709 as usize);
|
||||
assert_eq!(source.len(), 17708 as usize);
|
||||
}
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data);
|
||||
let fast_field_reader: I64FastFieldReader =
|
||||
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
@@ -354,17 +272,15 @@ mod tests {
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data);
|
||||
let fast_field_reader: I64FastFieldReader =
|
||||
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
}
|
||||
@@ -387,19 +303,17 @@ mod tests {
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
@@ -415,7 +329,7 @@ mod tests {
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
for i in Iterator::step_by((0u32..n), 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
@@ -444,24 +358,22 @@ mod tests {
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
for i in Iterator::step_by((0u32..n), 7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
@@ -478,19 +390,17 @@ mod tests {
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
mod writer;
|
||||
mod reader;
|
||||
|
||||
pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
pub use self::reader::MultiValueIntFastFieldReader;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Cardinality;
|
||||
use schema::IntOptions;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=>1u64, field=>3u64));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(field=>4u64));
|
||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4u64]);
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1u64, 3u64]);
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_i64() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_i64_field(
|
||||
"multifield",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(field=> -4i64));
|
||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[-4i64]);
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1i64, 3i64]);
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,127 +0,0 @@
|
||||
use DocId;
|
||||
use fastfield::{FastFieldReader, FastValue};
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
///
|
||||
/// The reader is implemented as two `u64` fast field.
|
||||
///
|
||||
/// The `vals_reader` will access the concatenated list of all
|
||||
/// values for all reader.
|
||||
/// The `idx_reader` associated, for each document, the index of its first value.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValueIntFastFieldReader<Item: FastValue> {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
) -> MultiValueIntFastFieldReader<Item> {
|
||||
MultiValueIntFastFieldReader {
|
||||
idx_reader,
|
||||
vals_reader,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
/// Returns the number of values associated to a given document.
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let (start, stop) = self.range(doc);
|
||||
(stop - start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values associated to documents.
|
||||
pub(crate) fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let (start, stop) = self.range(doc);
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let facet_field = schema_builder.add_facet_field("facets");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 30_000_000)
|
||||
.expect("Failed to create index writer.");
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
doc.add_facet(facet_field, "/category/cat1");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat3");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().expect("Commit failed");
|
||||
index.load_searchers().expect("Reloading searchers");
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
facet_reader.facet_from_ord(1, &mut facet);
|
||||
assert_eq!(facet, Facet::from("/category"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(2, &mut facet);
|
||||
assert_eq!(facet, Facet::from("/category/cat1"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(3, &mut facet);
|
||||
assert_eq!(format!("{}", facet), "/category/cat2");
|
||||
assert_eq!(facet, Facet::from("/category/cat2"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(4, &mut facet);
|
||||
assert_eq!(facet, Facet::from("/category/cat3"));
|
||||
}
|
||||
|
||||
let mut vals = Vec::new();
|
||||
{
|
||||
facet_reader.facet_ords(0, &mut vals);
|
||||
assert_eq!(&vals[..], &[3, 2]);
|
||||
}
|
||||
{
|
||||
facet_reader.facet_ords(1, &mut vals);
|
||||
assert_eq!(&vals[..], &[3]);
|
||||
}
|
||||
{
|
||||
facet_reader.facet_ords(2, &mut vals);
|
||||
assert_eq!(&vals[..], &[4]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,112 +0,0 @@
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::serializer::FastSingleFieldSerializer;
|
||||
use fastfield::value_to_u64;
|
||||
use std::collections::HashMap;
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Document, Field};
|
||||
use std::io;
|
||||
use itertools::Itertools;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
pub struct MultiValueIntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u64>,
|
||||
doc_index: Vec<u64>,
|
||||
is_facet: bool,
|
||||
}
|
||||
|
||||
impl MultiValueIntFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field, is_facet: bool) -> Self {
|
||||
MultiValueIntFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
is_facet,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
pub fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: UnorderedTermId) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
if !self.is_facet {
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
self.add_val(value_to_u64(field_value.value()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
|
||||
///
|
||||
/// HashMap makes it possible to remap them before serializing.
|
||||
/// Specifically, string terms are first stored in the writer as their
|
||||
/// position in the `IndexWriter`'s `HashMap`. This value is called
|
||||
/// an `UnorderedTermId`.
|
||||
///
|
||||
/// During the serialization of the segment, terms gets sorted and
|
||||
/// `tantivy` builds a mapping to convert this `UnorderedTermId` into
|
||||
/// term ordinals.
|
||||
///
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
for &offset in &self.doc_index {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
}
|
||||
doc_index_serializer.add_val(self.vals.len() as u64)?;
|
||||
doc_index_serializer.close_field()?;
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: FastSingleFieldSerializer<_>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
self.field,
|
||||
0u64,
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
for val in &self.vals {
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
|
||||
value_serializer.add_val(remapped_val)?;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = self.vals.iter().cloned().minmax();
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for &val in &self.vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
value_serializer.close_field()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,35 +1,107 @@
|
||||
use common::BinarySerializable;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use common::CompositeFile;
|
||||
use common::compute_num_bits;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use directory::ReadOnlySource;
|
||||
use common::{self, BinarySerializable};
|
||||
use common::bitpacker::{compute_num_bits, BitUnpacker};
|
||||
use DocId;
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use owning_ref::OwningRef;
|
||||
use schema::FAST;
|
||||
use schema::SchemaBuilder;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem;
|
||||
use std::path::Path;
|
||||
use super::FastValue;
|
||||
use schema::FAST;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use schema::FieldType;
|
||||
use std::mem;
|
||||
use common::CompositeFile;
|
||||
use owning_ref::OwningRef;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReader<Item: FastValue> {
|
||||
bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
|
||||
min_value_u64: u64,
|
||||
max_value_u64: u64,
|
||||
_phantom: PhantomData<Item>,
|
||||
pub trait FastFieldReader: Sized {
|
||||
/// Type of the value stored in the fastfield.
|
||||
type ValueType;
|
||||
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
fn get(&self, doc: DocId) -> Self::ValueType;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: u32, output: &mut [Self::ValueType]);
|
||||
|
||||
/// Opens a fast field given a source.
|
||||
fn open(source: ReadOnlySource) -> Self;
|
||||
|
||||
/// Returns true iff the given field_type makes
|
||||
/// it possible to access the field values via a
|
||||
/// fastfield.
|
||||
fn is_enabled(field_type: &FieldType) -> bool;
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// Opens a fast field given a source.
|
||||
pub fn open(data: ReadOnlySource) -> Self {
|
||||
/// `FastFieldReader` for unsigned 64-bits integers.
|
||||
pub struct U64FastFieldReader {
|
||||
bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
}
|
||||
|
||||
impl U64FastFieldReader {
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
pub fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReader for U64FastFieldReader {
|
||||
type ValueType = u64;
|
||||
|
||||
fn get(&self, doc: DocId) -> u64 {
|
||||
self.min_value + self.bit_unpacker.get(doc as usize)
|
||||
}
|
||||
|
||||
fn is_enabled(field_type: &FieldType) -> bool {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_range(&self, start: u32, output: &mut [Self::ValueType]) {
|
||||
self.bit_unpacker.get_range(start, output);
|
||||
for out in output.iter_mut() {
|
||||
*out += self.min_value;
|
||||
}
|
||||
}
|
||||
|
||||
/// Opens a new fast field reader given a read only source.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the data is corrupted.
|
||||
fn open(data: ReadOnlySource) -> U64FastFieldReader {
|
||||
let min_value: u64;
|
||||
let amplitude: u64;
|
||||
{
|
||||
@@ -42,67 +114,17 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let owning_ref = OwningRef::new(data).map(|data| &data[16..]);
|
||||
let bit_unpacker = BitUnpacker::new(owning_ref, num_bits);
|
||||
FastFieldReader {
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
_phantom: PhantomData,
|
||||
let bit_unpacker = BitUnpacker::new(owning_ref, num_bits as usize);
|
||||
U64FastFieldReader {
|
||||
min_value: min_value,
|
||||
max_value: max_value,
|
||||
bit_unpacker: bit_unpacker,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
///
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||
self.bit_unpacker.get_range(start, output_u64);
|
||||
for out in output_u64.iter_mut() {
|
||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.min_value_u64)
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.max_value_u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||
impl From<Vec<u64>> for U64FastFieldReader {
|
||||
fn from(vals: Vec<u64>) -> U64FastFieldReader {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
@@ -120,21 +142,89 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
.get_field_writer(field)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val.to_u64());
|
||||
fast_field_writer.add_val(val);
|
||||
}
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let source = directory.open_read(path).expect("Failed to open the file");
|
||||
let composite_file =
|
||||
CompositeFile::open(&source).expect("Failed to read the composite file");
|
||||
|
||||
let field_source = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
FastFieldReader::open(field_source)
|
||||
U64FastFieldReader::open(field_source)
|
||||
}
|
||||
}
|
||||
|
||||
/// `FastFieldReader` for signed 64-bits integers.
|
||||
pub struct I64FastFieldReader {
|
||||
underlying: U64FastFieldReader,
|
||||
}
|
||||
|
||||
impl I64FastFieldReader {
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
pub fn min_value(&self) -> i64 {
|
||||
common::u64_to_i64(self.underlying.min_value())
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> i64 {
|
||||
common::u64_to_i64(self.underlying.max_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReader for I64FastFieldReader {
|
||||
type ValueType = i64;
|
||||
|
||||
///
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic or return wrong random result if `doc`
|
||||
/// is greater or equal to the segment's `maxdoc`.
|
||||
fn get(&self, doc: DocId) -> i64 {
|
||||
common::u64_to_i64(self.underlying.get(doc))
|
||||
}
|
||||
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic or return wrong random result if `doc`
|
||||
/// is greater or equal to the segment's `maxdoc`.
|
||||
fn get_range(&self, start: u32, output: &mut [Self::ValueType]) {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||
self.underlying.get_range(start, output_u64);
|
||||
for mut_val in output_u64.iter_mut() {
|
||||
*mut_val = common::u64_to_i64(*mut_val as u64) as u64;
|
||||
}
|
||||
}
|
||||
|
||||
/// Opens a new fast field reader given a read only source.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the data is corrupted.
|
||||
fn open(data: ReadOnlySource) -> I64FastFieldReader {
|
||||
I64FastFieldReader {
|
||||
underlying: U64FastFieldReader::open(data),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_enabled(field_type: &FieldType) -> bool {
|
||||
match *field_type {
|
||||
FieldType::I64(ref integer_options) => integer_options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use common::BinarySerializable;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use common::bitpacker::BitPacker;
|
||||
use common::compute_num_bits;
|
||||
use common::bitpacker::{compute_num_bits, BitPacker};
|
||||
use common::CountingWriter;
|
||||
use common::CompositeWrite;
|
||||
use std::io::{self, Write};
|
||||
@@ -46,18 +45,7 @@ impl FastFieldSerializer {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
let field_write = self.composite_write.for_field(field);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
@@ -73,43 +61,30 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
let bit_packer = BitPacker::new(num_bits as usize);
|
||||
Ok(FastSingleFieldSerializer {
|
||||
write,
|
||||
bit_packer,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
self.bit_packer.write(val_to_write, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -1,120 +1,81 @@
|
||||
use schema::{Cardinality, Document, Field, Schema};
|
||||
use schema::{Document, Field, Schema};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use std::io;
|
||||
use schema::Value;
|
||||
use DocId;
|
||||
use schema::FieldType;
|
||||
use common;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use postings::UnorderedTermId;
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use common::BinarySerializable;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
|
||||
field_writers: Vec<IntFastFieldWriter>,
|
||||
}
|
||||
|
||||
impl FastFieldsWriter {
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
let default_value = if let FieldType::I64(_) = *field_entry.field_type() {
|
||||
common::i64_to_u64(0i64)
|
||||
} else {
|
||||
0u64
|
||||
};
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let field_writers: Vec<IntFastFieldWriter> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.flat_map(|(field_id, field_entry)| {
|
||||
let field = Field(field_id as u32);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(ref int_options) => {
|
||||
if int_options.is_fast() {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
fast_field_writer.set_val_if_missing(default_value);
|
||||
single_value_writers.push(fast_field_writer);
|
||||
fast_field_writer.set_val_if_missing(common::i64_to_u64(0i64));
|
||||
Some(fast_field_writer)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, false);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
FieldType::U64(ref int_options) => {
|
||||
if int_options.is_fast() {
|
||||
Some(IntFastFieldWriter::new(field))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
FieldType::HierarchicalFacet => {
|
||||
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
FastFieldsWriter { field_writers }
|
||||
}
|
||||
|
||||
/// Returns a `FastFieldsWriter with a `u64` `IntFastFieldWriter` for each
|
||||
/// Returns a `FastFieldsWriter`
|
||||
/// with a `IntFastFieldWriter` for each
|
||||
/// of the field given in argument.
|
||||
pub(crate) fn new(fields: Vec<Field>) -> FastFieldsWriter {
|
||||
pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
|
||||
FastFieldsWriter {
|
||||
single_value_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
|
||||
multi_values_writers: vec![],
|
||||
field_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.single_value_writers
|
||||
self.field_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the fast field multi-value writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a multivalued fastfield in the schema.
|
||||
pub(crate) fn get_multivalue_writer(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValueIntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
// TODO expose for users
|
||||
self.multi_values_writers
|
||||
.iter_mut()
|
||||
.find(|multivalue_writer| multivalue_writer.field() == field)
|
||||
.find(|field_writer| field_writer.field == field)
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.multi_values_writers {
|
||||
field_writer.next_doc();
|
||||
for field_writer in &mut self.field_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.single_value_writers {
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
for field_writer in &self.field_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
}
|
||||
for field_writer in &self.multi_values_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -123,7 +84,7 @@ impl FastFieldsWriter {
|
||||
///
|
||||
/// The missing values will be filled with 0.
|
||||
pub fn fill_val_up_to(&mut self, doc: DocId) {
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
for field_writer in &mut self.field_writers {
|
||||
field_writer.fill_val_up_to(doc);
|
||||
}
|
||||
}
|
||||
@@ -166,11 +127,6 @@ impl IntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the field that this writer is targetting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Sets the default value.
|
||||
///
|
||||
/// This default value is recorded for documents if
|
||||
@@ -224,7 +180,11 @@ impl IntFastFieldWriter {
|
||||
/// only the first one is taken in account.
|
||||
fn extract_val(&self, doc: &Document) -> u64 {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => super::value_to_u64(v),
|
||||
Some(v) => match *v {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => panic!("Expected a u64field, got {:?} ", v),
|
||||
},
|
||||
None => self.val_if_missing,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ use indexer::stamper::Stamper;
|
||||
use datastruct::stacker::Heap;
|
||||
use directory::FileProtection;
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
use Directory;
|
||||
use fastfield::write_delete_bitset;
|
||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use futures::Canceled;
|
||||
@@ -20,17 +21,17 @@ use indexer::MergePolicy;
|
||||
use indexer::operation::DeleteOperation;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentWriter;
|
||||
use docset::DocSet;
|
||||
use postings::DocSet;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::mem::swap;
|
||||
use std::thread::JoinHandle;
|
||||
use indexer::DirectoryLock;
|
||||
use super::directory_lock::DirectoryLock;
|
||||
use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use std::thread;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
@@ -56,7 +57,7 @@ type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
pub struct IndexWriter {
|
||||
// the lock is just used to bind the
|
||||
// lifetime of the lock with that of the IndexWriter.
|
||||
_directory_lock: Option<DirectoryLock>,
|
||||
_directory_lock: DirectoryLock,
|
||||
|
||||
index: Index,
|
||||
|
||||
@@ -103,20 +104,22 @@ pub fn open_index_writer(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_LIMIT as usize {
|
||||
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT
|
||||
));
|
||||
}
|
||||
|
||||
let directory_lock = DirectoryLock::lock(index.directory().box_clone())?;
|
||||
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let current_opstamp = index.load_metas()?.opstamp;
|
||||
let current_opstamp = index.opstamp();
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
@@ -124,7 +127,7 @@ pub fn open_index_writer(
|
||||
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
_directory_lock: Some(directory_lock),
|
||||
_directory_lock: directory_lock,
|
||||
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
@@ -200,6 +203,7 @@ pub fn advance_deletes(
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
{
|
||||
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
|
||||
// We are already up-to-date here.
|
||||
@@ -240,6 +244,7 @@ pub fn advance_deletes(
|
||||
}
|
||||
}
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
|
||||
Ok(file_protect)
|
||||
}
|
||||
|
||||
@@ -247,18 +252,17 @@ fn index_documents(
|
||||
heap: &mut Heap,
|
||||
table_size: usize,
|
||||
segment: &Segment,
|
||||
schema: &Schema,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
heap.clear();
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?;
|
||||
let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?;
|
||||
for doc in document_iterator {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
segment_writer.add_document(&doc, schema)?;
|
||||
// There is two possible conditions to close the segment.
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
@@ -282,11 +286,6 @@ fn index_documents(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !segment_updater.is_alive() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let num_docs = segment_writer.max_doc();
|
||||
|
||||
// this is ensured by the call to peek before starting
|
||||
@@ -357,11 +356,7 @@ impl IndexWriter {
|
||||
.add_segment(self.generation, segment_entry);
|
||||
}
|
||||
|
||||
/// *Experimental & Advanced API* Creates a new segment.
|
||||
/// and marks it as currently in write.
|
||||
///
|
||||
/// This method is useful only for users trying to do complex
|
||||
/// operations, like converting an index format to another.
|
||||
#[doc(hidden)]
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
self.segment_updater.new_segment()
|
||||
}
|
||||
@@ -370,6 +365,7 @@ impl IndexWriter {
|
||||
/// The thread consumes documents from the pipeline.
|
||||
///
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let schema = self.index.schema();
|
||||
let document_receiver_clone = self.document_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
|
||||
@@ -410,6 +406,7 @@ impl IndexWriter {
|
||||
&mut heap,
|
||||
table_size,
|
||||
&segment,
|
||||
&schema,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
@@ -479,66 +476,41 @@ impl IndexWriter {
|
||||
/// state as it was after the last commit.
|
||||
///
|
||||
/// The opstamp at the last commit is returned.
|
||||
pub fn rollback(&mut self) -> Result<()> {
|
||||
pub fn rollback(mut self) -> Result<IndexWriter> {
|
||||
info!("Rolling back to opstamp {}", self.committed_opstamp);
|
||||
|
||||
// marks the segment updater as killed. From now on, all
|
||||
// segment updates will be ignored.
|
||||
self.segment_updater.kill();
|
||||
|
||||
let document_receiver = self.document_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self._directory_lock
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
let new_index_writer: IndexWriter = open_index_writer(
|
||||
&self.index,
|
||||
self.num_threads,
|
||||
self.heap_size_in_bytes_per_thread,
|
||||
directory_lock,
|
||||
)?;
|
||||
|
||||
// the current `self` is dropped right away because of this call.
|
||||
//
|
||||
// This will drop the document queue, and the thread
|
||||
// should terminate.
|
||||
mem::replace(self, new_index_writer);
|
||||
|
||||
// Drains the document receiver pipeline :
|
||||
// Workers don't need to index the pending documents.
|
||||
//
|
||||
// This will reach an end as the only document_sender
|
||||
// was dropped with the index_writer.
|
||||
for _ in document_receiver.clone() {}
|
||||
let receiver_clone = self.document_receiver.clone();
|
||||
let index = self.index.clone();
|
||||
let num_threads = self.num_threads;
|
||||
let heap_size_in_bytes_per_thread = self.heap_size_in_bytes_per_thread;
|
||||
drop(self);
|
||||
for _ in receiver_clone {}
|
||||
|
||||
Ok(())
|
||||
let index_writer = open_index_writer(&index, num_threads, heap_size_in_bytes_per_thread)?;
|
||||
|
||||
Ok(index_writer)
|
||||
}
|
||||
|
||||
/// Prepares a commit.
|
||||
/// Commits all of the pending changes
|
||||
///
|
||||
/// Calling `prepare_commit()` will cut the indexing
|
||||
/// queue. All pending documents will be sent to the
|
||||
/// indexing workers. They will then terminate, regardless
|
||||
/// of the size of their current segment and flush their
|
||||
/// work on disk.
|
||||
/// A call to commit blocks.
|
||||
/// After it returns, all of the document that
|
||||
/// were added since the last commit are published
|
||||
/// and persisted.
|
||||
///
|
||||
/// Once a commit is "prepared", you can either
|
||||
/// call
|
||||
/// * `.commit()`: to accept this commit
|
||||
/// * `.abort()`: to cancel this commit.
|
||||
/// In case of a crash or an hardware failure (as
|
||||
/// long as the hard disk is spared), it will be possible
|
||||
/// to resume indexing from this point.
|
||||
///
|
||||
/// In the current implementation, `PreparedCommit` borrows
|
||||
/// the `IndexWriter` mutably so we are guaranteed that no new
|
||||
/// document can be added as long as it is committed or is
|
||||
/// dropped.
|
||||
/// Commit returns the `opstamp` of the last document
|
||||
/// that made it in the commit.
|
||||
///
|
||||
/// It is also possible to add a payload to the `commit`
|
||||
/// using this API.
|
||||
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
|
||||
pub fn prepare_commit(&mut self) -> Result<PreparedCommit> {
|
||||
// Here, because we join all of the worker threads,
|
||||
pub fn commit(&mut self) -> Result<u64> {
|
||||
// here, because we join all of the worker threads,
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
//
|
||||
@@ -548,7 +520,8 @@ impl IndexWriter {
|
||||
|
||||
// This will move uncommitted segments to the state of
|
||||
// committed segments.
|
||||
info!("Preparing commit");
|
||||
self.committed_opstamp = self.stamper.stamp();
|
||||
info!("committing {}", self.committed_opstamp);
|
||||
|
||||
// this will drop the current document channel
|
||||
// and recreate a new one channels.
|
||||
@@ -570,32 +543,10 @@ impl IndexWriter {
|
||||
self.add_indexing_worker()?;
|
||||
}
|
||||
|
||||
let commit_opstamp = self.stamper.stamp();
|
||||
let prepared_commit = PreparedCommit::new(self, commit_opstamp);
|
||||
info!("Prepared commit {}", commit_opstamp);
|
||||
Ok(prepared_commit)
|
||||
}
|
||||
// wait for the segment update thread to have processed the info
|
||||
self.segment_updater.commit(self.committed_opstamp)?;
|
||||
|
||||
/// Commits all of the pending changes
|
||||
///
|
||||
/// A call to commit blocks.
|
||||
/// After it returns, all of the document that
|
||||
/// were added since the last commit are published
|
||||
/// and persisted.
|
||||
///
|
||||
/// In case of a crash or an hardware failure (as
|
||||
/// long as the hard disk is spared), it will be possible
|
||||
/// to resume indexing from this point.
|
||||
///
|
||||
/// Commit returns the `opstamp` of the last document
|
||||
/// that made it in the commit.
|
||||
///
|
||||
pub fn commit(&mut self) -> Result<u64> {
|
||||
self.prepare_commit()?.commit()
|
||||
}
|
||||
|
||||
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
|
||||
&self.segment_updater
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
|
||||
/// Delete all documents containing a given term.
|
||||
@@ -701,21 +652,33 @@ mod tests {
|
||||
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term)
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
};
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.rollback().unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
index_writer = index_writer.rollback().unwrap();
|
||||
|
||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
assert_eq!(index_writer.commit().unwrap(), 2u64);
|
||||
index.load_searchers().unwrap();
|
||||
@@ -765,78 +728,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prepare_with_commit_message() {
|
||||
let _ = env_logger::init();
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
assert_eq!(prepared_commit.opstamp(), 100);
|
||||
prepared_commit.commit().expect("commit failed");
|
||||
}
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
assert_eq!(metas.payload.unwrap(), "first commit");
|
||||
}
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
assert!(metas.payload.is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prepare_but_rollback() {
|
||||
let _ = env_logger::init();
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
assert_eq!(prepared_commit.opstamp(), 100);
|
||||
prepared_commit.abort().expect("commit failed");
|
||||
}
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
assert!(metas.payload.is_none());
|
||||
}
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -13,9 +13,7 @@ mod segment_entry;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod operation;
|
||||
mod stamper;
|
||||
mod prepared_commit;
|
||||
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
@@ -23,7 +21,6 @@ pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
use Result;
|
||||
use super::IndexWriter;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
index_writer: &'a mut IndexWriter,
|
||||
payload: Option<String>,
|
||||
opstamp: u64,
|
||||
}
|
||||
|
||||
impl<'a> PreparedCommit<'a> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
opstamp
|
||||
}
|
||||
}
|
||||
|
||||
pub fn opstamp(&self) -> u64 {
|
||||
self.opstamp
|
||||
}
|
||||
|
||||
pub fn set_payload(&mut self, payload: &str) {
|
||||
self.payload = Some(payload.to_string())
|
||||
}
|
||||
|
||||
pub fn abort(self) -> Result<()> {
|
||||
self.index_writer.rollback()
|
||||
}
|
||||
|
||||
pub fn commit(self) -> Result<u64> {
|
||||
info!("committing {}", self.opstamp);
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
.commit(self.opstamp, self.payload)?;
|
||||
Ok(self.opstamp)
|
||||
}
|
||||
}
|
||||
@@ -56,6 +56,7 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Finalize the segment serialization.
|
||||
#[inline(never)]
|
||||
pub fn close(self) -> Result<()> {
|
||||
self.fast_field_serializer.close()?;
|
||||
self.postings_serializer.close()?;
|
||||
|
||||
@@ -46,7 +46,7 @@ use super::segment_manager::{get_mergeable_segments, SegmentManager};
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
|
||||
save_metas(vec![], schema, opstamp, None, directory)
|
||||
save_metas(vec![], schema, opstamp, directory)
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
@@ -62,14 +62,12 @@ pub fn save_metas(
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
schema: Schema,
|
||||
opstamp: u64,
|
||||
payload: Option<String>,
|
||||
directory: &mut Directory,
|
||||
) -> Result<()> {
|
||||
let metas = IndexMeta {
|
||||
segments: segment_metas,
|
||||
schema,
|
||||
opstamp,
|
||||
payload,
|
||||
};
|
||||
let mut buffer = serde_json::to_vec_pretty(&metas)?;
|
||||
write!(&mut buffer, "\n")?;
|
||||
@@ -224,7 +222,7 @@ impl SegmentUpdater {
|
||||
self.0.killed.store(true, Ordering::Release);
|
||||
}
|
||||
|
||||
pub fn is_alive(&self) -> bool {
|
||||
fn is_alive(&self) -> bool {
|
||||
!self.0.killed.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
@@ -241,7 +239,7 @@ impl SegmentUpdater {
|
||||
Ok(segment_entries)
|
||||
}
|
||||
|
||||
pub fn save_metas(&self, opstamp: u64, commit_message: Option<String>) {
|
||||
pub fn save_metas(&self, opstamp: u64) {
|
||||
if self.is_alive() {
|
||||
let index = &self.0.index;
|
||||
let directory = index.directory();
|
||||
@@ -249,7 +247,6 @@ impl SegmentUpdater {
|
||||
self.0.segment_manager.committed_segment_metas(),
|
||||
index.schema(),
|
||||
opstamp,
|
||||
commit_message,
|
||||
directory.box_clone().borrow_mut(),
|
||||
).expect("Could not save metas.");
|
||||
}
|
||||
@@ -269,14 +266,14 @@ impl SegmentUpdater {
|
||||
.garbage_collect(|| self.0.segment_manager.list_files());
|
||||
}
|
||||
|
||||
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
|
||||
pub fn commit(&self, opstamp: u64) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater
|
||||
.purge_deletes(opstamp)
|
||||
.expect("Failed purge deletes");
|
||||
segment_updater.0.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp, payload);
|
||||
segment_updater.save_metas(opstamp);
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
segment_updater.consider_merge_options();
|
||||
}
|
||||
@@ -385,12 +382,7 @@ impl SegmentUpdater {
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
let mut _file_protection_opt = None;
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater
|
||||
.0
|
||||
.index
|
||||
.load_metas()
|
||||
.expect("Failed to read opstamp")
|
||||
.opstamp;
|
||||
let committed_opstamp = segment_updater.0.index.opstamp();
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
@@ -426,8 +418,7 @@ impl SegmentUpdater {
|
||||
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
info!("save metas");
|
||||
let previous_metas = segment_updater.0.index.load_metas().unwrap();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
|
||||
segment_updater.save_metas(segment_updater.0.index.opstamp());
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
}).wait()
|
||||
}
|
||||
|
||||
@@ -1,23 +1,20 @@
|
||||
use Result;
|
||||
use DocId;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use schema::Field;
|
||||
use schema::FieldValue;
|
||||
use schema::FieldType;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use std::collections::HashMap;
|
||||
use datastruct::stacker::Heap;
|
||||
use indexer::index_writer::MARGIN_IN_BYTES;
|
||||
use super::operation::AddOperation;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::FacetTokenizer;
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use schema::Value;
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
@@ -126,69 +123,36 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// Indexes a new document
|
||||
///
|
||||
/// As a user, you should rather use `IndexWriter`'s add_document.
|
||||
pub fn add_document(&mut self, add_operation: AddOperation, schema: &Schema) -> io::Result<()> {
|
||||
pub fn add_document(
|
||||
&mut self,
|
||||
add_operation: &AddOperation,
|
||||
schema: &Schema,
|
||||
) -> io::Result<()> {
|
||||
let doc_id = self.max_doc;
|
||||
let mut doc = add_operation.document;
|
||||
let doc = &add_operation.document;
|
||||
self.doc_opstamps.push(add_operation.opstamp);
|
||||
|
||||
self.fast_field_writers.add_document(&doc);
|
||||
|
||||
for (field, field_values) in doc.get_sorted_field_values() {
|
||||
let field_options = schema.get_field_entry(field);
|
||||
if !field_options.is_indexed() {
|
||||
continue;
|
||||
}
|
||||
match *field_options.field_type() {
|
||||
FieldType::HierarchicalFacet => {
|
||||
let facets: Vec<&[u8]> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
for facet_bytes in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||
term.set_text(&token.text);
|
||||
let unordered_term_id =
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
unordered_term_id_opt = Some(unordered_term_id);
|
||||
});
|
||||
if let Some(unordered_term_id) = unordered_term_id_opt {
|
||||
self.fast_field_writers
|
||||
.get_multivalue_writer(field)
|
||||
.expect("multified writer for facet missing")
|
||||
.add_val(unordered_term_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let num_tokens = if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.0 as usize]
|
||||
{
|
||||
let texts: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
if texts.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let num_tokens =
|
||||
if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize] {
|
||||
let texts: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &mut token_stream)
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| {
|
||||
@@ -220,9 +184,13 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
}
|
||||
self.fieldnorms_writer.fill_val_up_to(doc_id);
|
||||
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
|
||||
self.fast_field_writers.add_document(doc);
|
||||
let stored_fieldvalues: Vec<&FieldValue> = doc.field_values()
|
||||
.iter()
|
||||
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
|
||||
.collect();
|
||||
let doc_writer = self.segment_serializer.get_store_writer();
|
||||
doc_writer.store(&doc)?;
|
||||
doc_writer.store(&stored_fieldvalues)?;
|
||||
self.max_doc += 1;
|
||||
Ok(())
|
||||
}
|
||||
@@ -255,9 +223,9 @@ fn write(
|
||||
fieldnorms_writer: &FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer,
|
||||
) -> Result<()> {
|
||||
let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
|
||||
fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
|
||||
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer(), &HashMap::new())?;
|
||||
multifield_postings.serialize(serializer.get_postings_serializer())?;
|
||||
fast_field_writers.serialize(serializer.get_fast_field_serializer())?;
|
||||
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
|
||||
serializer.close()?;
|
||||
|
||||
Ok(())
|
||||
|
||||
161
src/lib.rs
161
src/lib.rs
@@ -4,15 +4,12 @@
|
||||
#![feature(box_syntax)]
|
||||
#![feature(optin_builtin_traits)]
|
||||
#![feature(conservative_impl_trait)]
|
||||
#![feature(collections_range)]
|
||||
#![feature(integer_atomics)]
|
||||
#![feature(drain_filter)]
|
||||
#![cfg_attr(test, feature(test))]
|
||||
#![cfg_attr(test, feature(iterator_step_by))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![allow(unknown_lints)]
|
||||
#![allow(new_without_default)]
|
||||
#![allow(decimal_literal_representation)]
|
||||
#![warn(missing_docs)]
|
||||
|
||||
//! # `tantivy`
|
||||
@@ -20,98 +17,6 @@
|
||||
//! Tantivy is a search engine library.
|
||||
//! Think `Lucene`, but in Rust.
|
||||
//!
|
||||
//! ```rust
|
||||
|
||||
//! # extern crate tempdir;
|
||||
//! #
|
||||
//! #[macro_use]
|
||||
//! extern crate tantivy;
|
||||
//!
|
||||
//! // ...
|
||||
//!
|
||||
//! # use std::path::Path;
|
||||
//! # use tempdir::TempDir;
|
||||
//! # use tantivy::Index;
|
||||
//! # use tantivy::schema::*;
|
||||
//! # use tantivy::collector::TopCollector;
|
||||
//! # use tantivy::query::QueryParser;
|
||||
//! #
|
||||
//! # fn main() {
|
||||
//! # // Let's create a temporary directory for the
|
||||
//! # // sake of this example
|
||||
//! # if let Ok(dir) = TempDir::new("tantivy_example_dir") {
|
||||
//! # run_example(dir.path()).unwrap();
|
||||
//! # dir.close().unwrap();
|
||||
//! # }
|
||||
//! # }
|
||||
//! #
|
||||
//! # fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//! // First we need to define a schema ...
|
||||
//!
|
||||
//! // `TEXT` means the field should be tokenized and indexed,
|
||||
//! // along with its term frequency and term positions.
|
||||
//! //
|
||||
//! // `STORED` means that the field will also be saved
|
||||
//! // in a compressed, row-oriented key-value store.
|
||||
//! // This store is useful to reconstruct the
|
||||
//! // documents that were selected during the search phase.
|
||||
//! let mut schema_builder = SchemaBuilder::default();
|
||||
//! let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
//! let body = schema_builder.add_text_field("body", TEXT);
|
||||
//! let schema = schema_builder.build();
|
||||
//!
|
||||
//! // Indexing documents
|
||||
//!
|
||||
//! let index = Index::create(index_path, schema.clone())?;
|
||||
//!
|
||||
//! // Here we use a buffer of 100MB that will be split
|
||||
//! // between indexing threads.
|
||||
//! let mut index_writer = index.writer(100_000_000)?;
|
||||
//!
|
||||
//! // Let's index one documents!
|
||||
//! index_writer.add_document(doc!(
|
||||
//! title => "The Old Man and the Sea",
|
||||
//! body => "He was an old man who fished alone in a skiff in \
|
||||
//! the Gulf Stream and he had gone eighty-four days \
|
||||
//! now without taking a fish."
|
||||
//! ));
|
||||
//!
|
||||
//! // We need to call .commit() explicitly to force the
|
||||
//! // index_writer to finish processing the documents in the queue,
|
||||
//! // flush the current index to the disk, and advertise
|
||||
//! // the existence of new documents.
|
||||
//! index_writer.commit()?;
|
||||
//!
|
||||
//! // # Searching
|
||||
//!
|
||||
//! index.load_searchers()?;
|
||||
//!
|
||||
//! let searcher = index.searcher();
|
||||
//!
|
||||
//! let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
//!
|
||||
//! // QueryParser may fail if the query is not in the right
|
||||
//! // format. For user facing applications, this can be a problem.
|
||||
//! // A ticket has been opened regarding this problem.
|
||||
//! let query = query_parser.parse_query("sea whale")?;
|
||||
//!
|
||||
//! let mut top_collector = TopCollector::with_limit(10);
|
||||
//! searcher.search(&*query, &mut top_collector)?;
|
||||
//!
|
||||
//! // Our top collector now contains the 10
|
||||
//! // most relevant doc ids...
|
||||
//! let doc_addresses = top_collector.docs();
|
||||
//! for doc_address in doc_addresses {
|
||||
//! let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
//! println!("{}", schema.to_json(&retrieved_doc));
|
||||
//! }
|
||||
//!
|
||||
//! # Ok(())
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//!
|
||||
//!
|
||||
//! A good place for you to get started is to check out
|
||||
//! the example code (
|
||||
//! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) /
|
||||
@@ -140,6 +45,7 @@ extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate itertools;
|
||||
extern crate lz4;
|
||||
extern crate memmap;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
@@ -148,13 +54,10 @@ extern crate serde;
|
||||
extern crate serde_json;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate tempdir;
|
||||
extern crate tempfile;
|
||||
extern crate time;
|
||||
extern crate uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate matches;
|
||||
extern crate version;
|
||||
|
||||
#[cfg(test)]
|
||||
extern crate env_logger;
|
||||
@@ -172,9 +75,6 @@ extern crate test;
|
||||
|
||||
extern crate tinysegmenter;
|
||||
|
||||
#[macro_use]
|
||||
extern crate downcast;
|
||||
|
||||
#[cfg(test)]
|
||||
mod functional_test;
|
||||
|
||||
@@ -205,9 +105,6 @@ pub mod postings;
|
||||
pub mod schema;
|
||||
pub mod fastfield;
|
||||
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use directory::Directory;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use indexer::IndexWriter;
|
||||
@@ -215,6 +112,7 @@ pub use schema::{Document, Term};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use self::common::TimerTree;
|
||||
|
||||
pub use postings::DocSet;
|
||||
pub use postings::Postings;
|
||||
pub use core::SegmentComponent;
|
||||
|
||||
@@ -224,9 +122,9 @@ pub use common::{i64_to_u64, u64_to_i64};
|
||||
/// whether it was compiled with the simd compression.
|
||||
pub fn version() -> &'static str {
|
||||
if cfg!(feature = "simdcompression") {
|
||||
concat!(env!("CARGO_PKG_VERSION"), "-simd")
|
||||
concat!(version!(), "-simd")
|
||||
} else {
|
||||
concat!(env!("CARGO_PKG_VERSION"), "-nosimd")
|
||||
concat!(version!(), "-nosimd")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -287,12 +185,13 @@ mod tests {
|
||||
use Index;
|
||||
use core::SegmentReader;
|
||||
use query::BooleanQuery;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::*;
|
||||
use docset::DocSet;
|
||||
use DocSet;
|
||||
use IndexWriter;
|
||||
use fastfield::{FastFieldReader, I64FastFieldReader, U64FastFieldReader};
|
||||
use Postings;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
@@ -303,20 +202,11 @@ mod tests {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
let between = Range::new(0u32, max_value);
|
||||
(0..n_elems)
|
||||
.map(|_| between.ind_sample(&mut rng))
|
||||
.collect::<Vec<u32>>()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..n).filter(|_| rng.next_f32() < ratio).collect()
|
||||
@@ -557,7 +447,7 @@ mod tests {
|
||||
{
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
}
|
||||
index_writer.rollback().unwrap();
|
||||
index_writer = index_writer.rollback().unwrap();
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
@@ -645,22 +535,6 @@ mod tests {
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indexedfield_not_in_documents() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let absent_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
assert!(index.load_searchers().is_ok());
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
segment_reader.inverted_index(absent_field); //< should not panic
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_postings2() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -875,26 +749,31 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<U64FastFieldReader>(text_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<U64FastFieldReader>(stored_int_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<U64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
|
||||
@@ -54,7 +54,7 @@ macro_rules! doc(
|
||||
($crate::Document::default())
|
||||
}
|
||||
}; // avoids a warning due to the useless `mut`.
|
||||
($($field:expr => $value:expr),*) => {
|
||||
($($field:ident => $value:expr),*) => {
|
||||
{
|
||||
let mut document = $crate::Document::default();
|
||||
$(
|
||||
@@ -63,41 +63,4 @@ macro_rules! doc(
|
||||
document
|
||||
}
|
||||
};
|
||||
// if there is a trailing comma retry with the trailing comma stripped.
|
||||
($($field:expr => $value:expr),+ ,) => {
|
||||
doc!( $( $field => $value ), *);
|
||||
};
|
||||
);
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use schema::{SchemaBuilder, FAST, TEXT};
|
||||
|
||||
#[test]
|
||||
fn test_doc_basic() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let author = schema_builder.add_text_field("text", TEXT);
|
||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let _schema = schema_builder.build();
|
||||
let _doc = doc!(
|
||||
title => "Life Aquatic",
|
||||
author => "Wes Anderson",
|
||||
likes => 4u64
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_trailing_comma() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let author = schema_builder.add_text_field("text", TEXT);
|
||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let _schema = schema_builder.build();
|
||||
let _doc = doc!(
|
||||
title => "Life Aquatic",
|
||||
author => "Wes Anderson",
|
||||
likes => 4u64,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ use DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
use common::BitSet;
|
||||
|
||||
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
@@ -34,9 +33,6 @@ pub trait DocSet {
|
||||
/// More specifically, if the docset is already positionned on the target
|
||||
/// skipping will advance to the next position and return SkipResult::Overstep.
|
||||
///
|
||||
/// If `.skip_next()` oversteps, then the docset must be positionned correctly
|
||||
/// on an existing document. In other words, `.doc()` should return the first document
|
||||
/// greater than `DocId`.
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
@@ -83,27 +79,20 @@ pub trait DocSet {
|
||||
/// Returns the current document
|
||||
fn doc(&self) -> DocId;
|
||||
|
||||
/// Advances the cursor to the next document
|
||||
/// None is returned if the iterator has `DocSet`
|
||||
/// has already been entirely consumed.
|
||||
fn next(&mut self) -> Option<DocId> {
|
||||
if self.advance() {
|
||||
Some(self.doc())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a best-effort hint of the
|
||||
/// length of the docset.
|
||||
fn size_hint(&self) -> u32;
|
||||
|
||||
/// Appends all docs to a `bitset`.
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
while self.advance() {
|
||||
bitset.insert(self.doc());
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number documents matching.
|
||||
///
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
count += 1u32;
|
||||
}
|
||||
count
|
||||
}
|
||||
fn size_hint(&self) -> usize;
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
@@ -122,18 +111,30 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
fn size_hint(&self) -> usize {
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&mut self) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count()
|
||||
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
|
||||
fn advance(&mut self) -> bool {
|
||||
let unref: &mut TDocSet = *self;
|
||||
unref.advance()
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.append_to_bitset(bitset);
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
let unref: &mut TDocSet = *self;
|
||||
unref.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
let unref: &TDocSet = *self;
|
||||
unref.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> usize {
|
||||
let unref: &TDocSet = *self;
|
||||
unref.size_hint()
|
||||
}
|
||||
}
|
||||
131
src/postings/intersection.rs
Normal file
131
src/postings/intersection.rs
Normal file
@@ -0,0 +1,131 @@
|
||||
use postings::DocSet;
|
||||
use postings::SkipResult;
|
||||
use DocId;
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct IntersectionDocSet<TDocSet: DocSet> {
|
||||
docsets: Vec<TDocSet>,
|
||||
finished: bool,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> {
|
||||
fn from(mut docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
|
||||
assert!(docsets.len() >= 2);
|
||||
docsets.sort_by_key(|docset| docset.size_hint());
|
||||
IntersectionDocSet {
|
||||
docsets,
|
||||
finished: false,
|
||||
doc: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
|
||||
/// Returns an array to the underlying `DocSet`s of the intersection.
|
||||
/// These `DocSet` are in the same position as the `IntersectionDocSet`,
|
||||
/// so that user can access their `docfreq` and `positions`.
|
||||
pub fn docsets(&self) -> &[TDocSet] {
|
||||
&self.docsets[..]
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
|
||||
fn size_hint(&self) -> usize {
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.min()
|
||||
.unwrap() // safe as docsets cannot be empty.
|
||||
}
|
||||
|
||||
#[allow(never_loop)]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.finished {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut candidate_doc = self.doc;
|
||||
let mut candidate_ord = self.docsets.len();
|
||||
|
||||
'outer: loop {
|
||||
for (ord, docset) in self.docsets.iter_mut().enumerate() {
|
||||
if ord != candidate_ord {
|
||||
// `candidate_ord` is already at the
|
||||
// right position.
|
||||
//
|
||||
// Calling `skip_next` would advance this docset
|
||||
// and miss it.
|
||||
match docset.skip_next(candidate_doc) {
|
||||
SkipResult::Reached => {}
|
||||
SkipResult::OverStep => {
|
||||
// this is not in the intersection,
|
||||
// let's update our candidate.
|
||||
candidate_doc = docset.doc();
|
||||
candidate_ord = ord;
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::End => {
|
||||
self.finished = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.doc = candidate_doc;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use postings::{DocSet, IntersectionDocSet, VecPostings};
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
{
|
||||
let left = VecPostings::from(vec![1, 3, 9]);
|
||||
let right = VecPostings::from(vec![3, 4, 9, 18]);
|
||||
let mut intersection = IntersectionDocSet::from(vec![left, right]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 3);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.advance());
|
||||
}
|
||||
{
|
||||
let a = VecPostings::from(vec![1, 3, 9]);
|
||||
let b = VecPostings::from(vec![3, 4, 9, 18]);
|
||||
let c = VecPostings::from(vec![1, 5, 9, 111]);
|
||||
let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.advance());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection_zero() {
|
||||
let left = VecPostings::from(vec![0]);
|
||||
let right = VecPostings::from(vec![0]);
|
||||
let mut intersection = IntersectionDocSet::from(vec![left, right]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection_empty() {
|
||||
let a = VecPostings::from(vec![1, 3]);
|
||||
let b = VecPostings::from(vec![1, 4]);
|
||||
let c = VecPostings::from(vec![3, 9]);
|
||||
let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
|
||||
assert!(!intersection.advance());
|
||||
}
|
||||
}
|
||||
@@ -12,8 +12,12 @@ mod recorder;
|
||||
mod serializer;
|
||||
mod postings_writer;
|
||||
mod term_info;
|
||||
mod vec_postings;
|
||||
mod segment_postings;
|
||||
mod intersection;
|
||||
mod docset;
|
||||
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
@@ -21,28 +25,17 @@ pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
#[cfg(test)]
|
||||
pub use self::vec_postings::VecPostings;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
pub use self::intersection::IntersectionDocSet;
|
||||
pub use common::HasLen;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
|
||||
#[allow(enum_variant_names)]
|
||||
pub(crate) enum FreqReadingOption {
|
||||
NoFreq,
|
||||
SkipFreq,
|
||||
ReadFreq,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use DocId;
|
||||
use Score;
|
||||
use query::Intersection;
|
||||
use query::Scorer;
|
||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||
use core::SegmentComponent;
|
||||
use indexer::SegmentWriter;
|
||||
@@ -51,6 +44,7 @@ pub mod tests {
|
||||
use schema::IndexRecordOption;
|
||||
use std::iter;
|
||||
use datastruct::stacker::Heap;
|
||||
use fastfield::FastFieldReader;
|
||||
use query::TermQuery;
|
||||
use schema::Field;
|
||||
use test::{self, Bencher};
|
||||
@@ -82,71 +76,6 @@ pub mod tests {
|
||||
assert!(read.len() <= 140);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_skip_positions() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 30_000_000).unwrap();
|
||||
index_writer.add_document(doc!(title => r#"abc abc abc"#));
|
||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||
for _ in 0..1_000 {
|
||||
index_writer.add_document(doc!(title => r#"abc abc abc"#));
|
||||
}
|
||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(title, "abc"),
|
||||
IndexRecordOption::WithFreqsAndPositions,
|
||||
);
|
||||
let weight = query.specialized_weight(&*searcher, true);
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
scorer.advance();
|
||||
assert_eq!(&[0, 1, 2], scorer.postings().positions());
|
||||
scorer.advance();
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
scorer.advance();
|
||||
scorer.advance();
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(1), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1);
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1002);
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(100), SkipResult::Reached);
|
||||
assert_eq!(scorer.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1002);
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm1() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -168,7 +97,7 @@ pub mod tests {
|
||||
opstamp: 0u64,
|
||||
document: doc,
|
||||
};
|
||||
segment_writer.add_document(op, &schema).unwrap();
|
||||
segment_writer.add_document(&op, &schema).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
@@ -177,7 +106,7 @@ pub mod tests {
|
||||
opstamp: 1u64,
|
||||
document: doc,
|
||||
};
|
||||
segment_writer.add_document(op, &schema).unwrap();
|
||||
segment_writer.add_document(&op, &schema).unwrap();
|
||||
}
|
||||
for i in 2..1000 {
|
||||
let mut doc = Document::default();
|
||||
@@ -188,7 +117,7 @@ pub mod tests {
|
||||
opstamp: 2u64,
|
||||
document: doc,
|
||||
};
|
||||
segment_writer.add_document(op, &schema).unwrap();
|
||||
segment_writer.add_document(&op, &schema).unwrap();
|
||||
}
|
||||
segment_writer.finalize().unwrap();
|
||||
}
|
||||
@@ -282,7 +211,7 @@ pub mod tests {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let searcher = index.searcher();
|
||||
let mut term_weight = term_query.specialized_weight(&*searcher, true);
|
||||
let mut term_weight = term_query.specialized_weight(&*searcher);
|
||||
term_weight.index_record_option = IndexRecordOption::WithFreqsAndPositions;
|
||||
let segment_reader = &searcher.segment_readers()[0];
|
||||
let mut term_scorer = term_weight.specialized_scorer(segment_reader).unwrap();
|
||||
@@ -551,7 +480,7 @@ pub mod tests {
|
||||
.inverted_index(TERM_D.field())
|
||||
.read_postings(&*TERM_D, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
let mut intersection = Intersection::from(vec![
|
||||
let mut intersection = IntersectionDocSet::from(vec![
|
||||
segment_postings_a,
|
||||
segment_postings_b,
|
||||
segment_postings_c,
|
||||
@@ -633,70 +562,4 @@ pub mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
/// Wraps a given docset, and forward alls call but the
|
||||
/// `.skip_next(...)`. This is useful to test that a specialized
|
||||
/// implementation of `.skip_next(...)` is consistent
|
||||
/// with the default implementation.
|
||||
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
|
||||
|
||||
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
|
||||
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
|
||||
UnoptimizedDocSet(docset)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.0.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.0.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.0.score()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
|
||||
postings_factory: F,
|
||||
targets: Vec<u32>,
|
||||
) {
|
||||
for target in targets {
|
||||
let mut postings_opt = postings_factory();
|
||||
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
|
||||
let skip_result_opt = postings_opt.skip_next(target);
|
||||
let skip_result_unopt = postings_unopt.skip_next(target);
|
||||
assert_eq!(
|
||||
skip_result_unopt, skip_result_opt,
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
match skip_result_opt {
|
||||
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
|
||||
SkipResult::OverStep => assert!(postings_opt.doc() > target),
|
||||
SkipResult::End => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
while postings_opt.advance() {
|
||||
assert!(postings_unopt.advance());
|
||||
assert_eq!(
|
||||
postings_opt.doc(),
|
||||
postings_unopt.doc(),
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
}
|
||||
assert!(!postings_unopt.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use docset::DocSet;
|
||||
use std::borrow::Borrow;
|
||||
use postings::docset::DocSet;
|
||||
|
||||
/// Postings (also called inverted list)
|
||||
///
|
||||
@@ -17,3 +18,27 @@ pub trait Postings: DocSet {
|
||||
/// token ordinals.
|
||||
fn positions(&self) -> &[u32];
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> Postings for Box<TPostings> {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let unboxed: &TPostings = self.borrow();
|
||||
unboxed.term_freq()
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
let unboxed: &TPostings = self.borrow();
|
||||
unboxed.positions()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, TPostings: Postings> Postings for &'a mut TPostings {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let unref: &TPostings = *self;
|
||||
unref.term_freq()
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
let unref: &TPostings = *self;
|
||||
unref.positions()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,21 +2,18 @@ use DocId;
|
||||
use schema::Term;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use std::io;
|
||||
use std::collections::HashMap;
|
||||
use postings::Recorder;
|
||||
use Result;
|
||||
use schema::{Field, Schema};
|
||||
use tokenizer::Token;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use datastruct::stacker::{HashMap, Heap};
|
||||
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use schema::IndexRecordOption;
|
||||
use postings::UnorderedTermId;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
@@ -37,7 +34,7 @@ fn posting_from_field_entry<'a>(
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
FieldType::U64(_) | FieldType::I64(_) => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
@@ -45,8 +42,7 @@ fn posting_from_field_entry<'a>(
|
||||
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
schema: Schema,
|
||||
term_index: TermHashMap<'a>,
|
||||
term_index: HashMap<'a>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
}
|
||||
|
||||
@@ -54,14 +50,14 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
/// Create a new `MultiFieldPostingsWriter` given
|
||||
/// a schema and a heap.
|
||||
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
|
||||
let term_index = TermHashMap::new(table_bits, heap);
|
||||
let term_index = HashMap::new(table_bits, heap);
|
||||
let per_field_postings_writers: Vec<_> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.collect();
|
||||
|
||||
MultiFieldPostingsWriter {
|
||||
schema: schema.clone(),
|
||||
heap,
|
||||
term_index,
|
||||
per_field_postings_writers,
|
||||
@@ -73,32 +69,26 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
|
||||
}
|
||||
|
||||
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
|
||||
pub fn subscribe(&mut self, doc: DocId, term: &Term) {
|
||||
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, self.heap)
|
||||
postings_writer.suscribe(&mut self.term_index, doc, 0u32, term, self.heap)
|
||||
}
|
||||
|
||||
/// Serialize the inverted index.
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
#[allow(needless_range_loop)]
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
||||
let mut term_offsets: Vec<(&[u8], u32)> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _v)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.map(|(key, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
if field != prev_field {
|
||||
@@ -107,29 +97,9 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
}
|
||||
}
|
||||
offsets.push((Field(0), term_offsets.len()));
|
||||
|
||||
for i in 0..(offsets.len() - 1) {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i + 1];
|
||||
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match field_entry.field_type() {
|
||||
FieldType::Str(_) | FieldType::HierarchicalFacet => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let mut unordered_term_ids = term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| (unord_term_id as UnorderedTermId, term_ord as TermOrdinal))
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) => {}
|
||||
}
|
||||
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
let mut field_serializer = serializer.new_field(field)?;
|
||||
postings_writer.serialize(
|
||||
@@ -139,7 +109,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(unordered_term_mappings)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return true iff the term dictionary is saturated.
|
||||
@@ -160,28 +130,28 @@ pub trait PostingsWriter {
|
||||
/// * term - the term
|
||||
/// * heap - heap used to store the postings informations as well as the terms
|
||||
/// in the hashmap.
|
||||
fn subscribe(
|
||||
fn suscribe(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
) -> UnorderedTermId;
|
||||
);
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and subscribe all of its token.
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
@@ -191,8 +161,9 @@ pub trait PostingsWriter {
|
||||
term.set_field(field);
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
self.suscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
};
|
||||
|
||||
token_stream.process(&mut sink)
|
||||
}
|
||||
}
|
||||
@@ -220,16 +191,16 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
fn subscribe(
|
||||
fn suscribe(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
) -> UnorderedTermId {
|
||||
) {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
let (term_ord, recorder): (UnorderedTermId, &mut Rec) = term_index.get_or_create(term);
|
||||
let recorder: &mut Rec = term_index.get_or_create(term);
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
if current_doc != u32::max_value() {
|
||||
@@ -238,18 +209,17 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
recorder.record_position(position, heap);
|
||||
term_ord
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for &(term_bytes, addr, _) in term_addrs {
|
||||
for &(term_bytes, addr) in term_addrs {
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
serializer.new_term(&term_bytes[4..])?;
|
||||
serializer.new_term(term_bytes)?;
|
||||
recorder.serialize(addr, serializer, heap)?;
|
||||
serializer.close_term()?;
|
||||
}
|
||||
|
||||
@@ -1,18 +1,11 @@
|
||||
use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use DocId;
|
||||
|
||||
use common::BitSet;
|
||||
use common::HasLen;
|
||||
use postings::Postings;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use postings::{DocSet, HasLen, Postings, SkipResult};
|
||||
use std::cmp;
|
||||
use fst::Streamer;
|
||||
use compression::compressed_block_size;
|
||||
use fastfield::DeleteBitSet;
|
||||
use std::cell::UnsafeCell;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::FreqReadingOption;
|
||||
use postings::serializer::PostingsSerializer;
|
||||
|
||||
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
|
||||
|
||||
@@ -47,6 +40,7 @@ impl PositionComputer {
|
||||
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
|
||||
if let Some(num_skip) = self.position_to_skip {
|
||||
self.positions.resize(term_freq, 0u32);
|
||||
|
||||
self.positions_stream.skip(num_skip);
|
||||
self.positions_stream.read(&mut self.positions[..term_freq]);
|
||||
|
||||
@@ -74,32 +68,6 @@ pub struct SegmentPostings {
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
/// Creates a segment postings object with the given documents
|
||||
/// and no frequency encoded.
|
||||
///
|
||||
/// This method is mostly useful for unit tests.
|
||||
///
|
||||
/// It serializes the doc ids using tantivy's codec
|
||||
/// and returns a `SegmentPostings` object that embeds a
|
||||
/// buffer with the serialized data.
|
||||
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut postings_serializer = PostingsSerializer::new(&mut buffer, false);
|
||||
for &doc in docs {
|
||||
postings_serializer.write_doc(doc, 1u32).unwrap();
|
||||
}
|
||||
postings_serializer.close_term().unwrap();
|
||||
}
|
||||
let data = ReadOnlySource::from(buffer);
|
||||
let block_segment_postings = BlockSegmentPostings::from_data(
|
||||
docs.len(),
|
||||
SourceRead::from(data),
|
||||
FreqReadingOption::NoFreq,
|
||||
);
|
||||
SegmentPostings::from_block_postings(block_segment_postings, DeleteBitSet::empty(), None)
|
||||
}
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
@@ -148,7 +116,6 @@ impl DocSet for SegmentPostings {
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
self.position_add_skip(|| self.term_freq() as usize);
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
@@ -157,6 +124,7 @@ impl DocSet for SegmentPostings {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
self.position_add_skip(|| self.term_freq() as usize);
|
||||
if !self.delete_bitset.is_deleted(self.doc()) {
|
||||
return true;
|
||||
}
|
||||
@@ -268,8 +236,8 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
fn size_hint(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
/// Return the current document's `DocId`.
|
||||
@@ -282,21 +250,6 @@ impl DocSet for SegmentPostings {
|
||||
);
|
||||
docs[self.cur]
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
// finish the current block
|
||||
if self.advance() {
|
||||
for &doc in &self.block_cursor.docs()[self.cur..] {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
// ... iterate through the remaining blocks.
|
||||
while self.block_cursor.advance() {
|
||||
for &doc in self.block_cursor.docs() {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for SegmentPostings {
|
||||
@@ -331,7 +284,7 @@ impl Postings for SegmentPostings {
|
||||
pub struct BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder,
|
||||
freq_decoder: BlockDecoder,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
has_freq: bool,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
@@ -344,7 +297,7 @@ impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: usize,
|
||||
data: SourceRead,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
has_freq: bool,
|
||||
) -> BlockSegmentPostings {
|
||||
let num_bitpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
|
||||
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_bitpacked_blocks;
|
||||
@@ -353,7 +306,7 @@ impl BlockSegmentPostings {
|
||||
num_vint_docs,
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option,
|
||||
has_freq,
|
||||
remaining_data: data,
|
||||
doc_offset: 0,
|
||||
doc_freq,
|
||||
@@ -433,17 +386,11 @@ impl BlockSegmentPostings {
|
||||
let num_consumed_bytes = self.doc_decoder
|
||||
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq => {}
|
||||
FreqReadingOption::SkipFreq => {
|
||||
let num_bytes_to_skip = compressed_block_size(self.remaining_data.as_ref()[0]);
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref());
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
|
||||
if self.has_freq {
|
||||
let num_consumed_bytes = self.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref());
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
|
||||
@@ -456,12 +403,9 @@ impl BlockSegmentPostings {
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
match self.freq_reading_option {
|
||||
FreqReadingOption::NoFreq | FreqReadingOption::SkipFreq => {}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
if self.has_freq {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
@@ -478,7 +422,7 @@ impl BlockSegmentPostings {
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
has_freq: false,
|
||||
|
||||
remaining_data: From::from(ReadOnlySource::empty()),
|
||||
doc_offset: 0,
|
||||
@@ -502,7 +446,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use docset::DocSet;
|
||||
use DocSet;
|
||||
use super::SegmentPostings;
|
||||
use schema::SchemaBuilder;
|
||||
use core::Index;
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::io::{self, Write};
|
||||
use compression::VIntEncoder;
|
||||
use common::CountingWriter;
|
||||
use common::CompositeWrite;
|
||||
use termdict::{TermOrdinal, TermDictionaryBuilder};
|
||||
use termdict::TermDictionaryBuilder;
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
@@ -114,7 +114,6 @@ pub struct FieldSerializer<'a> {
|
||||
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
|
||||
current_term_info: TermInfo,
|
||||
term_open: bool,
|
||||
num_terms: TermOrdinal,
|
||||
}
|
||||
|
||||
impl<'a> FieldSerializer<'a> {
|
||||
@@ -153,7 +152,6 @@ impl<'a> FieldSerializer<'a> {
|
||||
positions_serializer_opt,
|
||||
current_term_info: TermInfo::default(),
|
||||
term_open: false,
|
||||
num_terms: TermOrdinal::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -174,7 +172,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
||||
assert!(
|
||||
!self.term_open,
|
||||
"Called new_term, while the previous term was not closed."
|
||||
@@ -182,10 +180,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.term_open = true;
|
||||
self.postings_serializer.clear();
|
||||
self.current_term_info = self.current_term_info();
|
||||
self.term_dictionary_builder.insert_key(term)?;
|
||||
let term_ordinal = self.num_terms;
|
||||
self.num_terms += 1;
|
||||
Ok(term_ordinal)
|
||||
self.term_dictionary_builder.insert_key(term)
|
||||
}
|
||||
|
||||
/// Serialize the information that a document contains the current term,
|
||||
@@ -237,7 +232,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostingsSerializer<W: Write> {
|
||||
struct PostingsSerializer<W: Write> {
|
||||
postings_write: CountingWriter<W>,
|
||||
last_doc_id_encoded: u32,
|
||||
|
||||
@@ -249,7 +244,7 @@ pub struct PostingsSerializer<W: Write> {
|
||||
}
|
||||
|
||||
impl<W: Write> PostingsSerializer<W> {
|
||||
pub fn new(write: W, termfreq_enabled: bool) -> PostingsSerializer<W> {
|
||||
fn new(write: W, termfreq_enabled: bool) -> PostingsSerializer<W> {
|
||||
PostingsSerializer {
|
||||
postings_write: CountingWriter::wrap(write),
|
||||
|
||||
@@ -262,7 +257,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
|
||||
fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
|
||||
self.doc_ids.push(doc_id);
|
||||
if self.termfreq_enabled {
|
||||
self.term_freqs.push(term_freq as u32);
|
||||
@@ -287,7 +282,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
fn close_term(&mut self) -> io::Result<()> {
|
||||
if !self.doc_ids.is_empty() {
|
||||
// we have doc ids waiting to be written
|
||||
// this happens when the number of doc ids is
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
|
||||
/// `TermInfo` contains all of the information
|
||||
@@ -23,15 +23,6 @@ pub struct TermInfo {
|
||||
pub positions_inner_offset: u8,
|
||||
}
|
||||
|
||||
impl FixedSize for TermInfo {
|
||||
/// Size required for the binary serialization of `TermInfo`.
|
||||
/// This is large, but in practise, all `TermInfo` but the first one
|
||||
/// of the block are bitpacked.
|
||||
///
|
||||
/// See `TermInfoStore`.
|
||||
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
impl BinarySerializable for TermInfo {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.doc_freq.serialize(writer)?;
|
||||
@@ -53,15 +44,3 @@ impl BinarySerializable for TermInfo {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::TermInfo;
|
||||
use common::test::fixed_size_test;
|
||||
|
||||
#[test]
|
||||
fn test_fixed_size() {
|
||||
fixed_size_test::<TermInfo>();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
use DocId;
|
||||
use docset::DocSet;
|
||||
use common::HasLen;
|
||||
use postings::{DocSet, HasLen, Postings};
|
||||
use std::num::Wrapping;
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [];
|
||||
@@ -12,21 +11,21 @@ const EMPTY_ARRAY: [u32; 0] = [];
|
||||
///
|
||||
/// Term frequencies always return 1.
|
||||
/// No positions are returned.
|
||||
pub struct VecDocSet {
|
||||
pub struct VecPostings {
|
||||
doc_ids: Vec<DocId>,
|
||||
cursor: Wrapping<usize>,
|
||||
}
|
||||
|
||||
impl From<Vec<DocId>> for VecDocSet {
|
||||
fn from(doc_ids: Vec<DocId>) -> VecDocSet {
|
||||
VecDocSet {
|
||||
impl From<Vec<DocId>> for VecPostings {
|
||||
fn from(doc_ids: Vec<DocId>) -> VecPostings {
|
||||
VecPostings {
|
||||
doc_ids,
|
||||
cursor: Wrapping(usize::max_value()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for VecDocSet {
|
||||
impl DocSet for VecPostings {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.cursor += Wrapping(1);
|
||||
self.doc_ids.len() > self.cursor.0
|
||||
@@ -36,32 +35,43 @@ impl DocSet for VecDocSet {
|
||||
self.doc_ids[self.cursor.0]
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
fn size_hint(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for VecDocSet {
|
||||
impl HasLen for VecPostings {
|
||||
fn len(&self) -> usize {
|
||||
self.doc_ids.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for VecPostings {
|
||||
fn term_freq(&self) -> u32 {
|
||||
1u32
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
&EMPTY_ARRAY
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use DocId;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use postings::{DocSet, Postings, SkipResult};
|
||||
|
||||
#[test]
|
||||
pub fn test_vec_postings() {
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
|
||||
let mut postings = VecDocSet::from(doc_ids);
|
||||
let mut postings = VecPostings::from(doc_ids);
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3u32);
|
||||
assert_eq!(postings.term_freq(), 1u32);
|
||||
assert_eq!(postings.skip_next(14u32), SkipResult::OverStep);
|
||||
assert_eq!(postings.doc(), 15u32);
|
||||
assert_eq!(postings.skip_next(300u32), SkipResult::Reached);
|
||||
@@ -69,20 +79,4 @@ pub mod tests {
|
||||
assert_eq!(postings.skip_next(6000u32), SkipResult::End);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fill_buffer() {
|
||||
let doc_ids: Vec<DocId> = (1u32..210u32).collect();
|
||||
let mut postings = VecDocSet::from(doc_ids);
|
||||
let mut buffer = vec![1000u32; 100];
|
||||
assert_eq!(postings.fill_buffer(&mut buffer[..]), 100);
|
||||
for i in 0u32..100u32 {
|
||||
assert_eq!(buffer[i as usize], i + 1);
|
||||
}
|
||||
assert_eq!(postings.fill_buffer(&mut buffer[..]), 100);
|
||||
for i in 0u32..100u32 {
|
||||
assert_eq!(buffer[i as usize], i + 101);
|
||||
}
|
||||
assert_eq!(postings.fill_buffer(&mut buffer[..]), 9);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,91 +0,0 @@
|
||||
use query::Query;
|
||||
use query::Weight;
|
||||
use query::Scorer;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use Result;
|
||||
use Score;
|
||||
use DocId;
|
||||
use core::Searcher;
|
||||
use fastfield::DeleteBitSet;
|
||||
|
||||
/// Query that matches all of the documents.
|
||||
///
|
||||
/// All of the document get the score 1f32.
|
||||
#[derive(Debug)]
|
||||
pub struct AllQuery;
|
||||
|
||||
impl Query for AllQuery {
|
||||
fn weight(&self, _: &Searcher, _: bool) -> Result<Box<Weight>> {
|
||||
Ok(box AllWeight)
|
||||
}
|
||||
}
|
||||
|
||||
/// Weight associated to the `AllQuery` query.
|
||||
pub struct AllWeight;
|
||||
|
||||
impl Weight for AllWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
Ok(box AllScorer {
|
||||
state: State::NotStarted,
|
||||
doc: 0u32,
|
||||
max_doc: reader.max_doc(),
|
||||
deleted_bitset: reader.delete_bitset().clone()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
enum State {
|
||||
NotStarted,
|
||||
Started,
|
||||
Finished
|
||||
}
|
||||
|
||||
/// Scorer associated to the `AllQuery` query.
|
||||
pub struct AllScorer {
|
||||
state: State,
|
||||
doc: DocId,
|
||||
max_doc: DocId,
|
||||
deleted_bitset: DeleteBitSet
|
||||
}
|
||||
|
||||
impl DocSet for AllScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
match self.state {
|
||||
State::NotStarted => {
|
||||
self.state = State::Started;
|
||||
self.doc = 0;
|
||||
}
|
||||
State::Started => {
|
||||
self.doc += 1u32;
|
||||
}
|
||||
State::Finished => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if self.doc < self.max_doc {
|
||||
if !self.deleted_bitset.is_deleted(self.doc) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
self.state = State::Finished;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.max_doc
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for AllScorer {
|
||||
fn score(&mut self) -> Score {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
@@ -1,257 +0,0 @@
|
||||
use common::{BitSet, TinySet};
|
||||
use DocId;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use std::cmp::Ordering;
|
||||
|
||||
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
|
||||
///
|
||||
/// # Implementation detail
|
||||
///
|
||||
/// Skipping is relatively fast here as we can directly point to the
|
||||
/// right tiny bitset bucket.
|
||||
///
|
||||
/// TODO: Consider implementing a `BitTreeSet` in order to advance faster
|
||||
/// when the bitset is sparse
|
||||
pub struct BitSetDocSet {
|
||||
docs: BitSet,
|
||||
cursor_bucket: u32, //< index associated to the current tiny bitset
|
||||
cursor_tinybitset: TinySet,
|
||||
doc: u32,
|
||||
}
|
||||
|
||||
impl BitSetDocSet {
|
||||
fn go_to_bucket(&mut self, bucket_addr: u32) {
|
||||
self.cursor_bucket = bucket_addr;
|
||||
self.cursor_tinybitset = self.docs.tinyset(bucket_addr);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BitSet> for BitSetDocSet {
|
||||
fn from(docs: BitSet) -> BitSetDocSet {
|
||||
let first_tiny_bitset = if docs.max_value() == 0 {
|
||||
TinySet::empty()
|
||||
} else {
|
||||
docs.tinyset(0)
|
||||
};
|
||||
BitSetDocSet {
|
||||
docs,
|
||||
cursor_bucket: 0,
|
||||
cursor_tinybitset: first_tiny_bitset,
|
||||
doc: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for BitSetDocSet {
|
||||
fn advance(&mut self) -> bool {
|
||||
if let Some(lower) = self.cursor_tinybitset.pop_lowest() {
|
||||
self.doc = (self.cursor_bucket as u32 * 64u32) | lower;
|
||||
return true;
|
||||
}
|
||||
if let Some(cursor_bucket) = self.docs.first_non_empty_bucket(self.cursor_bucket + 1) {
|
||||
self.go_to_bucket(cursor_bucket);
|
||||
let lower = self.cursor_tinybitset.pop_lowest().unwrap();
|
||||
self.doc = (cursor_bucket * 64u32) | lower;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
// skip is required to advance.
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
let target_bucket = target / 64u32;
|
||||
|
||||
// Mask for all of the bits greater or equal
|
||||
// to our target document.
|
||||
match target_bucket.cmp(&self.cursor_bucket) {
|
||||
Ordering::Greater => {
|
||||
self.go_to_bucket(target_bucket);
|
||||
let greater_filter: TinySet = TinySet::range_greater_or_equal(target);
|
||||
self.cursor_tinybitset = self.cursor_tinybitset.intersect(greater_filter);
|
||||
if !self.advance() {
|
||||
SkipResult::End
|
||||
} else if self.doc() == target {
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
debug_assert!(self.doc() > target);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
}
|
||||
Ordering::Equal => loop {
|
||||
match self.doc().cmp(&target) {
|
||||
Ordering::Less => {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
}
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
debug_assert!(self.doc() > target);
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
},
|
||||
Ordering::Less => {
|
||||
debug_assert!(self.doc() > target);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the current document
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
/// Returns half of the `max_doc`
|
||||
/// This is quite a terrible heuristic,
|
||||
/// but we don't have access to any better
|
||||
/// value.
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docs.len() as u32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use DocId;
|
||||
use common::BitSet;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use super::BitSetDocSet;
|
||||
extern crate test;
|
||||
use tests;
|
||||
|
||||
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
|
||||
let mut docset = BitSet::with_max_value(max_doc);
|
||||
for &doc in docs {
|
||||
docset.insert(doc);
|
||||
}
|
||||
BitSetDocSet::from(docset)
|
||||
}
|
||||
|
||||
fn test_go_through_sequential(docs: &[DocId]) {
|
||||
let mut docset = create_docbitset(docs, 1_000u32);
|
||||
for &doc in docs {
|
||||
assert!(docset.advance());
|
||||
assert_eq!(doc, docset.doc());
|
||||
}
|
||||
assert!(!docset.advance());
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docbitset_sequential() {
|
||||
test_go_through_sequential(&[]);
|
||||
test_go_through_sequential(&[1, 2, 3]);
|
||||
test_go_through_sequential(&[1, 2, 3, 4, 5, 63, 64, 65]);
|
||||
test_go_through_sequential(&[63, 64, 65]);
|
||||
test_go_through_sequential(&[1, 2, 3, 4, 95, 96, 97, 98, 99]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docbitset_skip() {
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
|
||||
assert_eq!(docset.skip_next(7), SkipResult::Reached);
|
||||
assert_eq!(docset.doc(), 7);
|
||||
assert!(docset.advance(), 7);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112], 10_000);
|
||||
assert_eq!(docset.skip_next(3), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 5);
|
||||
assert!(docset.advance());
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[5112], 10_000);
|
||||
assert_eq!(docset.skip_next(5112), SkipResult::Reached);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[5112], 10_000);
|
||||
assert_eq!(docset.skip_next(5113), SkipResult::End);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[5112], 10_000);
|
||||
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
|
||||
assert_eq!(docset.skip_next(5112), SkipResult::Reached);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 5500);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 6666);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5500, 6666], 10_000);
|
||||
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 5500);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 6666);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let mut docset = create_docbitset(&[1, 5, 6, 7, 5112, 5513, 6666], 10_000);
|
||||
assert_eq!(docset.skip_next(5111), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 5112);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 5513);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 6666);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
|
||||
use tests;
|
||||
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
||||
b.iter(|| {
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els.iter().cloned() {
|
||||
bitset.insert(el);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_clone(b: &mut test::Bencher) {
|
||||
use tests;
|
||||
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
b.iter(|| bitset.clone());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_clone_iterate(b: &mut test::Bencher) {
|
||||
let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut docset = BitSetDocSet::from(bitset.clone());
|
||||
while docset.advance() {}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
use Result;
|
||||
use std::any::Any;
|
||||
use super::boolean_weight::BooleanWeight;
|
||||
use query::Weight;
|
||||
use Searcher;
|
||||
@@ -7,6 +8,7 @@ use schema::Term;
|
||||
use query::TermQuery;
|
||||
use schema::IndexRecordOption;
|
||||
use query::Occur;
|
||||
use query::OccurFilter;
|
||||
|
||||
/// The boolean query combines a set of queries
|
||||
///
|
||||
@@ -30,14 +32,21 @@ impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
|
||||
}
|
||||
|
||||
impl Query for BooleanQuery {
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
let sub_weights = self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref occur, ref subquery)| {
|
||||
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
||||
})
|
||||
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
|
||||
.collect::<Result<_>>()?;
|
||||
Ok(box BooleanWeight::new(sub_weights, scoring_enabled))
|
||||
let occurs: Vec<Occur> = self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref occur, ref _subquery)| *occur)
|
||||
.collect();
|
||||
let filter = OccurFilter::new(&occurs);
|
||||
Ok(box BooleanWeight::new(sub_weights, filter))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
147
src/query/boolean_query/boolean_scorer.rs
Normal file
147
src/query/boolean_query/boolean_scorer.rs
Normal file
@@ -0,0 +1,147 @@
|
||||
use query::Scorer;
|
||||
use DocId;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::cmp::Ordering;
|
||||
use postings::DocSet;
|
||||
use query::OccurFilter;
|
||||
use query::boolean_query::ScoreCombiner;
|
||||
|
||||
/// Each `HeapItem` represents the head of
|
||||
/// one of scorer being merged.
|
||||
///
|
||||
/// * `doc` - is the current doc id for the given segment postings
|
||||
/// * `ord` - is the ordinal used to identify to which segment postings
|
||||
/// this heap item belong to.
|
||||
#[derive(Eq, PartialEq)]
|
||||
struct HeapItem {
|
||||
doc: DocId,
|
||||
ord: u32,
|
||||
}
|
||||
|
||||
/// `HeapItem` are ordered by the document
|
||||
impl PartialOrd for HeapItem {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for HeapItem {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
(other.doc).cmp(&self.doc)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BooleanScorer<TScorer: Scorer> {
|
||||
scorers: Vec<TScorer>,
|
||||
queue: BinaryHeap<HeapItem>,
|
||||
doc: DocId,
|
||||
score_combiner: ScoreCombiner,
|
||||
occur_filter: OccurFilter,
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
pub fn new(scorers: Vec<TScorer>, occur_filter: OccurFilter) -> BooleanScorer<TScorer> {
|
||||
let score_combiner = ScoreCombiner::default_for_num_scorers(scorers.len());
|
||||
let mut non_empty_scorers: Vec<TScorer> = Vec::new();
|
||||
for mut posting in scorers {
|
||||
let non_empty = posting.advance();
|
||||
if non_empty {
|
||||
non_empty_scorers.push(posting);
|
||||
}
|
||||
}
|
||||
let heap_items: Vec<HeapItem> = non_empty_scorers
|
||||
.iter()
|
||||
.map(|posting| posting.doc())
|
||||
.enumerate()
|
||||
.map(|(ord, doc)| HeapItem {
|
||||
doc,
|
||||
ord: ord as u32,
|
||||
})
|
||||
.collect();
|
||||
BooleanScorer {
|
||||
scorers: non_empty_scorers,
|
||||
queue: BinaryHeap::from(heap_items),
|
||||
doc: 0u32,
|
||||
score_combiner,
|
||||
occur_filter,
|
||||
}
|
||||
}
|
||||
|
||||
/// Advances the head of our heap (the segment posting with the lowest doc)
|
||||
/// It will also update the new current `DocId` as well as the term frequency
|
||||
/// associated with the segment postings.
|
||||
///
|
||||
/// After advancing the `SegmentPosting`, the postings is removed from the heap
|
||||
/// if it has been entirely consumed, or pushed back into the heap.
|
||||
///
|
||||
/// # Panics
|
||||
/// This method will panic if the head `SegmentPostings` is not empty.
|
||||
fn advance_head(&mut self) {
|
||||
{
|
||||
let mut mutable_head = self.queue.peek_mut().unwrap();
|
||||
let cur_scorers = &mut self.scorers[mutable_head.ord as usize];
|
||||
if cur_scorers.advance() {
|
||||
mutable_head.doc = cur_scorers.doc();
|
||||
return;
|
||||
}
|
||||
}
|
||||
self.queue.pop();
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
|
||||
fn size_hint(&self) -> usize {
|
||||
// TODO fix this. it should be the min
|
||||
// of the MUST scorer
|
||||
// and the max of the SHOULD scorers.
|
||||
self.scorers
|
||||
.iter()
|
||||
.map(|scorer| scorer.size_hint())
|
||||
.max()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
self.score_combiner.clear();
|
||||
let mut ord_bitset = 0u64;
|
||||
match self.queue.peek() {
|
||||
Some(heap_item) => {
|
||||
let ord = heap_item.ord as usize;
|
||||
self.doc = heap_item.doc;
|
||||
let score = self.scorers[ord].score();
|
||||
self.score_combiner.update(score);
|
||||
ord_bitset |= 1 << ord;
|
||||
}
|
||||
None => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
self.advance_head();
|
||||
while let Some(&HeapItem { doc, ord }) = self.queue.peek() {
|
||||
if doc == self.doc {
|
||||
let ord = ord as usize;
|
||||
let score = self.scorers[ord].score();
|
||||
self.score_combiner.update(score);
|
||||
ord_bitset |= 1 << ord;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
self.advance_head();
|
||||
}
|
||||
if self.occur_filter.accept(ord_bitset) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for BooleanScorer<TScorer> {
|
||||
fn score(&self) -> f32 {
|
||||
self.score_combiner.score()
|
||||
}
|
||||
}
|
||||
@@ -1,142 +1,31 @@
|
||||
use query::Weight;
|
||||
use core::SegmentReader;
|
||||
use query::{Intersection, Union};
|
||||
use std::collections::HashMap;
|
||||
use query::EmptyScorer;
|
||||
use query::Scorer;
|
||||
use downcast::Downcast;
|
||||
use query::term_query::TermScorer;
|
||||
use std::borrow::Borrow;
|
||||
use query::Exclude;
|
||||
use query::Occur;
|
||||
use query::RequiredOptionalScorer;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use super::BooleanScorer;
|
||||
use query::OccurFilter;
|
||||
use Result;
|
||||
|
||||
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<Scorer>>) -> Box<Scorer>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
assert!(!scorers.is_empty());
|
||||
if scorers.len() == 1 {
|
||||
scorers.into_iter().next().unwrap() //< we checked the size beforehands
|
||||
} else {
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
});
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorer> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Union::<TermScorer, TScoreCombiner>::from(scorers);
|
||||
scorer
|
||||
} else {
|
||||
let scorer: Box<Scorer> = box Union::<_, TScoreCombiner>::from(scorers);
|
||||
scorer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BooleanWeight {
|
||||
weights: Vec<(Occur, Box<Weight>)>,
|
||||
scoring_enabled: bool,
|
||||
weights: Vec<Box<Weight>>,
|
||||
occur_filter: OccurFilter,
|
||||
}
|
||||
|
||||
impl BooleanWeight {
|
||||
pub fn new(weights: Vec<(Occur, Box<Weight>)>, scoring_enabled: bool) -> BooleanWeight {
|
||||
pub fn new(weights: Vec<Box<Weight>>, occur_filter: OccurFilter) -> BooleanWeight {
|
||||
BooleanWeight {
|
||||
weights,
|
||||
scoring_enabled,
|
||||
}
|
||||
}
|
||||
|
||||
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Box<Scorer>> {
|
||||
let mut per_occur_scorers: HashMap<Occur, Vec<Box<Scorer>>> = HashMap::new();
|
||||
for &(ref occur, ref subweight) in &self.weights {
|
||||
let sub_scorer: Box<Scorer> = subweight.scorer(reader)?;
|
||||
per_occur_scorers
|
||||
.entry(*occur)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(sub_scorer);
|
||||
}
|
||||
|
||||
let should_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::Should)
|
||||
.map(scorer_union::<TScoreCombiner>);
|
||||
|
||||
let exclude_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::MustNot)
|
||||
.map(scorer_union::<TScoreCombiner>);
|
||||
|
||||
let must_scorer_opt: Option<Box<Scorer>> =
|
||||
per_occur_scorers.remove(&Occur::Must).map(|scorers| {
|
||||
if scorers.len() == 1 {
|
||||
scorers.into_iter().next().unwrap()
|
||||
} else {
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
});
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorer> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Intersection::from(scorers);
|
||||
scorer
|
||||
} else {
|
||||
let scorer: Box<Scorer> = box Intersection::from(scorers);
|
||||
scorer
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let positive_scorer: Box<Scorer> = match (should_scorer_opt, must_scorer_opt) {
|
||||
(Some(should_scorer), Some(must_scorer)) => {
|
||||
if self.scoring_enabled {
|
||||
box RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||
must_scorer,
|
||||
should_scorer,
|
||||
)
|
||||
} else {
|
||||
must_scorer
|
||||
}
|
||||
}
|
||||
(None, Some(must_scorer)) => must_scorer,
|
||||
(Some(should_scorer), None) => should_scorer,
|
||||
(None, None) => {
|
||||
return Ok(box EmptyScorer);
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(exclude_scorer) = exclude_scorer_opt {
|
||||
Ok(box Exclude::new(positive_scorer, exclude_scorer))
|
||||
} else {
|
||||
Ok(positive_scorer)
|
||||
occur_filter,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for BooleanWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
if self.weights.is_empty() {
|
||||
Ok(box EmptyScorer)
|
||||
} else if self.weights.len() == 1 {
|
||||
let &(occur, ref weight) = &self.weights[0];
|
||||
if occur == Occur::MustNot {
|
||||
Ok(box EmptyScorer)
|
||||
} else {
|
||||
weight.scorer(reader)
|
||||
}
|
||||
} else if self.scoring_enabled {
|
||||
self.complex_scorer::<SumWithCoordsCombiner>(reader)
|
||||
} else {
|
||||
self.complex_scorer::<DoNothingCombiner>(reader)
|
||||
}
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect::<Result<_>>()?;
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
|
||||
Ok(box boolean_scorer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,27 +1,35 @@
|
||||
mod boolean_query;
|
||||
mod boolean_scorer;
|
||||
mod boolean_weight;
|
||||
mod score_combiner;
|
||||
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::boolean_scorer::BooleanScorer;
|
||||
pub use self::score_combiner::ScoreCombiner;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use postings::{DocSet, VecPostings};
|
||||
use query::Scorer;
|
||||
use query::OccurFilter;
|
||||
use query::term_query::TermScorer;
|
||||
use query::Occur;
|
||||
use query::Query;
|
||||
use query::TermQuery;
|
||||
use query::Intersection;
|
||||
use query::Scorer;
|
||||
use query::term_query::TermScorer;
|
||||
use collector::tests::TestCollector;
|
||||
use Index;
|
||||
use downcast::Downcast;
|
||||
use schema::*;
|
||||
use query::QueryParser;
|
||||
use query::RequiredOptionalScorer;
|
||||
use query::score_combiner::SumWithCoordsCombiner;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use schema::IndexRecordOption;
|
||||
|
||||
fn aux_test_helper() -> (Index, Field) {
|
||||
fn abs_diff(left: f32, right: f32) -> f32 {
|
||||
(right - left).abs()
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_query() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -51,72 +59,6 @@ mod tests {
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
(index, text_field)
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_non_all_term_disjunction() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let query = query_parser.parse_query("(+a +b) d").unwrap();
|
||||
assert_eq!(query.count(&*index.searcher()).unwrap(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_single_must_clause() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let query = query_parser.parse_query("+a").unwrap();
|
||||
let searcher = index.searcher();
|
||||
let weight = query.weight(&*searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_termonly_intersection() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let searcher = index.searcher();
|
||||
{
|
||||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||
let weight = query.weight(&*searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||
let weight = query.weight(&*searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_reqopt() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let searcher = index.searcher();
|
||||
{
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&*searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<
|
||||
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
|
||||
>::is_type(&*scorer));
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&*searcher, false).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_query() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
|
||||
let make_term_query = |text: &str| {
|
||||
let term_query = TermQuery::new(
|
||||
@@ -127,13 +69,14 @@ mod tests {
|
||||
query
|
||||
};
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let matching_docs = |boolean_query: &Query| {
|
||||
let searcher = index.searcher();
|
||||
let mut test_collector = TestCollector::default();
|
||||
searcher.search(boolean_query, &mut test_collector).unwrap();
|
||||
test_collector.docs()
|
||||
};
|
||||
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||||
@@ -169,4 +112,41 @@ mod tests {
|
||||
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_scorer() {
|
||||
let occurs = vec![Occur::Should, Occur::Should];
|
||||
let occur_filter = OccurFilter::new(&occurs);
|
||||
|
||||
let left_fieldnorms =
|
||||
U64FastFieldReader::from((0u64..9u64).map(|doc| doc * 3).collect::<Vec<u64>>());
|
||||
|
||||
let left = VecPostings::from(vec![1, 2, 3]);
|
||||
let left_scorer = TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: Some(left_fieldnorms),
|
||||
postings: left,
|
||||
};
|
||||
|
||||
let right_fieldnorms =
|
||||
U64FastFieldReader::from((0u64..9u64).map(|doc| doc * 5).collect::<Vec<u64>>());
|
||||
let right = VecPostings::from(vec![1, 3, 8]);
|
||||
|
||||
let right_scorer = TermScorer {
|
||||
idf: 4f32,
|
||||
fieldnorm_reader_opt: Some(right_fieldnorms),
|
||||
postings: right,
|
||||
};
|
||||
|
||||
let mut boolean_scorer = BooleanScorer::new(vec![left_scorer, right_scorer], occur_filter);
|
||||
assert_eq!(boolean_scorer.next(), Some(1u32));
|
||||
assert!(abs_diff(boolean_scorer.score(), 2.3662047) < 0.001);
|
||||
assert_eq!(boolean_scorer.next(), Some(2u32));
|
||||
assert!(abs_diff(boolean_scorer.score(), 0.20412415) < 0.001f32);
|
||||
assert_eq!(boolean_scorer.next(), Some(3u32));
|
||||
assert_eq!(boolean_scorer.next(), Some(8u32));
|
||||
assert!(abs_diff(boolean_scorer.score(), 0.31622776) < 0.001f32);
|
||||
assert!(!boolean_scorer.advance());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
45
src/query/boolean_query/score_combiner.rs
Normal file
45
src/query/boolean_query/score_combiner.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
use Score;
|
||||
|
||||
pub struct ScoreCombiner {
|
||||
coords: Vec<Score>,
|
||||
num_fields: usize,
|
||||
score: Score,
|
||||
}
|
||||
|
||||
impl ScoreCombiner {
|
||||
pub fn update(&mut self, score: Score) {
|
||||
self.score += score;
|
||||
self.num_fields += 1;
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.score = 0f32;
|
||||
self.num_fields = 0;
|
||||
}
|
||||
|
||||
/// Compute the coord term
|
||||
fn coord(&self) -> f32 {
|
||||
self.coords[self.num_fields]
|
||||
}
|
||||
|
||||
pub fn score(&self) -> Score {
|
||||
self.score * self.coord()
|
||||
}
|
||||
|
||||
pub fn default_for_num_scorers(num_scorers: usize) -> ScoreCombiner {
|
||||
let query_coords: Vec<Score> = (0..num_scorers + 1)
|
||||
.map(|i| (i as Score) / (num_scorers as Score))
|
||||
.collect();
|
||||
ScoreCombiner::from(query_coords)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<Score>> for ScoreCombiner {
|
||||
fn from(coords: Vec<Score>) -> ScoreCombiner {
|
||||
ScoreCombiner {
|
||||
coords,
|
||||
num_fields: 0,
|
||||
score: 0f32,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,179 +0,0 @@
|
||||
use query::Scorer;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use Score;
|
||||
use DocId;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
enum State {
|
||||
ExcludeOne(DocId),
|
||||
Finished,
|
||||
}
|
||||
|
||||
/// Filters a given `DocSet` by removing the docs from a given `DocSet`.
|
||||
///
|
||||
/// The excluding docset has no impact on scoring.
|
||||
pub struct Exclude<TDocSet, TDocSetExclude> {
|
||||
underlying_docset: TDocSet,
|
||||
excluding_docset: TDocSetExclude,
|
||||
excluding_state: State,
|
||||
}
|
||||
|
||||
impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
|
||||
where
|
||||
TDocSetExclude: DocSet,
|
||||
{
|
||||
/// Creates a new `ExcludeScorer`
|
||||
pub fn new(
|
||||
underlying_docset: TDocSet,
|
||||
mut excluding_docset: TDocSetExclude,
|
||||
) -> Exclude<TDocSet, TDocSetExclude> {
|
||||
let state = if excluding_docset.advance() {
|
||||
State::ExcludeOne(excluding_docset.doc())
|
||||
} else {
|
||||
State::Finished
|
||||
};
|
||||
Exclude {
|
||||
underlying_docset,
|
||||
excluding_docset,
|
||||
excluding_state: state,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet, TDocSetExclude> Exclude<TDocSet, TDocSetExclude>
|
||||
where
|
||||
TDocSet: DocSet,
|
||||
TDocSetExclude: DocSet,
|
||||
{
|
||||
/// Returns true iff the doc is not removed.
|
||||
///
|
||||
/// The method has to be called with non strictly
|
||||
/// increasing `doc`.
|
||||
fn accept(&mut self) -> bool {
|
||||
let doc = self.underlying_docset.doc();
|
||||
match self.excluding_state {
|
||||
State::ExcludeOne(excluded_doc) => {
|
||||
if doc == excluded_doc {
|
||||
false
|
||||
} else if excluded_doc > doc {
|
||||
true
|
||||
} else {
|
||||
match self.excluding_docset.skip_next(doc) {
|
||||
SkipResult::OverStep => {
|
||||
self.excluding_state = State::ExcludeOne(self.excluding_docset.doc());
|
||||
true
|
||||
}
|
||||
SkipResult::End => {
|
||||
self.excluding_state = State::Finished;
|
||||
true
|
||||
}
|
||||
SkipResult::Reached => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
State::Finished => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet, TDocSetExclude> DocSet for Exclude<TDocSet, TDocSetExclude>
|
||||
where
|
||||
TDocSet: DocSet,
|
||||
TDocSetExclude: DocSet,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.underlying_docset.advance() {
|
||||
if self.accept() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
let underlying_skip_result = self.underlying_docset.skip_next(target);
|
||||
if underlying_skip_result == SkipResult::End {
|
||||
return SkipResult::End;
|
||||
}
|
||||
if self.accept() {
|
||||
underlying_skip_result
|
||||
} else if self.advance() {
|
||||
SkipResult::OverStep
|
||||
} else {
|
||||
SkipResult::End
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.underlying_docset.doc()
|
||||
}
|
||||
|
||||
/// `.size_hint()` directly returns the size
|
||||
/// of the underlying docset without taking in account
|
||||
/// the fact that docs might be deleted.
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.underlying_docset.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TDocSetExclude> Scorer for Exclude<TScorer, TDocSetExclude>
|
||||
where
|
||||
TScorer: Scorer,
|
||||
TDocSetExclude: DocSet + 'static,
|
||||
{
|
||||
fn score(&mut self) -> Score {
|
||||
self.underlying_docset.score()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use tests::sample_with_seed;
|
||||
use postings::tests::test_skip_against_unoptimized;
|
||||
use super::*;
|
||||
use query::VecDocSet;
|
||||
|
||||
#[test]
|
||||
fn test_exclude() {
|
||||
let mut exclude_scorer = Exclude::new(
|
||||
VecDocSet::from(vec![1, 2, 5, 8, 10, 15, 24]),
|
||||
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
|
||||
);
|
||||
let mut els = vec![];
|
||||
while exclude_scorer.advance() {
|
||||
els.push(exclude_scorer.doc());
|
||||
}
|
||||
assert_eq!(els, vec![5, 8, 15]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_skip() {
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
box Exclude::new(
|
||||
VecDocSet::from(vec![1, 2, 5, 8, 10, 15, 24]),
|
||||
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
|
||||
)
|
||||
},
|
||||
vec![1, 2, 5, 8, 10, 15, 24],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_skip_random() {
|
||||
let sample_include = sample_with_seed(10_000, 0.1, 1);
|
||||
let sample_exclude = sample_with_seed(10_000, 0.05, 2);
|
||||
let sample_skip = sample_with_seed(10_000, 0.005, 3);
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
box Exclude::new(
|
||||
VecDocSet::from(sample_include.clone()),
|
||||
VecDocSet::from(sample_exclude.clone()),
|
||||
)
|
||||
},
|
||||
sample_skip,
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,222 +0,0 @@
|
||||
use docset::{DocSet, SkipResult};
|
||||
use query::Scorer;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct Intersection<TDocSet: DocSet> {
|
||||
docsets: Vec<TDocSet>,
|
||||
finished: bool,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> From<Vec<TDocSet>> for Intersection<TDocSet> {
|
||||
fn from(mut docsets: Vec<TDocSet>) -> Intersection<TDocSet> {
|
||||
assert!(docsets.len() >= 2);
|
||||
docsets.sort_by_key(|docset| docset.size_hint());
|
||||
Intersection {
|
||||
docsets,
|
||||
finished: false,
|
||||
doc: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> Intersection<TDocSet> {
|
||||
/// Returns an array to the underlying `DocSet`s of the intersection.
|
||||
/// These `DocSet` are in the same position as the `IntersectionDocSet`,
|
||||
/// so that user can access their `docfreq` and `positions`.
|
||||
pub fn docsets(&self) -> &[TDocSet] {
|
||||
&self.docsets[..]
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for Intersection<TDocSet> {
|
||||
#[allow(never_loop)]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.finished {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut candidate_doc = self.doc;
|
||||
let mut candidate_ord = self.docsets.len();
|
||||
|
||||
'outer: loop {
|
||||
for (ord, docset) in self.docsets.iter_mut().enumerate() {
|
||||
if ord != candidate_ord {
|
||||
// `candidate_ord` is already at the
|
||||
// right position.
|
||||
//
|
||||
// Calling `skip_next` would advance this docset
|
||||
// and miss it.
|
||||
match docset.skip_next(candidate_doc) {
|
||||
SkipResult::Reached => {}
|
||||
SkipResult::OverStep => {
|
||||
// this is not in the intersection,
|
||||
// let's update our candidate.
|
||||
candidate_doc = docset.doc();
|
||||
candidate_ord = ord;
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::End => {
|
||||
self.finished = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.doc = candidate_doc;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
// We optimize skipping by skipping every single member
|
||||
// of the intersection to target.
|
||||
let mut current_target: DocId = target;
|
||||
let mut current_ord = self.docsets.len();
|
||||
|
||||
'outer: loop {
|
||||
for (ord, docset) in self.docsets.iter_mut().enumerate() {
|
||||
if ord == current_ord {
|
||||
continue;
|
||||
}
|
||||
match docset.skip_next(current_target) {
|
||||
SkipResult::End => {
|
||||
return SkipResult::End;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
// update the target
|
||||
// for the remaining members of the intersection.
|
||||
current_target = docset.doc();
|
||||
current_ord = ord;
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::Reached => {}
|
||||
}
|
||||
}
|
||||
|
||||
self.doc = current_target;
|
||||
if target == current_target {
|
||||
return SkipResult::Reached;
|
||||
} else {
|
||||
assert!(current_target > target);
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.min()
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer> Scorer for Intersection<TScorer>
|
||||
where
|
||||
TScorer: Scorer,
|
||||
{
|
||||
fn score(&mut self) -> Score {
|
||||
self.docsets.iter_mut().map(Scorer::score).sum()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use docset::{DocSet, SkipResult};
|
||||
use super::Intersection;
|
||||
use query::VecDocSet;
|
||||
use postings::tests::test_skip_against_unoptimized;
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
{
|
||||
let left = VecDocSet::from(vec![1, 3, 9]);
|
||||
let right = VecDocSet::from(vec![3, 4, 9, 18]);
|
||||
let mut intersection = Intersection::from(vec![left, right]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 3);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.advance());
|
||||
}
|
||||
{
|
||||
let a = VecDocSet::from(vec![1, 3, 9]);
|
||||
let b = VecDocSet::from(vec![3, 4, 9, 18]);
|
||||
let c = VecDocSet::from(vec![1, 5, 9, 111]);
|
||||
let mut intersection = Intersection::from(vec![a, b, c]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.advance());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection_zero() {
|
||||
let left = VecDocSet::from(vec![0]);
|
||||
let right = VecDocSet::from(vec![0]);
|
||||
let mut intersection = Intersection::from(vec![left, right]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection_skip() {
|
||||
let left = VecDocSet::from(vec![0, 1, 2, 4]);
|
||||
let right = VecDocSet::from(vec![2, 5]);
|
||||
let mut intersection = Intersection::from(vec![left, right]);
|
||||
assert_eq!(intersection.skip_next(2), SkipResult::Reached);
|
||||
assert_eq!(intersection.doc(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection_skip_against_unoptimized() {
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
let left = VecDocSet::from(vec![4]);
|
||||
let right = VecDocSet::from(vec![2, 5]);
|
||||
box Intersection::from(vec![left, right])
|
||||
},
|
||||
vec![0, 2, 4, 5, 6],
|
||||
);
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
let mut left = VecDocSet::from(vec![1, 4, 5, 6]);
|
||||
let mut right = VecDocSet::from(vec![2, 5, 10]);
|
||||
left.advance();
|
||||
right.advance();
|
||||
box Intersection::from(vec![left, right])
|
||||
},
|
||||
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
|
||||
);
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
box Intersection::from(vec![
|
||||
VecDocSet::from(vec![1, 4, 5, 6]),
|
||||
VecDocSet::from(vec![1, 2, 5, 6]),
|
||||
VecDocSet::from(vec![1, 4, 5, 6]),
|
||||
VecDocSet::from(vec![1, 5, 6]),
|
||||
VecDocSet::from(vec![2, 4, 5, 7, 8]),
|
||||
])
|
||||
},
|
||||
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersection_empty() {
|
||||
let a = VecDocSet::from(vec![1, 3]);
|
||||
let b = VecDocSet::from(vec![1, 4]);
|
||||
let c = VecDocSet::from(vec![3, 9]);
|
||||
let mut intersection = Intersection::from(vec![a, b, c]);
|
||||
assert!(!intersection.advance());
|
||||
}
|
||||
}
|
||||
@@ -7,32 +7,13 @@ mod boolean_query;
|
||||
mod scorer;
|
||||
mod occur;
|
||||
mod weight;
|
||||
mod occur_filter;
|
||||
mod term_query;
|
||||
mod query_parser;
|
||||
mod phrase_query;
|
||||
mod all_query;
|
||||
mod bitset;
|
||||
mod range_query;
|
||||
mod exclude;
|
||||
mod union;
|
||||
mod intersection;
|
||||
mod reqopt_scorer;
|
||||
|
||||
#[cfg(test)]
|
||||
mod vec_docset;
|
||||
|
||||
pub(crate) mod score_combiner;
|
||||
|
||||
pub use self::intersection::Intersection;
|
||||
pub use self::union::Union;
|
||||
|
||||
#[cfg(test)]
|
||||
pub use self::vec_docset::VecDocSet;
|
||||
|
||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::bitset::BitSetDocSet;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::occur_filter::OccurFilter;
|
||||
pub use self::occur::Occur;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query_parser::QueryParserError;
|
||||
@@ -42,6 +23,3 @@ pub use self::scorer::EmptyScorer;
|
||||
pub use self::scorer::Scorer;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::weight::Weight;
|
||||
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
||||
pub use self::range_query::RangeQuery;
|
||||
pub use self::scorer::ConstScorer;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/// Defines whether a term in a query must be present,
|
||||
/// should be present or must not be present.
|
||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum Occur {
|
||||
/// For a given document to be considered for scoring,
|
||||
/// at least one of the document with the Should or the Must
|
||||
|
||||
39
src/query/occur_filter.rs
Normal file
39
src/query/occur_filter.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use query::Occur;
|
||||
|
||||
/// An `OccurFilter` represents a filter over a bitset of
|
||||
/// at most 64 elements.
|
||||
///
|
||||
/// It wraps some simple bitmask to compute the filter
|
||||
/// rapidly.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct OccurFilter {
|
||||
and_mask: u64,
|
||||
result: u64,
|
||||
}
|
||||
|
||||
impl OccurFilter {
|
||||
/// Returns true if the bitset is matching the occur list.
|
||||
pub fn accept(&self, ord_set: u64) -> bool {
|
||||
(self.and_mask & ord_set) == self.result
|
||||
}
|
||||
|
||||
/// Builds an `OccurFilter` from a list of `Occur`.
|
||||
pub fn new(occurs: &[Occur]) -> OccurFilter {
|
||||
let mut and_mask = 0u64;
|
||||
let mut result = 0u64;
|
||||
for (i, occur) in occurs.iter().enumerate() {
|
||||
let shift = 1 << i;
|
||||
match *occur {
|
||||
Occur::Must => {
|
||||
and_mask |= shift;
|
||||
result |= shift;
|
||||
}
|
||||
Occur::MustNot => {
|
||||
and_mask |= shift;
|
||||
}
|
||||
Occur::Should => {}
|
||||
}
|
||||
}
|
||||
OccurFilter { and_mask, result }
|
||||
}
|
||||
}
|
||||
@@ -74,47 +74,4 @@ mod tests {
|
||||
assert_eq!(test_query(vec!["g", "a"]), empty_vec);
|
||||
}
|
||||
|
||||
#[test] // motivated by #234
|
||||
pub fn test_phrase_query_docfreq_order() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
// 0
|
||||
let doc = doc!(text_field=>"b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
// 1
|
||||
let doc = doc!(text_field=>"a b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
// 2
|
||||
let doc = doc!(text_field=>"b a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let mut test_collector = TestCollector::default();
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::from(terms);
|
||||
searcher
|
||||
.search(&phrase_query, &mut test_collector)
|
||||
.expect("search should succeed");
|
||||
test_collector.docs()
|
||||
};
|
||||
assert_eq!(test_query(vec!["a", "b"]), vec![1]);
|
||||
assert_eq!(test_query(vec!["b", "a"]), vec![2]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use schema::Term;
|
||||
use query::Query;
|
||||
use core::searcher::Searcher;
|
||||
use super::PhraseWeight;
|
||||
use std::any::Any;
|
||||
use query::Weight;
|
||||
use Result;
|
||||
|
||||
@@ -25,14 +26,17 @@ pub struct PhraseQuery {
|
||||
}
|
||||
|
||||
impl Query for PhraseQuery {
|
||||
/// Used to make it possible to cast Box<Query>
|
||||
/// into a specific type. This is mostly useful for unit tests.
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
|
||||
/// Create the weight associated to a query.
|
||||
///
|
||||
/// See [`Weight`](./trait.Weight.html).
|
||||
fn weight(&self, _searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
Ok(box PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
scoring_enabled,
|
||||
))
|
||||
fn weight(&self, _searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box PhraseWeight::from(self.phrase_terms.clone()))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,77 +1,21 @@
|
||||
use query::Scorer;
|
||||
use DocSet;
|
||||
use postings::SegmentPostings;
|
||||
use postings::Postings;
|
||||
use postings::IntersectionDocSet;
|
||||
use DocId;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use postings::{Postings, SegmentPostings};
|
||||
use query::{Intersection, Scorer};
|
||||
|
||||
struct PostingsWithOffset {
|
||||
offset: u32,
|
||||
segment_postings: SegmentPostings,
|
||||
}
|
||||
|
||||
impl PostingsWithOffset {
|
||||
pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset {
|
||||
PostingsWithOffset {
|
||||
offset,
|
||||
segment_postings,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for PostingsWithOffset {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.segment_postings.term_freq()
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.segment_postings.positions()
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for PostingsWithOffset {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.segment_postings.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.segment_postings.skip_next(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.segment_postings.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.segment_postings.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PhraseScorer {
|
||||
intersection_docset: Intersection<PostingsWithOffset>,
|
||||
pub intersection_docset: IntersectionDocSet<SegmentPostings>,
|
||||
}
|
||||
|
||||
impl PhraseScorer {
|
||||
pub fn new(term_postings: Vec<SegmentPostings>) -> PhraseScorer {
|
||||
let postings_with_offsets: Vec<_> = term_postings
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(offset, postings)| PostingsWithOffset::new(postings, offset as u32))
|
||||
.collect();
|
||||
PhraseScorer {
|
||||
intersection_docset: Intersection::from(postings_with_offsets),
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase_match(&self) -> bool {
|
||||
// TODO maybe we could avoid decoding positions lazily for all terms
|
||||
// when there is > 2 terms.
|
||||
//
|
||||
// For instance for the query "A B C", the position of "C" do not need
|
||||
// to be decoded if "A B" had no match.
|
||||
let docsets = self.intersection_docset.docsets();
|
||||
let mut positions_arr: Vec<&[u32]> = vec![&[]; docsets.len()];
|
||||
for docset in docsets {
|
||||
positions_arr[docset.offset as usize] = docset.positions();
|
||||
}
|
||||
let mut positions_arr: Vec<&[u32]> = self.intersection_docset
|
||||
.docsets()
|
||||
.iter()
|
||||
.map(|posting| posting.positions())
|
||||
.collect();
|
||||
|
||||
let num_postings = positions_arr.len() as u32;
|
||||
|
||||
@@ -119,35 +63,17 @@ impl DocSet for PhraseScorer {
|
||||
false
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if self.intersection_docset.skip_next(target) == SkipResult::End {
|
||||
return SkipResult::End;
|
||||
}
|
||||
if self.phrase_match() {
|
||||
if self.doc() == target {
|
||||
return SkipResult::Reached;
|
||||
} else {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
if self.advance() {
|
||||
SkipResult::OverStep
|
||||
} else {
|
||||
SkipResult::End
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.intersection_docset.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
fn size_hint(&self) -> usize {
|
||||
self.intersection_docset.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for PhraseScorer {
|
||||
fn score(&mut self) -> f32 {
|
||||
fn score(&self) -> f32 {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ use schema::Term;
|
||||
use schema::IndexRecordOption;
|
||||
use core::SegmentReader;
|
||||
use super::PhraseScorer;
|
||||
use postings::IntersectionDocSet;
|
||||
use query::EmptyScorer;
|
||||
use Result;
|
||||
|
||||
@@ -11,32 +12,27 @@ pub struct PhraseWeight {
|
||||
phrase_terms: Vec<Term>,
|
||||
}
|
||||
|
||||
impl PhraseWeight {
|
||||
/// Creates a new phrase weight.
|
||||
///
|
||||
/// Right now `scoring_enabled` is actually ignored.
|
||||
/// In the future, disabling scoring will result in a small performance boost.
|
||||
// TODO use the scoring disable information to avoid compute the
|
||||
// phrase freq in that case, and compute the phrase freq when scoring is enabled.
|
||||
// Right now we never compute it :|
|
||||
pub fn new(phrase_terms: Vec<Term>, _scoring_enabled: bool) -> PhraseWeight {
|
||||
impl From<Vec<Term>> for PhraseWeight {
|
||||
fn from(phrase_terms: Vec<Term>) -> PhraseWeight {
|
||||
PhraseWeight { phrase_terms }
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for term in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)
|
||||
{
|
||||
term_postings_list.push(postings);
|
||||
let inverted_index = reader.inverted_index(term.field());
|
||||
let term_postings_option =
|
||||
inverted_index.read_postings(term, IndexRecordOption::WithFreqsAndPositions);
|
||||
if let Some(term_postings) = term_postings_option {
|
||||
term_postings_list.push(term_postings);
|
||||
} else {
|
||||
return Ok(box EmptyScorer);
|
||||
}
|
||||
}
|
||||
Ok(box PhraseScorer::new(term_postings_list))
|
||||
Ok(box PhraseScorer {
|
||||
intersection_docset: IntersectionDocSet::from(term_postings_list),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use common::TimerTree;
|
||||
use SegmentLocalId;
|
||||
use super::Weight;
|
||||
use std::fmt;
|
||||
use std::any::Any;
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
/// for those documents.
|
||||
@@ -40,23 +41,14 @@ use std::fmt;
|
||||
/// When implementing a new type of `Query`, it is normal to implement a
|
||||
/// dedicated `Query`, `Weight` and `Scorer`.
|
||||
pub trait Query: fmt::Debug {
|
||||
/// Used to make it possible to cast Box<Query>
|
||||
/// into a specific type. This is mostly useful for unit tests.
|
||||
fn as_any(&self) -> &Any;
|
||||
|
||||
/// Create the weight associated to a query.
|
||||
///
|
||||
/// If scoring is not required, setting `scoring_enabled` to `false`
|
||||
/// can increase performances.
|
||||
///
|
||||
/// See [`Weight`](./trait.Weight.html).
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>>;
|
||||
|
||||
/// Returns the number of documents matching the query.
|
||||
fn count(&self, searcher: &Searcher) -> Result<usize> {
|
||||
let weight = self.weight(searcher, false)?;
|
||||
let mut result = 0;
|
||||
for reader in searcher.segment_readers() {
|
||||
result += weight.count(reader)? as usize;
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>>;
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
@@ -69,8 +61,7 @@ pub trait Query: fmt::Debug {
|
||||
///
|
||||
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<TimerTree> {
|
||||
let mut timer_tree = TimerTree::default();
|
||||
let scoring_enabled = collector.requires_scoring();
|
||||
let weight = self.weight(searcher, scoring_enabled)?;
|
||||
let weight = self.weight(searcher)?;
|
||||
{
|
||||
let mut search_timer = timer_tree.open("search");
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
|
||||
@@ -41,10 +41,9 @@ fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
(char('-'), parser(leaf))
|
||||
(char('-'), parser(literal))
|
||||
.map(|(_, expr)| UserInputAST::Not(box expr))
|
||||
.or((char('+'), parser(leaf)).map(|(_, expr)| UserInputAST::Must(box expr)))
|
||||
.or((char('('), parser(parse_to_ast), char(')')).map(|(_, expr, _)| expr))
|
||||
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
|
||||
.or(parser(literal))
|
||||
.parse_stream(input)
|
||||
}
|
||||
@@ -81,15 +80,11 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast() {
|
||||
test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))");
|
||||
test_parse_query_to_ast_helper("(+a +b) d", "((+(\"a\") +(\"b\")) \"d\")");
|
||||
test_parse_query_to_ast_helper("(+a)", "+(\"a\")");
|
||||
test_parse_query_to_ast_helper("(+a +b)", "(+(\"a\") +(\"b\"))");
|
||||
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "+(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+(abc:\"toto\") -(\"titi\"))");
|
||||
test_parse_query_to_ast_helper("+abc:toto -titi", "+(abc:\"toto\") -(\"titi\")");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "abc:\"a\" \"b\"");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
|
||||
@@ -206,10 +206,6 @@ impl QueryParser {
|
||||
))
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet => {
|
||||
let term = Term::from_field_text(field, phrase);
|
||||
Ok(Some(LogicalLiteral::Term(term)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -328,11 +324,8 @@ mod test {
|
||||
use tokenizer::TokenizerManager;
|
||||
use query::Query;
|
||||
use schema::Field;
|
||||
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
use super::QueryParser;
|
||||
use super::QueryParserError;
|
||||
use Index;
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use super::super::logical_ast::*;
|
||||
|
||||
fn make_query_parser() -> QueryParser {
|
||||
@@ -380,7 +373,7 @@ mod test {
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_nonindexed_field_yields_error() {
|
||||
let query_parser = make_query_parser();
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
let is_not_indexed_err = |query: &str| {
|
||||
let result: Result<Box<Query>, QueryParserError> = query_parser.parse_query(query);
|
||||
@@ -492,73 +485,6 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_field_does_not_exist() {
|
||||
let query_parser = make_query_parser();
|
||||
assert_matches!(
|
||||
query_parser.parse_query("boujou:\"18446744073709551615\""),
|
||||
Err(QueryParserError::FieldDoesNotExist(_))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_field_not_indexed() {
|
||||
let query_parser = make_query_parser();
|
||||
assert_matches!(
|
||||
query_parser.parse_query("notindexed_text:\"18446744073709551615\""),
|
||||
Err(QueryParserError::FieldNotIndexed(_))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_unknown_tokenizer() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("nonexistingtokenizer")
|
||||
.set_index_option(IndexRecordOption::Basic);
|
||||
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
|
||||
let title = schema_builder.add_text_field("title", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
|
||||
assert_matches!(
|
||||
query_parser.parse_query("title:\"happy tax payer\""),
|
||||
Err(QueryParserError::UnknownTokenizer(_, _))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_from_index() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("customtokenizer")
|
||||
.set_index_option(IndexRecordOption::Basic);
|
||||
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
|
||||
let title = schema_builder.add_text_field("title", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("customtokenizer", SimpleTokenizer);
|
||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
assert!(query_parser.parse_query("title:\"happy tax\"").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_expected_int() {
|
||||
let query_parser = make_query_parser();
|
||||
assert_matches!(
|
||||
query_parser.parse_query("unsigned:18a"),
|
||||
Err(QueryParserError::ExpectedInt(_))
|
||||
);
|
||||
assert!(query_parser.parse_query("unsigned:\"18\"").is_ok());
|
||||
assert_matches!(
|
||||
query_parser.parse_query("signed:18b"),
|
||||
Err(QueryParserError::ExpectedInt(_))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
|
||||
@@ -35,12 +35,10 @@ impl fmt::Debug for UserInputAST {
|
||||
if subqueries.is_empty() {
|
||||
write!(formatter, "<emptyclause>")?;
|
||||
} else {
|
||||
write!(formatter, "(")?;
|
||||
write!(formatter, "{:?}", &subqueries[0])?;
|
||||
for subquery in &subqueries[1..] {
|
||||
write!(formatter, " {:?}", subquery)?;
|
||||
}
|
||||
write!(formatter, ")")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,287 +0,0 @@
|
||||
use schema::{Field, IndexRecordOption, Term};
|
||||
use query::{Query, Scorer, Weight};
|
||||
use termdict::{TermDictionary, TermStreamer, TermStreamerBuilder};
|
||||
use core::SegmentReader;
|
||||
use common::BitSet;
|
||||
use Result;
|
||||
use core::Searcher;
|
||||
use query::BitSetDocSet;
|
||||
use query::ConstScorer;
|
||||
use std::collections::Bound;
|
||||
use std::collections::range::RangeArgument;
|
||||
|
||||
fn map_bound<TFrom, Transform: Fn(TFrom) -> Vec<u8>>(
|
||||
bound: Bound<TFrom>,
|
||||
transform: &Transform,
|
||||
) -> Bound<Vec<u8>> {
|
||||
use self::Bound::*;
|
||||
match bound {
|
||||
Excluded(from_val) => Excluded(transform(from_val)),
|
||||
Included(from_val) => Included(transform(from_val)),
|
||||
Unbounded => Unbounded,
|
||||
}
|
||||
}
|
||||
|
||||
/// `RangeQuery` match all documents that have at least one term within a defined range.
|
||||
///
|
||||
/// Matched document will all get a constant `Score` of one.
|
||||
///
|
||||
/// # Implementation
|
||||
///
|
||||
/// The current implement will iterate over the terms within the range
|
||||
/// and append all of the document cross into a `BitSet`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
///
|
||||
/// # #[macro_use]
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::Index;
|
||||
/// # use tantivy::schema::{SchemaBuilder, INT_INDEXED};
|
||||
/// # use tantivy::collector::CountCollector;
|
||||
/// # use tantivy::query::Query;
|
||||
/// # use tantivy::Result;
|
||||
/// # use tantivy::query::RangeQuery;
|
||||
/// #
|
||||
/// # fn run() -> Result<()> {
|
||||
/// # let mut schema_builder = SchemaBuilder::new();
|
||||
/// # let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
|
||||
/// # let schema = schema_builder.build();
|
||||
/// #
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # {
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
|
||||
/// # for year in 1950u64..2017u64 {
|
||||
/// # let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
|
||||
/// # for _ in 0..num_docs_within_year {
|
||||
/// # index_writer.add_document(doc!(year_field => year));
|
||||
/// # }
|
||||
/// # }
|
||||
/// # index_writer.commit().unwrap();
|
||||
/// # }
|
||||
/// # index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
||||
///
|
||||
/// // ... or `1960..=1969` if inclusive range is enabled.
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
|
||||
///
|
||||
/// let num_60s_books = count_collector.count();
|
||||
///
|
||||
/// # assert_eq!(num_60s_books, 2285);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// #
|
||||
/// # fn main() {
|
||||
/// # run().unwrap()
|
||||
/// # }
|
||||
/// ```
|
||||
#[derive(Debug)]
|
||||
pub struct RangeQuery {
|
||||
field: Field,
|
||||
left_bound: Bound<Vec<u8>>,
|
||||
right_bound: Bound<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl RangeQuery {
|
||||
/// Create a new `RangeQuery` over a `i64` field.
|
||||
pub fn new_i64<TRangeArgument: RangeArgument<i64>>(
|
||||
field: Field,
|
||||
range: TRangeArgument,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &i64| Term::from_field_i64(field, *val).value_bytes().to_owned();
|
||||
RangeQuery {
|
||||
field,
|
||||
left_bound: map_bound(range.start(), &make_term_val),
|
||||
right_bound: map_bound(range.end(), &make_term_val),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `u64` field.
|
||||
pub fn new_u64<TRangeArgument: RangeArgument<u64>>(
|
||||
field: Field,
|
||||
range: TRangeArgument,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &u64| Term::from_field_u64(field, *val).value_bytes().to_owned();
|
||||
RangeQuery {
|
||||
field,
|
||||
left_bound: map_bound(range.start(), &make_term_val),
|
||||
right_bound: map_bound(range.end(), &make_term_val),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `Str` field.
|
||||
pub fn new_str<'b, TRangeArgument: RangeArgument<&'b str>>(
|
||||
field: Field,
|
||||
range: TRangeArgument,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &&str| val.as_bytes().to_vec();
|
||||
RangeQuery {
|
||||
field,
|
||||
left_bound: map_bound(range.start(), &make_term_val),
|
||||
right_bound: map_bound(range.end(), &make_term_val),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for RangeQuery {
|
||||
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
Ok(box RangeWeight {
|
||||
field: self.field,
|
||||
left_bound: self.left_bound.clone(),
|
||||
right_bound: self.right_bound.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RangeWeight {
|
||||
field: Field,
|
||||
left_bound: Bound<Vec<u8>>,
|
||||
right_bound: Bound<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl RangeWeight {
|
||||
fn term_range<'a, T>(&self, term_dict: &'a T) -> T::Streamer
|
||||
where
|
||||
T: TermDictionary<'a> + 'a,
|
||||
{
|
||||
use std::collections::Bound::*;
|
||||
let mut term_stream_builder = term_dict.range();
|
||||
term_stream_builder = match self.left_bound {
|
||||
Included(ref term_val) => term_stream_builder.ge(term_val),
|
||||
Excluded(ref term_val) => term_stream_builder.gt(term_val),
|
||||
Unbounded => term_stream_builder,
|
||||
};
|
||||
term_stream_builder = match self.right_bound {
|
||||
Included(ref term_val) => term_stream_builder.le(term_val),
|
||||
Excluded(ref term_val) => term_stream_builder.lt(term_val),
|
||||
Unbounded => term_stream_builder,
|
||||
};
|
||||
term_stream_builder.into_stream()
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for RangeWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
|
||||
let inverted_index = reader.inverted_index(self.field);
|
||||
let term_dict = inverted_index.terms();
|
||||
let mut term_range = self.term_range(term_dict);
|
||||
while term_range.advance() {
|
||||
let term_info = term_range.value();
|
||||
let mut block_segment_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
|
||||
while block_segment_postings.advance() {
|
||||
for &doc in block_segment_postings.docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(box ConstScorer::new(doc_bitset))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use Index;
|
||||
use schema::{Document, Field, SchemaBuilder, INT_INDEXED};
|
||||
use collector::CountCollector;
|
||||
use std::collections::Bound;
|
||||
use query::Query;
|
||||
use Result;
|
||||
use super::RangeQuery;
|
||||
|
||||
#[test]
|
||||
fn test_range_query_simple() {
|
||||
fn run() -> Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
|
||||
for year in 1950u64..2017u64 {
|
||||
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
|
||||
for _ in 0..num_docs_within_year {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let mut count_collector = CountCollector::default();
|
||||
docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
|
||||
assert_eq!(count_collector.count(), 2285);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
run().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_query() {
|
||||
let int_field: Field;
|
||||
let schema = {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
int_field = schema_builder.add_i64_field("intfield", INT_INDEXED);
|
||||
schema_builder.build()
|
||||
};
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
|
||||
|
||||
for i in 1..100 {
|
||||
let mut doc = Document::new();
|
||||
for j in 1..100 {
|
||||
if i % j == 0 {
|
||||
doc.add_i64(int_field, j as i64);
|
||||
}
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let count_multiples = |range_query: RangeQuery| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
range_query
|
||||
.search(&*searcher, &mut count_collector)
|
||||
.unwrap();
|
||||
count_collector.count()
|
||||
};
|
||||
|
||||
assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 10..11)), 9);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64(
|
||||
int_field,
|
||||
(Bound::Included(10), Bound::Included(11))
|
||||
)),
|
||||
18
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64(
|
||||
int_field,
|
||||
(Bound::Excluded(9), Bound::Included(10))
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 9..)), 91);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,194 +0,0 @@
|
||||
use DocId;
|
||||
use query::Scorer;
|
||||
use query::score_combiner::ScoreCombiner;
|
||||
use Score;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use std::cmp::Ordering;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
/// Given a required scorer and an optional scorer
|
||||
/// matches all document from the required scorer
|
||||
/// and complements the score using the optional scorer.
|
||||
///
|
||||
/// This is useful for queries like `+somethingrequired somethingoptional`.
|
||||
///
|
||||
/// Note that `somethingoptional` has no impact on the `DocSet`.
|
||||
pub struct RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner> {
|
||||
req_scorer: TReqScorer,
|
||||
opt_scorer: TOptScorer,
|
||||
score_cache: Option<Score>,
|
||||
opt_finished: bool,
|
||||
_phantom: PhantomData<TScoreCombiner>,
|
||||
}
|
||||
|
||||
impl<TReqScorer, TOptScorer, TScoreCombiner>
|
||||
RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner>
|
||||
where
|
||||
TOptScorer: DocSet,
|
||||
{
|
||||
/// Creates a new `RequiredOptionalScorer`.
|
||||
pub fn new(
|
||||
req_scorer: TReqScorer,
|
||||
mut opt_scorer: TOptScorer,
|
||||
) -> RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner> {
|
||||
let opt_finished = !opt_scorer.advance();
|
||||
RequiredOptionalScorer {
|
||||
req_scorer,
|
||||
opt_scorer,
|
||||
score_cache: None,
|
||||
opt_finished,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TReqScorer, TOptScorer, TScoreCombiner> DocSet
|
||||
for RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner>
|
||||
where
|
||||
TReqScorer: DocSet,
|
||||
TOptScorer: DocSet,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
self.score_cache = None;
|
||||
self.req_scorer.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.req_scorer.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.req_scorer.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TReqScorer, TOptScorer, TScoreCombiner> Scorer
|
||||
for RequiredOptionalScorer<TReqScorer, TOptScorer, TScoreCombiner>
|
||||
where
|
||||
TReqScorer: Scorer,
|
||||
TOptScorer: Scorer,
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
fn score(&mut self) -> Score {
|
||||
if let Some(score) = self.score_cache {
|
||||
return score;
|
||||
}
|
||||
let doc = self.doc();
|
||||
let mut score_combiner = TScoreCombiner::default();
|
||||
score_combiner.update(&mut self.req_scorer);
|
||||
if !self.opt_finished {
|
||||
match self.opt_scorer.doc().cmp(&doc) {
|
||||
Ordering::Greater => {}
|
||||
Ordering::Equal => {
|
||||
score_combiner.update(&mut self.opt_scorer);
|
||||
}
|
||||
Ordering::Less => match self.opt_scorer.skip_next(doc) {
|
||||
SkipResult::Reached => {
|
||||
score_combiner.update(&mut self.opt_scorer);
|
||||
}
|
||||
SkipResult::End => {
|
||||
self.opt_finished = true;
|
||||
}
|
||||
SkipResult::OverStep => {}
|
||||
},
|
||||
}
|
||||
}
|
||||
let score = score_combiner.score();
|
||||
self.score_cache = Some(score);
|
||||
score
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tests::sample_with_seed;
|
||||
use super::RequiredOptionalScorer;
|
||||
use query::VecDocSet;
|
||||
use query::ConstScorer;
|
||||
use docset::DocSet;
|
||||
use postings::tests::test_skip_against_unoptimized;
|
||||
use query::Scorer;
|
||||
use query::score_combiner::{DoNothingCombiner, SumCombiner};
|
||||
|
||||
#[test]
|
||||
fn test_reqopt_scorer_empty() {
|
||||
let req = vec![1, 3, 7];
|
||||
let mut reqoptscorer: RequiredOptionalScorer<_, _, SumCombiner> =
|
||||
RequiredOptionalScorer::new(
|
||||
ConstScorer::new(VecDocSet::from(req.clone())),
|
||||
ConstScorer::new(VecDocSet::from(vec![])),
|
||||
);
|
||||
let mut docs = vec![];
|
||||
while reqoptscorer.advance() {
|
||||
docs.push(reqoptscorer.doc());
|
||||
}
|
||||
assert_eq!(docs, req);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reqopt_scorer() {
|
||||
let mut reqoptscorer: RequiredOptionalScorer<_, _, SumCombiner> =
|
||||
RequiredOptionalScorer::new(
|
||||
ConstScorer::new(VecDocSet::from(vec![1, 3, 7, 8, 9, 10, 13, 15])),
|
||||
ConstScorer::new(VecDocSet::from(vec![1, 2, 7, 11, 12, 15])),
|
||||
);
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 1);
|
||||
assert_eq!(reqoptscorer.score(), 2f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 3);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 7);
|
||||
assert_eq!(reqoptscorer.score(), 2f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 8);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 9);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 10);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 13);
|
||||
assert_eq!(reqoptscorer.score(), 1f32);
|
||||
}
|
||||
{
|
||||
assert!(reqoptscorer.advance());
|
||||
assert_eq!(reqoptscorer.doc(), 15);
|
||||
assert_eq!(reqoptscorer.score(), 2f32);
|
||||
}
|
||||
assert!(!reqoptscorer.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reqopt_scorer_skip() {
|
||||
let req_docs = sample_with_seed(10_000, 0.02, 1);
|
||||
let opt_docs = sample_with_seed(10_000, 0.02, 2);
|
||||
let skip_docs = sample_with_seed(10_000, 0.001, 3);
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
box RequiredOptionalScorer::<_, _, DoNothingCombiner>::new(
|
||||
ConstScorer::new(VecDocSet::from(req_docs.clone())),
|
||||
ConstScorer::new(VecDocSet::from(opt_docs.clone())),
|
||||
)
|
||||
},
|
||||
skip_docs,
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,80 +0,0 @@
|
||||
use Score;
|
||||
use query::Scorer;
|
||||
|
||||
/// The `ScoreCombiner` trait defines how to compute
|
||||
/// an overall score given a list of scores.
|
||||
pub trait ScoreCombiner: Default + Clone + Copy + 'static {
|
||||
/// Aggregates the score combiner with the given scorer.
|
||||
///
|
||||
/// The `ScoreCombiner` may decide to call `.scorer.score()`
|
||||
/// or not.
|
||||
fn update<TScorer: Scorer>(&mut self, scorer: &mut TScorer);
|
||||
|
||||
/// Clears the score combiner state back to its initial state.
|
||||
fn clear(&mut self);
|
||||
|
||||
/// Returns the aggregate score.
|
||||
fn score(&self) -> Score;
|
||||
}
|
||||
|
||||
/// Just ignores scores. The `DoNothingCombiner` does not
|
||||
/// even call the scorers `.score()` function.
|
||||
///
|
||||
/// It is useful to optimize the case when scoring is disabled.
|
||||
///
|
||||
#[derive(Default, Clone, Copy)] //< these should not be too much work :)
|
||||
pub struct DoNothingCombiner;
|
||||
|
||||
impl ScoreCombiner for DoNothingCombiner {
|
||||
fn update<TScorer: Scorer>(&mut self, _scorer: &mut TScorer) {}
|
||||
|
||||
fn clear(&mut self) {}
|
||||
|
||||
fn score(&self) -> Score {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
|
||||
/// Sums the score of different scorers.
|
||||
#[derive(Default, Clone, Copy)]
|
||||
pub struct SumCombiner {
|
||||
score: Score,
|
||||
}
|
||||
|
||||
impl ScoreCombiner for SumCombiner {
|
||||
fn update<TScorer: Scorer>(&mut self, scorer: &mut TScorer) {
|
||||
self.score += scorer.score();
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.score = 0f32;
|
||||
}
|
||||
|
||||
fn score(&self) -> Score {
|
||||
self.score
|
||||
}
|
||||
}
|
||||
|
||||
/// Sums the score of different scorers and keeps the count
|
||||
/// of scorers which matched.
|
||||
#[derive(Default, Clone, Copy)]
|
||||
pub struct SumWithCoordsCombiner {
|
||||
num_fields: usize,
|
||||
score: Score,
|
||||
}
|
||||
|
||||
impl ScoreCombiner for SumWithCoordsCombiner {
|
||||
fn update<TScorer: Scorer>(&mut self, scorer: &mut TScorer) {
|
||||
self.score += scorer.score();
|
||||
self.num_fields += 1;
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.score = 0f32;
|
||||
self.num_fields = 0;
|
||||
}
|
||||
|
||||
fn score(&self) -> Score {
|
||||
self.score
|
||||
}
|
||||
}
|
||||
@@ -1,19 +1,17 @@
|
||||
use DocSet;
|
||||
use DocId;
|
||||
use Score;
|
||||
use collector::Collector;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use common::BitSet;
|
||||
use std::ops::DerefMut;
|
||||
use downcast;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
/// Scored set of documents matching a query within a specific segment.
|
||||
///
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
pub trait Scorer: downcast::Any + DocSet + 'static {
|
||||
pub trait Scorer: DocSet {
|
||||
/// Returns the score.
|
||||
///
|
||||
/// This method will perform a bit of computation and is not cached.
|
||||
fn score(&mut self) -> Score;
|
||||
fn score(&self) -> Score;
|
||||
|
||||
/// Consumes the complete `DocSet` and
|
||||
/// push the scored documents to the collector.
|
||||
@@ -24,19 +22,16 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(missing_docs)]
|
||||
mod downcast_impl {
|
||||
downcast!(super::Scorer);
|
||||
}
|
||||
|
||||
impl Scorer for Box<Scorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.deref_mut().score()
|
||||
impl<'a> Scorer for Box<Scorer + 'a> {
|
||||
fn score(&self) -> Score {
|
||||
self.deref().score()
|
||||
}
|
||||
|
||||
fn collect(&mut self, collector: &mut Collector) {
|
||||
let scorer = self.deref_mut();
|
||||
scorer.collect(collector);
|
||||
while scorer.advance() {
|
||||
collector.collect(scorer.doc(), scorer.score());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,95 +46,16 @@ impl DocSet for EmptyScorer {
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
panic!(
|
||||
"You may not call .doc() on a scorer \
|
||||
where the last call to advance() did not return true."
|
||||
);
|
||||
DocId::max_value()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
fn size_hint(&self) -> usize {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for EmptyScorer {
|
||||
fn score(&mut self) -> Score {
|
||||
fn score(&self) -> Score {
|
||||
0f32
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps a `DocSet` and simply returns a constant `Scorer`.
|
||||
/// The `ConstScorer` is useful if you have a `DocSet` where
|
||||
/// you needed a scorer.
|
||||
///
|
||||
/// The `ConstScorer`'s constant score can be set
|
||||
/// by calling `.set_score(...)`.
|
||||
pub struct ConstScorer<TDocSet: DocSet> {
|
||||
docset: TDocSet,
|
||||
score: Score,
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> ConstScorer<TDocSet> {
|
||||
/// Creates a new `ConstScorer`.
|
||||
pub fn new(docset: TDocSet) -> ConstScorer<TDocSet> {
|
||||
ConstScorer {
|
||||
docset,
|
||||
score: 1f32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the constant score to a different value.
|
||||
pub fn set_score(&mut self, score: Score) {
|
||||
self.score = score;
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.docset.advance()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.docset.skip_next(target)
|
||||
}
|
||||
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
|
||||
self.docset.fill_buffer(buffer)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.docset.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docset.size_hint()
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
self.docset.append_to_bitset(bitset);
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {
|
||||
fn score(&mut self) -> Score {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::EmptyScorer;
|
||||
use DocSet;
|
||||
|
||||
#[test]
|
||||
fn test_empty_scorer() {
|
||||
let mut empty_scorer = EmptyScorer;
|
||||
assert!(!empty_scorer.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_empty_scorer_panic_on_doc_call() {
|
||||
EmptyScorer.doc();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,10 +9,11 @@ pub use self::term_scorer::TermScorer;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use docset::DocSet;
|
||||
use postings::SegmentPostings;
|
||||
use query::{Query, Scorer};
|
||||
use postings::{DocSet, VecPostings};
|
||||
use query::Scorer;
|
||||
use query::term_query::TermScorer;
|
||||
use query::Query;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use query::TermQuery;
|
||||
use Index;
|
||||
use schema::*;
|
||||
@@ -45,7 +46,7 @@ mod tests {
|
||||
Term::from_field_text(text_field, "a"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let term_weight = term_query.weight(&searcher, true).unwrap();
|
||||
let term_weight = term_query.weight(&searcher).unwrap();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut term_scorer = term_weight.scorer(segment_reader).unwrap();
|
||||
assert!(term_scorer.advance());
|
||||
@@ -55,10 +56,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_term_scorer() {
|
||||
let left_fieldnorms = FastFieldReader::from(vec![10, 4]);
|
||||
let left_fieldnorms = U64FastFieldReader::from(vec![10, 4]);
|
||||
assert_eq!(left_fieldnorms.get(0), 10);
|
||||
assert_eq!(left_fieldnorms.get(1), 4);
|
||||
let left = SegmentPostings::create_from_docs(&[1]);
|
||||
let left = VecPostings::from(vec![1]);
|
||||
let mut left_scorer = TermScorer {
|
||||
idf: 0.30685282,
|
||||
fieldnorm_reader_opt: Some(left_fieldnorms),
|
||||
|
||||
@@ -5,6 +5,7 @@ use query::Query;
|
||||
use query::Weight;
|
||||
use schema::IndexRecordOption;
|
||||
use Searcher;
|
||||
use std::any::Any;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
/// containing a specific term.
|
||||
@@ -35,23 +36,22 @@ impl TermQuery {
|
||||
/// While `.weight(...)` returns a boxed trait object,
|
||||
/// this method return a specific implementation.
|
||||
/// This is useful for optimization purpose.
|
||||
pub fn specialized_weight(&self, searcher: &Searcher, scoring_enabled: bool) -> TermWeight {
|
||||
let index_record_option = if scoring_enabled {
|
||||
self.index_record_option
|
||||
} else {
|
||||
IndexRecordOption::Basic
|
||||
};
|
||||
pub fn specialized_weight(&self, searcher: &Searcher) -> TermWeight {
|
||||
TermWeight {
|
||||
num_docs: searcher.num_docs(),
|
||||
doc_freq: searcher.doc_freq(&self.term),
|
||||
term: self.term.clone(),
|
||||
index_record_option,
|
||||
index_record_option: self.index_record_option,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for TermQuery {
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
Ok(box self.specialized_weight(searcher, scoring_enabled))
|
||||
fn as_any(&self) -> &Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box self.specialized_weight(searcher))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,24 +1,33 @@
|
||||
use Score;
|
||||
use DocId;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use postings::SegmentPostings;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use postings::DocSet;
|
||||
use query::Scorer;
|
||||
use postings::Postings;
|
||||
use fastfield::FastFieldReader;
|
||||
|
||||
pub struct TermScorer {
|
||||
pub struct TermScorer<TPostings>
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
pub idf: Score,
|
||||
pub fieldnorm_reader_opt: Option<FastFieldReader<u64>>,
|
||||
pub postings: SegmentPostings,
|
||||
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
|
||||
pub postings: TPostings,
|
||||
}
|
||||
|
||||
impl TermScorer {
|
||||
pub fn postings(&self) -> &SegmentPostings {
|
||||
impl<TPostings> TermScorer<TPostings>
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
pub fn postings(&self) -> &TPostings {
|
||||
&self.postings
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for TermScorer {
|
||||
impl<TPostings> DocSet for TermScorer<TPostings>
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
@@ -27,17 +36,16 @@ impl DocSet for TermScorer {
|
||||
self.postings.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
fn size_hint(&self) -> usize {
|
||||
self.postings.size_hint()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.postings.skip_next(target)
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for TermScorer {
|
||||
fn score(&mut self) -> Score {
|
||||
impl<TPostings> Scorer for TermScorer<TPostings>
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
fn score(&self) -> Score {
|
||||
let doc = self.postings.doc();
|
||||
let tf = match self.fieldnorm_reader_opt {
|
||||
Some(ref fieldnorm_reader) => {
|
||||
|
||||
@@ -2,7 +2,6 @@ use Term;
|
||||
use query::Weight;
|
||||
use core::SegmentReader;
|
||||
use query::Scorer;
|
||||
use docset::DocSet;
|
||||
use postings::SegmentPostings;
|
||||
use schema::IndexRecordOption;
|
||||
use super::term_scorer::TermScorer;
|
||||
@@ -16,23 +15,10 @@ pub struct TermWeight {
|
||||
}
|
||||
|
||||
impl Weight for TermWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let specialized_scorer = self.specialized_scorer(reader)?;
|
||||
Ok(box specialized_scorer)
|
||||
}
|
||||
|
||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||
if reader.num_deleted_docs() == 0 {
|
||||
let field = self.term.field();
|
||||
Ok(reader
|
||||
.inverted_index(field)
|
||||
.get_term_info(&self.term)
|
||||
.map(|term_info| term_info.doc_freq)
|
||||
.unwrap_or(0))
|
||||
} else {
|
||||
Ok(self.specialized_scorer(reader)?.count())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TermWeight {
|
||||
@@ -41,7 +27,10 @@ impl TermWeight {
|
||||
}
|
||||
|
||||
/// If the field is not found, returns an empty `DocSet`.
|
||||
pub fn specialized_scorer(&self, reader: &SegmentReader) -> Result<TermScorer> {
|
||||
pub fn specialized_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<TermScorer<SegmentPostings>> {
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field);
|
||||
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
|
||||
|
||||
@@ -1,428 +0,0 @@
|
||||
use docset::{DocSet, SkipResult};
|
||||
use query::Scorer;
|
||||
use common::TinySet;
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
use Score;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
|
||||
const HORIZON_NUM_TINYBITSETS: usize = 64;
|
||||
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
docsets: Vec<TScorer>,
|
||||
bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
|
||||
scores: Box<[TScoreCombiner; HORIZON as usize]>,
|
||||
cursor: usize,
|
||||
offset: DocId,
|
||||
doc: DocId,
|
||||
score: Score,
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> From<Vec<TScorer>> for Union<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
TScorer: Scorer,
|
||||
{
|
||||
fn from(docsets: Vec<TScorer>) -> Union<TScorer, TScoreCombiner> {
|
||||
let non_empty_docsets: Vec<TScorer> = docsets
|
||||
.into_iter()
|
||||
.flat_map(
|
||||
|mut docset| {
|
||||
if docset.advance() {
|
||||
Some(docset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
)
|
||||
.collect();
|
||||
Union {
|
||||
docsets: non_empty_docsets,
|
||||
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
|
||||
scores: Box::new([TScoreCombiner::default(); HORIZON as usize]),
|
||||
cursor: HORIZON_NUM_TINYBITSETS,
|
||||
offset: 0,
|
||||
doc: 0,
|
||||
score: 0f32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
|
||||
scorers: &mut Vec<TScorer>,
|
||||
bitsets: &mut [TinySet; HORIZON_NUM_TINYBITSETS],
|
||||
score_combiner: &mut [TScoreCombiner; HORIZON as usize],
|
||||
min_doc: DocId,
|
||||
) {
|
||||
scorers.drain_filter(|scorer| {
|
||||
let horizon = min_doc + HORIZON as u32;
|
||||
loop {
|
||||
let doc = scorer.doc();
|
||||
if doc >= horizon {
|
||||
return false;
|
||||
}
|
||||
// add this document
|
||||
let delta = doc - min_doc;
|
||||
bitsets[(delta / 64) as usize].insert_mut(delta % 64u32);
|
||||
score_combiner[delta as usize].update(scorer);
|
||||
if !scorer.advance() {
|
||||
// remove the docset, it has been entirely consumed.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
||||
fn refill(&mut self) -> bool {
|
||||
if let Some(min_doc) = self.docsets.iter_mut().map(|docset| docset.doc()).min() {
|
||||
self.offset = min_doc;
|
||||
self.cursor = 0;
|
||||
refill(
|
||||
&mut self.docsets,
|
||||
&mut *self.bitsets,
|
||||
&mut *self.scores,
|
||||
min_doc,
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn advance_buffered(&mut self) -> bool {
|
||||
while self.cursor < HORIZON_NUM_TINYBITSETS {
|
||||
if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
|
||||
let delta = val + (self.cursor as u32) * 64;
|
||||
self.doc = self.offset + delta;
|
||||
let score_combiner = &mut self.scores[delta as usize];
|
||||
self.score = score_combiner.score();
|
||||
score_combiner.clear();
|
||||
return true;
|
||||
} else {
|
||||
self.cursor += 1;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> DocSet for Union<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScorer: Scorer,
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.advance_buffered() {
|
||||
return true;
|
||||
}
|
||||
if self.refill() {
|
||||
self.advance();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&mut self) -> u32 {
|
||||
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
||||
.iter()
|
||||
.map(|bitset| bitset.len())
|
||||
.sum::<u32>();
|
||||
for bitset in self.bitsets.iter_mut() {
|
||||
bitset.clear();
|
||||
}
|
||||
while self.refill() {
|
||||
count += self.bitsets.iter().map(|bitset| bitset.len()).sum::<u32>();
|
||||
for bitset in self.bitsets.iter_mut() {
|
||||
bitset.clear();
|
||||
}
|
||||
}
|
||||
self.cursor = HORIZON_NUM_TINYBITSETS;
|
||||
count
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
match self.doc.cmp(&target) {
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
Ordering::Less => {}
|
||||
}
|
||||
let gap = target - self.offset;
|
||||
if gap < HORIZON {
|
||||
// Our value is within the buffered horizon.
|
||||
|
||||
// Skipping to corresponding bucket.
|
||||
let new_cursor = gap as usize / 64;
|
||||
for obsolete_tinyset in &mut self.bitsets[self.cursor..new_cursor] {
|
||||
obsolete_tinyset.clear();
|
||||
}
|
||||
for score_combiner in &mut self.scores[self.cursor * 64..new_cursor * 64] {
|
||||
score_combiner.clear();
|
||||
}
|
||||
self.cursor = new_cursor;
|
||||
|
||||
// Advancing until we reach the end of the bucket
|
||||
// or we reach a doc greater or equal to the target.
|
||||
while self.advance() {
|
||||
match self.doc().cmp(&target) {
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
Ordering::Less => {}
|
||||
}
|
||||
}
|
||||
SkipResult::End
|
||||
} else {
|
||||
// clear the buffered info.
|
||||
for obsolete_tinyset in self.bitsets.iter_mut() {
|
||||
*obsolete_tinyset = TinySet::empty();
|
||||
}
|
||||
for score_combiner in self.scores.iter_mut() {
|
||||
score_combiner.clear();
|
||||
}
|
||||
|
||||
// The target is outside of the buffered horizon.
|
||||
// advance all docsets to a doc >= to the target.
|
||||
self.docsets
|
||||
.drain_filter(|docset| match docset.doc().cmp(&target) {
|
||||
Ordering::Less => match docset.skip_next(target) {
|
||||
SkipResult::End => true,
|
||||
SkipResult::Reached | SkipResult::OverStep => false,
|
||||
},
|
||||
Ordering::Equal | Ordering::Greater => false,
|
||||
});
|
||||
|
||||
// at this point all of the docsets
|
||||
// are positionned on a doc >= to the target.
|
||||
if self.refill() {
|
||||
self.advance();
|
||||
if self.doc() == target {
|
||||
SkipResult::Reached
|
||||
} else {
|
||||
debug_assert!(self.doc() > target);
|
||||
SkipResult::OverStep
|
||||
}
|
||||
} else {
|
||||
SkipResult::End
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
0u32
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> Scorer for Union<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
TScorer: Scorer,
|
||||
{
|
||||
fn score(&mut self) -> Score {
|
||||
self.score
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::Union;
|
||||
use tests;
|
||||
use test::Bencher;
|
||||
use DocId;
|
||||
use std::collections::BTreeSet;
|
||||
use super::HORIZON;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use postings::tests::test_skip_against_unoptimized;
|
||||
use query::VecDocSet;
|
||||
use query::ConstScorer;
|
||||
use query::score_combiner::DoNothingCombiner;
|
||||
|
||||
fn aux_test_union(vals: Vec<Vec<u32>>) {
|
||||
let mut val_set: BTreeSet<u32> = BTreeSet::new();
|
||||
for vs in &vals {
|
||||
for &v in vs {
|
||||
val_set.insert(v);
|
||||
}
|
||||
}
|
||||
let union_vals: Vec<u32> = val_set.into_iter().collect();
|
||||
let mut union_expected = VecDocSet::from(union_vals);
|
||||
let make_union = || {
|
||||
Union::from(
|
||||
vals.iter()
|
||||
.cloned()
|
||||
.map(VecDocSet::from)
|
||||
.map(ConstScorer::new)
|
||||
.collect::<Vec<ConstScorer<VecDocSet>>>(),
|
||||
)
|
||||
};
|
||||
let mut union: Union<_, DoNothingCombiner> = make_union();
|
||||
let mut count = 0;
|
||||
while union.advance() {
|
||||
assert!(union_expected.advance());
|
||||
assert_eq!(union_expected.doc(), union.doc());
|
||||
count += 1;
|
||||
}
|
||||
assert!(!union_expected.advance());
|
||||
assert_eq!(count, make_union().count());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union() {
|
||||
aux_test_union(vec![
|
||||
vec![1, 3333, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![],
|
||||
]);
|
||||
aux_test_union(vec![
|
||||
vec![1, 3333, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![],
|
||||
]);
|
||||
aux_test_union(vec![
|
||||
tests::sample_with_seed(100_000, 0.01, 1),
|
||||
tests::sample_with_seed(100_000, 0.05, 2),
|
||||
tests::sample_with_seed(100_000, 0.001, 3),
|
||||
]);
|
||||
}
|
||||
|
||||
fn test_aux_union_skip(docs_list: &[Vec<DocId>], skip_targets: Vec<DocId>) {
|
||||
let mut btree_set = BTreeSet::new();
|
||||
for docs in docs_list {
|
||||
for &doc in docs.iter() {
|
||||
btree_set.insert(doc);
|
||||
}
|
||||
}
|
||||
let docset_factory = || {
|
||||
let res: Box<DocSet> = box Union::<_, DoNothingCombiner>::from(
|
||||
docs_list
|
||||
.iter()
|
||||
.map(|docs| docs.clone())
|
||||
.map(VecDocSet::from)
|
||||
.map(ConstScorer::new)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
res
|
||||
};
|
||||
let mut docset = docset_factory();
|
||||
for el in btree_set {
|
||||
assert!(docset.advance());
|
||||
assert_eq!(el, docset.doc());
|
||||
}
|
||||
assert!(!docset.advance());
|
||||
test_skip_against_unoptimized(docset_factory, skip_targets);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case() {
|
||||
test_aux_union_skip(&[vec![165132, 167382], vec![25029, 25091]], vec![25029]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case2() {
|
||||
test_aux_union_skip(
|
||||
&[vec![1u32, 1u32 + HORIZON], vec![2u32, 1000u32, 10_000u32]],
|
||||
vec![0u32, 1u32, 2u32, 3u32, 1u32 + HORIZON, 2u32 + HORIZON],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case3() {
|
||||
let mut docset = Union::<_, DoNothingCombiner>::from(vec![
|
||||
ConstScorer::new(VecDocSet::from(vec![0u32, 5u32])),
|
||||
ConstScorer::new(VecDocSet::from(vec![1u32, 4u32])),
|
||||
]);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 0u32);
|
||||
assert_eq!(docset.skip_next(0u32), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 1u32)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_random() {
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
vec![1, 2, 3, 7],
|
||||
vec![1, 3, 9, 10000],
|
||||
vec![1, 3, 8, 9, 100],
|
||||
],
|
||||
vec![1, 2, 3, 5, 6, 7, 8, 100],
|
||||
);
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
tests::sample_with_seed(100_000, 0.001, 1),
|
||||
tests::sample_with_seed(100_000, 0.002, 2),
|
||||
tests::sample_with_seed(100_000, 0.005, 3),
|
||||
],
|
||||
tests::sample_with_seed(100_000, 0.01, 4),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_specific() {
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
vec![1, 2, 3, 7],
|
||||
vec![1, 3, 9, 10000],
|
||||
vec![1, 3, 8, 9, 100],
|
||||
],
|
||||
vec![1, 2, 3, 7, 8, 9, 99, 100, 101, 500, 20000],
|
||||
);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_union_3_high(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.1, 0),
|
||||
tests::sample_with_seed(100_000, 0.2, 1),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = Union::<_, DoNothingCombiner>::from(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(ConstScorer::new)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
while v.advance() {}
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_union_3_low(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.01, 0),
|
||||
tests::sample_with_seed(100_000, 0.05, 1),
|
||||
tests::sample_with_seed(100_000, 0.001, 2),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = Union::<_, DoNothingCombiner>::from(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(ConstScorer::new)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
while v.advance() {}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
@@ -9,10 +9,5 @@ use core::SegmentReader;
|
||||
pub trait Weight {
|
||||
/// Returns the scorer for the given segment.
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
|
||||
|
||||
/// Returns the number documents within the given `SegmentReader`.
|
||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||
Ok(self.scorer(reader)?.count())
|
||||
}
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>>;
|
||||
}
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
use super::*;
|
||||
use itertools::Itertools;
|
||||
use common::VInt;
|
||||
use std::io::{self, Read, Write};
|
||||
use common::BinarySerializable;
|
||||
|
||||
/// Tantivy's Document is the object that can
|
||||
/// be indexed and then searched for.
|
||||
@@ -14,17 +11,11 @@ use common::BinarySerializable;
|
||||
|
||||
/// Documents are really just a list of couple `(field, value)`.
|
||||
/// In this list, one field may appear more than once.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
pub struct Document {
|
||||
field_values: Vec<FieldValue>,
|
||||
}
|
||||
|
||||
impl From<Vec<FieldValue>> for Document {
|
||||
fn from(field_values: Vec<FieldValue>) -> Self {
|
||||
Document { field_values }
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Document {
|
||||
fn eq(&self, other: &Document) -> bool {
|
||||
// super slow, but only here for tests
|
||||
@@ -54,23 +45,6 @@ impl Document {
|
||||
self.field_values.is_empty()
|
||||
}
|
||||
|
||||
/// Retain only the field that are matching the
|
||||
/// predicate given in argument.
|
||||
pub fn filter_fields<P: Fn(Field) -> bool>(&mut self, predicate: P) {
|
||||
self.field_values
|
||||
.retain(|field_value| predicate(field_value.field()));
|
||||
}
|
||||
|
||||
/// Adding a facet to the document.
|
||||
pub fn add_facet<F>(&mut self, field: Field, path: F)
|
||||
where
|
||||
Facet: From<F>,
|
||||
{
|
||||
let facet = Facet::from(path);
|
||||
let value = Value::Facet(facet);
|
||||
self.add(FieldValue::new(field, value));
|
||||
}
|
||||
|
||||
/// Add a text field.
|
||||
pub fn add_text(&mut self, field: Field, text: &str) {
|
||||
let value = Value::Str(String::from(text));
|
||||
@@ -130,22 +104,11 @@ impl Document {
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Document {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let field_values = self.field_values();
|
||||
VInt(field_values.len() as u64).serialize(writer)?;
|
||||
for field_value in field_values {
|
||||
field_value.serialize(writer)?;
|
||||
impl From<Vec<FieldValue>> for Document {
|
||||
fn from(field_values: Vec<FieldValue>) -> Document {
|
||||
Document {
|
||||
field_values: field_values,
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let num_field_values = VInt::deserialize(reader)?.val() as usize;
|
||||
let field_values = (0..num_field_values)
|
||||
.map(|_| FieldValue::deserialize(reader))
|
||||
.collect::<io::Result<Vec<FieldValue>>>()?;
|
||||
Ok(Document::from(field_values))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,244 +0,0 @@
|
||||
use std::fmt::{self, Debug, Display, Formatter};
|
||||
use std::str;
|
||||
use std::io::{self, Read, Write};
|
||||
use regex::Regex;
|
||||
use std::borrow::Borrow;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::borrow::Cow;
|
||||
use common::BinarySerializable;
|
||||
|
||||
const SLASH_BYTE: u8 = b'/';
|
||||
const ESCAPE_BYTE: u8 = b'\\';
|
||||
|
||||
/// BYTE used as a level separation in the binary
|
||||
/// representation of facets.
|
||||
pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
|
||||
/// A Facet represent a point in a given hierarchy.
|
||||
///
|
||||
/// They are typically represented similarly to a filepath.
|
||||
/// For instance, an e-commerce website could
|
||||
/// have a `Facet` for `/electronics/tv_and_video/led_tv`.
|
||||
///
|
||||
/// A document can be associated to any number of facets.
|
||||
/// The hierarchy implicitely imply that a document
|
||||
/// belonging to a facet also belongs to the ancestor of
|
||||
/// its facet. In the example above, `/electronics/tv_and_video/`
|
||||
/// and `/electronics`.
|
||||
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
|
||||
pub struct Facet(Vec<u8>);
|
||||
|
||||
impl Facet {
|
||||
/// Returns a new instance of the "root facet"
|
||||
/// Equivalent to `/`.
|
||||
pub fn root() -> Facet {
|
||||
Facet(vec![])
|
||||
}
|
||||
|
||||
/// Returns true iff the facet is the root facet `/`.
|
||||
pub fn is_root(&self) -> bool {
|
||||
self.encoded_bytes().is_empty()
|
||||
}
|
||||
|
||||
/// Returns a binary representation of the facet.
|
||||
///
|
||||
/// In this representation, `0u8` is used as a separator
|
||||
/// and the string parts of the facet are unescaped.
|
||||
/// (The first `/` is not encoded at all).
|
||||
///
|
||||
/// This representation has the benefit of making it possible to
|
||||
/// express "being a child of a given facet" as a range over
|
||||
/// the term ordinals.
|
||||
pub fn encoded_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
|
||||
/// Creates a `Facet` from its binary representation.
|
||||
pub(crate) fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
|
||||
Facet(encoded_bytes)
|
||||
}
|
||||
|
||||
/// Parse a text representation of a facet.
|
||||
///
|
||||
/// It is conceptually, if one of the steps of this path
|
||||
/// contains a `/` or a `\`, it should be escaped
|
||||
/// using an anti-slash `/`.
|
||||
pub fn from_text<T>(path: &T) -> Facet
|
||||
where
|
||||
T: ?Sized + AsRef<str>,
|
||||
{
|
||||
From::from(path)
|
||||
}
|
||||
|
||||
/// Returns a `Facet` from an iterator over the different
|
||||
/// steps of the facet path.
|
||||
///
|
||||
/// The steps are expected to be unescaped.
|
||||
pub fn from_path<Path>(path: Path) -> Facet
|
||||
where
|
||||
Path: IntoIterator,
|
||||
Path::Item: ToString,
|
||||
{
|
||||
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
|
||||
let mut step_it = path.into_iter();
|
||||
if let Some(step) = step_it.next() {
|
||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
||||
}
|
||||
for step in step_it {
|
||||
facet_bytes.push(FACET_SEP_BYTE);
|
||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
||||
}
|
||||
Facet(facet_bytes)
|
||||
}
|
||||
|
||||
/// Accessor for the inner buffer of the `Facet`.
|
||||
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
}
|
||||
|
||||
/// Returns `true` iff other is a subfacet of `self`.
|
||||
#[allow(collapsible_if)]
|
||||
pub fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||
let self_bytes: &[u8] = self.encoded_bytes();
|
||||
let other_bytes: &[u8] = other.encoded_bytes();
|
||||
if self_bytes.len() < other_bytes.len() {
|
||||
if other_bytes.starts_with(self_bytes) {
|
||||
return other_bytes[self_bytes.len()] == 0u8;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl Borrow<[u8]> for Facet {
|
||||
fn borrow(&self) -> &[u8] {
|
||||
self.encoded_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
fn from(path_asref: &'a T) -> Facet {
|
||||
#[derive(Copy, Clone)]
|
||||
enum State {
|
||||
Escaped,
|
||||
Idle,
|
||||
}
|
||||
let path: &str = path_asref.as_ref();
|
||||
let mut facet_encoded = Vec::new();
|
||||
let mut state = State::Idle;
|
||||
let path_bytes = path.as_bytes();
|
||||
for &c in &path_bytes[1..] {
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push(FACET_SEP_BYTE);
|
||||
}
|
||||
(State::Escaped, any_char) => {
|
||||
state = State::Idle;
|
||||
facet_encoded.push(any_char);
|
||||
}
|
||||
(State::Idle, other_char) => {
|
||||
facet_encoded.push(other_char);
|
||||
}
|
||||
}
|
||||
}
|
||||
Facet(facet_encoded)
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Facet {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
|
||||
Ok(Facet(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Facet {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
|
||||
write!(f, "/")?;
|
||||
let step_str = unsafe { str::from_utf8_unchecked(step) };
|
||||
write!(f, "{}", escape_slashes(step_str))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn escape_slashes(s: &str) -> Cow<str> {
|
||||
lazy_static! {
|
||||
static ref SLASH_PTN: Regex = Regex::new(r"[\\/]").unwrap();
|
||||
}
|
||||
SLASH_PTN.replace_all(s, "\\/")
|
||||
}
|
||||
|
||||
impl Serialize for Facet {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
serializer.serialize_str(&self.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Facet {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
<&'de str as Deserialize<'de>>::deserialize(deserializer).map(Facet::from)
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for Facet {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
write!(f, "Facet({})", self)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::Facet;
|
||||
|
||||
#[test]
|
||||
fn test_root() {
|
||||
assert_eq!(Facet::root(), Facet::from("/"));
|
||||
assert_eq!(format!("{}", Facet::root()), "/");
|
||||
assert!(Facet::root().is_root());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_path() {
|
||||
assert_eq!(
|
||||
Facet::from_path(vec!["top", "a", "firstdoc"]),
|
||||
Facet::from("/top/a/firstdoc")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_display() {
|
||||
{
|
||||
let v = ["first", "second", "third"];
|
||||
let facet = Facet::from_path(v.iter());
|
||||
assert_eq!(format!("{}", facet), "/first/second/third");
|
||||
}
|
||||
{
|
||||
let v = ["first", "sec/ond", "third"];
|
||||
let facet = Facet::from_path(v.iter());
|
||||
assert_eq!(format!("{}", facet), "/first/sec\\/ond/third");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_debug() {
|
||||
let v = ["first", "second", "third"];
|
||||
let facet = Facet::from_path(v.iter());
|
||||
assert_eq!(format!("{:?}", facet), "Facet(/first/second/third)");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -48,14 +48,6 @@ impl FieldEntry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a field entry for a facet.
|
||||
pub fn new_facet(field_name: String) -> FieldEntry {
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::HierarchicalFacet,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the name of the field
|
||||
pub fn name(&self) -> &str {
|
||||
&self.name
|
||||
@@ -71,7 +63,6 @@ impl FieldEntry {
|
||||
match self.field_type {
|
||||
FieldType::Str(ref options) => options.get_indexing_options().is_some(),
|
||||
FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_indexed(),
|
||||
FieldType::HierarchicalFacet => true,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,8 +79,6 @@ impl FieldEntry {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_stored(),
|
||||
FieldType::Str(ref options) => options.is_stored(),
|
||||
FieldType::HierarchicalFacet => true,
|
||||
// TODO make stored hierachical facet optional
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -115,9 +104,6 @@ impl Serialize for FieldEntry {
|
||||
s.serialize_field("type", "i64")?;
|
||||
s.serialize_field("options", options)?;
|
||||
}
|
||||
FieldType::HierarchicalFacet => {
|
||||
s.serialize_field("type", "hierarchical_facet")?;
|
||||
}
|
||||
}
|
||||
|
||||
s.end()
|
||||
@@ -168,9 +154,6 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
return Err(de::Error::duplicate_field("type"));
|
||||
}
|
||||
ty = Some(map.next_value()?);
|
||||
if ty == Some("hierarchical_facet") {
|
||||
field_type = Some(FieldType::HierarchicalFacet);
|
||||
}
|
||||
}
|
||||
Field::Options => match ty {
|
||||
None => {
|
||||
|
||||
@@ -3,7 +3,6 @@ use schema::{IntOptions, TextOptions};
|
||||
use serde_json::Value as JsonValue;
|
||||
use schema::Value;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Facet;
|
||||
|
||||
/// Possible error that may occur while parsing a field value
|
||||
/// At this point the JSON is known to be valid.
|
||||
@@ -19,7 +18,7 @@ pub enum ValueParsingError {
|
||||
|
||||
/// A `FieldType` describes the type (text, u64) of a field as well as
|
||||
/// how it should be handled by tantivy.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum FieldType {
|
||||
/// String field type configuration
|
||||
Str(TextOptions),
|
||||
@@ -27,8 +26,6 @@ pub enum FieldType {
|
||||
U64(IntOptions),
|
||||
/// Signed 64-bits integers 64 field type configuration
|
||||
I64(IntOptions),
|
||||
/// Hierachical Facet
|
||||
HierarchicalFacet,
|
||||
}
|
||||
|
||||
impl FieldType {
|
||||
@@ -39,7 +36,6 @@ impl FieldType {
|
||||
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => {
|
||||
int_options.is_indexed()
|
||||
}
|
||||
FieldType::HierarchicalFacet => true,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,7 +55,6 @@ impl FieldType {
|
||||
None
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet => Some(IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,7 +70,6 @@ impl FieldType {
|
||||
FieldType::U64(_) | FieldType::I64(_) => Err(ValueParsingError::TypeError(
|
||||
format!("Expected an integer, got {:?}", json),
|
||||
)),
|
||||
FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))),
|
||||
},
|
||||
JsonValue::Number(ref field_val_num) => match *self {
|
||||
FieldType::I64(_) => {
|
||||
@@ -94,7 +88,7 @@ impl FieldType {
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::HierarchicalFacet => {
|
||||
FieldType::Str(_) => {
|
||||
let msg = format!("Expected a string, got {:?}", json);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
|
||||
@@ -1,23 +1,10 @@
|
||||
use std::ops::BitOr;
|
||||
|
||||
/// Express whether a field is single-value or multi-valued.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)]
|
||||
pub enum Cardinality {
|
||||
/// The document must have exactly one value associated to the document.
|
||||
#[serde(rename = "single")]
|
||||
SingleValue,
|
||||
/// The document can have any number of values associated to the document.
|
||||
/// This is more memory and CPU expensive than the SingleValue solution.
|
||||
#[serde(rename = "multi")]
|
||||
MultiValues,
|
||||
}
|
||||
|
||||
/// Define how an int field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct IntOptions {
|
||||
indexed: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
fast: Option<Cardinality>,
|
||||
fast: bool,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
@@ -34,7 +21,7 @@ impl IntOptions {
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast.is_some()
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Set the u64 options as stored.
|
||||
@@ -55,32 +42,24 @@ impl IntOptions {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the u64 options as a single-valued fast field.
|
||||
/// Set the u64 options as a fast field.
|
||||
///
|
||||
/// Fast fields are designed for random access.
|
||||
/// Access time are similar to a random lookup in an array.
|
||||
/// If more than one value is associated to a fast field, only the last one is
|
||||
/// kept.
|
||||
pub fn set_fast(mut self, cardinality: Cardinality) -> IntOptions {
|
||||
self.fast = Some(cardinality);
|
||||
pub fn set_fast(mut self) -> IntOptions {
|
||||
self.fast = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns the cardinality of the fastfield.
|
||||
///
|
||||
/// If the field has not been declared as a fastfield, then
|
||||
/// the method returns None.
|
||||
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
|
||||
self.fast
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for IntOptions {
|
||||
fn default() -> IntOptions {
|
||||
IntOptions {
|
||||
fast: false,
|
||||
indexed: false,
|
||||
stored: false,
|
||||
fast: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -91,7 +70,7 @@ impl Default for IntOptions {
|
||||
pub const FAST: IntOptions = IntOptions {
|
||||
indexed: false,
|
||||
stored: false,
|
||||
fast: Some(Cardinality::SingleValue),
|
||||
fast: true,
|
||||
};
|
||||
|
||||
/// Shortcut for a u64 indexed field.
|
||||
@@ -100,7 +79,7 @@ pub const FAST: IntOptions = IntOptions {
|
||||
pub const INT_INDEXED: IntOptions = IntOptions {
|
||||
indexed: true,
|
||||
stored: false,
|
||||
fast: None,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
/// Shortcut for a u64 stored field.
|
||||
@@ -109,7 +88,7 @@ pub const INT_INDEXED: IntOptions = IntOptions {
|
||||
pub const INT_STORED: IntOptions = IntOptions {
|
||||
indexed: false,
|
||||
stored: true,
|
||||
fast: None,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
impl BitOr for IntOptions {
|
||||
@@ -119,7 +98,7 @@ impl BitOr for IntOptions {
|
||||
let mut res = IntOptions::default();
|
||||
res.indexed = self.indexed | other.indexed;
|
||||
res.stored = self.stored | other.stored;
|
||||
res.fast = self.fast.or(other.fast);
|
||||
res.fast = self.fast | other.fast;
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,7 +103,6 @@ the field is required during scoring or collection for instance.
|
||||
mod schema;
|
||||
mod term;
|
||||
mod document;
|
||||
mod facet;
|
||||
|
||||
mod field_type;
|
||||
mod field_entry;
|
||||
@@ -121,9 +120,6 @@ pub use self::schema::{Schema, SchemaBuilder};
|
||||
pub use self::value::Value;
|
||||
pub use self::schema::DocParsingError;
|
||||
|
||||
pub use self::facet::Facet;
|
||||
pub use self::facet::FACET_SEP_BYTE;
|
||||
|
||||
pub use self::document::Document;
|
||||
pub use self::field::Field;
|
||||
pub use self::term::Term;
|
||||
@@ -143,7 +139,6 @@ pub use self::int_options::IntOptions;
|
||||
pub use self::int_options::FAST;
|
||||
pub use self::int_options::INT_INDEXED;
|
||||
pub use self::int_options::INT_STORED;
|
||||
pub use self::int_options::Cardinality;
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
|
||||
@@ -89,12 +89,6 @@ impl SchemaBuilder {
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a facet field to the schema.
|
||||
pub fn add_facet_field(&mut self, field_name: &str) -> Field {
|
||||
let field_entry = FieldEntry::new_facet(field_name.to_string());
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a field entry to the schema in build.
|
||||
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
|
||||
let field = Field(self.fields.len() as u32);
|
||||
@@ -123,6 +117,7 @@ impl Default for SchemaBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct InnerSchema {
|
||||
fields: Vec<FieldEntry>,
|
||||
fields_map: HashMap<String, Field>, // transient
|
||||
@@ -242,6 +237,12 @@ impl Schema {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Schema {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Schema {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
@@ -290,6 +291,12 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SchemaBuilder> for Schema {
|
||||
fn from(schema_builder: SchemaBuilder) -> Schema {
|
||||
schema_builder.build()
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may happen when deserializing
|
||||
/// a document from JSON.
|
||||
#[derive(Debug)]
|
||||
@@ -321,12 +328,8 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_schema_serialization() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let count_options = IntOptions::default().set_stored().set_fast();
|
||||
let popularity_options = IntOptions::default().set_stored().set_fast();
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
@@ -361,7 +364,7 @@ mod tests {
|
||||
"type": "u64",
|
||||
"options": {
|
||||
"indexed": false,
|
||||
"fast": "single",
|
||||
"fast": true,
|
||||
"stored": true
|
||||
}
|
||||
},
|
||||
@@ -370,7 +373,7 @@ mod tests {
|
||||
"type": "i64",
|
||||
"options": {
|
||||
"indexed": false,
|
||||
"fast": "single",
|
||||
"fast": true,
|
||||
"stored": true
|
||||
}
|
||||
}
|
||||
@@ -390,9 +393,7 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_document_to_json() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let count_options = IntOptions::default().set_stored().set_fast();
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
@@ -411,12 +412,8 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_parse_document() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let count_options = IntOptions::default().set_stored().set_fast();
|
||||
let popularity_options = IntOptions::default().set_stored().set_fast();
|
||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||
let author_field = schema_builder.add_text_field("author", STRING);
|
||||
let count_field = schema_builder.add_u64_field("count", count_options);
|
||||
@@ -452,7 +449,14 @@ mod tests {
|
||||
"jambon": "bayonne"
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(json_err, Err(DocParsingError::NoSuchFieldInSchema(_)));
|
||||
match json_err {
|
||||
Err(DocParsingError::NoSuchFieldInSchema(field_name)) => {
|
||||
assert_eq!(field_name, "jambon");
|
||||
}
|
||||
_ => {
|
||||
panic!("expected additional field 'jambon' to fail but didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
@@ -464,10 +468,14 @@ mod tests {
|
||||
"jambon": "bayonne"
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_)))
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
|
||||
assert!(true);
|
||||
}
|
||||
_ => {
|
||||
panic!("expected string of 5 to fail but didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
@@ -478,10 +486,14 @@ mod tests {
|
||||
"popularity": 10
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_)))
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
assert!(true);
|
||||
}
|
||||
_ => {
|
||||
panic!("expected -5 to fail but didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
@@ -492,10 +504,14 @@ mod tests {
|
||||
"popularity": 10
|
||||
}"#,
|
||||
);
|
||||
assert!(!matches!(
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_)))
|
||||
));
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
panic!("expected 9223372036854775808 to fit into u64, but it didn't");
|
||||
}
|
||||
_ => {
|
||||
assert!(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
@@ -506,10 +522,14 @@ mod tests {
|
||||
"popularity": 9223372036854775808
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(
|
||||
json_err,
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_)))
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
assert!(true);
|
||||
}
|
||||
_ => {
|
||||
panic!("expected 9223372036854775808 to overflow i64, but it didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
@@ -519,7 +539,14 @@ mod tests {
|
||||
"count": 50,
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(json_err, Err(NotJSON(_)));
|
||||
match json_err {
|
||||
Err(NotJSON(_)) => {
|
||||
assert!(true);
|
||||
}
|
||||
_ => {
|
||||
panic!("expected invalid JSON to fail parsing, but it didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user