Compare commits

..

11 Commits

75 changed files with 1836 additions and 2600 deletions

View File

@@ -1,127 +1,37 @@
# Based on the "trust" template v0.1.2
# https://github.com/japaric/trust/tree/v0.1.2
dist: trusty
language: rust
services: docker
sudo: required
cache: cargo
rust:
- nightly
env:
global:
- CRATE_NAME=tantivy
matrix:
include:
# Android
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
- env: TARGET=i686-linux-android DISABLE_TESTS=1
- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
# iOS
#- env: TARGET=aarch64-apple-ios DISABLE_TESTS=1
# os: osx
#- env: TARGET=armv7-apple-ios DISABLE_TESTS=1
# os: osx
#- env: TARGET=armv7s-apple-ios DISABLE_TESTS=1
# os: osx
#- env: TARGET=i386-apple-ios DISABLE_TESTS=1
# os: osx
- env: TARGET=x86_64-apple-ios DISABLE_TESTS=1
os: osx
# Linux
- env: TARGET=aarch64-unknown-linux-gnu
# - env: TARGET=arm-unknown-linux-gnueabi
# - env: TARGET=armv7-unknown-linux-gnueabihf
- env: TARGET=i686-unknown-linux-gnu
#- env: TARGET=i686-unknown-linux-musl
#- env: TARGET=mips-unknown-linux-gnu
#- env: TARGET=mips64-unknown-linux-gnuabi64
#- env: TARGET=mips64el-unknown-linux-gnuabi64
#- env: TARGET=mipsel-unknown-linux-gnu
#- env: TARGET=powerpc-unknown-linux-gnu
#- env: TARGET=powerpc64-unknown-linux-gnu
#- env: TARGET=powerpc64le-unknown-linux-gnu
#- env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1
- env: TARGET=x86_64-unknown-linux-gnu
- env: TARGET=x86_64-unknown-linux-musl
# OSX
#- env: TARGET=i686-apple-darwin
# os: osx
- env: TARGET=x86_64-apple-darwin
os: osx
# *BSD
#- env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1
#- env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1
#- env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1
# Windows
#- env: TARGET=x86_64-pc-windows-gnu
# Bare metal
# These targets don't support std and as such are likely not suitable for
# most crates.
# - env: TARGET=thumbv6m-none-eabi
# - env: TARGET=thumbv7em-none-eabi
# - env: TARGET=thumbv7em-none-eabihf
# - env: TARGET=thumbv7m-none-eabi
# Testing other channels
#- env: TARGET=x86_64-unknown-linux-gnu
# rust: nightly
#- env: TARGET=x86_64-apple-darwin
# os: osx
# rust: nightly
before_install:
- set -e
- rustup self update
install:
- sh ci/install.sh
- source ~/.cargo/env || true
- CC=gcc-4.8
- CXX=g++-4.8
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- kalakris-cmake
packages:
- gcc-4.8
- g++-4.8
- libcurl4-openssl-dev
- libelf-dev
- libdw-dev
- binutils-dev
- cmake
before_script:
- export PATH=$HOME/.cargo/bin:$PATH
- cargo install cargo-update || echo "cargo-update already installed"
- cargo install cargo-travis || echo "cargo-travis already installed"
script:
- bash ci/script.sh
after_script: set +e
before_deploy:
- sh ci/before_deploy.sh
#
#deploy:
# # - Create a `public_repo` GitHub token. Go to: https://github.com/settings/tokens/new
# # - Encrypt it: `travis encrypt 0123456789012345678901234567890123456789
# # - Paste the output down here
# api_key:
# secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
# file_glob: true
# file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.*
# on:
# # TODO Here you can pick which targets will generate binary releases
# # In this example, there are some targets that are tested using the stable
# # and nightly channels. This condition makes sure there is only one release
# # for such targets and that's generated using the stable channel
# condition: $TRAVIS_RUST_VERSION = stable
# tags: true
# provider: releases
# skip_cleanup: true
cache: cargo
before_cache:
# Travis can't cache files that are not readable by "others"
- chmod -R a+r $HOME/.cargo
#branches:
# only:
# # release tags
# - /^v\d+\.\d+\.\d+.*$/
# - master
notifications:
email:
on_success: never
- cargo build
- cargo test
- cargo test -- --ignored
- cargo run --example simple_search
- cargo doc
after_success:
- cargo coveralls --exclude-pattern src/functional_test.rs
- cargo doc-upload

11
AUTHORS
View File

@@ -1,11 +0,0 @@
# This is the list of authors of tantivy for copyright purposes.
Paul Masurel
Laurentiu Nicola
Dru Sellers
Ashley Mannix
Michael J. Curry
Jason Wolfe
# As an employee of Google I am required to add Google LLC
# in the list of authors, but this project is not affiliated to Google
# in any other way.
Google LLC

View File

@@ -1,23 +1,14 @@
Tantivy 0.6
==========================
Special thanks to @drusellers and @jason-wolfe for their contributions
to this release!
- Removed C code. Tantivy is now pure Rust. (@pmasurel)
- BM25 (@pmasurel)
- Approximate field norms encoded over 1 byte. (@pmasurel)
- Compiles on stable rust (@pmasurel)
- Removed C code. Tantivy is now pure Rust.
- BM25
- Approximate field norms encoded over 1 byte.
- Compiles on stable rust
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
- Completely uncompressed
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
- Add NGram token support (@drusellers)
- Add Stopword Filter support (@drusellers)
- Add a FuzzyTermQuery (@drusellers)
- Add a RegexQuery (@drusellers)
- Various performance improvements (@pmasurel)_
Tantivy 0.5.2
===========================

View File

@@ -1,15 +1,15 @@
[package]
name = "tantivy"
version = "0.6.0"
version = "0.6.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
description = """Search engine library"""
description = """Tantivy is a search engine library."""
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/tantivy-search/tantivy"
readme = "README.md"
keywords = ["search", "search engine", "information", "retrieval"]
keywords = ["search", "information", "retrieval"]
[dependencies]
base64 = "0.9.1"
@@ -18,10 +18,7 @@ lazy_static = "0.2.1"
tinysegmenter = "0.1.0"
regex = "0.2"
fst = {version="0.3", default-features=false}
fst-regex = { version="0.2" }
lz4 = {version="1.20", optional=true}
snap = {version="0.2"}
atomicwrites = {version="0.2.2", optional=true}
atomicwrites = {version="0.1", optional=true}
tempfile = "2.1"
log = "0.3.6"
combine = "2.2"
@@ -32,6 +29,7 @@ serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.5.9"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
lz4 = "1.20"
bit-set = "0.4.0"
uuid = { version = "0.6", features = ["v4", "serde"] }
chan = "0.1"
@@ -44,7 +42,7 @@ stable_deref_trait = "1.0.0"
rust-stemmers = "0.1.0"
downcast = { version="0.9" }
matches = "0.1"
bitpacking = "0.5"
bitpacking = "0.4"
fnv = "1.0.6"
[target.'cfg(windows)'.dependencies]
@@ -62,8 +60,9 @@ debug-assertions = false
[features]
default = ["mmap"]
simd = ["bitpacking/simd"]
mmap = ["fst/mmap", "atomicwrites"]
lz4-compression = ["lz4"]
unstable = ["simd"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -1,4 +1,4 @@
Copyright (c) 2018 by the project authors, as listed in the AUTHORS file.
Copyright (c) 2018 by Paul Masurel, Google LLC
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

View File

@@ -4,50 +4,36 @@
[![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master&refresh1)](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy)
**Tantivy** is a **full text search engine library** written in rust.
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design.
It is strongly inspired by Lucene's design.
# Features
- Full-text search
- Tiny startup time (<10ms), perfect for command line tools
- BM25 scoring (the same as lucene)
- Basic query language (`+michael +jackson`)
- Phrase queries search (\"michael jackson\"`)
- tf-idf scoring
- Basic query language
- Phrase queries
- Incremental indexing
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- Mmap directory
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
- optional SIMD integer compression
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- LZ4 compressed document store
- Range queries
- Faceted search
- Configurable indexing (optional term frequency and position indexing
- Faceting
- configurable indexing (optional term frequency and position indexing
- Cheesy logo with a horse
# Non-features
Tantivy supports Linux, MacOS and Windows.
- Distributed search and will not be in the scope of tantivy.
# Supported OS and compiler
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
# Getting started
- [tantivy's simple search example](http://fulmicoton.com/tantivy-examples/simple_search.html)
- [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html)
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
index documents and search via the CLI or a small server with a REST API.
It will walk you through getting a wikipedia search engine up and running in a few minutes.
- [reference doc]
- [For the last released version](https://docs.rs/tantivy/)
@@ -57,14 +43,40 @@ It will walk you through getting a wikipedia search engine up and running in a f
## Development
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run :
Tantivy now compiles on stable rust.
To check out and run test, you can simply run :
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
cargo build
## Note on release build and performance
If your project depends on `tantivy`, for better performance, make sure to enable
`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
be available on most `x86_64` CPUs you will encounter).
For instance,
RUSTFLAGS='-C target-feature=+sse3'
Or, if you are targetting a specific cpu
RUSTFLAGS='-C target-cpu=native' build --release
Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
If you want to disable those, you can run the following command :
cargo build --no-default-features
Alternatively, if you are trying to compile `tantivy` without simd compression,
you can disable this functionality. In this case, this submodule is not required
and you can compile tantivy by using the `--no-default-features` flag.
cargo build --no-default-features
# Contribute
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.

View File

@@ -6,6 +6,9 @@ environment:
matrix:
- channel: nightly
target: x86_64-pc-windows-msvc
- channel: nightly
target: x86_64-pc-windows-gnu
msys_bits: 64
install:
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe

View File

@@ -1,23 +0,0 @@
# This script takes care of packaging the build artifacts that will go in the
# release zipfile
$SRC_DIR = $PWD.Path
$STAGE = [System.Guid]::NewGuid().ToString()
Set-Location $ENV:Temp
New-Item -Type Directory -Name $STAGE
Set-Location $STAGE
$ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip"
# TODO Update this to package the right artifacts
Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\'
7z a "$ZIP" *
Push-AppveyorArtifact "$ZIP"
Remove-Item *.* -Force
Set-Location ..
Remove-Item $STAGE
Set-Location $SRC_DIR

View File

@@ -1,33 +0,0 @@
# This script takes care of building your crate and packaging it for release
set -ex
main() {
local src=$(pwd) \
stage=
case $TRAVIS_OS_NAME in
linux)
stage=$(mktemp -d)
;;
osx)
stage=$(mktemp -d -t tmp)
;;
esac
test -f Cargo.lock || cargo generate-lockfile
# TODO Update this to build the artifacts that matter to you
cross rustc --bin hello --target $TARGET --release -- -C lto
# TODO Update this to package the right artifacts
cp target/$TARGET/release/hello $stage/
cd $stage
tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
cd $src
rm -rf $stage
}
main

View File

@@ -1,47 +0,0 @@
set -ex
main() {
local target=
if [ $TRAVIS_OS_NAME = linux ]; then
target=x86_64-unknown-linux-musl
sort=sort
else
target=x86_64-apple-darwin
sort=gsort # for `sort --sort-version`, from brew's coreutils.
fi
# Builds for iOS are done on OSX, but require the specific target to be
# installed.
case $TARGET in
aarch64-apple-ios)
rustup target install aarch64-apple-ios
;;
armv7-apple-ios)
rustup target install armv7-apple-ios
;;
armv7s-apple-ios)
rustup target install armv7s-apple-ios
;;
i386-apple-ios)
rustup target install i386-apple-ios
;;
x86_64-apple-ios)
rustup target install x86_64-apple-ios
;;
esac
# This fetches latest stable release
local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \
| cut -d/ -f3 \
| grep -E '^v[0.1.0-9.]+$' \
| $sort --version-sort \
| tail -n1)
curl -LSfs https://japaric.github.io/trust/install.sh | \
sh -s -- \
--force \
--git japaric/cross \
--tag $tag \
--target $target
}
main

View File

@@ -1,23 +0,0 @@
# This script takes care of testing your crate
set -ex
main() {
cross build --target $TARGET
cross build --target $TARGET --release
if [ ! -z $DISABLE_TESTS ]; then
return
fi
cross test --target $TARGET
# cross test --target $TARGET --release
# cross run --target $TARGET
# cross run --target $TARGET --release
}
# we don't run the "test phase" when doing deploys
if [ -z $TRAVIS_TAG ]; then
main
fi

View File

@@ -61,7 +61,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = Index::create_in_dir(index_path, schema.clone())?;
let index = Index::create(index_path, schema.clone())?;
// here we are registering our custome tokenizer
// this will store tokens of 3 characters each

View File

@@ -64,7 +64,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = Index::create_in_dir(index_path, schema.clone())?;
let index = Index::create(index_path, schema.clone())?;
// To insert document we need an index writer.
// There must be only one writer at a time.

View File

@@ -4,111 +4,87 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
use collector::CollectorWrapper;
/// Collector that does nothing.
/// This is used in the chain Collector and will hopefully
/// be optimized away by the compiler.
pub struct DoNothingCollector;
impl Collector for DoNothingCollector {
type Child = DoNothingCollector;
#[inline]
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<DoNothingCollector> {
Ok(DoNothingCollector)
}
#[inline]
fn collect(&mut self, _doc: DocId, _score: Score) {}
#[inline]
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for DoNothingCollector {
type CollectionResult = ();
#[inline]
fn collect(&mut self, _doc: DocId, _score: Score) {}
fn finalize(self) -> () {
()
}
}
/// Zero-cost abstraction used to collect on multiple collectors.
/// This contraption is only usable if the type of your collectors
/// are known at compile time.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
pub struct ChainedCollector<Left: Collector, Right: Collector> {
left: Left,
right: Right,
}
pub struct ChainedSegmentCollector<Left: SegmentCollector, Right: SegmentCollector> {
left: Left,
right: Right,
}
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
/// Adds a collector
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, CollectorWrapper<C>> {
ChainedCollector {
left: self,
right: new_collector,
right: CollectorWrapper::new(new_collector),
}
}
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(
type Child = ChainedSegmentCollector<Left::Child, Right::Child>;
fn for_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
self.left.set_segment(segment_local_id, segment)?;
self.right.set_segment(segment_local_id, segment)?;
Ok(())
) -> Result<Self::Child> {
Ok(ChainedSegmentCollector {
left: self.left.for_segment(segment_local_id, segment)?,
right: self.right.for_segment(segment_local_id, segment)?,
})
}
fn requires_scoring(&self) -> bool {
self.left.requires_scoring() || self.right.requires_scoring()
}
}
impl<Left: SegmentCollector, Right: SegmentCollector> SegmentCollector for ChainedSegmentCollector<Left, Right> {
type CollectionResult = (Left::CollectionResult, Right::CollectionResult);
fn collect(&mut self, doc: DocId, score: Score) {
self.left.collect(doc, score);
self.right.collect(doc, score);
}
fn requires_scoring(&self) -> bool {
self.left.requires_scoring() || self.right.requires_scoring()
fn finalize(self) -> Self::CollectionResult {
(self.left.finalize(), self.right.finalize())
}
}
@@ -122,19 +98,35 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
#[cfg(test)]
mod tests {
use super::*;
use collector::{Collector, CountCollector, TopCollector};
use collector::{CountCollector, SegmentCollector, TopCollector};
use schema::SchemaBuilder;
use Index;
use Document;
#[test]
fn test_chained_collector() {
let schema_builder = SchemaBuilder::new();
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(3_000_000).unwrap();
let doc = Document::new();
index_writer.add_document(doc);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_readers = searcher.segment_readers();
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
let mut segment_collector = collectors.for_segment(0, &segment_readers[0]).unwrap();
segment_collector.collect(1, 0.2);
segment_collector.collect(2, 0.1);
segment_collector.collect(3, 0.5);
collectors.merge_children(vec![segment_collector]);
}
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());

View File

@@ -4,56 +4,11 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
use collector::Combinable;
/// `CountCollector` collector only counts how many
/// documents match the query.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result};
/// use tantivy::collector::CountCollector;
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut count_collector = CountCollector::default();
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut count_collector).unwrap();
///
/// assert_eq!(count_collector.count(), 2);
/// }
///
/// Ok(())
/// }
/// ```
#[derive(Default)]
pub struct CountCollector {
count: usize,
@@ -68,12 +23,10 @@ impl CountCollector {
}
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
type Child = CountCollector;
fn collect(&mut self, _: DocId, _: Score) {
self.count += 1;
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<CountCollector> {
Ok(CountCollector::default())
}
fn requires_scoring(&self) -> bool {
@@ -81,10 +34,28 @@ impl Collector for CountCollector {
}
}
impl Combinable for CountCollector {
fn combine_into(&mut self, other: Self) {
self.count += other.count;
}
}
impl SegmentCollector for CountCollector {
type CollectionResult = CountCollector;
fn collect(&mut self, _: DocId, _: Score) {
self.count += 1;
}
fn finalize(self) -> CountCollector {
self
}
}
#[cfg(test)]
mod tests {
use collector::{Collector, CountCollector};
use collector::{Collector, CountCollector, SegmentCollector};
#[test]
fn test_count_collector() {

View File

@@ -3,14 +3,12 @@ use docset::SkipResult;
use fastfield::FacetReader;
use schema::Facet;
use schema::Field;
use std::cell::UnsafeCell;
use std::collections::btree_map;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::BinaryHeap;
use std::collections::Bound;
use std::iter::Peekable;
use std::mem;
use std::{u64, usize};
use termdict::TermMerger;
@@ -20,6 +18,7 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
struct Hit<'a> {
count: u64,
@@ -194,19 +193,22 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// }
/// ```
pub struct FacetCollector {
facet_ords: Vec<u64>,
field: Field,
ff_reader: Option<UnsafeCell<FacetReader>>,
segment_counters: Vec<SegmentFacetCounter>,
facets: BTreeSet<Facet>,
}
pub struct FacetSegmentCollector {
reader: FacetReader,
facet_ords_buf: Vec<u64>,
// facet_ord -> collapse facet_id
current_segment_collapse_mapping: Vec<usize>,
collapse_mapping: Vec<usize>,
// collapse facet_id -> count
current_segment_counts: Vec<u64>,
counts: Vec<u64>,
// collapse facet_id -> facet_ord
current_collapse_facet_ords: Vec<u64>,
facets: BTreeSet<Facet>,
collapse_facet_ords: Vec<u64>,
}
fn skip<'a, I: Iterator<Item = &'a Facet>>(
@@ -240,15 +242,9 @@ impl FacetCollector {
/// is of the proper type.
pub fn for_field(field: Field) -> FacetCollector {
FacetCollector {
facet_ords: Vec::with_capacity(255),
segment_counters: Vec::new(),
field,
ff_reader: None,
facets: BTreeSet::new(),
current_segment_collapse_mapping: Vec::new(),
current_collapse_facet_ords: Vec::new(),
current_segment_counts: Vec::new(),
}
}
@@ -279,82 +275,21 @@ impl FacetCollector {
self.facets.insert(facet);
}
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
self.current_segment_collapse_mapping.clear();
self.current_collapse_facet_ords.clear();
self.current_segment_counts.clear();
let mut collapse_facet_it = self.facets.iter().peekable();
self.current_collapse_facet_ords.push(0);
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if !facet_streamer.advance() {
return;
}
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
self.current_segment_collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = self.current_collapse_facet_ords.len();
self.current_collapse_facet_ords
.push(facet_streamer.term_ord());
self.current_segment_collapse_mapping.push(collapsed_id);
} else {
self.current_segment_collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
self.current_segment_collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
fn finalize_segment(&mut self) {
if self.ff_reader.is_some() {
self.segment_counters.push(SegmentFacetCounter {
facet_reader: self.ff_reader.take().unwrap().into_inner(),
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
});
}
}
/// Returns the results of the collection.
///
/// This method does not just return the counters,
/// it also translates the facet ordinals of the last segment.
pub fn harvest(mut self) -> FacetCounts {
self.finalize_segment();
let collapsed_facet_ords: Vec<&[u64]> = self
.segment_counters
pub fn harvest(self) -> FacetCounts {
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_ords[..])
.collect();
let collapsed_facet_counts: Vec<&[u64]> = self
.segment_counters
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_counts[..])
.collect();
let facet_streams = self
.segment_counters
let facet_streams = self.segment_counters
.iter()
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
.collect::<Vec<_>>();
@@ -392,31 +327,92 @@ impl FacetCollector {
}
}
impl FacetSegmentCollector {
fn into_segment_facet_counter(self) -> SegmentFacetCounter {
SegmentFacetCounter {
facet_reader: self.reader,
facet_ords: self.collapse_facet_ords,
facet_counts: self.counts,
}
}
}
impl Collector for FacetCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.finalize_segment();
type Child = FacetSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FacetSegmentCollector> {
let facet_reader = reader.facet_reader(self.field)?;
self.set_collapse_mapping(&facet_reader);
self.current_segment_counts
.resize(self.current_collapse_facet_ords.len(), 0);
self.ff_reader = Some(UnsafeCell::new(facet_reader));
Ok(())
let mut collapse_mapping = Vec::new();
let mut counts = Vec::new();
let mut collapse_facet_ords = Vec::new();
let mut collapse_facet_it = self.facets.iter().peekable();
collapse_facet_ords.push(0);
{
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if facet_streamer.advance() {
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = collapse_facet_ords.len();
collapse_facet_ords.push(facet_streamer.term_ord());
collapse_mapping.push(collapsed_id);
} else {
collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
}
counts.resize(collapse_facet_ords.len(), 0);
Ok(FacetSegmentCollector {
reader: facet_reader,
facet_ords_buf: Vec::with_capacity(255),
collapse_mapping,
counts,
collapse_facet_ords,
})
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for FacetSegmentCollector {
type CollectionResult = Vec<SegmentFacetCounter>;
fn collect(&mut self, doc: DocId, _: Score) {
let facet_reader: &mut FacetReader = unsafe {
&mut *self
.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.get()
};
facet_reader.facet_ords(doc, &mut self.facet_ords);
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
let mut previous_collapsed_ord: usize = usize::MAX;
for &facet_ord in &self.facet_ords {
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
{
for &facet_ord in &self.facet_ords_buf {
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
0
} else {
1
@@ -425,8 +421,8 @@ impl Collector for FacetCollector {
}
}
fn requires_scoring(&self) -> bool {
false
fn finalize(self) -> Vec<SegmentFacetCounter> {
vec![self.into_segment_facet_counter()]
}
}
@@ -511,7 +507,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer(3_000_000).unwrap();
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
@@ -591,7 +587,7 @@ mod tests {
.collect();
thread_rng().shuffle(&mut docs[..]);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc);
}
@@ -648,7 +644,7 @@ mod bench {
// 40425 docs
thread_rng().shuffle(&mut docs[..]);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc);
}

View File

@@ -7,12 +7,15 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use query::Query;
use Searcher;
use downcast;
mod count_collector;
pub use self::count_collector::CountCollector;
mod multi_collector;
pub use self::multi_collector::MultiCollector;
//mod multi_collector;
//pub use self::multi_collector::MultiCollector;
mod top_collector;
pub use self::top_collector::TopCollector;
@@ -21,7 +24,7 @@ mod facet_collector;
pub use self::facet_collector::FacetCollector;
mod chained_collector;
pub use self::chained_collector::{chain, ChainedCollector};
pub use self::chained_collector::chain;
/// Collectors are in charge of collecting and retaining relevant
/// information from the document found and scored by the query.
@@ -53,31 +56,90 @@ pub use self::chained_collector::{chain, ChainedCollector};
///
/// Segments are not guaranteed to be visited in any specific order.
pub trait Collector {
type Child : SegmentCollector + 'static;
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn set_segment(
fn for_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
) -> Result<Self::Child>;
/// Returns true iff the collector requires to compute scores for documents.
fn requires_scoring(&self) -> bool;
/// Search works as follows :
///
/// First the weight object associated to the query is created.
///
/// Then, the query loops over the segments and for each segment :
/// - setup the collector and informs it that the segment being processed has changed.
/// - creates a SegmentCollector for collecting documents associated to the segment
/// - creates a `Scorer` object associated for this segment
/// - iterate through the matched documents and push them to the segment collector.
/// - turn the segment collector into a Combinable segment result
///
/// Combining all of the segment results gives a single Child::CollectionResult, which is returned.
///
/// The result will be Ok(None) in case of having no segments.
fn search(&mut self, searcher: &Searcher, query: &Query) -> Result<Option<<Self::Child as SegmentCollector>::CollectionResult>> {
let scoring_enabled = self.requires_scoring();
let weight = query.weight(searcher, scoring_enabled)?;
let mut results = Vec::new();
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
let mut child: Self::Child = self.for_segment(segment_ord as SegmentLocalId, segment_reader)?;
let mut scorer = weight.scorer(segment_reader)?;
scorer.collect(&mut child, segment_reader.delete_bitset());
results.push(child.finalize());
}
Ok(results.into_iter().fold1(|x,y| {
x.combine_into(y);
x
}))
}
}
pub trait Combinable {
fn combine_into(&mut self, other: Self);
}
impl Combinable for () {
fn combine_into(&mut self, other: Self) {
()
}
}
impl<T> Combinable for Vec<T> {
fn combine_into(&mut self, other: Self) {
self.extend(other.into_iter());
}
}
impl<L: Combinable, R: Combinable> Combinable for (L, R) {
fn combine_into(&mut self, other: Self) {
self.0.combine_into(other.0);
self.1.combine_into(other.1);
}
}
pub trait SegmentCollector: downcast::Any + 'static {
type CollectionResult: Combinable + downcast::Any + 'static;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
/// Turn into the final result
fn finalize(self) -> Self::CollectionResult;
}
impl<'a, C: Collector> Collector for &'a mut C {
fn set_segment(
&mut self,
type Child = C::Child;
fn for_segment(
&mut self, // TODO Ask Jason : why &mut self here!?
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score) {
C::collect(self, doc, score)
) -> Result<C::Child> {
(*self).for_segment(segment_local_id, segment)
}
fn requires_scoring(&self) -> bool {
@@ -85,6 +147,61 @@ impl<'a, C: Collector> Collector for &'a mut C {
}
}
pub struct CollectorWrapper<'a, TCollector: 'a + Collector>(&'a mut TCollector);
impl<'a, T: 'a + Collector> CollectorWrapper<'a, T> {
pub fn new(collector: &'a mut T) -> CollectorWrapper<'a, T> {
CollectorWrapper(collector)
}
}
impl<'a, T: 'a + Collector> Collector for CollectorWrapper<'a, T> {
type Child = T::Child;
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<T::Child> {
self.0.for_segment(segment_local_id, segment)
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring()
}
}
trait UntypedCollector {
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>>;
}
impl<'a, TCollector:'a + Collector> UntypedCollector for CollectorWrapper<'a, TCollector> {
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>> {
let segment_collector = self.0.for_segment(segment_local_id, segment)?;
Ok(Box::new(segment_collector))
}
}
trait UntypedSegmentCollector {
fn finalize(self) -> Box<UntypedCombinable>;
}
trait UntypedCombinable {
fn combine_into(&mut self, other: Box<UntypedCombinable>);
}
pub struct CombinableWrapper<'a, T: 'a + Combinable>(&'a mut T);
impl<'a, T: 'a + Combinable> CombinableWrapper<'a, T> {
pub fn new(combinable: &'a mut T) -> CombinableWrapper<'a, T> {
CombinableWrapper(combinable)
}
}
impl<'a, T: 'a + Combinable> Combinable for CombinableWrapper<'a, T> {
fn combine_into(&mut self, other: Self) {
self.0.combine_into(*::downcast::Downcast::<T>::downcast(other).unwrap())
}
}
#[cfg(test)]
pub mod tests {
@@ -102,8 +219,13 @@ pub mod tests {
/// It is unusable in practise, as it does not store
/// the segment ordinals
pub struct TestCollector {
next_offset: DocId,
docs: Vec<DocId>,
scores: Vec<Score>,
}
pub struct TestSegmentCollector {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
scores: Vec<Score>,
}
@@ -122,8 +244,7 @@ pub mod tests {
impl Default for TestCollector {
fn default() -> TestCollector {
TestCollector {
offset: 0,
segment_max_doc: 0,
next_offset: 0,
docs: Vec::new(),
scores: Vec::new(),
}
@@ -131,19 +252,33 @@ pub mod tests {
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
type Child = TestSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<TestSegmentCollector> {
let offset = self.next_offset;
self.next_offset += reader.max_doc();
Ok(TestSegmentCollector {
offset,
docs: Vec::new(),
scores: Vec::new(),
})
}
fn requires_scoring(&self) -> bool {
true
}
}
impl SegmentCollector for TestSegmentCollector {
type CollectionResult = Vec<TestSegmentCollector>;
fn collect(&mut self, doc: DocId, score: Score) {
self.docs.push(doc + self.offset);
self.scores.push(score);
}
fn requires_scoring(&self) -> bool {
true
fn finalize(self) -> Vec<TestSegmentCollector> {
vec![self]
}
}
@@ -152,17 +287,26 @@ pub mod tests {
///
/// This collector is mainly useful for tests.
pub struct FastFieldTestCollector {
vals: Vec<u64>,
next_counter: usize,
field: Field,
ff_reader: Option<FastFieldReader<u64>>,
}
#[derive(Default)]
pub struct FastFieldSegmentCollectorState {
counter: usize,
vals: Vec<u64>,
}
pub struct FastFieldSegmentCollector {
state: FastFieldSegmentCollectorState,
reader: FastFieldReader<u64>,
}
impl FastFieldTestCollector {
pub fn for_field(field: Field) -> FastFieldTestCollector {
FastFieldTestCollector {
vals: Vec::new(),
next_counter: 0,
field,
ff_reader: None,
}
}
@@ -172,20 +316,35 @@ pub mod tests {
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
Ok(())
type Child = FastFieldSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FastFieldSegmentCollector> {
let counter = self.next_counter;
self.next_counter += 1;
Ok(FastFieldSegmentCollector {
state: FastFieldSegmentCollectorState::default(),
reader: reader.fast_field_reader(self.field)?,
})
}
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.ff_reader.as_ref().unwrap().get(doc);
self.vals.push(val);
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for FastFieldSegmentCollector {
type CollectionResult = Vec<FastFieldSegmentCollectorState>;
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get(doc);
self.vals.push(val);
}
fn finalize(self) -> Vec<FastFieldSegmentCollectorState> {
vec![self.state]
}
}
/// Collects in order all of the fast field bytes for all of the
/// docs in the `DocSet`
///
@@ -193,7 +352,11 @@ pub mod tests {
pub struct BytesFastFieldTestCollector {
vals: Vec<u8>,
field: Field,
ff_reader: Option<BytesFastFieldReader>,
}
pub struct BytesFastFieldSegmentCollector {
vals: Vec<u8>,
reader: BytesFastFieldReader,
}
impl BytesFastFieldTestCollector {
@@ -201,7 +364,6 @@ pub mod tests {
BytesFastFieldTestCollector {
vals: Vec::new(),
field,
ff_reader: None,
}
}
@@ -211,20 +373,32 @@ pub mod tests {
}
impl Collector for BytesFastFieldTestCollector {
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
Ok(())
}
type Child = BytesFastFieldSegmentCollector;
fn collect(&mut self, doc: u32, _score: f32) {
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
self.vals.extend(val);
fn for_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<BytesFastFieldSegmentCollector> {
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader: segment.bytes_fast_field_reader(self.field)?,
})
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for BytesFastFieldSegmentCollector {
type CollectionResult = Vec<Vec<u8>>;
fn collect(&mut self, doc: u32, _score: f32) {
let val = self.reader.get_val(doc);
self.vals.extend(val);
}
fn finalize(self) -> Vec<Vec<u8>> {
vec![self.vals]
}
}
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -1,119 +1,122 @@
use super::Collector;
use super::SegmentCollector;
use DocId;
use Result;
use Score;
use Result;
use SegmentLocalId;
use SegmentReader;
use downcast::Downcast;
/// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown
/// at compile time.
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result};
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors =
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
pub struct MultiCollector<'a> {
collectors: Vec<&'a mut Collector>,
collector_wrappers: Vec<Box<UntypedCollector + 'a>>
}
impl<'a> MultiCollector<'a> {
/// Constructor
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
MultiCollector { collectors }
pub fn new() -> MultiCollector<'a> {
MultiCollector {
collector_wrappers: Vec::new()
}
}
pub fn add_collector<TCollector: 'a + Collector>(&mut self, collector: &'a mut TCollector) {
let collector_wrapper = CollectorWrapper(collector);
self.collector_wrappers.push(Box::new(collector_wrapper));
}
}
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
for collector in &mut self.collectors {
collector.set_segment(segment_local_id, segment)?;
}
Ok(())
type Child = MultiCollectorChild;
fn for_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<MultiCollectorChild> {
let children = self.collector_wrappers
.iter_mut()
.map(|collector_wrapper| {
collector_wrapper.for_segment(segment_local_id, segment)
})
.collect::<Result<Vec<_>>>()?;
Ok(MultiCollectorChild {
children
})
}
fn collect(&mut self, doc: DocId, score: Score) {
for collector in &mut self.collectors {
collector.collect(doc, score);
fn requires_scoring(&self) -> bool {
self.collector_wrappers
.iter()
.any(|c| c.requires_scoring())
}
fn merge_children(&mut self, children: Vec<MultiCollectorChild>) {
let mut per_collector_children: Vec<Vec<Box<SegmentCollector>>> =
(0..self.collector_wrappers.len())
.map(|_| Vec::with_capacity(children.len()))
.collect::<Vec<_>>();
for child in children {
for (idx, segment_collector) in child.children.into_iter().enumerate() {
per_collector_children[idx].push(segment_collector);
}
}
for (collector, children) in self.collector_wrappers.iter_mut().zip(per_collector_children) {
collector.merge_children_anys(children);
}
}
fn requires_scoring(&self) -> bool {
self.collectors
.iter()
.any(|collector| collector.requires_scoring())
}
pub struct MultiCollectorChild {
children: Vec<Box<SegmentCollector>>
}
impl SegmentCollector for MultiCollectorChild {
fn collect(&mut self, doc: DocId, score: Score) {
for child in &mut self.children {
child.collect(doc, score);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use collector::{Collector, CountCollector, TopCollector};
use schema::{TEXT, SchemaBuilder};
use query::TermQuery;
use Index;
use Term;
use schema::IndexRecordOption;
#[test]
fn test_multi_collector() {
let mut schema_builder = SchemaBuilder::new();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc"));
index_writer.commit().unwrap();
index_writer.add_document(doc!(text=>""));
index_writer.add_document(doc!(text=>"abc abc abc abc"));
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic);
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors =
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
let mut collectors = MultiCollector::new();
collectors.add_collector(&mut top_collector);
collectors.add_collector(&mut count_collector);
collectors.search(&*searcher, &query).unwrap();
}
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());
assert_eq!(count_collector.count(), 5);
}
}

View File

@@ -7,6 +7,8 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
use collector::Combinable;
// Rust heap is a max-heap and we need a min heap.
#[derive(Clone, Copy)]
@@ -43,61 +45,7 @@ impl Eq for GlobalScoredDoc {}
/// with the best scores.
///
/// The implementation is based on a `BinaryHeap`.
/// The theorical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result, DocId, Score};
/// use tantivy::collector::TopCollector;
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut top_collector).unwrap();
///
/// let score_docs: Vec<(Score, DocId)> = top_collector
/// .score_docs()
/// .into_iter()
/// .map(|(score, doc_address)| (score, doc_address.doc()))
/// .collect();
///
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
/// }
///
/// Ok(())
/// }
/// ```
/// The theorical complexity is `O(n log K)`.
pub struct TopCollector {
limit: usize,
heap: BinaryHeap<GlobalScoredDoc>,
@@ -153,21 +101,42 @@ impl TopCollector {
}
impl Collector for TopCollector {
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
self.segment_id = segment_id;
Ok(())
type Child = TopCollector;
fn for_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<TopCollector> {
Ok(TopCollector {
limit: self.limit,
heap: BinaryHeap::new(),
segment_id,
})
}
fn requires_scoring(&self) -> bool {
true
}
}
impl Combinable for TopCollector {
// TODO: I think this could be a bit better
fn combine_into(&mut self, other: Self) {
self.segment_id = other.segment_id;
while let Some(doc) = other.heap.pop() {
self.collect(doc.doc_address.doc(), doc.score);
}
}
}
impl SegmentCollector for TopCollector {
type CollectionResult = TopCollector;
fn collect(&mut self, doc: DocId, score: Score) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
let limit_doc: GlobalScoredDoc = *self
.heap
let limit_doc: GlobalScoredDoc = *self.heap
.peek()
.expect("Top collector with size 0 is forbidden");
if limit_doc.score < score {
let mut mut_head = self
.heap
let mut mut_head = self.heap
.peek_mut()
.expect("Top collector with size 0 is forbidden");
mut_head.score = score;
@@ -182,8 +151,8 @@ impl Collector for TopCollector {
}
}
fn requires_scoring(&self) -> bool {
true
fn finalize(self) -> TopCollector {
self
}
}
@@ -191,7 +160,6 @@ impl Collector for TopCollector {
mod tests {
use super::*;
use collector::Collector;
use DocId;
use Score;
@@ -242,5 +210,4 @@ mod tests {
fn test_top_0() {
TopCollector::with_limit(0);
}
}

View File

@@ -46,7 +46,7 @@ impl BitPacker {
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0;
}
@@ -98,14 +98,31 @@ where
let addr_in_bits = idx * num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
if cfg!(feature = "simdcompression") {
// for simdcompression,
// the bitpacker is only used for fastfields,
// and we expect them to be always padded.
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
} else {
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
} else {
let mut buffer = [0u8; 8];
for i in addr..data.len() {
buffer[i - addr] += data[i];
}
unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
};
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
val_shifted & mask
}
}
/// Reads a range of values from the fast field.

View File

@@ -72,8 +72,7 @@ impl<W: Write> CompositeWrite<W> {
let footer_offset = self.write.written_bytes();
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
let mut offset_fields: Vec<_> = self
.offsets
let mut offset_fields: Vec<_> = self.offsets
.iter()
.map(|(file_addr, offset)| (*offset, *file_addr))
.collect();

View File

@@ -34,8 +34,7 @@ impl BlockEncoder {
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
self.output[0] = num_bits;
let written_size =
1 + self
.bitpacker
1 + self.bitpacker
.compress_sorted(offset, block, &mut self.output[1..], num_bits);
&self.output[..written_size]
}
@@ -43,8 +42,7 @@ impl BlockEncoder {
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
let num_bits = self.bitpacker.num_bits(block);
self.output[0] = num_bits;
let written_size = 1 + self
.bitpacker
let written_size = 1 + self.bitpacker
.compress(block, &mut self.output[1..], num_bits);
&self.output[..written_size]
}
@@ -85,8 +83,7 @@ impl BlockDecoder {
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let num_bits = compressed_data[0];
self.output_len = COMPRESSION_BLOCK_SIZE;
1 + self
.bitpacker
1 + self.bitpacker
.decompress(&compressed_data[1..], &mut self.output, num_bits)
}

View File

@@ -42,8 +42,7 @@ impl CompressedIntStream {
// no need to read.
self.cached_next_addr
} else {
let next_addr = addr + self
.block_decoder
let next_addr = addr + self.block_decoder
.uncompress_block_unsorted(self.buffer.slice_from(addr));
self.cached_addr = addr;
self.cached_next_addr = next_addr;

View File

@@ -21,7 +21,6 @@ use directory::ManagedDirectory;
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
use indexer::segment_updater::save_new_metas;
use indexer::DirectoryLock;
use num_cpus;
@@ -52,7 +51,12 @@ impl Index {
/// This should only be used for unit tests.
pub fn create_in_ram(schema: Schema) -> Index {
let ram_directory = RAMDirectory::create();
Index::create(ram_directory, schema).expect("Creating a RAMDirectory should never fail")
// unwrap is ok here
let directory = ManagedDirectory::new(ram_directory).expect(
"Creating a managed directory from a brand new RAM directory \
should never fail.",
);
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
}
/// Creates a new index in a given filepath.
@@ -60,9 +64,15 @@ impl Index {
///
/// If a previous index was in this directory, then its meta file will be destroyed.
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
Index::create(mmap_directory, schema)
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Accessor for the tokenizer manager.
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
}
/// Creates a new index in a temp directory.
@@ -76,22 +86,10 @@ impl Index {
#[cfg(feature = "mmap")]
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::create_from_tempdir()?;
Index::create(mmap_directory, schema)
}
/// Creates a new index given an implementation of the trait `Directory`
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
let directory = ManagedDirectory::new(dir)?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Create a new index from a directory.
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
/// Creates a new index given a directory and an `IndexMeta`.
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
let schema = metas.schema.clone();
@@ -105,22 +103,24 @@ impl Index {
Ok(index)
}
/// Accessor for the tokenizer manager.
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
/// Open the index using the provided directory
pub fn open_directory<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
/// Opens a new directory from an index path.
#[cfg(feature = "mmap")]
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
Index::open(mmap_directory)
Index::open_directory(mmap_directory)
}
/// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
/// Create a new index from a directory.
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
@@ -137,13 +137,9 @@ impl Index {
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// - `num_threads` defines the number of indexing workers that
/// num_threads specifies the number of indexing workers that
/// should work at the same time.
///
/// - `overall_heap_size_in_bytes` sets the amount of memory
/// allocated for all indexing thread.
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
@@ -151,35 +147,21 @@ impl Index {
pub fn writer_with_num_threads(
&self,
num_threads: usize,
overall_heap_size_in_bytes: usize,
heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
open_index_writer(
self,
num_threads,
heap_size_in_bytes_per_thread,
directory_lock,
)
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
}
/// Creates a multithreaded writer
///
/// Tantivy will automatically define the number of threads to use.
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
/// between a given number of threads.
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> Result<IndexWriter> {
let mut num_threads = num_cpus::get();
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
}
self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
pub fn writer(&self, heap_size_in_bytes: usize) -> Result<IndexWriter> {
self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes)
}
/// Accessor to the index schema
@@ -191,8 +173,7 @@ impl Index {
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self
.searchable_segment_metas()?
Ok(self.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect())
@@ -227,8 +208,7 @@ impl Index {
/// Returns the list of segment ids that are searchable.
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self
.searchable_segment_metas()?
Ok(self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect())

View File

@@ -87,8 +87,7 @@ impl<T> Deref for LeasedItem<T> {
type Target = T;
fn deref(&self) -> &T {
&self
.gen_item
&self.gen_item
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
@@ -97,8 +96,7 @@ impl<T> Deref for LeasedItem<T> {
impl<T> DerefMut for LeasedItem<T> {
fn deref_mut(&mut self) -> &mut T {
&mut self
.gen_item
&mut self.gen_item
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here

View File

@@ -73,13 +73,12 @@ impl Searcher {
/// Runs a query on the segment readers wrapped by the searcher
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
query.search(self, collector)
collector.search(self, query)
}
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> FieldSearcher {
let inv_index_readers = self
.segment_readers
let inv_index_readers = self.segment_readers
.iter()
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<Vec<_>>();
@@ -99,8 +98,7 @@ impl FieldSearcher {
/// Returns a Stream over all of the sorted unique terms of
/// for the given field.
pub fn terms(&self) -> TermMerger {
let term_streamers: Vec<_> = self
.inv_index_readers
let term_streamers: Vec<_> = self.inv_index_readers
.iter()
.map(|inverted_index| inverted_index.terms().stream())
.collect();
@@ -110,8 +108,7 @@ impl FieldSearcher {
impl fmt::Debug for Searcher {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let segment_ids = self
.segment_readers
let segment_ids = self.segment_readers
.iter()
.map(|segment_reader| segment_reader.segment_id())
.collect::<Vec<_>>();

View File

@@ -156,13 +156,11 @@ impl SegmentReader {
&FieldType::Bytes => {}
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
}
let idx_reader = self
.fast_fields_composite
let idx_reader = self.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let values = self
.fast_fields_composite
let values = self.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
Ok(BytesFastFieldReader::open(idx_reader, values))
@@ -274,8 +272,7 @@ impl SegmentReader {
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self
.inv_idx_reader_cache
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
.read()
.expect("Lock poisoned. This should never happen")
.get(&field)
@@ -304,13 +301,11 @@ impl SegmentReader {
let postings_source = postings_source_opt.unwrap();
let termdict_source = self
.termdict_composite
let termdict_source = self.termdict_composite
.open_read(field)
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
let positions_source = self
.positions_composite
let positions_source = self.positions_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");

4
src/datastruct/mod.rs Normal file
View File

@@ -0,0 +1,4 @@
mod skip;
pub mod stacker;
pub use self::skip::{SkipList, SkipListBuilder};

View File

@@ -72,8 +72,7 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
let mut skip_pointer = self.data_layer.insert(key, dest)?;
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) => self
.get_skip_layer(layer_id)
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)?,
None => {
return Ok(());

View File

@@ -0,0 +1,168 @@
use super::heap::{Heap, HeapAllocable};
use std::mem;
#[inline]
pub fn is_power_of_2(val: u32) -> bool {
val & (val - 1) == 0
}
#[inline]
pub fn jump_needed(val: u32) -> bool {
val > 3 && is_power_of_2(val)
}
#[derive(Debug, Clone)]
pub struct ExpUnrolledLinkedList {
len: u32,
end: u32,
val0: u32,
val1: u32,
val2: u32,
next: u32, // inline of the first block
}
impl ExpUnrolledLinkedList {
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap,
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
len: self.len,
consumed: 0,
}
}
pub fn push(&mut self, val: u32, heap: &Heap) {
self.len += 1;
if jump_needed(self.len) {
// we need to allocate another block.
// ... As we want to grow block exponentially
// the next block as a size of (length so far),
// and we need to add 1u32 to store the pointer
// to the next element.
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
let new_block_addr: u32 = heap.allocate_space(new_block_size);
heap.set(self.end, &new_block_addr);
self.end = new_block_addr;
}
heap.set(self.end, &val);
self.end += mem::size_of::<u32>() as u32;
}
}
impl HeapAllocable for u32 {
fn with_addr(_addr: u32) -> u32 {
0u32
}
}
impl HeapAllocable for ExpUnrolledLinkedList {
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;
ExpUnrolledLinkedList {
len: 0u32,
end: last_addr,
val0: 0u32,
val1: 0u32,
val2: 0u32,
next: 0u32,
}
}
}
pub struct ExpUnrolledLinkedListIterator<'a> {
heap: &'a Heap,
addr: u32,
len: u32,
consumed: u32,
}
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
} else {
let addr: u32;
self.consumed += 1;
if jump_needed(self.consumed) {
addr = *self.heap.get_mut_ref(self.addr);
} else {
addr = self.addr;
}
self.addr = addr + mem::size_of::<u32>() as u32;
Some(*self.heap.get_mut_ref(addr))
}
}
}
#[cfg(test)]
mod tests {
use super::super::heap::Heap;
use super::*;
#[test]
fn test_stack() {
let heap = Heap::with_capacity(1_000_000);
let (addr, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
stack.push(1u32, &heap);
stack.push(2u32, &heap);
stack.push(4u32, &heap);
stack.push(8u32, &heap);
{
let mut it = stack.iter(addr, &heap);
assert_eq!(it.next().unwrap(), 1u32);
assert_eq!(it.next().unwrap(), 2u32);
assert_eq!(it.next().unwrap(), 4u32);
assert_eq!(it.next().unwrap(), 8u32);
assert!(it.next().is_none());
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::ExpUnrolledLinkedList;
use super::Heap;
use test::Bencher;
const NUM_STACK: usize = 10_000;
const STACK_SIZE: u32 = 1000;
#[bench]
fn bench_push_vec(bench: &mut Bencher) {
bench.iter(|| {
let mut vecs = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
vecs.push(Vec::new());
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
vecs[t].push(i);
}
}
});
}
#[bench]
fn bench_push_stack(bench: &mut Bencher) {
let heap = Heap::with_capacity(64_000_000);
bench.iter(|| {
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
stacks.push(stack);
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
stacks[t].push(i, &heap);
}
}
heap.clear();
});
}
}

View File

@@ -0,0 +1,335 @@
use super::heap::{BytesRef, Heap, HeapAllocable};
use postings::UnorderedTermId;
use std::iter;
use std::mem;
use std::slice;
mod murmurhash2 {
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { *key_ptr }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
}
/// Split the thread memory budget into
/// - the heap size
/// - the hash table "table" itself.
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
let table_size_limit: usize = per_thread_memory_budget / 3;
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
let table_num_bits: usize = (1..)
.into_iter()
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
));
let table_size = compute_table_size(table_num_bits);
let heap_size = per_thread_memory_budget - table_size;
(heap_size, table_num_bits)
}
/// `KeyValue` is the item stored in the hash table.
/// The key is actually a `BytesRef` object stored in an external heap.
/// The `value_addr` also points to an address in the heap.
///
/// The key and the value are actually stored contiguously.
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
#[derive(Copy, Clone, Default)]
struct KeyValue {
key_value_addr: BytesRef,
hash: u32,
}
impl KeyValue {
fn is_empty(&self) -> bool {
self.key_value_addr.is_null()
}
}
/// Customized `HashMap` with string keys
///
/// This `HashMap` takes String as keys. Keys are
/// stored in a user defined heap.
///
/// The quirky API has the benefit of avoiding
/// the computation of the hash of the key twice,
/// or copying the key as long as there is no insert.
///
pub struct TermHashMap<'a> {
table: Box<[KeyValue]>,
heap: &'a Heap,
mask: usize,
occupied: Vec<usize>,
}
struct QuadraticProbing {
hash: usize,
i: usize,
mask: usize,
}
impl QuadraticProbing {
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing { hash, i: 0, mask }
}
#[inline]
fn next_probe(&mut self) -> usize {
self.i += 1;
(self.hash + self.i * self.i) & self.mask
}
}
pub struct Iter<'a: 'b, 'b> {
hashmap: &'b TermHashMap<'a>,
inner: slice::Iter<'a, usize>,
}
impl<'a, 'b> Iterator for Iter<'a, 'b> {
type Item = (&'b [u8], u32, UnorderedTermId);
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'b [u8], u32) = self.hashmap.get_key_value(kv.key_value_addr);
(key, offset, bucket as UnorderedTermId)
})
}
}
impl<'a> TermHashMap<'a> {
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
let table_size = 1 << num_bucket_power_of_2;
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
TermHashMap {
table: table.into_boxed_slice(),
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
}
}
fn probe(&self, hash: u32) -> QuadraticProbing {
QuadraticProbing::compute(hash as usize, self.mask)
}
pub fn is_saturated(&self) -> bool {
self.table.len() < self.occupied.len() * 3
}
#[inline(never)]
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
(key_bytes, expull_addr)
}
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key_value_addr,
hash,
};
}
pub fn iter<'b: 'a>(&'b self) -> Iter<'a, 'b> {
Iter {
inner: self.occupied.iter(),
hashmap: &self,
}
}
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
&mut self,
key: S,
) -> (UnorderedTermId, &mut V) {
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
self.set_bucket(hash, key_bytes_ref, bucket);
return (bucket as UnorderedTermId, val);
} else if kv.hash == hash {
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
if stored_key == key_bytes {
return (
bucket as UnorderedTermId,
self.heap.get_mut_ref(expull_addr),
);
}
}
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)]
mod tests {
use super::super::heap::{Heap, HeapAllocable};
use super::murmurhash2::murmurhash2;
use super::split_memory;
use super::*;
use std::collections::HashSet;
struct TestValue {
val: u32,
_addr: u32,
}
impl HeapAllocable for TestValue {
fn with_addr(addr: u32) -> TestValue {
TestValue {
val: 0u32,
_addr: addr,
}
}
}
#[test]
fn test_hashmap_size() {
assert_eq!(split_memory(100_000), (67232, 12));
assert_eq!(split_memory(1_000_000), (737856, 15));
assert_eq!(split_memory(10_000_000), (7902848, 18));
}
#[test]
fn test_hash_map() {
let heap = Heap::with_capacity(2_000_000);
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
{
let v: &mut TestValue = hash_map.get_or_create("abc").1;
assert_eq!(v.val, 0u32);
v.val = 3u32;
}
{
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
assert_eq!(v.val, 0u32);
v.val = 4u32;
}
{
let v: &mut TestValue = hash_map.get_or_create("abc").1;
assert_eq!(v.val, 3u32);
}
{
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
assert_eq!(v.val, 4u32);
}
let mut iter_values = hash_map.iter();
{
let (_, addr, _) = iter_values.next().unwrap();
let val: &TestValue = heap.get_ref(addr);
assert_eq!(val.val, 3u32);
}
{
let (_, addr, _) = iter_values.next().unwrap();
let val: &TestValue = heap.get_ref(addr);
assert_eq!(val.val, 4u32);
}
assert!(iter_values.next().is_none());
}
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -0,0 +1,233 @@
use byteorder::{ByteOrder, NativeEndian};
use std::cell::UnsafeCell;
use std::mem;
use std::ptr;
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
///
/// The slice will encode the length of the `&[u8]` slice
/// on 16-bits, and then the data is encoded.
#[derive(Copy, Clone)]
pub struct BytesRef(u32);
impl BytesRef {
pub fn is_null(&self) -> bool {
self.0 == u32::max_value()
}
pub fn addr(&self) -> u32 {
self.0
}
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef(u32::max_value())
}
}
/// Object that can be allocated in tantivy's custom `Heap`.
pub trait HeapAllocable {
fn with_addr(addr: u32) -> Self;
}
/// Tantivy's custom `Heap`.
pub struct Heap {
inner: UnsafeCell<InnerHeap>,
}
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
impl Heap {
/// Creates a new heap with a given capacity
pub fn with_capacity(num_bytes: usize) -> Heap {
Heap {
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
}
}
fn inner(&self) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
}
/// Clears the heap. All the underlying data is lost.
///
/// This heap does not support deallocation.
/// This method is the only way to free memory.
pub fn clear(&self) {
self.inner().clear();
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self) -> u32 {
self.inner().num_free_bytes()
}
/// Allocate a given amount of space and returns an address
/// in the Heap.
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
self.inner().allocate_space(num_bytes)
}
/// Allocate an object in the heap
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
let addr = self.inner().allocate_space(mem::size_of::<V>());
let v: V = V::with_addr(addr);
self.inner().set(addr, &v);
(addr, self.inner().get_mut_ref(addr))
}
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
self.inner().allocate_and_set(data)
}
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
/// given as argumetn
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
self.inner().get_slice(bytes_ref)
}
/// Stores an item's data in the heap, at the given `address`.
pub fn set<Item>(&self, addr: u32, val: &Item) {
self.inner().set(addr, val);
}
/// Returns a mutable reference for an object at a given Item.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
/// Returns a mutable reference to an `Item` at a given `addr`.
#[cfg(test)]
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
self.get_mut_ref(addr)
}
}
struct InnerHeap {
buffer: Vec<u8>,
buffer_len: u32,
used: u32,
next_heap: Option<Box<InnerHeap>>,
}
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
buffer,
buffer_len: num_bytes as u32,
next_heap: None,
used: 0u32,
}
}
pub fn clear(&mut self) {
self.used = 0u32;
self.next_heap = None;
}
// Returns the number of free bytes. If the buffer
// has reached it's capacity and overflowed to another buffer, return 0.
pub fn num_free_bytes(&self) -> u32 {
if self.next_heap.is_some() {
0u32
} else {
self.buffer_len - self.used
}
}
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
let addr = self.used;
self.used += num_bytes as u32;
if self.used <= self.buffer_len {
addr
} else {
if self.next_heap.is_none() {
info!(
r#"Exceeded heap size. The segment will be committed right
after indexing this document."#,
);
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
}
}
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
let start = bytes_ref.0;
if start >= self.buffer_len {
self.next_heap
.as_ref()
.unwrap()
.get_slice(BytesRef(start - self.buffer_len))
} else {
let start = start as usize;
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
&self.buffer[start + 2..start + 2 + len]
}
}
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
if start >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
} else {
&mut self.buffer[start as usize..stop as usize]
}
}
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
assert!(data.len() < u16::max_value() as usize);
let total_len = 2 + data.len();
let start = self.allocate_space(total_len);
let total_buff = self.get_mut_slice(start, start + total_len as u32);
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
total_buff[2..].clone_from_slice(data);
BytesRef(start)
}
fn get_mut(&mut self, addr: u32) -> *mut u8 {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut(addr - self.buffer_len)
} else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
}
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_ref(addr - self.buffer_len)
} else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
}
}
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.set(addr - self.buffer_len, val);
} else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
unsafe {
let dest_ptr: *mut u8 = self.get_mut(addr);
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
}
}
}
}

View File

@@ -0,0 +1,43 @@
mod expull;
pub(crate) mod hashmap;
mod heap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::hashmap::TermHashMap;
pub use self::heap::{Heap, HeapAllocable};
#[test]
fn test_unrolled_linked_list() {
use std::collections;
let heap = Heap::with_capacity(30_000_000);
{
heap.clear();
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
ks.push(2);
ks.push(3);
for k in (1..5).map(|k| k * 100) {
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
for j in 0..k {
for i in 0..500 {
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
v.push(i * j, &heap);
}
}
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
for (key, addr, _) in hashmap.iter() {
map_addr.insert(Vec::from(key), addr);
}
for i in 0..500 {
let key: String = i.to_string();
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
let mut it = exp_pull.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i * j);
}
assert!(!it.next().is_some());
}
}
}
}

View File

@@ -117,8 +117,7 @@ impl ManagedDirectory {
let mut files_to_delete = vec![];
{
// releasing the lock as .delete() will use it too.
let meta_informations_rlock = self
.meta_informations
let meta_informations_rlock = self.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
@@ -171,8 +170,7 @@ impl ManagedDirectory {
if !deleted_files.is_empty() {
// update the list of managed files by removing
// the file that were removed.
let mut meta_informations_wlock = self
.meta_informations
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
{
@@ -195,8 +193,7 @@ impl ManagedDirectory {
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
let pathbuf = path.to_owned();
{
let mut meta_informations_wlock = self
.meta_informations
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned on protect");
*meta_informations_wlock
@@ -218,8 +215,7 @@ impl ManagedDirectory {
/// will not lead to garbage files that will
/// never get removed.
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
let mut meta_wlock = self
.meta_informations
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
@@ -252,8 +248,7 @@ impl Directory for ManagedDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
{
let metas_rlock = self
.meta_informations
let metas_rlock = self.meta_informations
.read()
.expect("poisoned lock in managed directory meta");
if let Some(counter) = metas_rlock.protected_files.get(path) {

View File

@@ -32,8 +32,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
}
})?;
let meta_data = file
.metadata()
let meta_data = file.metadata()
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
@@ -310,8 +309,7 @@ impl Directory for MmapDirectory {
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
Ok(_) => self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {

View File

@@ -170,8 +170,7 @@ impl Directory for RAMDirectory {
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self
.fs
let exists = self.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;

View File

@@ -41,8 +41,7 @@ pub struct DeleteBitSet {
impl DeleteBitSet {
/// Opens a delete bitset given its data source.
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
let num_deleted: usize = data
.as_slice()
let num_deleted: usize = data.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();

View File

@@ -56,8 +56,7 @@ impl FacetReader {
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
let found_term = self
.term_dict
let found_term = self.term_dict
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
assert!(found_term, "Term ordinal {} no found.", facet_ord);
}

View File

@@ -52,8 +52,7 @@ impl DeleteQueue {
//
// Past delete operations are not accessible.
pub fn cursor(&self) -> DeleteCursor {
let last_block = self
.inner
let last_block = self.inner
.read()
.expect("Read lock poisoned when opening delete queue cursor")
.last_block
@@ -93,8 +92,7 @@ impl DeleteQueue {
// be some unflushed operations.
//
fn flush(&self) -> Option<Arc<Block>> {
let mut self_wlock = self
.inner
let mut self_wlock = self.inner
.write()
.expect("Failed to acquire write lock on delete queue writer");
@@ -134,8 +132,7 @@ impl From<DeleteQueue> for NextBlock {
impl NextBlock {
fn next_block(&self) -> Option<Arc<Block>> {
{
let next_read_lock = self
.0
let next_read_lock = self.0
.read()
.expect("Failed to acquire write lock in delete queue");
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
@@ -144,8 +141,7 @@ impl NextBlock {
}
let next_block;
{
let mut next_write_lock = self
.0
let mut next_write_lock = self.0
.write()
.expect("Failed to acquire write lock in delete queue");
match *next_write_lock {

View File

@@ -9,6 +9,8 @@ use core::SegmentComponent;
use core::SegmentId;
use core::SegmentMeta;
use core::SegmentReader;
use datastruct::stacker::hashmap::split_memory;
use datastruct::stacker::Heap;
use directory::FileProtection;
use docset::DocSet;
use error::{Error, ErrorKind, Result, ResultExt};
@@ -22,7 +24,6 @@ use indexer::DirectoryLock;
use indexer::MergePolicy;
use indexer::SegmentEntry;
use indexer::SegmentWriter;
use postings::compute_table_size;
use schema::Document;
use schema::IndexRecordOption;
use schema::Term;
@@ -33,11 +34,10 @@ use std::thread::JoinHandle;
// Size of the margin for the heap. A segment is closed when the remaining memory
// in the heap goes below MARGIN_IN_BYTES.
pub const MARGIN_IN_BYTES: usize = 1_000_000;
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
// We impose the memory per thread to be at least 3 MB.
pub const HEAP_SIZE_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
// Add document will block if the number of docs waiting in the queue to be indexed
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
@@ -46,24 +46,6 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
type DocumentSender = chan::Sender<AddOperation>;
type DocumentReceiver = chan::Receiver<AddOperation>;
/// Split the thread memory budget into
/// - the heap size
/// - the hash table "table" itself.
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
let table_size_limit: usize = per_thread_memory_budget / 3;
(1..)
.into_iter()
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
))
.min(19) // we cap it at 512K
}
/// `IndexWriter` is the user entry-point to add document to an index.
///
/// It manages a small number of indexing thread, as well as a shared
@@ -118,16 +100,11 @@ pub fn open_index_writer(
heap_size_in_bytes_per_thread: usize,
directory_lock: DirectoryLock,
) -> Result<IndexWriter> {
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
let err_msg = format!(
if heap_size_in_bytes_per_thread < HEAP_SIZE_LIMIT as usize {
panic!(format!(
"The heap size per thread needs to be at least {}.",
HEAP_SIZE_MIN
);
bail!(ErrorKind::InvalidArgument(err_msg));
}
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
bail!(ErrorKind::InvalidArgument(err_msg));
HEAP_SIZE_LIMIT
));
}
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
@@ -262,29 +239,43 @@ pub fn advance_deletes(
}
fn index_documents(
memory_budget: usize,
heap: &mut Heap,
table_size: usize,
segment: &Segment,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
heap.clear();
let schema = segment.schema();
let segment_id = segment.id();
let table_size = initial_table_size(memory_budget);
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
let mut segment_writer =
SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?;
for doc in document_iterator {
segment_writer.add_document(doc, &schema)?;
let mem_usage = segment_writer.mem_usage();
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
// There is two possible conditions to close the segment.
// One is the memory arena dedicated to the segment is
// getting full.
if segment_writer.is_buffer_full() {
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
// The second is the term dictionary hash table
// is reaching saturation.
//
// Tantivy does not resize its hashtable. When it reaches
// capacity, we just stop indexing new document.
if segment_writer.is_term_saturated() {
info!(
"Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
}
if !segment_updater.is_alive() {
@@ -342,8 +333,7 @@ impl IndexWriter {
}
drop(self.workers_join_handle);
let result = self
.segment_updater
let result = self.segment_updater
.wait_merging_thread()
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
@@ -377,12 +367,14 @@ impl IndexWriter {
fn add_indexing_worker(&mut self) -> Result<()> {
let document_receiver_clone = self.document_receiver.clone();
let mut segment_updater = self.segment_updater.clone();
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
info!("heap size {}, table_size {}", heap_size, table_size);
let mut heap = Heap::with_capacity(heap_size);
let generation = self.generation;
let mut delete_cursor = self.delete_queue.cursor();
let mem_budget = self.heap_size_in_bytes_per_thread;
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!(
"indexing thread {} for gen {}",
@@ -410,7 +402,8 @@ impl IndexWriter {
}
let segment = segment_updater.new_segment();
index_documents(
mem_budget,
&mut heap,
table_size,
&segment,
generation,
&mut document_iterator,
@@ -488,8 +481,7 @@ impl IndexWriter {
let document_receiver = self.document_receiver.clone();
// take the directory lock to create a new index_writer.
let directory_lock = self
._directory_lock
let directory_lock = self._directory_lock
.take()
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
@@ -645,7 +637,6 @@ impl IndexWriter {
#[cfg(test)]
mod tests {
use super::initial_table_size;
use env_logger;
use error::*;
use indexer::NoMergePolicy;
@@ -708,7 +699,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer(3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback().unwrap();
@@ -741,7 +732,7 @@ mod tests {
};
{
// writing the segment
let mut index_writer = index.writer(12_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
let mut doc = Document::default();
@@ -775,7 +766,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer(12_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
@@ -810,7 +801,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
@@ -840,12 +831,4 @@ mod tests {
assert_eq!(num_docs_containing("b"), 100);
}
#[test]
fn test_hashmap_size() {
assert_eq!(initial_table_size(100_000), 12);
assert_eq!(initial_table_size(1_000_000), 15);
assert_eq!(initial_table_size(10_000_000), 18);
assert_eq!(initial_table_size(1_000_000_000), 19);
}
}

View File

@@ -440,8 +440,7 @@ impl IndexMerger {
) -> Result<Option<TermOrdinalMapping>> {
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new();
let field_readers = self
.readers
let field_readers = self.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect::<Vec<_>>();
@@ -684,7 +683,7 @@ mod tests {
};
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
// writing the segment
{
@@ -734,7 +733,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.wait()
@@ -1139,125 +1138,126 @@ mod tests {
}
}
#[test]
fn test_merge_facets() {
let mut schema_builder = schema::SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet");
let index = Index::create_in_ram(schema_builder.build());
use schema::Facet;
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
index_writer.add_document(doc);
};
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
index_doc(&mut index_writer, &["/top/a"]);
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
index_doc(&mut index_writer, &["/top/d"]);
index_doc(&mut index_writer, &["/top/e"]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"]);
index_doc(&mut index_writer, &["/top/b"]);
index_doc(&mut index_writer, &["/top/c"]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
index_writer.commit().expect("committed");
}
index.load_searchers().unwrap();
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top"));
use collector::{CountCollector, MultiCollector};
let mut count_collector = CountCollector::default();
{
let mut multi_collectors =
MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
}
assert_eq!(count_collector.count(), expected_num_docs);
let facet_counts = facet_collector.harvest();
let facets: Vec<(String, u64)> = facet_counts
.get("/top")
.map(|(facet, count)| (facet.to_string(), count))
.collect();
assert_eq!(
facets,
expected
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
};
test_searcher(
11,
&[
("/top/a", 5),
("/top/b", 5),
("/top/c", 2),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
index.load_searchers().unwrap();
test_searcher(
11,
&[
("/top/a", 5),
("/top/b", 5),
("/top/c", 2),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
}
// Deleting one term
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
test_searcher(
9,
&[
("/top/a", 3),
("/top/b", 3),
("/top/c", 1),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
}
}
// #[test]
// fn test_merge_facets() {
// let mut schema_builder = schema::SchemaBuilder::default();
// let facet_field = schema_builder.add_facet_field("facet");
// let index = Index::create_in_ram(schema_builder.build());
// use schema::Facet;
// {
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
// let mut doc = Document::default();
// for facet in doc_facets {
// doc.add_facet(facet_field, Facet::from(facet));
// }
// index_writer.add_document(doc);
// };
//
// index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
// index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
// index_doc(&mut index_writer, &["/top/a", "/top/b"]);
// index_doc(&mut index_writer, &["/top/a"]);
//
// index_doc(&mut index_writer, &["/top/b", "/top/d"]);
// index_doc(&mut index_writer, &["/top/d"]);
// index_doc(&mut index_writer, &["/top/e"]);
// index_writer.commit().expect("committed");
//
// index_doc(&mut index_writer, &["/top/a"]);
// index_doc(&mut index_writer, &["/top/b"]);
// index_doc(&mut index_writer, &["/top/c"]);
// index_writer.commit().expect("committed");
//
// index_doc(&mut index_writer, &["/top/e", "/top/f"]);
// index_writer.commit().expect("committed");
// }
// index.load_searchers().unwrap();
// let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
// let searcher = index.searcher();
// let mut facet_collector = FacetCollector::for_field(facet_field);
// facet_collector.add_facet(Facet::from("/top"));
// use collector::{CountCollector, MultiCollector};
// let mut count_collector = CountCollector::default();
// {
// let mut multi_collectors = MultiCollector::new();
// multi_collectors.add_collector(&mut count_collector);
// multi_collectors.add_collector(&mut facet_collector);
// searcher.search(&AllQuery, &mut multi_collectors).unwrap();
// }
// assert_eq!(count_collector.count(), expected_num_docs);
// let facet_counts = facet_collector.harvest();
// let facets: Vec<(String, u64)> = facet_counts
// .get("/top")
// .map(|(facet, count)| (facet.to_string(), count))
// .collect();
// assert_eq!(
// facets,
// expected
// .iter()
// .map(|&(facet_str, count)| (String::from(facet_str), count))
// .collect::<Vec<_>>()
// );
// };
// test_searcher(
// 11,
// &[
// ("/top/a", 5),
// ("/top/b", 5),
// ("/top/c", 2),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
//
// // Merging the segments
// {
// let segment_ids = index
// .searchable_segment_ids()
// .expect("Searchable segments failed.");
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// index_writer
// .merge(&segment_ids)
// .wait()
// .expect("Merging failed");
// index_writer.wait_merging_threads().unwrap();
//
// index.load_searchers().unwrap();
// test_searcher(
// 11,
// &[
// ("/top/a", 5),
// ("/top/b", 5),
// ("/top/c", 2),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
// }
//
// // Deleting one term
// {
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
// let facet_term = Term::from_facet(facet_field, &facet);
// index_writer.delete_term(facet_term);
// index_writer.commit().unwrap();
// index.load_searchers().unwrap();
// test_searcher(
// 9,
// &[
// ("/top/a", 3),
// ("/top/b", 3),
// ("/top/c", 1),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
// }
// }
#[test]
fn test_merge_multivalued_int_fields_all_deleted() {

View File

@@ -59,8 +59,7 @@ impl SegmentRegister {
}
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
let mut segment_ids: Vec<SegmentMeta> = self
.segment_states
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
.values()
.map(|segment_entry| segment_entry.meta().clone())
.collect();

View File

@@ -1,8 +1,10 @@
use super::operation::AddOperation;
use core::Segment;
use core::SerializableSegment;
use datastruct::stacker::Heap;
use fastfield::FastFieldsWriter;
use fieldnorm::FieldNormsWriter;
use indexer::index_writer::MARGIN_IN_BYTES;
use indexer::segment_serializer::SegmentSerializer;
use postings::MultiFieldPostingsWriter;
use schema::FieldType;
@@ -22,9 +24,10 @@ use Result;
///
/// They creates the postings list in anonymous memory.
/// The segment is layed on disk when the segment gets `finalized`.
pub struct SegmentWriter {
pub struct SegmentWriter<'a> {
heap: &'a Heap,
max_doc: DocId,
multifield_postings: MultiFieldPostingsWriter,
multifield_postings: MultiFieldPostingsWriter<'a>,
segment_serializer: SegmentSerializer,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter,
@@ -32,7 +35,7 @@ pub struct SegmentWriter {
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
}
impl SegmentWriter {
impl<'a> SegmentWriter<'a> {
/// Creates a new `SegmentWriter`
///
/// The arguments are defined as follows
@@ -43,12 +46,13 @@ impl SegmentWriter {
/// - segment: The segment being written
/// - schema
pub fn for_segment(
heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema,
) -> Result<SegmentWriter> {
) -> Result<SegmentWriter<'a>> {
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
let tokenizers = schema
.fields()
.iter()
@@ -64,6 +68,7 @@ impl SegmentWriter {
})
.collect();
Ok(SegmentWriter {
heap,
max_doc: 0,
multifield_postings,
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
@@ -89,8 +94,22 @@ impl SegmentWriter {
Ok(self.doc_opstamps)
}
pub fn mem_usage(&self) -> usize {
self.multifield_postings.mem_usage()
/// Returns true iff the segment writer's buffer has reached capacity.
///
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
/// The `Segment` is `finalize`d when the buffer gets full.
///
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
/// exceeds the heap size.
pub fn is_buffer_full(&self) -> bool {
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
}
/// Return true if the term dictionary hashmap is reaching capacity.
/// It is one of the condition that triggers a `SegmentWriter` to
/// be finalized.
pub(crate) fn is_term_saturated(&self) -> bool {
self.multifield_postings.is_term_saturated()
}
/// Indexes a new document
@@ -229,7 +248,7 @@ fn write(
Ok(())
}
impl SerializableSegment for SegmentWriter {
impl<'a> SerializableSegment for SegmentWriter<'a> {
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(

View File

@@ -55,7 +55,7 @@
//!
//! // Indexing documents
//!
//! let index = Index::create_in_dir(index_path, schema.clone())?;
//! let index = Index::create(index_path, schema.clone())?;
//!
//! // Here we use a buffer of 100MB that will be split
//! // between indexing threads.
@@ -136,11 +136,11 @@ extern crate combine;
extern crate crossbeam;
extern crate fnv;
extern crate fst;
extern crate fst_regex;
extern crate futures;
extern crate futures_cpupool;
extern crate itertools;
extern crate levenshtein_automata;
extern crate lz4;
extern crate num_cpus;
extern crate owning_ref;
extern crate regex;
@@ -188,7 +188,8 @@ mod compression;
mod core;
mod indexer;
#[allow(unused_doc_comments)]
mod datastruct;
#[allow(unused_doc_comment)]
mod error;
pub mod tokenizer;

View File

@@ -11,7 +11,6 @@ mod postings_writer;
mod recorder;
mod segment_postings;
mod serializer;
mod stacker;
mod term_info;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
@@ -22,8 +21,6 @@ pub use self::term_info::TermInfo;
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
pub(crate) use self::stacker::compute_table_size;
pub use common::HasLen;
pub(crate) type UnorderedTermId = u64;
@@ -42,6 +39,7 @@ pub mod tests {
use core::Index;
use core::SegmentComponent;
use core::SegmentReader;
use datastruct::stacker::Heap;
use docset::{DocSet, SkipResult};
use fieldnorm::FieldNormReader;
use indexer::operation::AddOperation;
@@ -162,9 +160,10 @@ pub mod tests {
let index = Index::create_in_ram(schema.clone());
let segment = index.new_segment();
let heap = Heap::with_capacity(10_000_000);
{
let mut segment_writer =
SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values

View File

@@ -1,5 +1,4 @@
use super::stacker::{Addr, MemoryArena, TermHashMap};
use datastruct::stacker::{Heap, TermHashMap};
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
use postings::UnorderedTermId;
use postings::{FieldSerializer, InvertedIndexSerializer};
@@ -15,90 +14,80 @@ use tokenizer::TokenStream;
use DocId;
use Result;
fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box<PostingsWriter> {
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
heap: &'a Heap,
) -> Box<PostingsWriter + 'a> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options
.get_indexing_options()
.map(|indexing_options| match indexing_options.index_option() {
IndexRecordOption::Basic => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
IndexRecordOption::WithFreqs => {
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
}
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
FieldType::Bytes => {
// FieldType::Bytes cannot actually be indexed.
// TODO fix during the indexer refactoring described in #276
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
}
pub struct MultiFieldPostingsWriter {
heap: MemoryArena,
pub struct MultiFieldPostingsWriter<'a> {
heap: &'a Heap,
schema: Schema,
term_index: TermHashMap,
per_field_postings_writers: Vec<Box<PostingsWriter>>,
term_index: TermHashMap<'a>,
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
}
impl MultiFieldPostingsWriter {
impl<'a> MultiFieldPostingsWriter<'a> {
/// Create a new `MultiFieldPostingsWriter` given
/// a schema and a heap.
pub fn new(schema: &Schema, table_bits: usize) -> MultiFieldPostingsWriter {
let term_index = TermHashMap::new(table_bits);
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
let term_index = TermHashMap::new(table_bits, heap);
let per_field_postings_writers: Vec<_> = schema
.fields()
.iter()
.map(|field_entry| posting_from_field_entry(field_entry))
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.collect();
MultiFieldPostingsWriter {
heap: MemoryArena::new(),
schema: schema.clone(),
heap,
term_index,
per_field_postings_writers,
}
}
pub fn mem_usage(&self) -> usize {
self.term_index.mem_usage() + self.heap.mem_usage()
}
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
postings_writer.index_text(
&mut self.term_index,
doc,
field,
token_stream,
&mut self.heap,
)
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
}
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, self.heap)
}
/// Serialize the inverted index.
/// It pushes all term, one field at a time, towards the
/// postings serializer.
#[allow(needless_range_loop)]
pub fn serialize(
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
.term_index
.iter()
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
.collect();
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
term_offsets.sort_by_key(|&(k, _, _)| k);
let mut offsets: Vec<(Field, usize)> = vec![];
@@ -153,19 +142,23 @@ impl MultiFieldPostingsWriter {
postings_writer.serialize(
&term_offsets[start..stop],
&mut field_serializer,
&self.term_index.heap,
&self.heap,
self.heap,
)?;
field_serializer.close()?;
}
Ok(unordered_term_mappings)
}
/// Return true iff the term dictionary is saturated.
pub fn is_term_saturated(&self) -> bool {
self.term_index.is_saturated()
}
}
/// The `PostingsWriter` is in charge of receiving documenting
/// and building a `Segment` in anonymous memory.
///
/// `PostingsWriter` writes in a `MemoryArena`.
/// `PostingsWriter` writes in a `Heap`.
pub trait PostingsWriter {
/// Record that a document contains a term at a given position.
///
@@ -180,17 +173,16 @@ pub trait PostingsWriter {
doc: DocId,
pos: u32,
term: &Term,
heap: &mut MemoryArena,
heap: &Heap,
) -> UnorderedTermId;
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
term_addrs: &[(&[u8], u32, UnorderedTermId)],
serializer: &mut FieldSerializer,
term_heap: &MemoryArena,
heap: &MemoryArena,
heap: &Heap,
) -> io::Result<()>;
/// Tokenize a text and subscribe all of its token.
@@ -200,7 +192,7 @@ pub trait PostingsWriter {
doc_id: DocId,
field: Field,
token_stream: &mut TokenStream,
heap: &mut MemoryArena,
heap: &Heap,
) -> u32 {
let mut term = Term::for_field(field);
let num_tokens = {
@@ -218,67 +210,61 @@ pub trait PostingsWriter {
/// The `SpecializedPostingsWriter` is just here to remove dynamic
/// dispatch to the recorder information.
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
heap: &'a Heap,
total_num_tokens: u64,
_recorder_type: PhantomData<Rec>,
}
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
/// constructor
pub fn new() -> SpecializedPostingsWriter<Rec> {
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
SpecializedPostingsWriter {
heap,
total_num_tokens: 0u64,
_recorder_type: PhantomData,
}
}
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
pub fn new_boxed() -> Box<PostingsWriter> {
Box::new(SpecializedPostingsWriter::<Rec>::new())
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
}
}
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn subscribe(
&mut self,
term_index: &mut TermHashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &mut MemoryArena,
heap: &Heap,
) -> UnorderedTermId {
debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1;
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
if opt_recorder.is_some() {
let mut recorder = opt_recorder.unwrap();
let current_doc = recorder.current_doc();
if current_doc != doc {
recorder.close_doc(heap);
recorder.new_doc(doc, heap);
}
recorder.record_position(position, heap);
recorder
} else {
let mut recorder = Rec::new(heap);
recorder.new_doc(doc, heap);
recorder.record_position(position, heap);
recorder
let (term_ord, recorder): (UnorderedTermId, &mut Rec) = term_index.get_or_create(term);
let current_doc = recorder.current_doc();
if current_doc != doc {
if current_doc != u32::max_value() {
recorder.close_doc(heap);
}
}) as UnorderedTermId
recorder.new_doc(doc, heap);
}
self.total_num_tokens += 1;
recorder.record_position(position, heap);
term_ord
}
fn serialize(
&self,
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
term_addrs: &[(&[u8], u32, UnorderedTermId)],
serializer: &mut FieldSerializer,
termdict_heap: &MemoryArena,
heap: &MemoryArena,
heap: &Heap,
) -> io::Result<()> {
for &(term_bytes, addr, _) in term_addrs {
let recorder: Rec = unsafe { termdict_heap.read(addr) };
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
serializer.new_term(&term_bytes[4..])?;
recorder.serialize(serializer, heap)?;
recorder.serialize(addr, serializer, heap)?;
serializer.close_term()?;
}
Ok(())

View File

@@ -1,4 +1,4 @@
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
use postings::FieldSerializer;
use std::{self, io};
use DocId;
@@ -15,53 +15,62 @@ const POSITION_END: u32 = std::u32::MAX;
/// * the document id
/// * the term frequency
/// * the term positions
pub trait Recorder: Copy {
///
fn new(heap: &mut MemoryArena) -> Self;
pub trait Recorder: HeapAllocable {
/// Returns the current document
fn current_doc(&self) -> u32;
/// Starts recording information about a new document
/// This method shall only be called if the term is within the document.
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena);
fn new_doc(&mut self, doc: DocId, heap: &Heap);
/// Record the position of a term. For each document,
/// this method will be called `term_freq` times.
fn record_position(&mut self, position: u32, heap: &mut MemoryArena);
fn record_position(&mut self, position: u32, heap: &Heap);
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &mut MemoryArena);
fn close_doc(&mut self, heap: &Heap);
/// Pushes the postings information to the serializer.
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
}
/// Only records the doc ids
#[derive(Clone, Copy)]
pub struct NothingRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
}
impl Recorder for NothingRecorder {
fn new(heap: &mut MemoryArena) -> Self {
impl HeapAllocable for NothingRecorder {
fn with_addr(addr: u32) -> NothingRecorder {
NothingRecorder {
stack: ExpUnrolledLinkedList::new(heap),
stack: ExpUnrolledLinkedList::with_addr(addr),
current_doc: u32::max_value(),
}
}
}
impl Recorder for NothingRecorder {
fn current_doc(&self) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
self.current_doc = doc;
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
fn record_position(&mut self, _position: u32, _heap: &Heap) {}
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
fn close_doc(&mut self, _heap: &Heap) {}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
for doc in self.stack.iter(heap) {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for doc in self.stack.iter(self_addr, heap) {
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
}
Ok(())
@@ -69,47 +78,52 @@ impl Recorder for NothingRecorder {
}
/// Recorder encoding document ids, and term frequencies
#[derive(Clone, Copy)]
pub struct TermFrequencyRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
current_tf: u32,
}
impl Recorder for TermFrequencyRecorder {
fn new(heap: &mut MemoryArena) -> Self {
impl HeapAllocable for TermFrequencyRecorder {
fn with_addr(addr: u32) -> TermFrequencyRecorder {
TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::new(heap),
stack: ExpUnrolledLinkedList::with_addr(addr),
current_doc: u32::max_value(),
current_tf: 0u32,
}
}
}
impl Recorder for TermFrequencyRecorder {
fn current_doc(&self) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
self.current_doc = doc;
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
fn record_position(&mut self, _position: u32, _heap: &Heap) {
self.current_tf += 1;
}
fn close_doc(&mut self, heap: &mut MemoryArena) {
fn close_doc(&mut self, heap: &Heap) {
debug_assert!(self.current_tf > 0);
self.stack.push(self.current_tf, heap);
self.current_tf = 0;
}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self
.stack
.iter(heap)
let mut doc_iter = self.stack
.iter(self_addr, heap)
.chain(Some(self.current_tf).into_iter());
while let Some(doc) = doc_iter.next() {
@@ -123,40 +137,46 @@ impl Recorder for TermFrequencyRecorder {
}
/// Recorder encoding term frequencies as well as positions.
#[derive(Clone, Copy)]
pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
}
impl Recorder for TFAndPositionRecorder {
fn new(heap: &mut MemoryArena) -> Self {
impl HeapAllocable for TFAndPositionRecorder {
fn with_addr(addr: u32) -> TFAndPositionRecorder {
TFAndPositionRecorder {
stack: ExpUnrolledLinkedList::new(heap),
stack: ExpUnrolledLinkedList::with_addr(addr),
current_doc: u32::max_value(),
}
}
}
impl Recorder for TFAndPositionRecorder {
fn current_doc(&self) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
self.current_doc = doc;
self.stack.push(doc, heap);
}
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
fn record_position(&mut self, position: u32, heap: &Heap) {
self.stack.push(position, heap);
}
fn close_doc(&mut self, heap: &mut MemoryArena) {
fn close_doc(&mut self, heap: &Heap) {
self.stack.push(POSITION_END, heap);
}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(heap);
let mut positions_iter = self.stack.iter(self_addr, heap);
while let Some(doc) = positions_iter.next() {
let mut prev_position = 0;
doc_positions.clear();

View File

@@ -399,8 +399,7 @@ impl BlockSegmentPostings {
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.num_bitpacked_blocks > 0 {
let num_consumed_bytes = self
.doc_decoder
let num_consumed_bytes = self.doc_decoder
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
self.remaining_data.advance(num_consumed_bytes);
match self.freq_reading_option {
@@ -410,8 +409,7 @@ impl BlockSegmentPostings {
self.remaining_data.advance(num_bytes_to_skip);
}
FreqReadingOption::ReadFreq => {
let num_consumed_bytes = self
.freq_decoder
let num_consumed_bytes = self.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref());
self.remaining_data.advance(num_consumed_bytes);
}

View File

@@ -160,8 +160,7 @@ impl<'a> FieldSerializer<'a> {
}
fn current_term_info(&self) -> TermInfo {
let (filepos, offset) = self
.positions_serializer_opt
let (filepos, offset) = self.positions_serializer_opt
.as_ref()
.map(|positions_serializer| positions_serializer.addr())
.unwrap_or((0u64, 0u8));
@@ -273,8 +272,7 @@ impl<W: Write> PostingsSerializer<W> {
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
{
// encode the doc ids
let block_encoded: &[u8] = self
.block_encoder
let block_encoded: &[u8] = self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
self.postings_write.write_all(block_encoded)?;
@@ -300,16 +298,14 @@ impl<W: Write> PostingsSerializer<W> {
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self
.block_encoder
let block_encoded = self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.termfreq_enabled {
let block_encoded = self
.block_encoder
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();

View File

@@ -1,218 +0,0 @@
use super::{Addr, MemoryArena};
use common::is_power_of_2;
use std::mem;
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
const FIRST_BLOCK: u32 = 4u32;
#[inline]
pub fn jump_needed(len: u32) -> Option<usize> {
match len {
0...3 => None,
4...MAX_BLOCK_LEN => {
if is_power_of_2(len as usize) {
Some(len as usize)
} else {
None
}
}
n => {
if n % MAX_BLOCK_LEN == 0 {
Some(MAX_BLOCK_LEN as usize)
} else {
None
}
}
}
}
/// An exponential unrolled link.
///
/// The use case is as follows. Tantivy's indexer conceptually acts like a
/// `HashMap<Term, Vec<u32>>`. As we come accross a given term in document
/// `D`, we lookup the term in the map and append the document id to its vector.
///
/// The vector is then only read when it is serialized.
///
/// The `ExpUnrolledLinkedList` offers a more efficient solution to this
/// problem.
///
/// It combines the idea of the unrolled linked list and tries to address the
/// problem of selecting an adequate block size using a strategy similar to
/// that of the `Vec` amortized resize strategy.
///
/// Data is stored in a linked list of blocks. The first block has a size of `4`
/// and each block has a length of twice that of the previous block up to
/// `MAX_BLOCK_LEN = 32768`.
///
/// This strategy is a good trade off to handle numerous very rare terms
/// and avoid wasting half of the memory for very frequent terms.
#[derive(Debug, Clone, Copy)]
pub struct ExpUnrolledLinkedList {
len: u32,
head: Addr,
tail: Addr,
}
impl ExpUnrolledLinkedList {
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
ExpUnrolledLinkedList {
len: 0u32,
head: addr,
tail: addr,
}
}
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap,
addr: self.head,
len: self.len,
consumed: 0,
}
}
/// Appends a new element to the current stack.
///
/// If the current block end is reached, a new block is allocated.
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
self.len += 1;
if let Some(new_block_len) = jump_needed(self.len) {
// We need to allocate another block.
// We also allocate an extra `u32` to store the pointer
// to the future next block.
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
let new_block_addr: Addr = heap.allocate_space(new_block_size);
unsafe {
// logic
heap.write(self.tail, new_block_addr)
};
self.tail = new_block_addr;
}
unsafe {
// logic
heap.write(self.tail, val);
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
}
}
}
pub struct ExpUnrolledLinkedListIterator<'a> {
heap: &'a MemoryArena,
addr: Addr,
len: u32,
consumed: u32,
}
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
} else {
self.consumed += 1;
let addr: Addr = if jump_needed(self.consumed).is_some() {
unsafe {
// logic
self.heap.read(self.addr)
}
} else {
self.addr
};
self.addr = addr.offset(mem::size_of::<u32>() as u32);
Some(unsafe {
// logic
self.heap.read(addr)
})
}
}
}
#[cfg(test)]
mod tests {
use super::super::MemoryArena;
use super::jump_needed;
use super::*;
#[test]
fn test_stack() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
stack.push(1u32, &mut heap);
stack.push(2u32, &mut heap);
stack.push(4u32, &mut heap);
stack.push(8u32, &mut heap);
{
let mut it = stack.iter(&heap);
assert_eq!(it.next().unwrap(), 1u32);
assert_eq!(it.next().unwrap(), 2u32);
assert_eq!(it.next().unwrap(), 4u32);
assert_eq!(it.next().unwrap(), 8u32);
assert!(it.next().is_none());
}
}
#[test]
fn test_jump_if_needed() {
let mut block_len = 4u32;
let mut i = 0;
while i < 10_000_000 {
assert!(jump_needed(i + block_len - 1).is_none());
assert!(jump_needed(i + block_len + 1).is_none());
assert!(jump_needed(i + block_len).is_some());
let new_block_len = jump_needed(i + block_len).unwrap();
i += block_len;
block_len = new_block_len as u32;
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::ExpUnrolledLinkedList;
use tantivy_memory_arena::MemoryArena;
use test::Bencher;
const NUM_STACK: usize = 10_000;
const STACK_SIZE: u32 = 1000;
#[bench]
fn bench_push_vec(bench: &mut Bencher) {
bench.iter(|| {
let mut vecs = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
vecs.push(Vec::new());
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
vecs[t].push(i);
}
}
});
}
#[bench]
fn bench_push_stack(bench: &mut Bencher) {
let heap = MemoryArena::new();
bench.iter(|| {
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
stacks.push(stack);
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
stacks[t].push(i, &heap);
}
}
heap.clear();
});
}
}

View File

@@ -1,291 +0,0 @@
//! 32-bits Memory arena for types implementing `Copy`.
//! This Memory arena has been implemented to fit the use of tantivy's indexer
//! and has *twisted specifications*.
//!
//! - It works on stable rust.
//! - One can get an accurate figure of the memory usage of the arena.
//! - Allocation are very cheap.
//! - Allocation happening consecutively are very likely to have great locality.
//! - Addresses (`Addr`) are 32bits.
//! - Dropping the whole `MemoryArena` is cheap.
//!
//! # Limitations
//!
//! - Your object shall not implement `Drop`.
//! - `Addr` to the `Arena` are 32-bits. The maximum capacity of the arena
//! is 4GB. *(Tantivy's indexer uses one arena per indexing thread.)*
//! - The arena only works for objects much smaller than `1MB`.
//! Allocating more than `1MB` at a time will result in a panic,
//! and allocating a lot of large object (> 500KB) will result in a fragmentation.
//! - Your objects are store in an unaligned fashion. For this reason,
//! the API does not let you access them as references.
//!
//! Instead, you store and access your data via `.write(...)` and `.read(...)`, which under the hood
//! stores your object using `ptr::write_unaligned` and `ptr::read_unaligned`.
use std::mem;
use std::ptr;
const NUM_BITS_PAGE_ADDR: usize = 20;
const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
/// Represents a pointer into the `MemoryArena`
/// .
/// Pointer are 32-bits and are split into
/// two parts.
///
/// The first 12 bits represent the id of a
/// page of memory.
///
/// The last 20 bits are an address within this page of memory.
#[derive(Clone, Copy, Debug)]
pub struct Addr(u32);
impl Addr {
/// Creates a null pointer.
pub fn null_pointer() -> Addr {
Addr(u32::max_value())
}
/// Returns the `Addr` object for `addr + offset`
pub fn offset(&self, offset: u32) -> Addr {
Addr(self.0.wrapping_add(offset))
}
fn new(page_id: usize, local_addr: usize) -> Addr {
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
}
fn page_id(&self) -> usize {
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
}
fn page_local_addr(&self) -> usize {
(self.0 as usize) & (PAGE_SIZE - 1)
}
/// Returns true if and only if the `Addr` is null.
pub fn is_null(&self) -> bool {
self.0 == u32::max_value()
}
}
/// Trait required for an object to be `storable`.
///
/// # Warning
///
/// Most of the time you should not implement this trait,
/// and only use the `MemoryArena` with object implementing `Copy`.
///
/// `ArenaStorable` is used in `tantivy` to force
/// a `Copy` object and a `slice` of data to be stored contiguously.
pub trait ArenaStorable {
fn num_bytes(&self) -> usize;
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
}
impl<V> ArenaStorable for V
where
V: Copy,
{
fn num_bytes(&self) -> usize {
mem::size_of::<V>()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
ptr::write_unaligned(dst_ptr, self);
}
}
/// The `MemoryArena`
pub struct MemoryArena {
pages: Vec<Page>,
}
impl MemoryArena {
/// Creates a new memory arena.
pub fn new() -> MemoryArena {
let first_page = Page::new(0);
MemoryArena {
pages: vec![first_page],
}
}
fn add_page(&mut self) -> &mut Page {
let new_page_id = self.pages.len();
self.pages.push(Page::new(new_page_id));
&mut self.pages[new_page_id]
}
/// Returns an estimate in number of bytes
/// of resident memory consumed by the `MemoryArena`.
///
/// Internally, it counts a number of `1MB` pages
/// and therefore delivers an upperbound.
pub fn mem_usage(&self) -> usize {
self.pages.len() * PAGE_SIZE
}
/// Writes a slice at the given address, assuming the
/// memory was allocated beforehands.
///
/// # Panics
///
/// May panic or corrupt the heap if he space was not
/// properly allocated beforehands.
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
let bytes = data.as_ref();
self.pages[addr.page_id()]
.get_mut_slice(addr.page_local_addr(), bytes.len())
.copy_from_slice(bytes);
}
/// Returns the `len` bytes starting at `addr`
///
/// # Panics
///
/// Panics if the memory has not been allocated beforehands.
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
}
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
}
/// Stores an item's data in the heap
///
/// It allocates the `Item` beforehands.
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
let num_bytes = val.num_bytes();
let addr = self.allocate_space(num_bytes);
unsafe {
self.write(addr, val);
};
addr
}
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
val.write_into(self, addr)
}
/// Read an item in the heap at the given `address`.
///
/// # Panics
///
/// If the address is erroneous
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
ptr::read_unaligned(ptr as *const Item)
}
/// Allocates `len` bytes and returns the allocated address.
pub fn allocate_space(&mut self, len: usize) -> Addr {
let page_id = self.pages.len() - 1;
if let Some(addr) = self.pages[page_id].allocate_space(len) {
return addr;
}
self.add_page().allocate_space(len).unwrap()
}
}
struct Page {
page_id: usize,
len: usize,
data: Box<[u8]>,
}
impl Page {
fn new(page_id: usize) -> Page {
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
unsafe {
data.set_len(PAGE_SIZE);
} // avoid initializing page
Page {
page_id,
len: 0,
data: data.into_boxed_slice(),
}
}
#[inline(always)]
fn is_available(&self, len: usize) -> bool {
len + self.len <= PAGE_SIZE
}
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
&mut self.data[local_addr..][..len]
}
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
&self.data[local_addr..][..len]
}
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
if self.is_available(len) {
let addr = Addr::new(self.page_id, self.len);
self.len += len;
Some(addr)
} else {
None
}
}
#[inline(always)]
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
self.data.as_ptr().offset(addr as isize)
}
#[inline(always)]
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
self.data.as_mut_ptr().offset(addr as isize)
}
}
#[cfg(test)]
mod tests {
use super::MemoryArena;
#[test]
fn test_arena_allocate_slice() {
let mut arena = MemoryArena::new();
let a = b"hello";
let b = b"happy tax payer";
let addr_a = arena.allocate_space(a.len());
arena.write_bytes(addr_a, a);
let addr_b = arena.allocate_space(b.len());
arena.write_bytes(addr_b, b);
assert_eq!(arena.read_slice(addr_a, a.len()), a);
assert_eq!(arena.read_slice(addr_b, b.len()), b);
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct MyTest {
pub a: usize,
pub b: u8,
pub c: u32,
}
#[test]
fn test_store_object() {
let mut arena = MemoryArena::new();
let a = MyTest {
a: 143,
b: 21,
c: 32,
};
let b = MyTest {
a: 113,
b: 221,
c: 12,
};
let addr_a = arena.store(a);
let addr_b = arena.store(b);
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
}
}

View File

@@ -1,9 +0,0 @@
mod expull;
mod memory_arena;
mod murmurhash2;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -1,86 +0,0 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,296 +0,0 @@
use super::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena};
use std::iter;
use std::mem;
use std::slice;
pub type BucketId = usize;
struct KeyBytesValue<'a, V> {
key: &'a [u8],
value: V,
}
impl<'a, V> KeyBytesValue<'a, V> {
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
KeyBytesValue { key, value }
}
}
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
where
V: ArenaStorable,
{
fn num_bytes(&self) -> usize {
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
arena.write(addr, self.key.len() as u16);
arena.write_bytes(addr.offset(2), self.key);
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
}
}
/// Returns the actual memory size in bytes
/// required to create a table of size $2^num_bits$.
pub fn compute_table_size(num_bits: usize) -> usize {
(1 << num_bits) * mem::size_of::<KeyValue>()
}
/// `KeyValue` is the item stored in the hash table.
/// The key is actually a `BytesRef` object stored in an external heap.
/// The `value_addr` also points to an address in the heap.
///
/// The key and the value are actually stored contiguously.
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
#[derive(Copy, Clone)]
struct KeyValue {
key_value_addr: Addr,
hash: u32,
}
impl Default for KeyValue {
fn default() -> Self {
KeyValue {
key_value_addr: Addr::null_pointer(),
hash: 0u32,
}
}
}
impl KeyValue {
fn is_empty(&self) -> bool {
self.key_value_addr.is_null()
}
}
/// Customized `HashMap` with string keys
///
/// This `HashMap` takes String as keys. Keys are
/// stored in a user defined heap.
///
/// The quirky API has the benefit of avoiding
/// the computation of the hash of the key twice,
/// or copying the key as long as there is no insert.
///
pub struct TermHashMap {
table: Box<[KeyValue]>,
pub heap: MemoryArena,
mask: usize,
occupied: Vec<usize>,
}
struct QuadraticProbing {
hash: usize,
i: usize,
mask: usize,
}
impl QuadraticProbing {
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing { hash, i: 0, mask }
}
#[inline]
fn next_probe(&mut self) -> usize {
self.i += 1;
(self.hash + self.i) & self.mask
}
}
pub struct Iter<'a> {
hashmap: &'a TermHashMap,
inner: slice::Iter<'a, usize>,
}
impl<'a> Iterator for Iter<'a> {
type Item = (&'a [u8], Addr, BucketId);
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'a [u8], Addr) =
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
(key, offset, bucket as BucketId)
})
}
}
impl TermHashMap {
pub fn new(num_bucket_power_of_2: usize) -> TermHashMap {
let heap = MemoryArena::new();
let table_size = 1 << num_bucket_power_of_2;
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
TermHashMap {
table: table.into_boxed_slice(),
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
}
}
fn probe(&self, hash: u32) -> QuadraticProbing {
QuadraticProbing::compute(hash as usize, self.mask)
}
pub fn mem_usage(&self) -> usize {
self.table.len() * mem::size_of::<KeyValue>()
}
fn is_saturated(&self) -> bool {
self.table.len() < self.occupied.len() * 3
}
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
let key_addr = addr.offset(2u32);
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
(key_bytes, val_addr)
}
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key_value_addr,
hash,
};
}
pub fn iter(&self) -> Iter {
Iter {
inner: self.occupied.iter(),
hashmap: &self,
}
}
fn resize(&mut self) {
let new_len = self.table.len() * 2;
let mask = new_len - 1;
self.mask = mask;
let new_table = vec![KeyValue::default(); new_len].into_boxed_slice();
let old_table = mem::replace(&mut self.table, new_table);
for old_pos in self.occupied.iter_mut() {
let key_value: KeyValue = old_table[*old_pos];
let mut probe = QuadraticProbing::compute(key_value.hash as usize, mask);
loop {
let bucket = probe.next_probe();
if self.table[bucket].is_empty() {
*old_pos = bucket;
self.table[bucket] = key_value;
break;
}
}
}
}
/// `update` create a new entry for a given key if it does not exists
/// or updates the existing entry.
///
/// The actual logic for this update is define in the the `updater`
/// argument.
///
/// If the key is not present, `updater` will receive `None` and
/// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`.
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
where
S: AsRef<[u8]>,
V: Copy,
TMutator: FnMut(Option<V>) -> V,
{
if self.is_saturated() {
self.resize();
}
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
let val = updater(None);
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
self.set_bucket(hash, key_addr, bucket);
return bucket as BucketId;
} else if kv.hash == hash {
let (key_matches, val_addr) = {
let (stored_key, val_addr): (&[u8], Addr) =
unsafe { self.get_key_value(kv.key_value_addr) };
(stored_key == key_bytes, val_addr)
};
if key_matches {
unsafe {
// logic
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write(val_addr, new_v);
};
return bucket as BucketId;
}
}
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)]
mod tests {
use super::TermHashMap;
use std::collections::HashMap;
#[test]
fn test_hash_map() {
let mut hash_map: TermHashMap = TermHashMap::new(18);
{
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, None);
3u32
});
}
{
hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
assert_eq!(opt_val, None);
4u32
});
}
{
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, Some(3u32));
5u32
});
}
let mut vanilla_hash_map = HashMap::new();
let mut iter_values = hash_map.iter();
while let Some((key, addr, _)) = iter_values.next() {
let val: u32 = unsafe {
// test
hash_map.heap.read(addr)
};
vanilla_hash_map.insert(key.to_owned(), val);
}
assert_eq!(vanilla_hash_map.len(), 2);
}
}

View File

@@ -1,59 +0,0 @@
use common::BitSet;
use core::SegmentReader;
use fst::Automaton;
use query::BitSetDocSet;
use query::ConstScorer;
use query::{Scorer, Weight};
use schema::{Field, IndexRecordOption};
use termdict::{TermDictionary, TermStreamer};
use Result;
/// A weight struct for Fuzzy Term and Regex Queries
pub struct AutomatonWeight<A>
where
A: Automaton,
{
field: Field,
automaton: A,
}
impl<A> AutomatonWeight<A>
where
A: Automaton,
{
/// Create a new AutomationWeight
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
AutomatonWeight { field, automaton }
}
fn automaton_stream<'a>(&'a self, term_dict: &'a TermDictionary) -> TermStreamer<'a, &'a A> {
let term_stream_builder = term_dict.search(&self.automaton);
term_stream_builder.into_stream()
}
}
impl<A> Weight for AutomatonWeight<A>
where
A: Automaton,
{
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field);
let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict);
while term_stream.advance() {
let term_info = term_stream.value();
let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
while block_segment_postings.advance() {
for &doc in block_segment_postings.docs() {
doc_bitset.insert(doc);
}
}
}
let doc_bitset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(doc_bitset)))
}
}

View File

@@ -41,8 +41,7 @@ impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
impl Query for BooleanQuery {
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
let sub_weights = self
.subqueries
let sub_weights = self.subqueries
.iter()
.map(|&(ref occur, ref subquery)| {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))

View File

@@ -1,162 +0,0 @@
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
use query::{AutomatonWeight, Query, Weight};
use schema::Term;
use std::collections::HashMap;
use Result;
use Searcher;
lazy_static! {
static ref LEV_BUILDER: HashMap<(u8, bool), LevenshteinAutomatonBuilder> = {
let mut lev_builder_cache = HashMap::new();
// TODO make population lazy on a `(distance, val)` basis
for distance in 0..3 {
for &transposition in [false, true].iter() {
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition);
lev_builder_cache.insert((distance, transposition), lev_automaton_builder);
}
}
lev_builder_cache
};
}
/// A Fuzzy Query matches all of the documents
/// containing a specific term that is within
/// Levenshtein distance
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result, Term};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::FuzzyTermQuery;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let term = Term::from_field_text(title, "Diary");
/// let query = FuzzyTermQuery::new(term, 1, true);
/// searcher.search(&query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
#[derive(Debug, Clone)]
pub struct FuzzyTermQuery {
/// What term are we searching
term: Term,
/// How many changes are we going to allow
distance: u8,
/// Should a transposition cost 1 or 2?
transposition_cost_one: bool,
///
prefix: bool,
}
impl FuzzyTermQuery {
/// Creates a new Fuzzy Query
pub fn new(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
FuzzyTermQuery {
term,
distance,
transposition_cost_one,
prefix: false,
}
}
/// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
FuzzyTermQuery {
term,
distance,
transposition_cost_one,
prefix: true,
}
}
fn specialized_weight(&self) -> Result<AutomatonWeight<DFA>> {
let automaton = LEV_BUILDER.get(&(self.distance, false))
.unwrap() // TODO return an error
.build_dfa(self.term.text());
Ok(AutomatonWeight::new(self.term.field(), automaton))
}
}
impl Query for FuzzyTermQuery {
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
Ok(Box::new(self.specialized_weight()?))
}
}
#[cfg(test)]
mod test {
use super::FuzzyTermQuery;
use collector::TopCollector;
use schema::SchemaBuilder;
use schema::TEXT;
use tests::assert_nearly_equals;
use Index;
use Term;
#[test]
pub fn test_fuzzy_term() {
let mut schema_builder = SchemaBuilder::new();
let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(
country_field => "japan",
));
index_writer.add_document(doc!(
country_field => "korea",
));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
{
let mut collector = TopCollector::with_limit(2);
let term = Term::from_field_text(country_field, "japon");
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
searcher.search(&fuzzy_query, &mut collector).unwrap();
let scored_docs = collector.score_docs();
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
let (score, _) = scored_docs[0];
assert_nearly_equals(1f32, score);
}
}
}

View File

@@ -228,8 +228,7 @@ where
TOtherScorer: Scorer,
{
fn score(&mut self) -> Score {
self.left.score()
+ self.right.score()
self.left.score() + self.right.score()
+ self.others.iter_mut().map(Scorer::score).sum::<Score>()
}
}

View File

@@ -3,19 +3,16 @@ Query
*/
mod all_query;
mod automaton_weight;
mod bitset;
mod bm25;
mod boolean_query;
mod exclude;
mod fuzzy_query;
mod intersection;
mod occur;
mod phrase_query;
mod query;
mod query_parser;
mod range_query;
mod regex_query;
mod reqopt_scorer;
mod scorer;
mod term_query;
@@ -34,11 +31,9 @@ pub use self::union::Union;
pub use self::vec_docset::VecDocSet;
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
pub use self::automaton_weight::AutomatonWeight;
pub use self::bitset::BitSetDocSet;
pub use self::boolean_query::BooleanQuery;
pub use self::exclude::Exclude;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers;
pub use self::occur::Occur;
pub use self::phrase_query::PhraseQuery;
@@ -46,7 +41,6 @@ pub use self::query::Query;
pub use self::query_parser::QueryParser;
pub use self::query_parser::QueryParserError;
pub use self::range_query::RangeQuery;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::scorer::ConstScorer;
pub use self::scorer::EmptyScorer;

View File

@@ -1,10 +1,8 @@
use super::Weight;
use collector::Collector;
use core::searcher::Searcher;
use downcast;
use std::fmt;
use Result;
use SegmentLocalId;
/// The `Query` trait defines a set of documents and a scoring method
/// for those documents.
@@ -57,26 +55,6 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug {
}
Ok(result)
}
/// Search works as follows :
///
/// First the weight object associated to the query is created.
///
/// Then, the query loops over the segments and for each segment :
/// - setup the collector and informs it that the segment being processed has changed.
/// - creates a `Scorer` object associated for this segment
/// - iterate throw the matched documents and push them to the collector.
///
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<()> {
let scoring_enabled = collector.requires_scoring();
let weight = self.weight(searcher, scoring_enabled)?;
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
collector.set_segment(segment_ord as SegmentLocalId, segment_reader)?;
let mut scorer = weight.scorer(segment_reader)?;
scorer.collect(collector, segment_reader.delete_bitset());
}
Ok(())
}
}
pub trait QueryClone {

View File

@@ -41,8 +41,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// # extern crate tantivy;
/// # use tantivy::Index;
/// # use tantivy::schema::{SchemaBuilder, INT_INDEXED};
/// # use tantivy::collector::CountCollector;
/// # use tantivy::query::Query;
/// # use tantivy::collector::{Collector, CountCollector};
/// # use tantivy::Result;
/// # use tantivy::query::RangeQuery;
/// #
@@ -68,7 +67,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
///
/// let mut count_collector = CountCollector::default();
/// docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
/// count_collector.search(&*searcher, &docs_in_the_sixties)?;
///
/// let num_60s_books = count_collector.count();
///
@@ -195,16 +194,12 @@ impl RangeQuery {
/// Lower bound of range
pub fn left_bound(&self) -> Bound<Term> {
map_bound(&self.left_bound, &|bytes| {
Term::from_field_bytes(self.field, bytes)
})
map_bound(&self.left_bound, &|bytes| Term::from_field_bytes(self.field, bytes))
}
/// Upper bound of range
pub fn right_bound(&self) -> Bound<Term> {
map_bound(&self.right_bound, &|bytes| {
Term::from_field_bytes(self.field, bytes)
})
map_bound(&self.right_bound, &|bytes| Term::from_field_bytes(self.field, bytes))
}
}
@@ -278,8 +273,7 @@ impl Weight for RangeWeight {
mod tests {
use super::RangeQuery;
use collector::CountCollector;
use query::Query;
use collector::{Collector, CountCollector};
use schema::{Document, Field, SchemaBuilder, INT_INDEXED};
use std::collections::Bound;
use Index;
@@ -310,7 +304,7 @@ mod tests {
// ... or `1960..=1969` if inclusive range is enabled.
let mut count_collector = CountCollector::default();
docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
count_collector.search(&*searcher, &docs_in_the_sixties)?;
assert_eq!(count_collector.count(), 2285);
Ok(())
}
@@ -347,9 +341,7 @@ mod tests {
let searcher = index.searcher();
let count_multiples = |range_query: RangeQuery| {
let mut count_collector = CountCollector::default();
range_query
.search(&*searcher, &mut count_collector)
.unwrap();
count_collector.search(&*searcher, &range_query).unwrap();
count_collector.count()
};

View File

@@ -1,143 +0,0 @@
use error::ErrorKind;
use fst_regex::Regex;
use query::{AutomatonWeight, Query, Weight};
use schema::Field;
use std::clone::Clone;
use Result;
use Searcher;
// A Regex Query matches all of the documents
/// containing a specific term that matches
/// a regex pattern
/// A Fuzzy Query matches all of the documents
/// containing a specific term that is within
/// Levenshtein distance
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result, Term};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::RegexQuery;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let term = Term::from_field_text(title, "Diary");
/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title);
/// searcher.search(&query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 3);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
#[derive(Debug, Clone)]
pub struct RegexQuery {
regex_pattern: String,
field: Field,
}
impl RegexQuery {
/// Creates a new Fuzzy Query
pub fn new(regex_pattern: String, field: Field) -> RegexQuery {
RegexQuery {
regex_pattern,
field,
}
}
fn specialized_weight(&self) -> Result<AutomatonWeight<Regex>> {
let automaton = Regex::new(&self.regex_pattern)
.map_err(|_| ErrorKind::InvalidArgument(self.regex_pattern.clone()))?;
Ok(AutomatonWeight::new(self.field.clone(), automaton))
}
}
impl Query for RegexQuery {
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
Ok(Box::new(self.specialized_weight()?))
}
}
#[cfg(test)]
mod test {
use super::RegexQuery;
use collector::TopCollector;
use schema::SchemaBuilder;
use schema::TEXT;
use tests::assert_nearly_equals;
use Index;
#[test]
pub fn test_regex_query() {
let mut schema_builder = SchemaBuilder::new();
let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(
country_field => "japan",
));
index_writer.add_document(doc!(
country_field => "korea",
));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
{
let mut collector = TopCollector::with_limit(2);
let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field);
searcher.search(&regex_query, &mut collector).unwrap();
let scored_docs = collector.score_docs();
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
let (score, _) = scored_docs[0];
assert_nearly_equals(1f32, score);
}
let searcher = index.searcher();
{
let mut collector = TopCollector::with_limit(2);
let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field);
searcher.search(&regex_query, &mut collector).unwrap();
let scored_docs = collector.score_docs();
assert_eq!(scored_docs.len(), 0, "Expected ZERO document");
}
}
}

View File

@@ -1,4 +1,4 @@
use collector::Collector;
use collector::SegmentCollector;
use common::BitSet;
use docset::{DocSet, SkipResult};
use downcast;
@@ -18,7 +18,7 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
/// Consumes the complete `DocSet` and
/// push the scored documents to the collector.
fn collect(&mut self, collector: &mut Collector, delete_bitset_opt: Option<&DeleteBitSet>) {
fn collect<T>(&mut self, collector: &mut SegmentCollector<CollectionResult = T>, delete_bitset_opt: Option<&DeleteBitSet>) {
if let Some(delete_bitset) = delete_bitset_opt {
while self.advance() {
let doc = self.doc();
@@ -44,7 +44,7 @@ impl Scorer for Box<Scorer> {
self.deref_mut().score()
}
fn collect(&mut self, collector: &mut Collector, delete_bitset: Option<&DeleteBitSet>) {
fn collect<T>(&mut self, collector: &mut SegmentCollector<CollectionResult = T>, delete_bitset: Option<&DeleteBitSet>) {
let scorer = self.deref_mut();
scorer.collect(collector, delete_bitset);
}

View File

@@ -16,59 +16,6 @@ use Term;
/// * `idf` - inverse document frequency.
/// * `term_freq` - number of occurrences of the term in the field
/// * `field norm` - number of tokens in the field.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT, IndexRecordOption};
/// use tantivy::{Index, Result, Term};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::TermQuery;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit()?;
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let query = TermQuery::new(
/// Term::from_field_text(title, "diary"),
/// IndexRecordOption::Basic,
/// );
/// searcher.search(&query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
#[derive(Clone, Debug)]
pub struct TermQuery {
term: Term,

View File

@@ -6,7 +6,7 @@ use Result;
/// for a given set of segments.
///
/// See [`Query`](./trait.Query.html).
pub trait Weight {
pub trait Weight: Send + Sync + 'static {
/// Returns the scorer for the given segment.
/// See [`Query`](./trait.Query.html).
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;

View File

@@ -1,19 +0,0 @@
extern crate lz4;
use std::io::{self, Read, Write};
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = lz4::EncoderBuilder::new().build(compressed)?;
encoder.write_all(&uncompressed)?;
let (_, encoder_result) = encoder.finish();
encoder_result?;
Ok(())
}
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
let mut decoder = lz4::Decoder::new(compressed)?;
decoder.read_to_end(decompressed)?;
Ok(())
}

View File

@@ -1,17 +0,0 @@
extern crate snap;
use std::io::{self, Read, Write};
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = snap::Writer::new(compressed);
encoder.write_all(&uncompressed)?;
encoder.flush()?;
Ok(())
}
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
snap::Reader::new(compressed).read_to_end(decompressed)?;
Ok(())
}

View File

@@ -34,21 +34,10 @@ and should rely on either
!*/
mod reader;
mod skiplist;
mod writer;
pub use self::reader::StoreReader;
pub use self::writer::StoreWriter;
#[cfg(feature = "lz4")]
mod compression_lz4;
#[cfg(feature = "lz4")]
use self::compression_lz4::*;
#[cfg(not(feature = "lz4"))]
mod compression_snap;
#[cfg(not(feature = "lz4"))]
use self::compression_snap::*;
#[cfg(test)]
pub mod tests {

View File

@@ -1,13 +1,13 @@
use Result;
use super::decompress;
use super::skiplist::SkipList;
use common::BinarySerializable;
use common::VInt;
use datastruct::SkipList;
use directory::ReadOnlySource;
use lz4;
use schema::Document;
use std::cell::RefCell;
use std::io;
use std::io::{self, Read};
use std::mem::size_of;
use DocId;
@@ -61,7 +61,9 @@ impl StoreReader {
let mut current_block_mut = self.current_block.borrow_mut();
current_block_mut.clear();
let compressed_block = self.compressed_block(block_offset);
decompress(compressed_block, &mut current_block_mut)?;
let mut lz4_decoder = lz4::Decoder::new(compressed_block)?;
*self.current_block_offset.borrow_mut() = usize::max_value();
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())?;
*self.current_block_offset.borrow_mut() = block_offset;
}
Ok(())

View File

@@ -1,9 +1,9 @@
use super::compress;
use super::skiplist::SkipListBuilder;
use super::StoreReader;
use common::CountingWriter;
use common::{BinarySerializable, VInt};
use datastruct::SkipListBuilder;
use directory::WritePtr;
use lz4;
use schema::Document;
use std::io::{self, Write};
use DocId;
@@ -87,7 +87,12 @@ impl StoreWriter {
fn write_and_compress_block(&mut self) -> io::Result<()> {
self.intermediary_buffer.clear();
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
{
let mut encoder = lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer)?;
encoder.write_all(&self.current_block)?;
let (_, encoder_result) = encoder.finish();
encoder_result?;
}
(self.intermediary_buffer.len() as u32).serialize(&mut self.writer)?;
self.writer.write_all(&self.intermediary_buffer)?;
self.offset_index_writer

View File

@@ -94,7 +94,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
let bit_shift = (addr_bits % 8) as u64;
assert!(data.len() >= addr_byte + 8);
let val_unshifted_unmasked: u64 = unsafe {
// ok thanks to the 7 byte padding on `.close`
//< ok : check len above
let addr = data.as_ptr().offset(addr_byte as isize) as *const u64;
ptr::read_unaligned(addr)
};

View File

@@ -164,8 +164,7 @@ impl TermDictionary {
let fst = self.fst_index.as_fst();
let mut node = fst.root();
while ord != 0 || !node.is_final() {
if let Some(transition) = node
.transitions()
if let Some(transition) = node.transitions()
.take_while(|transition| transition.out.value() <= ord)
.last()
{
@@ -204,7 +203,7 @@ impl TermDictionary {
/// Returns a search builder, to stream all of the terms
/// within the Automaton
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
pub fn search<'a, A: Automaton>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
let stream_builder = self.fst_index.search(automaton);
TermStreamerBuilder::<A>::new(self, stream_builder)
}

View File

@@ -25,7 +25,7 @@ impl Default for Token {
offset_from: 0,
offset_to: 0,
position: usize::max_value(),
text: String::with_capacity(200),
text: String::new(),
}
}
}