Compare commits

..

36 Commits

Author SHA1 Message Date
Paul Masurel
31655e92d7 Preparing release 0.6.1 2018-07-10 09:12:26 +09:00
Paul Masurel
6b8d76685a Tiny refactoring 2018-07-05 09:11:55 +09:00
Paul Masurel
ce5683fc6a Removed useless counting_writer 2018-07-04 16:13:19 +09:00
Paul Masurel
5205579db6 Merge branch 'master' of github.com:tantivy-search/tantivy 2018-07-04 16:09:59 +09:00
Paul Masurel
d056ae60dc Removed SourceRead. Relying on the new owned-read crate instead (#332) 2018-07-04 16:08:52 +09:00
Paul Masurel
af9280c95f Removed SourceRead. Relying on the new owned-read crate instead 2018-07-04 12:47:25 +09:00
David Hewson
2e538ce6e6 remove extra space in name (#331)
the extra space that appeared breaks using the package
2018-07-02 05:32:19 +09:00
Jason Wolfe
00466d2b08 #328: Support parsing unbounded range queries (#329)
* #328: Support parsing unbounded range queries. Update CHANGELOG.md for query parser changes.

* Set version to 0.7-dev
2018-06-30 13:24:02 +09:00
Paul Masurel
8ebbf6b336 Issue/325 (#330)
* Introducing a SegmentMea inventory.
* Depending on census=0.1
* Cargo fmt
2018-06-30 13:11:41 +09:00
Paul Masurel
1ce36bb211 Merge branch 'master' of github.com:tantivy-search/tantivy 2018-06-27 16:58:47 +09:00
Jason Wolfe
2ac43bf21b Support parsing RangeQuery and AllQuery in Queryparser (#323)
* (#321) Add support for range query parsing to grammar / parser. Still needs to be wired through the rest of the way.

* (321) Finish wiring RangeQuery parsing through

* (#321) Add logical AST query parser tests for RangeQuery

* (#321) Support parsing AllQuery

* (#321) Update documentation of QueryParser

* (#321) Support negative numbers in range query parsing
2018-06-25 08:29:47 +09:00
Paul Masurel
3fd8c2aa5a Removed one keywoard 2018-06-22 14:47:21 +09:00
Paul Masurel
c1022e23d2 Switching to stable rust in AppVeyor. 2018-06-22 14:33:42 +09:00
Paul Masurel
8ccbfdea5d Preparing for release 2018-06-22 14:27:46 +09:00
Paul Masurel
badfce3a23 Preparing for release. 2018-06-22 14:09:14 +09:00
Dru Sellers
e301e0bc87 Add some simple doc tests (#320)
* Add TopCollector doc test

* Add CountCollector Doc Test

* Add Doc Test for MultiCollector

* Add ChainedCollector Doc Test

* Expose Fuzzy Query where it should be

* Add FuzzyTermQuery Doc Test

* Expose RegexQuery

* Regex Query Doc Test

* Add TermQuery Doc Test

* Add doc comments

* fix test 🤦

* Added explanation about the complexity variables

* Fixing unit tests

* Single threads if you check docids
2018-06-19 10:45:20 +09:00
Dru Sellers
317baf4e75 Add in simple regex query support (#319)
* Add fst_regex crate in

* Reduce API surface area

This doesn't need to be public

* better test name

* Pull Automaton weight out so it can be shared

* Implement Regex Query
2018-06-16 14:08:30 +09:00
Paul Masurel
24398d94e4 Exposing the 2018-06-15 21:40:57 +09:00
Dru Sellers
360f4132eb Standardizes the Index::open_* APIs (#318)
* Relocate `from_directory` closer to its usage

* Specific methods come before the generic method

* Rename open methods to follow the lead of the create methods
2018-06-15 12:16:41 +09:00
Dru Sellers
2b8f02764b Standardizes the Index::create_* APIs (#317)
* Pull all creation methods next to each other

The goal here is to make it clear which methods are performing the
same function, and to assist with standardizing the API calls.

* Make `from_directory` private

This seems to be an internal function, so lets make it internal.

* Rename `create` to `create_in_dir`

This lets the name match the `create_in_ram` pattern and opens up
`create` for the generic implementation.

* Implement the generic create function

All of the create methods now delegate to the common create function
and future `create_in_*` functions now have a clear pattern
to follow as well
2018-06-14 11:08:42 +09:00
Paul Masurel
0465876854 Issue/257 (#310)
* Replaced lz4 by a pure rust implementation of snappy.

Closes #257

* snappy is the default compression. One can use lz4 by enabling the lz4 feature flag.

* Removed Compression trait
2018-06-12 19:02:57 +09:00
Dru Sellers
6f7b099370 Add AutomatonWeight to a fuzzy_search module and FuzzyQuery (#300)
* Add AutomatonWeight to a fuzzy_search module

* Hacking around ownership issues

* Working through lifetime issues

* Working through tests

* fix test by lower casing the words (reducing distance)

* code review changes

* Suggestion on how to solve the borrow problem

* clean up
2018-06-11 22:23:03 +09:00
Paul Masurel
84f5cc4388 Added an AUTHORS file. Closes #315 (#316) 2018-06-11 22:21:58 +09:00
Paul Masurel
75aae0d2c2 Update README 2018-06-08 13:05:57 +09:00
Paul Masurel
009a3559be atomicwrites 2.2.0 for ARM compilation 2018-06-06 07:13:09 +09:00
Paul Masurel
7a31669e9d Disabling ARM targets 2018-06-05 12:22:00 +09:00
Paul Masurel
5185eb790b Reduced heap usage in unit test 2018-06-05 10:02:10 +09:00
Paul Masurel
a3dffbf1c6 Added more ARM target. 2018-06-05 09:06:33 +09:00
Paul Masurel
857a5794d8 Updated nix version 2018-06-05 09:02:40 +09:00
Paul Masurel
b0a6fc1448 Reduce RAM usage 2018-06-04 11:20:24 +09:00
Paul Masurel
989d52bea4 Updated atomicwrites version. 2018-06-04 10:00:21 +09:00
Paul Masurel
09661ea7ec Added cross testing on different platforms 2018-06-04 09:47:53 +09:00
Paul Masurel
b59132966f Better heap (#311)
* Changed the heap to a paged memory arena.
* Trying to simplify the indexing term hashmap
* Exploding datastruct
* Removed some complexity in bitpacker
2018-06-04 09:39:18 +09:00
Paul Masurel
863d3411bc Update Cargo.toml 2018-05-31 15:54:34 +09:00
Paul Masurel
8a55d133ab Showing Appveyor CI badge for the master branch
.. before the last build was shown.
2018-05-28 13:44:53 +09:00
Jason Wolfe
432d49d814 Expose parameters of RangeQuery for external usage (#309) 2018-05-19 14:29:25 +09:00
78 changed files with 3192 additions and 2270 deletions

View File

@@ -1,37 +1,127 @@
# Based on the "trust" template v0.1.2
# https://github.com/japaric/trust/tree/v0.1.2
dist: trusty
language: rust
services: docker
sudo: required
cache: cargo
rust:
- nightly
env:
global:
- CC=gcc-4.8
- CXX=g++-4.8
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- kalakris-cmake
packages:
- gcc-4.8
- g++-4.8
- libcurl4-openssl-dev
- libelf-dev
- libdw-dev
- binutils-dev
- cmake
before_script:
- export PATH=$HOME/.cargo/bin:$PATH
- cargo install cargo-update || echo "cargo-update already installed"
- cargo install cargo-travis || echo "cargo-travis already installed"
- CRATE_NAME=tantivy
matrix:
include:
# Android
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
- env: TARGET=i686-linux-android DISABLE_TESTS=1
- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
# iOS
#- env: TARGET=aarch64-apple-ios DISABLE_TESTS=1
# os: osx
#- env: TARGET=armv7-apple-ios DISABLE_TESTS=1
# os: osx
#- env: TARGET=armv7s-apple-ios DISABLE_TESTS=1
# os: osx
#- env: TARGET=i386-apple-ios DISABLE_TESTS=1
# os: osx
- env: TARGET=x86_64-apple-ios DISABLE_TESTS=1
os: osx
# Linux
- env: TARGET=aarch64-unknown-linux-gnu
# - env: TARGET=arm-unknown-linux-gnueabi
# - env: TARGET=armv7-unknown-linux-gnueabihf
- env: TARGET=i686-unknown-linux-gnu
#- env: TARGET=i686-unknown-linux-musl
#- env: TARGET=mips-unknown-linux-gnu
#- env: TARGET=mips64-unknown-linux-gnuabi64
#- env: TARGET=mips64el-unknown-linux-gnuabi64
#- env: TARGET=mipsel-unknown-linux-gnu
#- env: TARGET=powerpc-unknown-linux-gnu
#- env: TARGET=powerpc64-unknown-linux-gnu
#- env: TARGET=powerpc64le-unknown-linux-gnu
#- env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1
- env: TARGET=x86_64-unknown-linux-gnu
- env: TARGET=x86_64-unknown-linux-musl
# OSX
#- env: TARGET=i686-apple-darwin
# os: osx
- env: TARGET=x86_64-apple-darwin
os: osx
# *BSD
#- env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1
#- env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1
#- env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1
# Windows
#- env: TARGET=x86_64-pc-windows-gnu
# Bare metal
# These targets don't support std and as such are likely not suitable for
# most crates.
# - env: TARGET=thumbv6m-none-eabi
# - env: TARGET=thumbv7em-none-eabi
# - env: TARGET=thumbv7em-none-eabihf
# - env: TARGET=thumbv7m-none-eabi
# Testing other channels
#- env: TARGET=x86_64-unknown-linux-gnu
# rust: nightly
#- env: TARGET=x86_64-apple-darwin
# os: osx
# rust: nightly
before_install:
- set -e
- rustup self update
install:
- sh ci/install.sh
- source ~/.cargo/env || true
script:
- cargo build
- cargo test
- cargo test -- --ignored
- cargo run --example simple_search
- cargo doc
after_success:
- cargo coveralls --exclude-pattern src/functional_test.rs
- cargo doc-upload
- bash ci/script.sh
after_script: set +e
before_deploy:
- sh ci/before_deploy.sh
#
#deploy:
# # - Create a `public_repo` GitHub token. Go to: https://github.com/settings/tokens/new
# # - Encrypt it: `travis encrypt 0123456789012345678901234567890123456789
# # - Paste the output down here
# api_key:
# secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
# file_glob: true
# file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.*
# on:
# # TODO Here you can pick which targets will generate binary releases
# # In this example, there are some targets that are tested using the stable
# # and nightly channels. This condition makes sure there is only one release
# # for such targets and that's generated using the stable channel
# condition: $TRAVIS_RUST_VERSION = stable
# tags: true
# provider: releases
# skip_cleanup: true
cache: cargo
before_cache:
# Travis can't cache files that are not readable by "others"
- chmod -R a+r $HOME/.cargo
#branches:
# only:
# # release tags
# - /^v\d+\.\d+\.\d+.*$/
# - master
notifications:
email:
on_success: never

11
AUTHORS Normal file
View File

@@ -0,0 +1,11 @@
# This is the list of authors of tantivy for copyright purposes.
Paul Masurel
Laurentiu Nicola
Dru Sellers
Ashley Mannix
Michael J. Curry
Jason Wolfe
# As an employee of Google I am required to add Google LLC
# in the list of authors, but this project is not affiliated to Google
# in any other way.
Google LLC

View File

@@ -1,14 +1,34 @@
Tantivy 0.6.1
=========================
- Bugfix #324. GC removing was removing file that were still in useful
- Added support for parsing AllQuery and RangeQuery via QueryParser
- AllQuery: `*`
- RangeQuery:
- Inclusive `field:[startIncl to endIncl]`
- Exclusive `field:{startExcl to endExcl}`
- Mixed `field:[startIncl to endExcl}` and vice versa
- Unbounded `field:[start to *]`, `field:[* to end]`
Tantivy 0.6
==========================
- Removed C code. Tantivy is now pure Rust.
- BM25
- Approximate field norms encoded over 1 byte.
- Compiles on stable rust
Special thanks to @drusellers and @jason-wolfe for their contributions
to this release!
- Removed C code. Tantivy is now pure Rust. (@pmasurel)
- BM25 (@pmasurel)
- Approximate field norms encoded over 1 byte. (@pmasurel)
- Compiles on stable rust (@pmasurel)
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
- Completely uncompressed
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
- Add NGram token support (@drusellers)
- Add Stopword Filter support (@drusellers)
- Add a FuzzyTermQuery (@drusellers)
- Add a RegexQuery (@drusellers)
- Various performance improvements (@pmasurel)_
Tantivy 0.5.2
===========================

View File

@@ -1,10 +1,10 @@
[package]
name = "tantivy"
version = "0.6.0-dev"
version = "0.6.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
description = """Tantivy is a search engine library."""
description = """Search engine library"""
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/tantivy-search/tantivy"
@@ -18,7 +18,10 @@ lazy_static = "0.2.1"
tinysegmenter = "0.1.0"
regex = "0.2"
fst = {version="0.3", default-features=false}
atomicwrites = {version="0.1", optional=true}
fst-regex = { version="0.2" }
lz4 = {version="1.20", optional=true}
snap = {version="0.2"}
atomicwrites = {version="0.2.2", optional=true}
tempfile = "2.1"
log = "0.3.6"
combine = "2.2"
@@ -29,7 +32,6 @@ serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.5.9"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
lz4 = "1.20"
bit-set = "0.4.0"
uuid = { version = "0.6", features = ["v4", "serde"] }
chan = "0.1"
@@ -42,8 +44,10 @@ stable_deref_trait = "1.0.0"
rust-stemmers = "0.1.0"
downcast = { version="0.9" }
matches = "0.1"
bitpacking = "0.4"
bitpacking = "0.5"
census = "0.1"
fnv = "1.0.6"
owned-read = "0.1"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -60,9 +64,8 @@ debug-assertions = false
[features]
default = ["mmap"]
simd = ["bitpacking/simd"]
mmap = ["fst/mmap", "atomicwrites"]
unstable = ["simd"]
lz4-compression = ["lz4"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -1,4 +1,4 @@
Copyright (c) 2018 by Paul Masurel, Google LLC
Copyright (c) 2018 by the project authors, as listed in the AUTHORS file.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

View File

@@ -4,36 +4,50 @@
[![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master&refresh1)](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
**Tantivy** is a **full text search engine library** written in rust.
It is strongly inspired by Lucene's design.
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design.
# Features
- Full-text search
- Tiny startup time (<10ms), perfect for command line tools
- tf-idf scoring
- Basic query language
- Phrase queries
- BM25 scoring (the same as lucene)
- Basic query language (`+michael +jackson`)
- Phrase queries search (\"michael jackson\"`)
- Incremental indexing
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- Mmap directory
- optional SIMD integer compression
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- LZ4 compressed document store
- Range queries
- Faceting
- configurable indexing (optional term frequency and position indexing
- Faceted search
- Configurable indexing (optional term frequency and position indexing
- Cheesy logo with a horse
Tantivy supports Linux, MacOS and Windows.
# Non-features
- Distributed search and will not be in the scope of tantivy.
# Supported OS and compiler
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
# Getting started
- [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html)
- [tantivy's simple search example](http://fulmicoton.com/tantivy-examples/simple_search.html)
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
index documents and search via the CLI or a small server with a REST API.
It will walk you through getting a wikipedia search engine up and running in a few minutes.
- [reference doc]
- [For the last released version](https://docs.rs/tantivy/)
@@ -43,40 +57,14 @@ It will walk you through getting a wikipedia search engine up and running in a f
## Development
Tantivy now compiles on stable rust.
To check out and run test, you can simply run :
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run :
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
cargo build
## Note on release build and performance
If your project depends on `tantivy`, for better performance, make sure to enable
`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
be available on most `x86_64` CPUs you will encounter).
For instance,
RUSTFLAGS='-C target-feature=+sse3'
Or, if you are targetting a specific cpu
RUSTFLAGS='-C target-cpu=native' build --release
Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
If you want to disable those, you can run the following command :
cargo build --no-default-features
Alternatively, if you are trying to compile `tantivy` without simd compression,
you can disable this functionality. In this case, this submodule is not required
and you can compile tantivy by using the `--no-default-features` flag.
cargo build --no-default-features
# Contribute
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.

View File

@@ -4,11 +4,8 @@
os: Visual Studio 2015
environment:
matrix:
- channel: nightly
- channel: stable
target: x86_64-pc-windows-msvc
- channel: nightly
target: x86_64-pc-windows-gnu
msys_bits: 64
install:
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe

23
ci/before_deploy.ps1 Normal file
View File

@@ -0,0 +1,23 @@
# This script takes care of packaging the build artifacts that will go in the
# release zipfile
$SRC_DIR = $PWD.Path
$STAGE = [System.Guid]::NewGuid().ToString()
Set-Location $ENV:Temp
New-Item -Type Directory -Name $STAGE
Set-Location $STAGE
$ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip"
# TODO Update this to package the right artifacts
Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\'
7z a "$ZIP" *
Push-AppveyorArtifact "$ZIP"
Remove-Item *.* -Force
Set-Location ..
Remove-Item $STAGE
Set-Location $SRC_DIR

33
ci/before_deploy.sh Normal file
View File

@@ -0,0 +1,33 @@
# This script takes care of building your crate and packaging it for release
set -ex
main() {
local src=$(pwd) \
stage=
case $TRAVIS_OS_NAME in
linux)
stage=$(mktemp -d)
;;
osx)
stage=$(mktemp -d -t tmp)
;;
esac
test -f Cargo.lock || cargo generate-lockfile
# TODO Update this to build the artifacts that matter to you
cross rustc --bin hello --target $TARGET --release -- -C lto
# TODO Update this to package the right artifacts
cp target/$TARGET/release/hello $stage/
cd $stage
tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
cd $src
rm -rf $stage
}
main

47
ci/install.sh Normal file
View File

@@ -0,0 +1,47 @@
set -ex
main() {
local target=
if [ $TRAVIS_OS_NAME = linux ]; then
target=x86_64-unknown-linux-musl
sort=sort
else
target=x86_64-apple-darwin
sort=gsort # for `sort --sort-version`, from brew's coreutils.
fi
# Builds for iOS are done on OSX, but require the specific target to be
# installed.
case $TARGET in
aarch64-apple-ios)
rustup target install aarch64-apple-ios
;;
armv7-apple-ios)
rustup target install armv7-apple-ios
;;
armv7s-apple-ios)
rustup target install armv7s-apple-ios
;;
i386-apple-ios)
rustup target install i386-apple-ios
;;
x86_64-apple-ios)
rustup target install x86_64-apple-ios
;;
esac
# This fetches latest stable release
local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \
| cut -d/ -f3 \
| grep -E '^v[0.1.0-9.]+$' \
| $sort --version-sort \
| tail -n1)
curl -LSfs https://japaric.github.io/trust/install.sh | \
sh -s -- \
--force \
--git japaric/cross \
--tag $tag \
--target $target
}
main

23
ci/script.sh Normal file
View File

@@ -0,0 +1,23 @@
# This script takes care of testing your crate
set -ex
main() {
cross build --target $TARGET
cross build --target $TARGET --release
if [ ! -z $DISABLE_TESTS ]; then
return
fi
cross test --target $TARGET
# cross test --target $TARGET --release
# cross run --target $TARGET
# cross run --target $TARGET --release
}
# we don't run the "test phase" when doing deploys
if [ -z $TRAVIS_TAG ]; then
main
fi

View File

@@ -61,7 +61,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = Index::create(index_path, schema.clone())?;
let index = Index::create_in_dir(index_path, schema.clone())?;
// here we are registering our custome tokenizer
// this will store tokens of 3 characters each

View File

@@ -64,7 +64,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = Index::create(index_path, schema.clone())?;
let index = Index::create_in_dir(index_path, schema.clone())?;
// To insert document we need an index writer.
// There must be only one writer at a time.

View File

@@ -4,87 +4,111 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
use collector::CollectorWrapper;
/// Collector that does nothing.
/// This is used in the chain Collector and will hopefully
/// be optimized away by the compiler.
pub struct DoNothingCollector;
impl Collector for DoNothingCollector {
type Child = DoNothingCollector;
#[inline]
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<DoNothingCollector> {
Ok(DoNothingCollector)
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
#[inline]
fn collect(&mut self, _doc: DocId, _score: Score) {}
#[inline]
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for DoNothingCollector {
type CollectionResult = ();
#[inline]
fn collect(&mut self, _doc: DocId, _score: Score) {}
fn finalize(self) -> () {
()
}
}
/// Zero-cost abstraction used to collect on multiple collectors.
/// This contraption is only usable if the type of your collectors
/// are known at compile time.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
pub struct ChainedCollector<Left: Collector, Right: Collector> {
left: Left,
right: Right,
}
pub struct ChainedSegmentCollector<Left: SegmentCollector, Right: SegmentCollector> {
left: Left,
right: Right,
}
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
/// Adds a collector
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, CollectorWrapper<C>> {
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
ChainedCollector {
left: self,
right: CollectorWrapper::new(new_collector),
right: new_collector,
}
}
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
type Child = ChainedSegmentCollector<Left::Child, Right::Child>;
fn for_segment(
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<Self::Child> {
Ok(ChainedSegmentCollector {
left: self.left.for_segment(segment_local_id, segment)?,
right: self.right.for_segment(segment_local_id, segment)?,
})
) -> Result<()> {
self.left.set_segment(segment_local_id, segment)?;
self.right.set_segment(segment_local_id, segment)?;
Ok(())
}
fn requires_scoring(&self) -> bool {
self.left.requires_scoring() || self.right.requires_scoring()
}
}
impl<Left: SegmentCollector, Right: SegmentCollector> SegmentCollector for ChainedSegmentCollector<Left, Right> {
type CollectionResult = (Left::CollectionResult, Right::CollectionResult);
fn collect(&mut self, doc: DocId, score: Score) {
self.left.collect(doc, score);
self.right.collect(doc, score);
}
fn finalize(self) -> Self::CollectionResult {
(self.left.finalize(), self.right.finalize())
fn requires_scoring(&self) -> bool {
self.left.requires_scoring() || self.right.requires_scoring()
}
}
@@ -98,35 +122,19 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
#[cfg(test)]
mod tests {
use super::*;
use collector::{CountCollector, SegmentCollector, TopCollector};
use schema::SchemaBuilder;
use Index;
use Document;
use collector::{Collector, CountCollector, TopCollector};
#[test]
fn test_chained_collector() {
let schema_builder = SchemaBuilder::new();
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(3_000_000).unwrap();
let doc = Document::new();
index_writer.add_document(doc);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_readers = searcher.segment_readers();
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
let mut segment_collector = collectors.for_segment(0, &segment_readers[0]).unwrap();
segment_collector.collect(1, 0.2);
segment_collector.collect(2, 0.1);
segment_collector.collect(3, 0.5);
collectors.merge_children(vec![segment_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
}
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());

View File

@@ -4,11 +4,56 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
use collector::Combinable;
/// `CountCollector` collector only counts how many
/// documents match the query.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result};
/// use tantivy::collector::CountCollector;
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut count_collector = CountCollector::default();
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut count_collector).unwrap();
///
/// assert_eq!(count_collector.count(), 2);
/// }
///
/// Ok(())
/// }
/// ```
#[derive(Default)]
pub struct CountCollector {
count: usize,
@@ -23,10 +68,12 @@ impl CountCollector {
}
impl Collector for CountCollector {
type Child = CountCollector;
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<CountCollector> {
Ok(CountCollector::default())
fn collect(&mut self, _: DocId, _: Score) {
self.count += 1;
}
fn requires_scoring(&self) -> bool {
@@ -34,28 +81,10 @@ impl Collector for CountCollector {
}
}
impl Combinable for CountCollector {
fn combine_into(&mut self, other: Self) {
self.count += other.count;
}
}
impl SegmentCollector for CountCollector {
type CollectionResult = CountCollector;
fn collect(&mut self, _: DocId, _: Score) {
self.count += 1;
}
fn finalize(self) -> CountCollector {
self
}
}
#[cfg(test)]
mod tests {
use collector::{Collector, CountCollector, SegmentCollector};
use collector::{Collector, CountCollector};
#[test]
fn test_count_collector() {

View File

@@ -3,12 +3,14 @@ use docset::SkipResult;
use fastfield::FacetReader;
use schema::Facet;
use schema::Field;
use std::cell::UnsafeCell;
use std::collections::btree_map;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::BinaryHeap;
use std::collections::Bound;
use std::iter::Peekable;
use std::mem;
use std::{u64, usize};
use termdict::TermMerger;
@@ -18,7 +20,6 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
struct Hit<'a> {
count: u64,
@@ -193,22 +194,19 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// }
/// ```
pub struct FacetCollector {
facet_ords: Vec<u64>,
field: Field,
ff_reader: Option<UnsafeCell<FacetReader>>,
segment_counters: Vec<SegmentFacetCounter>,
facets: BTreeSet<Facet>,
}
pub struct FacetSegmentCollector {
reader: FacetReader,
facet_ords_buf: Vec<u64>,
// facet_ord -> collapse facet_id
collapse_mapping: Vec<usize>,
current_segment_collapse_mapping: Vec<usize>,
// collapse facet_id -> count
counts: Vec<u64>,
current_segment_counts: Vec<u64>,
// collapse facet_id -> facet_ord
collapse_facet_ords: Vec<u64>,
current_collapse_facet_ords: Vec<u64>,
facets: BTreeSet<Facet>,
}
fn skip<'a, I: Iterator<Item = &'a Facet>>(
@@ -242,9 +240,15 @@ impl FacetCollector {
/// is of the proper type.
pub fn for_field(field: Field) -> FacetCollector {
FacetCollector {
facet_ords: Vec::with_capacity(255),
segment_counters: Vec::new(),
field,
ff_reader: None,
facets: BTreeSet::new(),
current_segment_collapse_mapping: Vec::new(),
current_collapse_facet_ords: Vec::new(),
current_segment_counts: Vec::new(),
}
}
@@ -275,11 +279,69 @@ impl FacetCollector {
self.facets.insert(facet);
}
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
self.current_segment_collapse_mapping.clear();
self.current_collapse_facet_ords.clear();
self.current_segment_counts.clear();
let mut collapse_facet_it = self.facets.iter().peekable();
self.current_collapse_facet_ords.push(0);
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if !facet_streamer.advance() {
return;
}
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
self.current_segment_collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = self.current_collapse_facet_ords.len();
self.current_collapse_facet_ords
.push(facet_streamer.term_ord());
self.current_segment_collapse_mapping.push(collapsed_id);
} else {
self.current_segment_collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
self.current_segment_collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
fn finalize_segment(&mut self) {
if self.ff_reader.is_some() {
self.segment_counters.push(SegmentFacetCounter {
facet_reader: self.ff_reader.take().unwrap().into_inner(),
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
});
}
}
/// Returns the results of the collection.
///
/// This method does not just return the counters,
/// it also translates the facet ordinals of the last segment.
pub fn harvest(self) -> FacetCounts {
pub fn harvest(mut self) -> FacetCounts {
self.finalize_segment();
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_ords[..])
@@ -327,92 +389,30 @@ impl FacetCollector {
}
}
impl FacetSegmentCollector {
fn into_segment_facet_counter(self) -> SegmentFacetCounter {
SegmentFacetCounter {
facet_reader: self.reader,
facet_ords: self.collapse_facet_ords,
facet_counts: self.counts,
}
}
}
impl Collector for FacetCollector {
type Child = FacetSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FacetSegmentCollector> {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.finalize_segment();
let facet_reader = reader.facet_reader(self.field)?;
let mut collapse_mapping = Vec::new();
let mut counts = Vec::new();
let mut collapse_facet_ords = Vec::new();
let mut collapse_facet_it = self.facets.iter().peekable();
collapse_facet_ords.push(0);
{
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if facet_streamer.advance() {
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = collapse_facet_ords.len();
collapse_facet_ords.push(facet_streamer.term_ord());
collapse_mapping.push(collapsed_id);
} else {
collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
}
counts.resize(collapse_facet_ords.len(), 0);
Ok(FacetSegmentCollector {
reader: facet_reader,
facet_ords_buf: Vec::with_capacity(255),
collapse_mapping,
counts,
collapse_facet_ords,
})
self.set_collapse_mapping(&facet_reader);
self.current_segment_counts
.resize(self.current_collapse_facet_ords.len(), 0);
self.ff_reader = Some(UnsafeCell::new(facet_reader));
Ok(())
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for FacetSegmentCollector {
type CollectionResult = Vec<SegmentFacetCounter>;
fn collect(&mut self, doc: DocId, _: Score) {
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
let facet_reader: &mut FacetReader = unsafe {
&mut *self.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.get()
};
facet_reader.facet_ords(doc, &mut self.facet_ords);
let mut previous_collapsed_ord: usize = usize::MAX;
for &facet_ord in &self.facet_ords_buf {
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
for &facet_ord in &self.facet_ords {
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
{
0
} else {
1
@@ -421,8 +421,8 @@ impl SegmentCollector for FacetSegmentCollector {
}
}
fn finalize(self) -> Vec<SegmentFacetCounter> {
vec![self.into_segment_facet_counter()]
fn requires_scoring(&self) -> bool {
false
}
}
@@ -507,7 +507,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
@@ -587,7 +587,7 @@ mod tests {
.collect();
thread_rng().shuffle(&mut docs[..]);
let mut index_writer = index.writer(3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc);
}
@@ -644,7 +644,7 @@ mod bench {
// 40425 docs
thread_rng().shuffle(&mut docs[..]);
let mut index_writer = index.writer(3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc);
}

View File

@@ -7,15 +7,12 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use query::Query;
use Searcher;
use downcast;
mod count_collector;
pub use self::count_collector::CountCollector;
//mod multi_collector;
//pub use self::multi_collector::MultiCollector;
mod multi_collector;
pub use self::multi_collector::MultiCollector;
mod top_collector;
pub use self::top_collector::TopCollector;
@@ -24,7 +21,7 @@ mod facet_collector;
pub use self::facet_collector::FacetCollector;
mod chained_collector;
pub use self::chained_collector::chain;
pub use self::chained_collector::{chain, ChainedCollector};
/// Collectors are in charge of collecting and retaining relevant
/// information from the document found and scored by the query.
@@ -56,90 +53,31 @@ pub use self::chained_collector::chain;
///
/// Segments are not guaranteed to be visited in any specific order.
pub trait Collector {
type Child : SegmentCollector + 'static;
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn for_segment(
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<Self::Child>;
/// Returns true iff the collector requires to compute scores for documents.
fn requires_scoring(&self) -> bool;
/// Search works as follows :
///
/// First the weight object associated to the query is created.
///
/// Then, the query loops over the segments and for each segment :
/// - setup the collector and informs it that the segment being processed has changed.
/// - creates a SegmentCollector for collecting documents associated to the segment
/// - creates a `Scorer` object associated for this segment
/// - iterate through the matched documents and push them to the segment collector.
/// - turn the segment collector into a Combinable segment result
///
/// Combining all of the segment results gives a single Child::CollectionResult, which is returned.
///
/// The result will be Ok(None) in case of having no segments.
fn search(&mut self, searcher: &Searcher, query: &Query) -> Result<Option<<Self::Child as SegmentCollector>::CollectionResult>> {
let scoring_enabled = self.requires_scoring();
let weight = query.weight(searcher, scoring_enabled)?;
let mut results = Vec::new();
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
let mut child: Self::Child = self.for_segment(segment_ord as SegmentLocalId, segment_reader)?;
let mut scorer = weight.scorer(segment_reader)?;
scorer.collect(&mut child, segment_reader.delete_bitset());
results.push(child.finalize());
}
Ok(results.into_iter().fold1(|x,y| {
x.combine_into(y);
x
}))
}
}
pub trait Combinable {
fn combine_into(&mut self, other: Self);
}
impl Combinable for () {
fn combine_into(&mut self, other: Self) {
()
}
}
impl<T> Combinable for Vec<T> {
fn combine_into(&mut self, other: Self) {
self.extend(other.into_iter());
}
}
impl<L: Combinable, R: Combinable> Combinable for (L, R) {
fn combine_into(&mut self, other: Self) {
self.0.combine_into(other.0);
self.1.combine_into(other.1);
}
}
pub trait SegmentCollector: downcast::Any + 'static {
type CollectionResult: Combinable + downcast::Any + 'static;
) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
/// Turn into the final result
fn finalize(self) -> Self::CollectionResult;
/// Returns true iff the collector requires to compute scores for documents.
fn requires_scoring(&self) -> bool;
}
impl<'a, C: Collector> Collector for &'a mut C {
type Child = C::Child;
fn for_segment(
&mut self, // TODO Ask Jason : why &mut self here!?
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<C::Child> {
(*self).for_segment(segment_local_id, segment)
) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score) {
C::collect(self, doc, score)
}
fn requires_scoring(&self) -> bool {
@@ -147,61 +85,6 @@ impl<'a, C: Collector> Collector for &'a mut C {
}
}
pub struct CollectorWrapper<'a, TCollector: 'a + Collector>(&'a mut TCollector);
impl<'a, T: 'a + Collector> CollectorWrapper<'a, T> {
pub fn new(collector: &'a mut T) -> CollectorWrapper<'a, T> {
CollectorWrapper(collector)
}
}
impl<'a, T: 'a + Collector> Collector for CollectorWrapper<'a, T> {
type Child = T::Child;
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<T::Child> {
self.0.for_segment(segment_local_id, segment)
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring()
}
}
trait UntypedCollector {
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>>;
}
impl<'a, TCollector:'a + Collector> UntypedCollector for CollectorWrapper<'a, TCollector> {
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>> {
let segment_collector = self.0.for_segment(segment_local_id, segment)?;
Ok(Box::new(segment_collector))
}
}
trait UntypedSegmentCollector {
fn finalize(self) -> Box<UntypedCombinable>;
}
trait UntypedCombinable {
fn combine_into(&mut self, other: Box<UntypedCombinable>);
}
pub struct CombinableWrapper<'a, T: 'a + Combinable>(&'a mut T);
impl<'a, T: 'a + Combinable> CombinableWrapper<'a, T> {
pub fn new(combinable: &'a mut T) -> CombinableWrapper<'a, T> {
CombinableWrapper(combinable)
}
}
impl<'a, T: 'a + Combinable> Combinable for CombinableWrapper<'a, T> {
fn combine_into(&mut self, other: Self) {
self.0.combine_into(*::downcast::Downcast::<T>::downcast(other).unwrap())
}
}
#[cfg(test)]
pub mod tests {
@@ -219,13 +102,8 @@ pub mod tests {
/// It is unusable in practise, as it does not store
/// the segment ordinals
pub struct TestCollector {
next_offset: DocId,
docs: Vec<DocId>,
scores: Vec<Score>,
}
pub struct TestSegmentCollector {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
scores: Vec<Score>,
}
@@ -244,7 +122,8 @@ pub mod tests {
impl Default for TestCollector {
fn default() -> TestCollector {
TestCollector {
next_offset: 0,
offset: 0,
segment_max_doc: 0,
docs: Vec::new(),
scores: Vec::new(),
}
@@ -252,33 +131,19 @@ pub mod tests {
}
impl Collector for TestCollector {
type Child = TestSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<TestSegmentCollector> {
let offset = self.next_offset;
self.next_offset += reader.max_doc();
Ok(TestSegmentCollector {
offset,
docs: Vec::new(),
scores: Vec::new(),
})
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
}
fn requires_scoring(&self) -> bool {
true
}
}
impl SegmentCollector for TestSegmentCollector {
type CollectionResult = Vec<TestSegmentCollector>;
fn collect(&mut self, doc: DocId, score: Score) {
self.docs.push(doc + self.offset);
self.scores.push(score);
}
fn finalize(self) -> Vec<TestSegmentCollector> {
vec![self]
fn requires_scoring(&self) -> bool {
true
}
}
@@ -287,26 +152,17 @@ pub mod tests {
///
/// This collector is mainly useful for tests.
pub struct FastFieldTestCollector {
next_counter: usize,
field: Field,
}
#[derive(Default)]
pub struct FastFieldSegmentCollectorState {
counter: usize,
vals: Vec<u64>,
}
pub struct FastFieldSegmentCollector {
state: FastFieldSegmentCollectorState,
reader: FastFieldReader<u64>,
field: Field,
ff_reader: Option<FastFieldReader<u64>>,
}
impl FastFieldTestCollector {
pub fn for_field(field: Field) -> FastFieldTestCollector {
FastFieldTestCollector {
next_counter: 0,
vals: Vec::new(),
field,
ff_reader: None,
}
}
@@ -316,32 +172,17 @@ pub mod tests {
}
impl Collector for FastFieldTestCollector {
type Child = FastFieldSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FastFieldSegmentCollector> {
let counter = self.next_counter;
self.next_counter += 1;
Ok(FastFieldSegmentCollector {
state: FastFieldSegmentCollectorState::default(),
reader: reader.fast_field_reader(self.field)?,
})
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
Ok(())
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for FastFieldSegmentCollector {
type CollectionResult = Vec<FastFieldSegmentCollectorState>;
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get(doc);
let val = self.ff_reader.as_ref().unwrap().get(doc);
self.vals.push(val);
}
fn finalize(self) -> Vec<FastFieldSegmentCollectorState> {
vec![self.state]
fn requires_scoring(&self) -> bool {
false
}
}
@@ -352,11 +193,7 @@ pub mod tests {
pub struct BytesFastFieldTestCollector {
vals: Vec<u8>,
field: Field,
}
pub struct BytesFastFieldSegmentCollector {
vals: Vec<u8>,
reader: BytesFastFieldReader,
ff_reader: Option<BytesFastFieldReader>,
}
impl BytesFastFieldTestCollector {
@@ -364,6 +201,7 @@ pub mod tests {
BytesFastFieldTestCollector {
vals: Vec::new(),
field,
ff_reader: None,
}
}
@@ -373,32 +211,20 @@ pub mod tests {
}
impl Collector for BytesFastFieldTestCollector {
type Child = BytesFastFieldSegmentCollector;
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
Ok(())
}
fn for_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<BytesFastFieldSegmentCollector> {
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader: segment.bytes_fast_field_reader(self.field)?,
})
fn collect(&mut self, doc: u32, _score: f32) {
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
self.vals.extend(val);
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for BytesFastFieldSegmentCollector {
type CollectionResult = Vec<Vec<u8>>;
fn collect(&mut self, doc: u32, _score: f32) {
let val = self.reader.get_val(doc);
self.vals.extend(val);
}
fn finalize(self) -> Vec<Vec<u8>> {
vec![self.vals]
}
}
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -1,122 +1,119 @@
use super::Collector;
use super::SegmentCollector;
use DocId;
use Score;
use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use downcast::Downcast;
/// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown
/// at compile time.
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result};
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors =
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
pub struct MultiCollector<'a> {
collector_wrappers: Vec<Box<UntypedCollector + 'a>>
collectors: Vec<&'a mut Collector>,
}
impl<'a> MultiCollector<'a> {
pub fn new() -> MultiCollector<'a> {
MultiCollector {
collector_wrappers: Vec::new()
}
}
pub fn add_collector<TCollector: 'a + Collector>(&mut self, collector: &'a mut TCollector) {
let collector_wrapper = CollectorWrapper(collector);
self.collector_wrappers.push(Box::new(collector_wrapper));
/// Constructor
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
MultiCollector { collectors }
}
}
impl<'a> Collector for MultiCollector<'a> {
type Child = MultiCollectorChild;
fn for_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<MultiCollectorChild> {
let children = self.collector_wrappers
.iter_mut()
.map(|collector_wrapper| {
collector_wrapper.for_segment(segment_local_id, segment)
})
.collect::<Result<Vec<_>>>()?;
Ok(MultiCollectorChild {
children
})
}
fn requires_scoring(&self) -> bool {
self.collector_wrappers
.iter()
.any(|c| c.requires_scoring())
}
fn merge_children(&mut self, children: Vec<MultiCollectorChild>) {
let mut per_collector_children: Vec<Vec<Box<SegmentCollector>>> =
(0..self.collector_wrappers.len())
.map(|_| Vec::with_capacity(children.len()))
.collect::<Vec<_>>();
for child in children {
for (idx, segment_collector) in child.children.into_iter().enumerate() {
per_collector_children[idx].push(segment_collector);
}
}
for (collector, children) in self.collector_wrappers.iter_mut().zip(per_collector_children) {
collector.merge_children_anys(children);
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
for collector in &mut self.collectors {
collector.set_segment(segment_local_id, segment)?;
}
Ok(())
}
}
pub struct MultiCollectorChild {
children: Vec<Box<SegmentCollector>>
}
impl SegmentCollector for MultiCollectorChild {
fn collect(&mut self, doc: DocId, score: Score) {
for child in &mut self.children {
child.collect(doc, score);
for collector in &mut self.collectors {
collector.collect(doc, score);
}
}
fn requires_scoring(&self) -> bool {
self.collectors
.iter()
.any(|collector| collector.requires_scoring())
}
}
#[cfg(test)]
mod tests {
use super::*;
use collector::{Collector, CountCollector, TopCollector};
use schema::{TEXT, SchemaBuilder};
use query::TermQuery;
use Index;
use Term;
use schema::IndexRecordOption;
#[test]
fn test_multi_collector() {
let mut schema_builder = SchemaBuilder::new();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc"));
index_writer.commit().unwrap();
index_writer.add_document(doc!(text=>""));
index_writer.add_document(doc!(text=>"abc abc abc abc"));
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic);
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = MultiCollector::new();
collectors.add_collector(&mut top_collector);
collectors.add_collector(&mut count_collector);
collectors.search(&*searcher, &query).unwrap();
let mut collectors =
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
}
assert_eq!(count_collector.count(), 5);
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());
}
}

View File

@@ -7,8 +7,6 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
use collector::Combinable;
// Rust heap is a max-heap and we need a min heap.
#[derive(Clone, Copy)]
@@ -45,7 +43,61 @@ impl Eq for GlobalScoredDoc {}
/// with the best scores.
///
/// The implementation is based on a `BinaryHeap`.
/// The theorical complexity is `O(n log K)`.
/// The theorical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result, DocId, Score};
/// use tantivy::collector::TopCollector;
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut top_collector).unwrap();
///
/// let score_docs: Vec<(Score, DocId)> = top_collector
/// .score_docs()
/// .into_iter()
/// .map(|(score, doc_address)| (score, doc_address.doc()))
/// .collect();
///
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
/// }
///
/// Ok(())
/// }
/// ```
pub struct TopCollector {
limit: usize,
heap: BinaryHeap<GlobalScoredDoc>,
@@ -101,34 +153,11 @@ impl TopCollector {
}
impl Collector for TopCollector {
type Child = TopCollector;
fn for_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<TopCollector> {
Ok(TopCollector {
limit: self.limit,
heap: BinaryHeap::new(),
segment_id,
})
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
self.segment_id = segment_id;
Ok(())
}
fn requires_scoring(&self) -> bool {
true
}
}
impl Combinable for TopCollector {
// TODO: I think this could be a bit better
fn combine_into(&mut self, other: Self) {
self.segment_id = other.segment_id;
while let Some(doc) = other.heap.pop() {
self.collect(doc.doc_address.doc(), doc.score);
}
}
}
impl SegmentCollector for TopCollector {
type CollectionResult = TopCollector;
fn collect(&mut self, doc: DocId, score: Score) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
@@ -151,8 +180,8 @@ impl SegmentCollector for TopCollector {
}
}
fn finalize(self) -> TopCollector {
self
fn requires_scoring(&self) -> bool {
true
}
}
@@ -160,6 +189,7 @@ impl SegmentCollector for TopCollector {
mod tests {
use super::*;
use collector::Collector;
use DocId;
use Score;
@@ -210,4 +240,5 @@ mod tests {
fn test_top_0() {
TopCollector::with_limit(0);
}
}

View File

@@ -46,7 +46,7 @@ impl BitPacker {
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0;
}
@@ -98,31 +98,14 @@ where
let addr_in_bits = idx * num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
if cfg!(feature = "simdcompression") {
// for simdcompression,
// the bitpacker is only used for fastfields,
// and we expect them to be always padded.
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
} else {
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
} else {
let mut buffer = [0u8; 8];
for i in addr..data.len() {
buffer[i - addr] += data[i];
}
unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
};
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
val_shifted & mask
}
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
/// Reads a range of values from the fast field.

View File

@@ -1,7 +1,8 @@
use compression::compressed_block_size;
use compression::BlockDecoder;
use compression::COMPRESSION_BLOCK_SIZE;
use directory::{ReadOnlySource, SourceRead};
use directory::ReadOnlySource;
use owned_read::OwnedRead;
/// Reads a stream of compressed ints.
///
@@ -10,7 +11,7 @@ use directory::{ReadOnlySource, SourceRead};
/// The `.skip(...)` makes it possible to avoid
/// decompressing blocks that are not required.
pub struct CompressedIntStream {
buffer: SourceRead,
buffer: OwnedRead,
block_decoder: BlockDecoder,
cached_addr: usize, // address of the currently decoded block
@@ -24,7 +25,7 @@ impl CompressedIntStream {
/// Opens a compressed int stream.
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
CompressedIntStream {
buffer: SourceRead::from(source),
buffer: OwnedRead::new(source),
block_decoder: BlockDecoder::new(),
cached_addr: usize::max_value(),
cached_next_addr: usize::max_value(),

View File

@@ -21,6 +21,7 @@ use directory::ManagedDirectory;
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
use indexer::segment_updater::save_new_metas;
use indexer::DirectoryLock;
use num_cpus;
@@ -51,12 +52,7 @@ impl Index {
/// This should only be used for unit tests.
pub fn create_in_ram(schema: Schema) -> Index {
let ram_directory = RAMDirectory::create();
// unwrap is ok here
let directory = ManagedDirectory::new(ram_directory).expect(
"Creating a managed directory from a brand new RAM directory \
should never fail.",
);
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
Index::create(ram_directory, schema).expect("Creating a RAMDirectory should never fail")
}
/// Creates a new index in a given filepath.
@@ -64,15 +60,9 @@ impl Index {
///
/// If a previous index was in this directory, then its meta file will be destroyed.
#[cfg(feature = "mmap")]
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Accessor for the tokenizer manager.
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
Index::create(mmap_directory, schema)
}
/// Creates a new index in a temp directory.
@@ -86,10 +76,22 @@ impl Index {
#[cfg(feature = "mmap")]
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::create_from_tempdir()?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::create(mmap_directory, schema)
}
/// Creates a new index given an implementation of the trait `Directory`
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
let directory = ManagedDirectory::new(dir)?;
Index::from_directory(directory, schema)
}
/// Create a new index from a directory.
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
/// Creates a new index given a directory and an `IndexMeta`.
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
let schema = metas.schema.clone();
@@ -103,24 +105,22 @@ impl Index {
Ok(index)
}
/// Open the index using the provided directory
pub fn open_directory<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
/// Accessor for the tokenizer manager.
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
}
/// Opens a new directory from an index path.
#[cfg(feature = "mmap")]
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
Index::open_directory(mmap_directory)
Index::open(mmap_directory)
}
/// Create a new index from a directory.
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
/// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
@@ -137,9 +137,13 @@ impl Index {
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// num_threads specifies the number of indexing workers that
/// - `num_threads` defines the number of indexing workers that
/// should work at the same time.
///
/// - `overall_heap_size_in_bytes` sets the amount of memory
/// allocated for all indexing thread.
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
@@ -147,21 +151,35 @@ impl Index {
pub fn writer_with_num_threads(
&self,
num_threads: usize,
heap_size_in_bytes: usize,
overall_heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
open_index_writer(
self,
num_threads,
heap_size_in_bytes_per_thread,
directory_lock,
)
}
/// Creates a multithreaded writer
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
///
/// Tantivy will automatically define the number of threads to use.
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
/// between a given number of threads.
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer(&self, heap_size_in_bytes: usize) -> Result<IndexWriter> {
self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes)
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> Result<IndexWriter> {
let mut num_threads = num_cpus::get();
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
}
self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
}
/// Accessor to the index schema
@@ -186,8 +204,8 @@ impl Index {
/// Creates a new segment.
pub fn new_segment(&self) -> Segment {
let segment_meta = SegmentMeta::new(SegmentId::generate_random());
create_segment(self.clone(), segment_meta)
let segment_meta = SegmentMeta::new(SegmentId::generate_random(), 0);
self.segment(segment_meta)
}
/// Return a reference to the index directory.

View File

@@ -1,6 +1,6 @@
use common::BinarySerializable;
use compression::CompressedIntStream;
use directory::{ReadOnlySource, SourceRead};
use directory::ReadOnlySource;
use postings::FreqReadingOption;
use postings::TermInfo;
use postings::{BlockSegmentPostings, SegmentPostings};
@@ -8,6 +8,7 @@ use schema::FieldType;
use schema::IndexRecordOption;
use schema::Term;
use termdict::TermDictionary;
use owned_read::OwnedRead;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
@@ -92,7 +93,7 @@ impl InvertedIndexReader {
let offset = term_info.postings_offset as usize;
let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source);
let postings_reader = SourceRead::from(postings_slice);
let postings_reader = OwnedRead::new(postings_slice);
block_postings.reset(term_info.doc_freq as usize, postings_reader);
}
@@ -114,7 +115,7 @@ impl InvertedIndexReader {
};
BlockSegmentPostings::from_data(
term_info.doc_freq as usize,
SourceRead::from(postings_data),
OwnedRead::new(postings_data),
freq_reading_option,
)
}

View File

@@ -73,7 +73,7 @@ impl Searcher {
/// Runs a query on the segment readers wrapped by the searcher
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
collector.search(self, query)
query.search(self, collector)
}
/// Return the field searcher associated to a `Field`.

View File

@@ -4,7 +4,7 @@ use core::SegmentId;
use core::SegmentMeta;
use directory::error::{OpenReadError, OpenWriteError};
use directory::Directory;
use directory::{FileProtection, ReadOnlySource, WritePtr};
use directory::{ReadOnlySource, WritePtr};
use indexer::segment_serializer::SegmentSerializer;
use schema::Schema;
use std::fmt;
@@ -28,6 +28,7 @@ impl fmt::Debug for Segment {
/// Creates a new segment given an `Index` and a `SegmentId`
///
/// The function is here to make it private outside `tantivy`.
/// #[doc(hidden)]
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
Segment { index, meta }
}
@@ -49,8 +50,11 @@ impl Segment {
}
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.meta.set_delete_meta(num_deleted_docs, opstamp);
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment {
Segment {
index: self.index,
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
}
}
/// Returns the segment's id.
@@ -66,16 +70,6 @@ impl Segment {
self.meta.relative_path(component)
}
/// Protects a specific component file from being deleted.
///
/// Returns a FileProtection object. The file is guaranteed
/// to not be garbage collected as long as this `FileProtection` object
/// lives.
pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection {
let path = self.relative_path(component);
self.index.directory().protect_file_from_delete(&path)
}
/// Open one of the component file for a *regular* read.
pub fn open_read(
&self,
@@ -105,35 +99,3 @@ pub trait SerializableSegment {
/// The number of documents in the segment.
fn write(&self, serializer: SegmentSerializer) -> Result<u32>;
}
#[cfg(test)]
mod tests {
use core::SegmentComponent;
use directory::Directory;
use schema::SchemaBuilder;
use std::collections::HashSet;
use Index;
#[test]
fn test_segment_protect_component() {
let mut index = Index::create_in_ram(SchemaBuilder::new().build());
let segment = index.new_segment();
let path = segment.relative_path(SegmentComponent::POSTINGS);
let directory = index.directory_mut();
directory.atomic_write(&*path, &vec![0u8]).unwrap();
let living_files = HashSet::new();
{
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
assert!(directory.exists(&*path));
directory.garbage_collect(|| living_files.clone());
assert!(directory.exists(&*path));
}
directory.garbage_collect(|| living_files);
assert!(!directory.exists(&*path));
}
}

View File

@@ -1,8 +1,15 @@
use super::SegmentComponent;
use census::{Inventory, TrackedObject};
use core::SegmentId;
use serde;
use std::collections::HashSet;
use std::fmt;
use std::path::PathBuf;
lazy_static! {
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
}
#[derive(Clone, Debug, Serialize, Deserialize)]
struct DeleteMeta {
num_deleted_docs: u32,
@@ -13,32 +20,72 @@ struct DeleteMeta {
///
/// For instance the number of docs it contains,
/// how many are deleted, etc.
#[derive(Clone, Debug, Serialize, Deserialize)]
#[derive(Clone)]
pub struct SegmentMeta {
segment_id: SegmentId,
max_doc: u32,
deletes: Option<DeleteMeta>,
tracked: TrackedObject<InnerSegmentMeta>,
}
impl fmt::Debug for SegmentMeta {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
self.tracked.fmt(f)
}
}
impl serde::Serialize for SegmentMeta {
fn serialize<S>(
&self,
serializer: S,
) -> Result<<S as serde::Serializer>::Ok, <S as serde::Serializer>::Error>
where
S: serde::Serializer,
{
self.tracked.serialize(serializer)
}
}
impl<'a> serde::Deserialize<'a> for SegmentMeta {
fn deserialize<D>(deserializer: D) -> Result<Self, <D as serde::Deserializer<'a>>::Error>
where
D: serde::Deserializer<'a>,
{
let inner = InnerSegmentMeta::deserialize(deserializer)?;
let tracked = INVENTORY.track(inner);
Ok(SegmentMeta { tracked: tracked })
}
}
impl SegmentMeta {
/// Creates a new segment meta for
/// a segment with no deletes and no documents.
pub fn new(segment_id: SegmentId) -> SegmentMeta {
SegmentMeta {
/// Lists all living `SegmentMeta` object at the time of the call.
pub fn all() -> Vec<SegmentMeta> {
INVENTORY
.list()
.into_iter()
.map(|inner| SegmentMeta { tracked: inner })
.collect::<Vec<_>>()
}
/// Creates a new `SegmentMeta` object.
#[doc(hidden)]
pub fn new(segment_id: SegmentId, max_doc: u32) -> SegmentMeta {
let inner = InnerSegmentMeta {
segment_id,
max_doc: 0,
max_doc,
deletes: None,
};
SegmentMeta {
tracked: INVENTORY.track(inner),
}
}
/// Returns the segment id.
pub fn id(&self) -> SegmentId {
self.segment_id
self.tracked.segment_id
}
/// Returns the number of deleted documents.
pub fn num_deleted_docs(&self) -> u32 {
self.deletes
self.tracked
.deletes
.as_ref()
.map(|delete_meta| delete_meta.num_deleted_docs)
.unwrap_or(0u32)
@@ -80,7 +127,7 @@ impl SegmentMeta {
/// and all the doc ids contains in this segment
/// are exactly (0..max_doc).
pub fn max_doc(&self) -> u32 {
self.max_doc
self.tracked.max_doc
}
/// Return the number of documents in the segment.
@@ -91,25 +138,36 @@ impl SegmentMeta {
/// Returns the opstamp of the last delete operation
/// taken in account in this segment.
pub fn delete_opstamp(&self) -> Option<u64> {
self.deletes.as_ref().map(|delete_meta| delete_meta.opstamp)
self.tracked
.deletes
.as_ref()
.map(|delete_meta| delete_meta.opstamp)
}
/// Returns true iff the segment meta contains
/// delete information.
pub fn has_deletes(&self) -> bool {
self.deletes.is_some()
self.num_deleted_docs() > 0
}
#[doc(hidden)]
pub fn set_max_doc(&mut self, max_doc: u32) {
self.max_doc = max_doc;
}
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.deletes = Some(DeleteMeta {
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta {
let delete_meta = DeleteMeta {
num_deleted_docs,
opstamp,
};
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
segment_id: inner_meta.segment_id,
max_doc: inner_meta.max_doc,
deletes: Some(delete_meta),
});
SegmentMeta { tracked }
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
struct InnerSegmentMeta {
segment_id: SegmentId,
max_doc: u32,
deletes: Option<DeleteMeta>,
}

View File

@@ -1,4 +0,0 @@
mod skip;
pub mod stacker;
pub use self::skip::{SkipList, SkipListBuilder};

View File

@@ -1,168 +0,0 @@
use super::heap::{Heap, HeapAllocable};
use std::mem;
#[inline]
pub fn is_power_of_2(val: u32) -> bool {
val & (val - 1) == 0
}
#[inline]
pub fn jump_needed(val: u32) -> bool {
val > 3 && is_power_of_2(val)
}
#[derive(Debug, Clone)]
pub struct ExpUnrolledLinkedList {
len: u32,
end: u32,
val0: u32,
val1: u32,
val2: u32,
next: u32, // inline of the first block
}
impl ExpUnrolledLinkedList {
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap,
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
len: self.len,
consumed: 0,
}
}
pub fn push(&mut self, val: u32, heap: &Heap) {
self.len += 1;
if jump_needed(self.len) {
// we need to allocate another block.
// ... As we want to grow block exponentially
// the next block as a size of (length so far),
// and we need to add 1u32 to store the pointer
// to the next element.
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
let new_block_addr: u32 = heap.allocate_space(new_block_size);
heap.set(self.end, &new_block_addr);
self.end = new_block_addr;
}
heap.set(self.end, &val);
self.end += mem::size_of::<u32>() as u32;
}
}
impl HeapAllocable for u32 {
fn with_addr(_addr: u32) -> u32 {
0u32
}
}
impl HeapAllocable for ExpUnrolledLinkedList {
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;
ExpUnrolledLinkedList {
len: 0u32,
end: last_addr,
val0: 0u32,
val1: 0u32,
val2: 0u32,
next: 0u32,
}
}
}
pub struct ExpUnrolledLinkedListIterator<'a> {
heap: &'a Heap,
addr: u32,
len: u32,
consumed: u32,
}
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
} else {
let addr: u32;
self.consumed += 1;
if jump_needed(self.consumed) {
addr = *self.heap.get_mut_ref(self.addr);
} else {
addr = self.addr;
}
self.addr = addr + mem::size_of::<u32>() as u32;
Some(*self.heap.get_mut_ref(addr))
}
}
}
#[cfg(test)]
mod tests {
use super::super::heap::Heap;
use super::*;
#[test]
fn test_stack() {
let heap = Heap::with_capacity(1_000_000);
let (addr, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
stack.push(1u32, &heap);
stack.push(2u32, &heap);
stack.push(4u32, &heap);
stack.push(8u32, &heap);
{
let mut it = stack.iter(addr, &heap);
assert_eq!(it.next().unwrap(), 1u32);
assert_eq!(it.next().unwrap(), 2u32);
assert_eq!(it.next().unwrap(), 4u32);
assert_eq!(it.next().unwrap(), 8u32);
assert!(it.next().is_none());
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::ExpUnrolledLinkedList;
use super::Heap;
use test::Bencher;
const NUM_STACK: usize = 10_000;
const STACK_SIZE: u32 = 1000;
#[bench]
fn bench_push_vec(bench: &mut Bencher) {
bench.iter(|| {
let mut vecs = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
vecs.push(Vec::new());
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
vecs[t].push(i);
}
}
});
}
#[bench]
fn bench_push_stack(bench: &mut Bencher) {
let heap = Heap::with_capacity(64_000_000);
bench.iter(|| {
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
stacks.push(stack);
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
stacks[t].push(i, &heap);
}
}
heap.clear();
});
}
}

View File

@@ -1,335 +0,0 @@
use super::heap::{BytesRef, Heap, HeapAllocable};
use postings::UnorderedTermId;
use std::iter;
use std::mem;
use std::slice;
mod murmurhash2 {
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { *key_ptr }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
}
/// Split the thread memory budget into
/// - the heap size
/// - the hash table "table" itself.
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
let table_size_limit: usize = per_thread_memory_budget / 3;
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
let table_num_bits: usize = (1..)
.into_iter()
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
));
let table_size = compute_table_size(table_num_bits);
let heap_size = per_thread_memory_budget - table_size;
(heap_size, table_num_bits)
}
/// `KeyValue` is the item stored in the hash table.
/// The key is actually a `BytesRef` object stored in an external heap.
/// The `value_addr` also points to an address in the heap.
///
/// The key and the value are actually stored contiguously.
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
#[derive(Copy, Clone, Default)]
struct KeyValue {
key_value_addr: BytesRef,
hash: u32,
}
impl KeyValue {
fn is_empty(&self) -> bool {
self.key_value_addr.is_null()
}
}
/// Customized `HashMap` with string keys
///
/// This `HashMap` takes String as keys. Keys are
/// stored in a user defined heap.
///
/// The quirky API has the benefit of avoiding
/// the computation of the hash of the key twice,
/// or copying the key as long as there is no insert.
///
pub struct TermHashMap<'a> {
table: Box<[KeyValue]>,
heap: &'a Heap,
mask: usize,
occupied: Vec<usize>,
}
struct QuadraticProbing {
hash: usize,
i: usize,
mask: usize,
}
impl QuadraticProbing {
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing { hash, i: 0, mask }
}
#[inline]
fn next_probe(&mut self) -> usize {
self.i += 1;
(self.hash + self.i * self.i) & self.mask
}
}
pub struct Iter<'a: 'b, 'b> {
hashmap: &'b TermHashMap<'a>,
inner: slice::Iter<'a, usize>,
}
impl<'a, 'b> Iterator for Iter<'a, 'b> {
type Item = (&'b [u8], u32, UnorderedTermId);
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'b [u8], u32) = self.hashmap.get_key_value(kv.key_value_addr);
(key, offset, bucket as UnorderedTermId)
})
}
}
impl<'a> TermHashMap<'a> {
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
let table_size = 1 << num_bucket_power_of_2;
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
TermHashMap {
table: table.into_boxed_slice(),
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
}
}
fn probe(&self, hash: u32) -> QuadraticProbing {
QuadraticProbing::compute(hash as usize, self.mask)
}
pub fn is_saturated(&self) -> bool {
self.table.len() < self.occupied.len() * 3
}
#[inline(never)]
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
(key_bytes, expull_addr)
}
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key_value_addr,
hash,
};
}
pub fn iter<'b: 'a>(&'b self) -> Iter<'a, 'b> {
Iter {
inner: self.occupied.iter(),
hashmap: &self,
}
}
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
&mut self,
key: S,
) -> (UnorderedTermId, &mut V) {
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
self.set_bucket(hash, key_bytes_ref, bucket);
return (bucket as UnorderedTermId, val);
} else if kv.hash == hash {
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
if stored_key == key_bytes {
return (
bucket as UnorderedTermId,
self.heap.get_mut_ref(expull_addr),
);
}
}
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)]
mod tests {
use super::super::heap::{Heap, HeapAllocable};
use super::murmurhash2::murmurhash2;
use super::split_memory;
use super::*;
use std::collections::HashSet;
struct TestValue {
val: u32,
_addr: u32,
}
impl HeapAllocable for TestValue {
fn with_addr(addr: u32) -> TestValue {
TestValue {
val: 0u32,
_addr: addr,
}
}
}
#[test]
fn test_hashmap_size() {
assert_eq!(split_memory(100_000), (67232, 12));
assert_eq!(split_memory(1_000_000), (737856, 15));
assert_eq!(split_memory(10_000_000), (7902848, 18));
}
#[test]
fn test_hash_map() {
let heap = Heap::with_capacity(2_000_000);
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
{
let v: &mut TestValue = hash_map.get_or_create("abc").1;
assert_eq!(v.val, 0u32);
v.val = 3u32;
}
{
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
assert_eq!(v.val, 0u32);
v.val = 4u32;
}
{
let v: &mut TestValue = hash_map.get_or_create("abc").1;
assert_eq!(v.val, 3u32);
}
{
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
assert_eq!(v.val, 4u32);
}
let mut iter_values = hash_map.iter();
{
let (_, addr, _) = iter_values.next().unwrap();
let val: &TestValue = heap.get_ref(addr);
assert_eq!(val.val, 3u32);
}
{
let (_, addr, _) = iter_values.next().unwrap();
let val: &TestValue = heap.get_ref(addr);
assert_eq!(val.val, 4u32);
}
assert!(iter_values.next().is_none());
}
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,233 +0,0 @@
use byteorder::{ByteOrder, NativeEndian};
use std::cell::UnsafeCell;
use std::mem;
use std::ptr;
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
///
/// The slice will encode the length of the `&[u8]` slice
/// on 16-bits, and then the data is encoded.
#[derive(Copy, Clone)]
pub struct BytesRef(u32);
impl BytesRef {
pub fn is_null(&self) -> bool {
self.0 == u32::max_value()
}
pub fn addr(&self) -> u32 {
self.0
}
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef(u32::max_value())
}
}
/// Object that can be allocated in tantivy's custom `Heap`.
pub trait HeapAllocable {
fn with_addr(addr: u32) -> Self;
}
/// Tantivy's custom `Heap`.
pub struct Heap {
inner: UnsafeCell<InnerHeap>,
}
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
impl Heap {
/// Creates a new heap with a given capacity
pub fn with_capacity(num_bytes: usize) -> Heap {
Heap {
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
}
}
fn inner(&self) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
}
/// Clears the heap. All the underlying data is lost.
///
/// This heap does not support deallocation.
/// This method is the only way to free memory.
pub fn clear(&self) {
self.inner().clear();
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self) -> u32 {
self.inner().num_free_bytes()
}
/// Allocate a given amount of space and returns an address
/// in the Heap.
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
self.inner().allocate_space(num_bytes)
}
/// Allocate an object in the heap
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
let addr = self.inner().allocate_space(mem::size_of::<V>());
let v: V = V::with_addr(addr);
self.inner().set(addr, &v);
(addr, self.inner().get_mut_ref(addr))
}
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
self.inner().allocate_and_set(data)
}
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
/// given as argumetn
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
self.inner().get_slice(bytes_ref)
}
/// Stores an item's data in the heap, at the given `address`.
pub fn set<Item>(&self, addr: u32, val: &Item) {
self.inner().set(addr, val);
}
/// Returns a mutable reference for an object at a given Item.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
/// Returns a mutable reference to an `Item` at a given `addr`.
#[cfg(test)]
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
self.get_mut_ref(addr)
}
}
struct InnerHeap {
buffer: Vec<u8>,
buffer_len: u32,
used: u32,
next_heap: Option<Box<InnerHeap>>,
}
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
buffer,
buffer_len: num_bytes as u32,
next_heap: None,
used: 0u32,
}
}
pub fn clear(&mut self) {
self.used = 0u32;
self.next_heap = None;
}
// Returns the number of free bytes. If the buffer
// has reached it's capacity and overflowed to another buffer, return 0.
pub fn num_free_bytes(&self) -> u32 {
if self.next_heap.is_some() {
0u32
} else {
self.buffer_len - self.used
}
}
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
let addr = self.used;
self.used += num_bytes as u32;
if self.used <= self.buffer_len {
addr
} else {
if self.next_heap.is_none() {
info!(
r#"Exceeded heap size. The segment will be committed right
after indexing this document."#,
);
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
}
}
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
let start = bytes_ref.0;
if start >= self.buffer_len {
self.next_heap
.as_ref()
.unwrap()
.get_slice(BytesRef(start - self.buffer_len))
} else {
let start = start as usize;
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
&self.buffer[start + 2..start + 2 + len]
}
}
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
if start >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
} else {
&mut self.buffer[start as usize..stop as usize]
}
}
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
assert!(data.len() < u16::max_value() as usize);
let total_len = 2 + data.len();
let start = self.allocate_space(total_len);
let total_buff = self.get_mut_slice(start, start + total_len as u32);
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
total_buff[2..].clone_from_slice(data);
BytesRef(start)
}
fn get_mut(&mut self, addr: u32) -> *mut u8 {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut(addr - self.buffer_len)
} else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
}
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_ref(addr - self.buffer_len)
} else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
}
}
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.set(addr - self.buffer_len, val);
} else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
unsafe {
let dest_ptr: *mut u8 = self.get_mut(addr);
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
}
}
}
}

View File

@@ -1,43 +0,0 @@
mod expull;
pub(crate) mod hashmap;
mod heap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::hashmap::TermHashMap;
pub use self::heap::{Heap, HeapAllocable};
#[test]
fn test_unrolled_linked_list() {
use std::collections;
let heap = Heap::with_capacity(30_000_000);
{
heap.clear();
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
ks.push(2);
ks.push(3);
for k in (1..5).map(|k| k * 100) {
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
for j in 0..k {
for i in 0..500 {
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
v.push(i * j, &heap);
}
}
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
for (key, addr, _) in hashmap.iter() {
map_addr.insert(Vec::from(key), addr);
}
for i in 0..500 {
let key: String = i.to_string();
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
let mut it = exp_pull.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i * j);
}
assert!(!it.next().is_some());
}
}
}
}

View File

@@ -173,9 +173,6 @@ pub enum DeleteError {
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(IOError),
/// The file may not be deleted because it is
/// protected.
FileProtected(PathBuf),
}
impl From<IOError> for DeleteError {
@@ -190,9 +187,6 @@ impl fmt::Display for DeleteError {
DeleteError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
DeleteError::FileProtected(ref path) => {
write!(f, "the file '{:?}' is protected and can't be deleted", path)
}
DeleteError::IOError(ref err) => {
write!(f, "an io error occurred while deleting a file: '{}'", err)
}
@@ -207,7 +201,7 @@ impl StdError for DeleteError {
fn cause(&self) -> Option<&StdError> {
match *self {
DeleteError::FileDoesNotExist(_) | DeleteError::FileProtected(_) => None,
DeleteError::FileDoesNotExist(_) => None,
DeleteError::IOError(ref err) => Some(err),
}
}

View File

@@ -3,9 +3,7 @@ use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use error::{ErrorKind, Result, ResultExt};
use serde_json;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt;
use std::io;
use std::io::Write;
use std::path::{Path, PathBuf};
@@ -32,37 +30,6 @@ pub struct ManagedDirectory {
#[derive(Debug, Default)]
struct MetaInformation {
managed_paths: HashSet<PathBuf>,
protected_files: HashMap<PathBuf, usize>,
}
/// A `FileProtection` prevents the garbage collection of a file.
///
/// See `ManagedDirectory.protect_file_from_delete`.
pub struct FileProtection {
directory: ManagedDirectory,
path: PathBuf,
}
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
let mut meta_informations_wlock = directory
.meta_informations
.write()
.expect("Managed file lock poisoned");
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
(*counter_ref_mut) -= 1;
}
}
impl fmt::Debug for FileProtection {
fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
write!(formatter, "FileProtectionFor({:?})", self.path)
}
}
impl Drop for FileProtection {
fn drop(&mut self) {
unprotect_file_from_delete(&self.directory, &*self.path);
}
}
/// Saves the file containing the list of existing files
@@ -89,7 +56,6 @@ impl ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default(),
})),
})
}
@@ -158,9 +124,6 @@ impl ManagedDirectory {
error!("Failed to delete {:?}", file_to_delete);
}
}
DeleteError::FileProtected(_) => {
// this is expected.
}
}
}
}
@@ -185,28 +148,6 @@ impl ManagedDirectory {
}
}
/// Protects a file from being garbage collected.
///
/// The method returns a `FileProtection` object.
/// The file will not be garbage collected as long as the
/// `FileProtection` object is kept alive.
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
let pathbuf = path.to_owned();
{
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned on protect");
*meta_informations_wlock
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
}
FileProtection {
directory: self.clone(),
path: pathbuf.clone(),
}
}
/// Registers a file as managed
///
/// This method must be called before the file is
@@ -247,16 +188,6 @@ impl Directory for ManagedDirectory {
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
{
let metas_rlock = self.meta_informations
.read()
.expect("poisoned lock in managed directory meta");
if let Some(counter) = metas_rlock.protected_files.get(path) {
if *counter > 0 {
return Err(DeleteError::FileProtected(path.to_owned()));
}
}
}
self.directory.delete(path)
}
@@ -372,28 +303,4 @@ mod tests {
}
}
#[test]
#[cfg(feature = "mmap")]
fn test_managed_directory_protect() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
{
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
managed_directory.garbage_collect(|| living_files.clone());
assert!(managed_directory.exists(*TEST_PATH1));
}
managed_directory.garbage_collect(|| living_files.clone());
assert!(!managed_directory.exists(*TEST_PATH1));
}
}

View File

@@ -25,8 +25,7 @@ pub use self::read_only_source::ReadOnlySource;
#[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory;
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
pub(crate) use self::read_only_source::SourceRead;
pub(crate) use self::managed_directory::ManagedDirectory;
/// Synonym of Seek + Write
pub trait SeekableWrite: Seek + Write {}

View File

@@ -3,9 +3,8 @@ use common::HasLen;
#[cfg(feature = "mmap")]
use fst::raw::MmapReadOnly;
use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::io::{self, Read};
use std::ops::Deref;
use std::slice;
/// Read object that represents files in tantivy.
///
@@ -120,49 +119,3 @@ impl From<Vec<u8>> for ReadOnlySource {
ReadOnlySource::Anonymous(shared_data)
}
}
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
pub(crate) struct SourceRead {
_data_owner: ReadOnlySource,
cursor: &'static [u8],
}
impl SourceRead {
// Advance the cursor by a given number of bytes.
pub fn advance(&mut self, len: usize) {
self.cursor = &self.cursor[len..];
}
pub fn slice_from(&self, start: usize) -> &[u8] {
&self.cursor[start..]
}
pub fn get(&self, idx: usize) -> u8 {
self.cursor[idx]
}
}
impl AsRef<[u8]> for SourceRead {
fn as_ref(&self) -> &[u8] {
self.cursor
}
}
impl From<ReadOnlySource> for SourceRead {
// Creates a new `SourceRead` from a given `ReadOnlySource`
fn from(source: ReadOnlySource) -> SourceRead {
let len = source.len();
let slice_ptr = source.as_slice().as_ptr();
let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) };
SourceRead {
_data_owner: source,
cursor: static_slice,
}
}
}
impl Read for SourceRead {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.cursor.read(buf)
}
}

View File

@@ -9,9 +9,6 @@ use core::SegmentComponent;
use core::SegmentId;
use core::SegmentMeta;
use core::SegmentReader;
use datastruct::stacker::hashmap::split_memory;
use datastruct::stacker::Heap;
use directory::FileProtection;
use docset::DocSet;
use error::{Error, ErrorKind, Result, ResultExt};
use fastfield::write_delete_bitset;
@@ -24,6 +21,7 @@ use indexer::DirectoryLock;
use indexer::MergePolicy;
use indexer::SegmentEntry;
use indexer::SegmentWriter;
use postings::compute_table_size;
use schema::Document;
use schema::IndexRecordOption;
use schema::Term;
@@ -34,10 +32,11 @@ use std::thread::JoinHandle;
// Size of the margin for the heap. A segment is closed when the remaining memory
// in the heap goes below MARGIN_IN_BYTES.
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
pub const MARGIN_IN_BYTES: usize = 1_000_000;
// We impose the memory per thread to be at least 3 MB.
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
pub const HEAP_SIZE_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
// Add document will block if the number of docs waiting in the queue to be indexed
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
@@ -46,6 +45,24 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
type DocumentSender = chan::Sender<AddOperation>;
type DocumentReceiver = chan::Receiver<AddOperation>;
/// Split the thread memory budget into
/// - the heap size
/// - the hash table "table" itself.
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
let table_size_limit: usize = per_thread_memory_budget / 3;
(1..)
.into_iter()
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
))
.min(19) // we cap it at 512K
}
/// `IndexWriter` is the user entry-point to add document to an index.
///
/// It manages a small number of indexing thread, as well as a shared
@@ -100,11 +117,16 @@ pub fn open_index_writer(
heap_size_in_bytes_per_thread: usize,
directory_lock: DirectoryLock,
) -> Result<IndexWriter> {
if heap_size_in_bytes_per_thread < HEAP_SIZE_LIMIT as usize {
panic!(format!(
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
let err_msg = format!(
"The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT
));
HEAP_SIZE_MIN
);
bail!(ErrorKind::InvalidArgument(err_msg));
}
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
bail!(ErrorKind::InvalidArgument(err_msg));
}
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
@@ -193,15 +215,13 @@ pub fn advance_deletes(
mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64,
) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
) -> Result<()> {
{
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
// We are already up-to-date here.
if target_opstamp == previous_opstamp {
return Ok(file_protect);
}
return Ok(());
}
let segment_reader = SegmentReader::open(&segment)?;
let max_doc = segment_reader.max_doc();
@@ -220,6 +240,7 @@ pub fn advance_deletes(
target_opstamp,
)?;
// TODO optimize
for doc in 0u32..max_doc {
if segment_reader.is_deleted(doc) {
delete_bitset.insert(doc as usize);
@@ -228,54 +249,39 @@ pub fn advance_deletes(
let num_deleted_docs = delete_bitset.len();
if num_deleted_docs > 0 {
segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
file_protect = Some(segment.protect_from_delete(SegmentComponent::DELETE));
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
write_delete_bitset(&delete_bitset, &mut delete_file)?;
}
}
segment_entry.set_meta(segment.meta().clone());
Ok(file_protect)
segment_entry.set_meta((*segment.meta()).clone());
Ok(())
}
fn index_documents(
heap: &mut Heap,
table_size: usize,
memory_budget: usize,
segment: &Segment,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
heap.clear();
let schema = segment.schema();
let segment_id = segment.id();
let mut segment_writer =
SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?;
let table_size = initial_table_size(memory_budget);
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
for doc in document_iterator {
segment_writer.add_document(doc, &schema)?;
// There is two possible conditions to close the segment.
// One is the memory arena dedicated to the segment is
// getting full.
if segment_writer.is_buffer_full() {
let mem_usage = segment_writer.mem_usage();
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
// The second is the term dictionary hash table
// is reaching saturation.
//
// Tantivy does not resize its hashtable. When it reaches
// capacity, we just stop indexing new document.
if segment_writer.is_term_saturated() {
info!(
"Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
}
if !segment_updater.is_alive() {
@@ -290,8 +296,7 @@ fn index_documents(
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
let mut segment_meta = SegmentMeta::new(segment_id);
segment_meta.set_max_doc(num_docs);
let segment_meta = SegmentMeta::new(segment_id, num_docs);
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
@@ -367,14 +372,12 @@ impl IndexWriter {
fn add_indexing_worker(&mut self) -> Result<()> {
let document_receiver_clone = self.document_receiver.clone();
let mut segment_updater = self.segment_updater.clone();
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
info!("heap size {}, table_size {}", heap_size, table_size);
let mut heap = Heap::with_capacity(heap_size);
let generation = self.generation;
let mut delete_cursor = self.delete_queue.cursor();
let mem_budget = self.heap_size_in_bytes_per_thread;
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!(
"indexing thread {} for gen {}",
@@ -402,8 +405,7 @@ impl IndexWriter {
}
let segment = segment_updater.new_segment();
index_documents(
&mut heap,
table_size,
mem_budget,
&segment,
generation,
&mut document_iterator,
@@ -441,7 +443,9 @@ impl IndexWriter {
}
/// Merges a given list of segments
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
///
/// `segment_ids` is required to be non-empty.
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
self.segment_updater.start_merge(segment_ids)
}
@@ -637,6 +641,7 @@ impl IndexWriter {
#[cfg(test)]
mod tests {
use super::initial_table_size;
use env_logger;
use error::*;
use indexer::NoMergePolicy;
@@ -699,7 +704,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap();
let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback().unwrap();
@@ -732,7 +737,7 @@ mod tests {
};
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
let mut doc = Document::default();
@@ -766,7 +771,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
@@ -801,7 +806,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
@@ -831,4 +836,12 @@ mod tests {
assert_eq!(num_docs_containing("b"), 100);
}
#[test]
fn test_hashmap_size() {
assert_eq!(initial_table_size(100_000), 12);
assert_eq!(initial_table_size(1_000_000), 15);
assert_eq!(initial_table_size(10_000_000), 18);
assert_eq!(initial_table_size(1_000_000_000), 19);
}
}

View File

@@ -116,15 +116,17 @@ mod tests {
assert!(result_list.is_empty());
}
fn seg_meta(num_docs: u32) -> SegmentMeta {
let mut segment_metas = SegmentMeta::new(SegmentId::generate_random());
segment_metas.set_max_doc(num_docs);
segment_metas
fn create_random_segment_meta(num_docs: u32) -> SegmentMeta {
SegmentMeta::new(SegmentId::generate_random(), num_docs)
}
#[test]
fn test_log_merge_policy_pair() {
let test_input = vec![seg_meta(10), seg_meta(10), seg_meta(10)];
let test_input = vec![
create_random_segment_meta(10),
create_random_segment_meta(10),
create_random_segment_meta(10),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}
@@ -137,17 +139,17 @@ mod tests {
// * one with the 3 * 1000-docs segments
// no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3
let test_input = vec![
seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000),
seg_meta(10000),
seg_meta(10000),
seg_meta(10),
seg_meta(10),
seg_meta(10),
create_random_segment_meta(10),
create_random_segment_meta(10),
create_random_segment_meta(10),
create_random_segment_meta(1000),
create_random_segment_meta(1000),
create_random_segment_meta(1000),
create_random_segment_meta(10000),
create_random_segment_meta(10000),
create_random_segment_meta(10),
create_random_segment_meta(10),
create_random_segment_meta(10),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
@@ -157,12 +159,12 @@ mod tests {
fn test_log_merge_policy_within_levels() {
// multiple levels all get merged correctly
let test_input = vec![
seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
seg_meta(11), // log2(11) = ~3.46
seg_meta(12), // log2(12) = ~3.58
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
seg_meta(1000), // log2(1000) = ~9.97
seg_meta(1000),
create_random_segment_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
create_random_segment_meta(11), // log2(11) = ~3.46
create_random_segment_meta(12), // log2(12) = ~3.58
create_random_segment_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
create_random_segment_meta(1000), // log2(1000) = ~9.97
create_random_segment_meta(1000),
]; // log2(1000) = ~9.97
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
@@ -171,12 +173,12 @@ mod tests {
fn test_log_merge_policy_small_segments() {
// segments under min_layer_size are merged together
let test_input = vec![
seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2),
create_random_segment_meta(1),
create_random_segment_meta(1),
create_random_segment_meta(1),
create_random_segment_meta(2),
create_random_segment_meta(2),
create_random_segment_meta(2),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);

View File

@@ -683,7 +683,7 @@ mod tests {
};
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
// writing the segment
{
@@ -733,9 +733,10 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
@@ -979,6 +980,7 @@ mod tests {
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
@@ -1075,6 +1077,7 @@ mod tests {
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
@@ -1128,6 +1131,7 @@ mod tests {
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
@@ -1138,126 +1142,126 @@ mod tests {
}
}
// #[test]
// fn test_merge_facets() {
// let mut schema_builder = schema::SchemaBuilder::default();
// let facet_field = schema_builder.add_facet_field("facet");
// let index = Index::create_in_ram(schema_builder.build());
// use schema::Facet;
// {
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
// let mut doc = Document::default();
// for facet in doc_facets {
// doc.add_facet(facet_field, Facet::from(facet));
// }
// index_writer.add_document(doc);
// };
//
// index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
// index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
// index_doc(&mut index_writer, &["/top/a", "/top/b"]);
// index_doc(&mut index_writer, &["/top/a"]);
//
// index_doc(&mut index_writer, &["/top/b", "/top/d"]);
// index_doc(&mut index_writer, &["/top/d"]);
// index_doc(&mut index_writer, &["/top/e"]);
// index_writer.commit().expect("committed");
//
// index_doc(&mut index_writer, &["/top/a"]);
// index_doc(&mut index_writer, &["/top/b"]);
// index_doc(&mut index_writer, &["/top/c"]);
// index_writer.commit().expect("committed");
//
// index_doc(&mut index_writer, &["/top/e", "/top/f"]);
// index_writer.commit().expect("committed");
// }
// index.load_searchers().unwrap();
// let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
// let searcher = index.searcher();
// let mut facet_collector = FacetCollector::for_field(facet_field);
// facet_collector.add_facet(Facet::from("/top"));
// use collector::{CountCollector, MultiCollector};
// let mut count_collector = CountCollector::default();
// {
// let mut multi_collectors = MultiCollector::new();
// multi_collectors.add_collector(&mut count_collector);
// multi_collectors.add_collector(&mut facet_collector);
// searcher.search(&AllQuery, &mut multi_collectors).unwrap();
// }
// assert_eq!(count_collector.count(), expected_num_docs);
// let facet_counts = facet_collector.harvest();
// let facets: Vec<(String, u64)> = facet_counts
// .get("/top")
// .map(|(facet, count)| (facet.to_string(), count))
// .collect();
// assert_eq!(
// facets,
// expected
// .iter()
// .map(|&(facet_str, count)| (String::from(facet_str), count))
// .collect::<Vec<_>>()
// );
// };
// test_searcher(
// 11,
// &[
// ("/top/a", 5),
// ("/top/b", 5),
// ("/top/c", 2),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
//
// // Merging the segments
// {
// let segment_ids = index
// .searchable_segment_ids()
// .expect("Searchable segments failed.");
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// index_writer
// .merge(&segment_ids)
// .wait()
// .expect("Merging failed");
// index_writer.wait_merging_threads().unwrap();
//
// index.load_searchers().unwrap();
// test_searcher(
// 11,
// &[
// ("/top/a", 5),
// ("/top/b", 5),
// ("/top/c", 2),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
// }
//
// // Deleting one term
// {
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
// let facet_term = Term::from_facet(facet_field, &facet);
// index_writer.delete_term(facet_term);
// index_writer.commit().unwrap();
// index.load_searchers().unwrap();
// test_searcher(
// 9,
// &[
// ("/top/a", 3),
// ("/top/b", 3),
// ("/top/c", 1),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
// }
// }
#[test]
fn test_merge_facets() {
let mut schema_builder = schema::SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet");
let index = Index::create_in_ram(schema_builder.build());
use schema::Facet;
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
index_writer.add_document(doc);
};
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
index_doc(&mut index_writer, &["/top/a"]);
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
index_doc(&mut index_writer, &["/top/d"]);
index_doc(&mut index_writer, &["/top/e"]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"]);
index_doc(&mut index_writer, &["/top/b"]);
index_doc(&mut index_writer, &["/top/c"]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
index_writer.commit().expect("committed");
}
index.load_searchers().unwrap();
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top"));
use collector::{CountCollector, MultiCollector};
let mut count_collector = CountCollector::default();
{
let mut multi_collectors =
MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
}
assert_eq!(count_collector.count(), expected_num_docs);
let facet_counts = facet_collector.harvest();
let facets: Vec<(String, u64)> = facet_counts
.get("/top")
.map(|(facet, count)| (facet.to_string(), count))
.collect();
assert_eq!(
facets,
expected
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
};
test_searcher(
11,
&[
("/top/a", 5),
("/top/b", 5),
("/top/c", 2),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
index.load_searchers().unwrap();
test_searcher(
11,
&[
("/top/a", 5),
("/top/b", 5),
("/top/c", 2),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
}
// Deleting one term
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
test_searcher(
9,
&[
("/top/a", 3),
("/top/b", 3),
("/top/c", 1),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
}
}
#[test]
fn test_merge_multivalued_int_fields_all_deleted() {
@@ -1290,6 +1294,7 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
@@ -1392,6 +1397,7 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();

View File

@@ -2,6 +2,8 @@ use super::segment_register::SegmentRegister;
use core::SegmentId;
use core::SegmentMeta;
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
use error::ErrorKind;
use error::Result as TantivyResult;
use indexer::delete_queue::DeleteCursor;
use indexer::SegmentEntry;
use std::collections::hash_set::HashSet;
@@ -64,8 +66,9 @@ impl SegmentManager {
/// Returns all of the segment entries (committed or uncommitted)
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
let mut segment_entries = self.read().uncommitted.segment_entries();
segment_entries.extend(self.read().committed.segment_entries());
let registers_lock = self.read();
let mut segment_entries = registers_lock.uncommitted.segment_entries();
segment_entries.extend(registers_lock.committed.segment_entries());
segment_entries
}
@@ -76,32 +79,15 @@ impl SegmentManager {
}
pub fn list_files(&self) -> HashSet<PathBuf> {
let registers_lock = self.read();
let mut files = HashSet::new();
files.insert(META_FILEPATH.clone());
files.insert(LOCKFILE_FILEPATH.clone());
let segment_metas: Vec<SegmentMeta> = registers_lock
.committed
.get_all_segments()
.into_iter()
.chain(registers_lock.uncommitted.get_all_segments().into_iter())
.chain(registers_lock.writing.iter().cloned().map(SegmentMeta::new))
.collect();
for segment_meta in segment_metas {
for segment_meta in SegmentMeta::all() {
files.extend(segment_meta.list_files());
}
files
}
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
let registers = self.read();
registers
.committed
.segment_entry(segment_id)
.or_else(|| registers.uncommitted.segment_entry(segment_id))
}
// Lock poisoning should never happen :
// The lock is acquired and released within this class,
// and the operations cannot panic.
@@ -126,19 +112,38 @@ impl SegmentManager {
}
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) {
/// Marks a list of segments as in merge.
///
/// Returns an error if some segments are missing, or if
/// the `segment_ids` are not either all committed or all
/// uncommitted.
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
let mut registers_lock = self.write();
let mut segment_entries = vec![];
if registers_lock.uncommitted.contains_all(segment_ids) {
for segment_id in segment_ids {
registers_lock.uncommitted.start_merge(segment_id);
let segment_entry = registers_lock.uncommitted
.start_merge(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry);
}
} else if registers_lock.committed.contains_all(segment_ids) {
for segment_id in segment_ids {
let segment_entry = registers_lock.committed
.start_merge(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry);
}
for segment_id in segment_ids {
registers_lock.committed.start_merge(segment_id);
}
} else {
error!("Merge operation sent for segments that are not all uncommited or commited.");
let error_msg = "Merge operation sent for segments that are not \
all uncommited or commited."
.to_string();
bail!(ErrorKind::InvalidArgument(error_msg))
}
Ok(segment_entries)
}
pub fn cancel_merge(

View File

@@ -3,8 +3,7 @@ use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor;
use indexer::segment_entry::SegmentEntry;
use std::collections::HashMap;
use std::fmt;
use std::fmt::{Debug, Formatter};
use std::fmt::{self, Debug, Formatter};
/// The segment register keeps track
/// of the list of segment, their size as well
@@ -39,13 +38,6 @@ impl SegmentRegister {
self.segment_states.len()
}
pub fn get_all_segments(&self) -> Vec<SegmentMeta> {
self.segment_states
.values()
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
self.segment_states
.values()
@@ -67,10 +59,6 @@ impl SegmentRegister {
segment_ids
}
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states.get(segment_id).cloned()
}
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
@@ -93,11 +81,13 @@ impl SegmentRegister {
.cancel_merge();
}
pub fn start_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.start_merge();
pub fn start_merge(&mut self, segment_id: &SegmentId) -> Option<SegmentEntry> {
if let Some(segment_entry) = self.segment_states.get_mut(segment_id) {
segment_entry.start_merge();
Some(segment_entry.clone())
} else {
None
}
}
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
@@ -109,6 +99,11 @@ impl SegmentRegister {
}
SegmentRegister { segment_states }
}
#[cfg(test)]
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states.get(segment_id).cloned()
}
}
#[cfg(test)]
@@ -137,7 +132,7 @@ mod tests {
let segment_id_merged = SegmentId::generate_random();
{
let segment_meta = SegmentMeta::new(segment_id_a);
let segment_meta = SegmentMeta::new(segment_id_a, 0u32);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
@@ -150,7 +145,7 @@ mod tests {
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{
let segment_meta = SegmentMeta::new(segment_id_b);
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
@@ -180,7 +175,7 @@ mod tests {
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
{
let segment_meta_merged = SegmentMeta::new(segment_id_merged);
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 0u32);
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}

View File

@@ -7,11 +7,11 @@ use core::SegmentMeta;
use core::SerializableSegment;
use core::META_FILEPATH;
use directory::Directory;
use directory::FileProtection;
use error::{Error, ErrorKind, Result};
use error::{Error, ErrorKind, Result, ResultExt};
use futures::oneshot;
use futures::sync::oneshot::Receiver;
use futures::Future;
use futures_cpupool::Builder as CpuPoolBuilder;
use futures_cpupool::CpuFuture;
use futures_cpupool::CpuPool;
use indexer::delete_queue::DeleteCursor;
@@ -29,8 +29,7 @@ use std::collections::HashMap;
use std::io::Write;
use std::mem;
use std::ops::DerefMut;
use std::sync::atomic::Ordering;
use std::sync::atomic::{AtomicBool, AtomicUsize};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use std::sync::RwLock;
use std::thread;
@@ -87,38 +86,19 @@ pub fn save_metas(
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
fn perform_merge(
segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
index: &Index,
mut segment_entries: Vec<SegmentEntry>,
mut merged_segment: Segment,
target_opstamp: u64,
) -> Result<SegmentEntry> {
// first we need to apply deletes to our segment.
info!("Start merge: {:?}", segment_ids);
let index = &segment_updater.0.index;
// TODO add logging
let schema = index.schema();
let mut segment_entries = vec![];
let mut file_protections: Vec<FileProtection> = vec![];
for segment_id in segment_ids {
if let Some(mut segment_entry) = segment_updater.0.segment_manager.segment_entry(segment_id)
{
let segment = index.segment(segment_entry.meta().clone());
if let Some(file_protection) =
advance_deletes(segment, &mut segment_entry, target_opstamp)?
{
file_protections.push(file_protection);
}
segment_entries.push(segment_entry);
} else {
error!("Error, had to abort merge as some of the segment is not managed anymore.");
let msg = format!(
"Segment {:?} requested for merge is not managed.",
segment_id
);
bail!(ErrorKind::InvalidArgument(msg));
}
for segment_entry in &mut segment_entries {
let segment = index.segment(segment_entry.meta().clone());
advance_deletes(segment, segment_entry, target_opstamp)?;
}
let delete_cursor = segment_entries[0].delete_cursor().clone();
@@ -135,13 +115,13 @@ fn perform_merge(
// to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
.expect("Creating index serializer failed");
.chain_err(|| "Creating index serializer failed")?;
let num_docs = merger
.write(segment_serializer)
.expect("Serializing merged index failed");
let mut segment_meta = SegmentMeta::new(merged_segment.id());
segment_meta.set_max_doc(num_docs);
.chain_err(|| "Serializing merged index failed")?;
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
Ok(after_merge_segment_entry)
@@ -167,8 +147,12 @@ impl SegmentUpdater {
) -> Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
let pool = CpuPoolBuilder::new()
.name_prefix("segment_updater")
.pool_size(1)
.create();
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
pool: CpuPool::new(1),
pool,
index,
segment_manager,
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
@@ -283,69 +267,85 @@ impl SegmentUpdater {
}).wait()
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
self.0.segment_manager.start_merge(segment_ids);
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
//let future_merged_segment = */
let segment_ids_vec = segment_ids.to_vec();
self.run_async(move |segment_updater| {
segment_updater.start_merge_impl(&segment_ids_vec[..])
}).wait()?
}
// `segment_ids` is required to be non-empty.
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
let segment_updater_clone = self.clone();
let segment_entries: Vec<SegmentEntry> = self.0.segment_manager.start_merge(segment_ids)?;
let segment_ids_vec = segment_ids.to_vec();
let merging_thread_id = self.get_merging_thread_id();
info!(
"Starting merge thread #{} - {:?}",
merging_thread_id, segment_ids
);
let (merging_future_send, merging_future_recv) = oneshot();
if segment_ids.is_empty() {
return merging_future_recv;
}
let target_opstamp = self.0.stamper.stamp();
let merging_join_handle = thread::spawn(move || {
// first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(
&segment_ids_vec,
&segment_updater_clone,
merged_segment,
target_opstamp,
);
match merge_result {
Ok(after_merge_segment_entry) => {
let merged_segment_meta = after_merge_segment_entry.meta().clone();
segment_updater_clone
.end_merge(segment_ids_vec, after_merge_segment_entry)
.expect("Segment updater thread is corrupted.");
// first we need to apply deletes to our segment.
let merging_join_handle = thread::Builder::new()
.name(format!("mergingthread-{}", merging_thread_id))
.spawn(move || {
// first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(
&segment_updater_clone.0.index,
segment_entries,
merged_segment,
target_opstamp,
);
// the future may fail if the listener of the oneshot future
// has been destroyed.
//
// This is not a problem here, so we just ignore any
// possible error.
let _merging_future_res = merging_future_send.send(merged_segment_meta);
}
Err(e) => {
error!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
match merge_result {
Ok(after_merge_segment_entry) => {
let merged_segment_meta = after_merge_segment_entry.meta().clone();
segment_updater_clone
.end_merge(segment_ids_vec, after_merge_segment_entry)
.expect("Segment updater thread is corrupted.");
// the future may fail if the listener of the oneshot future
// has been destroyed.
//
// This is not a problem here, so we just ignore any
// possible error.
let _merging_future_res = merging_future_send.send(merged_segment_meta);
}
Err(e) => {
warn!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
// merging_future_send will be dropped, sending an error to the future.
}
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
// merging_future_send will be dropped, sending an error to the future.
}
}
segment_updater_clone
.0
.merging_threads
.write()
.unwrap()
.remove(&merging_thread_id);
Ok(())
});
segment_updater_clone
.0
.merging_threads
.write()
.unwrap()
.remove(&merging_thread_id);
Ok(())
})
.expect("Failed to spawn a thread.");
self.0
.merging_threads
.write()
.unwrap()
.insert(merging_thread_id, merging_join_handle);
merging_future_recv
Ok(merging_future_recv)
}
fn consider_merge_options(&self) {
@@ -358,8 +358,18 @@ impl SegmentUpdater {
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
for MergeCandidate(segment_metas) in merge_candidates {
if let Err(e) = self.start_merge(&segment_metas).fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e);
match self.start_merge_impl(&segment_metas) {
Ok(merge_future) => {
if let Err(e) = merge_future.fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e);
}
}
Err(err) => {
warn!(
"Starting the merge failed for the following reason. This is not fatal. {}",
err
);
}
}
}
}
@@ -382,7 +392,6 @@ impl SegmentUpdater {
self.run_async(move |segment_updater| {
info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
let mut _file_protection_opt = None;
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater
.0
@@ -393,29 +402,22 @@ impl SegmentUpdater {
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
match advance_deletes(
segment,
&mut after_merge_segment_entry,
committed_opstamp,
) {
Ok(file_protection_opt_res) => {
_file_protection_opt = file_protection_opt_res;
}
Err(e) => {
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids, e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater.cancel_merge(
&before_merge_segment_ids,
after_merge_segment_entry.segment_id(),
);
return;
if let Err(e) =
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp)
{
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids, e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater.cancel_merge(
&before_merge_segment_ids,
after_merge_segment_entry.segment_id(),
);
return;
}
}
}

View File

@@ -1,10 +1,8 @@
use super::operation::AddOperation;
use core::Segment;
use core::SerializableSegment;
use datastruct::stacker::Heap;
use fastfield::FastFieldsWriter;
use fieldnorm::FieldNormsWriter;
use indexer::index_writer::MARGIN_IN_BYTES;
use indexer::segment_serializer::SegmentSerializer;
use postings::MultiFieldPostingsWriter;
use schema::FieldType;
@@ -24,10 +22,9 @@ use Result;
///
/// They creates the postings list in anonymous memory.
/// The segment is layed on disk when the segment gets `finalized`.
pub struct SegmentWriter<'a> {
heap: &'a Heap,
pub struct SegmentWriter {
max_doc: DocId,
multifield_postings: MultiFieldPostingsWriter<'a>,
multifield_postings: MultiFieldPostingsWriter,
segment_serializer: SegmentSerializer,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter,
@@ -35,7 +32,7 @@ pub struct SegmentWriter<'a> {
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
}
impl<'a> SegmentWriter<'a> {
impl SegmentWriter {
/// Creates a new `SegmentWriter`
///
/// The arguments are defined as follows
@@ -46,13 +43,12 @@ impl<'a> SegmentWriter<'a> {
/// - segment: The segment being written
/// - schema
pub fn for_segment(
heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema,
) -> Result<SegmentWriter<'a>> {
) -> Result<SegmentWriter> {
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
let tokenizers = schema
.fields()
.iter()
@@ -68,7 +64,6 @@ impl<'a> SegmentWriter<'a> {
})
.collect();
Ok(SegmentWriter {
heap,
max_doc: 0,
multifield_postings,
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
@@ -94,22 +89,8 @@ impl<'a> SegmentWriter<'a> {
Ok(self.doc_opstamps)
}
/// Returns true iff the segment writer's buffer has reached capacity.
///
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
/// The `Segment` is `finalize`d when the buffer gets full.
///
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
/// exceeds the heap size.
pub fn is_buffer_full(&self) -> bool {
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
}
/// Return true if the term dictionary hashmap is reaching capacity.
/// It is one of the condition that triggers a `SegmentWriter` to
/// be finalized.
pub(crate) fn is_term_saturated(&self) -> bool {
self.multifield_postings.is_term_saturated()
pub fn mem_usage(&self) -> usize {
self.multifield_postings.mem_usage()
}
/// Indexes a new document
@@ -248,7 +229,7 @@ fn write(
Ok(())
}
impl<'a> SerializableSegment for SegmentWriter<'a> {
impl SerializableSegment for SegmentWriter {
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(

View File

@@ -55,7 +55,7 @@
//!
//! // Indexing documents
//!
//! let index = Index::create(index_path, schema.clone())?;
//! let index = Index::create_in_dir(index_path, schema.clone())?;
//!
//! // Here we use a buffer of 100MB that will be split
//! // between indexing threads.
@@ -136,11 +136,11 @@ extern crate combine;
extern crate crossbeam;
extern crate fnv;
extern crate fst;
extern crate fst_regex;
extern crate futures;
extern crate futures_cpupool;
extern crate itertools;
extern crate levenshtein_automata;
extern crate lz4;
extern crate num_cpus;
extern crate owning_ref;
extern crate regex;
@@ -180,6 +180,9 @@ mod macros;
pub use error::{Error, ErrorKind, ResultExt};
extern crate census;
extern crate owned_read;
/// Tantivy result.
pub type Result<T> = std::result::Result<T, Error>;
@@ -188,8 +191,7 @@ mod compression;
mod core;
mod indexer;
mod datastruct;
#[allow(unused_doc_comment)]
#[allow(unused_doc_comments)]
mod error;
pub mod tokenizer;

View File

@@ -11,6 +11,7 @@ mod postings_writer;
mod recorder;
mod segment_postings;
mod serializer;
mod stacker;
mod term_info;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
@@ -21,6 +22,8 @@ pub use self::term_info::TermInfo;
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
pub(crate) use self::stacker::compute_table_size;
pub use common::HasLen;
pub(crate) type UnorderedTermId = u64;
@@ -39,7 +42,6 @@ pub mod tests {
use core::Index;
use core::SegmentComponent;
use core::SegmentReader;
use datastruct::stacker::Heap;
use docset::{DocSet, SkipResult};
use fieldnorm::FieldNormReader;
use indexer::operation::AddOperation;
@@ -160,10 +162,9 @@ pub mod tests {
let index = Index::create_in_ram(schema.clone());
let segment = index.new_segment();
let heap = Heap::with_capacity(10_000_000);
{
let mut segment_writer =
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values

View File

@@ -1,4 +1,5 @@
use datastruct::stacker::{Heap, TermHashMap};
use super::stacker::{Addr, MemoryArena, TermHashMap};
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
use postings::UnorderedTermId;
use postings::{FieldSerializer, InvertedIndexSerializer};
@@ -14,80 +15,89 @@ use tokenizer::TokenStream;
use DocId;
use Result;
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
heap: &'a Heap,
) -> Box<PostingsWriter + 'a> {
fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box<PostingsWriter> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options
.get_indexing_options()
.map(|indexing_options| match indexing_options.index_option() {
IndexRecordOption::Basic => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
IndexRecordOption::WithFreqs => {
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
}
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
FieldType::Bytes => {
// FieldType::Bytes cannot actually be indexed.
// TODO fix during the indexer refactoring described in #276
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
}
}
pub struct MultiFieldPostingsWriter<'a> {
heap: &'a Heap,
pub struct MultiFieldPostingsWriter {
heap: MemoryArena,
schema: Schema,
term_index: TermHashMap<'a>,
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
term_index: TermHashMap,
per_field_postings_writers: Vec<Box<PostingsWriter>>,
}
impl<'a> MultiFieldPostingsWriter<'a> {
impl MultiFieldPostingsWriter {
/// Create a new `MultiFieldPostingsWriter` given
/// a schema and a heap.
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
let term_index = TermHashMap::new(table_bits, heap);
pub fn new(schema: &Schema, table_bits: usize) -> MultiFieldPostingsWriter {
let term_index = TermHashMap::new(table_bits);
let per_field_postings_writers: Vec<_> = schema
.fields()
.iter()
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.map(|field_entry| posting_from_field_entry(field_entry))
.collect();
MultiFieldPostingsWriter {
heap: MemoryArena::new(),
schema: schema.clone(),
heap,
term_index,
per_field_postings_writers,
}
}
pub fn mem_usage(&self) -> usize {
self.term_index.mem_usage() + self.heap.mem_usage()
}
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
postings_writer.index_text(
&mut self.term_index,
doc,
field,
token_stream,
&mut self.heap,
)
}
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, self.heap)
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
}
/// Serialize the inverted index.
/// It pushes all term, one field at a time, towards the
/// postings serializer.
#[allow(needless_range_loop)]
pub fn serialize(
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index
.iter()
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
.collect();
term_offsets.sort_by_key(|&(k, _, _)| k);
let mut offsets: Vec<(Field, usize)> = vec![];
@@ -142,23 +152,19 @@ impl<'a> MultiFieldPostingsWriter<'a> {
postings_writer.serialize(
&term_offsets[start..stop],
&mut field_serializer,
self.heap,
&self.term_index.heap,
&self.heap,
)?;
field_serializer.close()?;
}
Ok(unordered_term_mappings)
}
/// Return true iff the term dictionary is saturated.
pub fn is_term_saturated(&self) -> bool {
self.term_index.is_saturated()
}
}
/// The `PostingsWriter` is in charge of receiving documenting
/// and building a `Segment` in anonymous memory.
///
/// `PostingsWriter` writes in a `Heap`.
/// `PostingsWriter` writes in a `MemoryArena`.
pub trait PostingsWriter {
/// Record that a document contains a term at a given position.
///
@@ -173,16 +179,17 @@ pub trait PostingsWriter {
doc: DocId,
pos: u32,
term: &Term,
heap: &Heap,
heap: &mut MemoryArena,
) -> UnorderedTermId;
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(&[u8], u32, UnorderedTermId)],
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
serializer: &mut FieldSerializer,
heap: &Heap,
term_heap: &MemoryArena,
heap: &MemoryArena,
) -> io::Result<()>;
/// Tokenize a text and subscribe all of its token.
@@ -192,7 +199,7 @@ pub trait PostingsWriter {
doc_id: DocId,
field: Field,
token_stream: &mut TokenStream,
heap: &Heap,
heap: &mut MemoryArena,
) -> u32 {
let mut term = Term::for_field(field);
let num_tokens = {
@@ -210,61 +217,67 @@ pub trait PostingsWriter {
/// The `SpecializedPostingsWriter` is just here to remove dynamic
/// dispatch to the recorder information.
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
heap: &'a Heap,
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
total_num_tokens: u64,
_recorder_type: PhantomData<Rec>,
}
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
/// constructor
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
pub fn new() -> SpecializedPostingsWriter<Rec> {
SpecializedPostingsWriter {
heap,
total_num_tokens: 0u64,
_recorder_type: PhantomData,
}
}
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
pub fn new_boxed() -> Box<PostingsWriter> {
Box::new(SpecializedPostingsWriter::<Rec>::new())
}
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
fn subscribe(
&mut self,
term_index: &mut TermHashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &Heap,
heap: &mut MemoryArena,
) -> UnorderedTermId {
debug_assert!(term.as_slice().len() >= 4);
let (term_ord, recorder): (UnorderedTermId, &mut Rec) = term_index.get_or_create(term);
let current_doc = recorder.current_doc();
if current_doc != doc {
if current_doc != u32::max_value() {
recorder.close_doc(heap);
}
recorder.new_doc(doc, heap);
}
self.total_num_tokens += 1;
recorder.record_position(position, heap);
term_ord
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
if opt_recorder.is_some() {
let mut recorder = opt_recorder.unwrap();
let current_doc = recorder.current_doc();
if current_doc != doc {
recorder.close_doc(heap);
recorder.new_doc(doc, heap);
}
recorder.record_position(position, heap);
recorder
} else {
let mut recorder = Rec::new(heap);
recorder.new_doc(doc, heap);
recorder.record_position(position, heap);
recorder
}
}) as UnorderedTermId
}
fn serialize(
&self,
term_addrs: &[(&[u8], u32, UnorderedTermId)],
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
serializer: &mut FieldSerializer,
heap: &Heap,
termdict_heap: &MemoryArena,
heap: &MemoryArena,
) -> io::Result<()> {
for &(term_bytes, addr, _) in term_addrs {
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
let recorder: Rec = unsafe { termdict_heap.read(addr) };
serializer.new_term(&term_bytes[4..])?;
recorder.serialize(addr, serializer, heap)?;
recorder.serialize(serializer, heap)?;
serializer.close_term()?;
}
Ok(())

View File

@@ -1,4 +1,4 @@
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use postings::FieldSerializer;
use std::{self, io};
use DocId;
@@ -15,62 +15,53 @@ const POSITION_END: u32 = std::u32::MAX;
/// * the document id
/// * the term frequency
/// * the term positions
pub trait Recorder: HeapAllocable {
pub trait Recorder: Copy {
///
fn new(heap: &mut MemoryArena) -> Self;
/// Returns the current document
fn current_doc(&self) -> u32;
/// Starts recording information about a new document
/// This method shall only be called if the term is within the document.
fn new_doc(&mut self, doc: DocId, heap: &Heap);
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena);
/// Record the position of a term. For each document,
/// this method will be called `term_freq` times.
fn record_position(&mut self, position: u32, heap: &Heap);
fn record_position(&mut self, position: u32, heap: &mut MemoryArena);
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &Heap);
fn close_doc(&mut self, heap: &mut MemoryArena);
/// Pushes the postings information to the serializer.
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
}
/// Only records the doc ids
#[derive(Clone, Copy)]
pub struct NothingRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
}
impl HeapAllocable for NothingRecorder {
fn with_addr(addr: u32) -> NothingRecorder {
impl Recorder for NothingRecorder {
fn new(heap: &mut MemoryArena) -> Self {
NothingRecorder {
stack: ExpUnrolledLinkedList::with_addr(addr),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
}
}
}
impl Recorder for NothingRecorder {
fn current_doc(&self) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &Heap) {}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
fn close_doc(&mut self, _heap: &Heap) {}
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for doc in self.stack.iter(self_addr, heap) {
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
for doc in self.stack.iter(heap) {
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
}
Ok(())
@@ -78,52 +69,46 @@ impl Recorder for NothingRecorder {
}
/// Recorder encoding document ids, and term frequencies
#[derive(Clone, Copy)]
pub struct TermFrequencyRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
current_tf: u32,
}
impl HeapAllocable for TermFrequencyRecorder {
fn with_addr(addr: u32) -> TermFrequencyRecorder {
impl Recorder for TermFrequencyRecorder {
fn new(heap: &mut MemoryArena) -> Self {
TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::with_addr(addr),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
current_tf: 0u32,
}
}
}
impl Recorder for TermFrequencyRecorder {
fn current_doc(&self) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &Heap) {
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
self.current_tf += 1;
}
fn close_doc(&mut self, heap: &Heap) {
fn close_doc(&mut self, heap: &mut MemoryArena) {
debug_assert!(self.current_tf > 0);
self.stack.push(self.current_tf, heap);
self.current_tf = 0;
}
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self.stack
.iter(self_addr, heap)
.iter(heap)
.chain(Some(self.current_tf).into_iter());
while let Some(doc) = doc_iter.next() {
@@ -137,46 +122,40 @@ impl Recorder for TermFrequencyRecorder {
}
/// Recorder encoding term frequencies as well as positions.
#[derive(Clone, Copy)]
pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
}
impl HeapAllocable for TFAndPositionRecorder {
fn with_addr(addr: u32) -> TFAndPositionRecorder {
impl Recorder for TFAndPositionRecorder {
fn new(heap: &mut MemoryArena) -> Self {
TFAndPositionRecorder {
stack: ExpUnrolledLinkedList::with_addr(addr),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
}
}
}
impl Recorder for TFAndPositionRecorder {
fn current_doc(&self) -> DocId {
self.current_doc
}
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
self.stack.push(doc, heap);
}
fn record_position(&mut self, position: u32, heap: &Heap) {
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
self.stack.push(position, heap);
}
fn close_doc(&mut self, heap: &Heap) {
fn close_doc(&mut self, heap: &mut MemoryArena) {
self.stack.push(POSITION_END, heap);
}
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(self_addr, heap);
let mut positions_iter = self.stack.iter(heap);
while let Some(doc) = positions_iter.next() {
let mut prev_position = 0;
doc_positions.clear();

View File

@@ -2,15 +2,14 @@ use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BL
use DocId;
use common::BitSet;
use common::CountingWriter;
use common::HasLen;
use compression::compressed_block_size;
use directory::{ReadOnlySource, SourceRead};
use docset::{DocSet, SkipResult};
use fst::Streamer;
use postings::serializer::PostingsSerializer;
use postings::FreqReadingOption;
use postings::Postings;
use owned_read::OwnedRead;
struct PositionComputer {
// store the amount of position int
@@ -78,9 +77,9 @@ impl SegmentPostings {
/// and returns a `SegmentPostings` object that embeds a
/// buffer with the serialized data.
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
let mut counting_writer = CountingWriter::wrap(Vec::new());
let mut buffer = Vec::new();
{
let mut postings_serializer = PostingsSerializer::new(&mut counting_writer, false);
let mut postings_serializer = PostingsSerializer::new(&mut buffer, false);
for &doc in docs {
postings_serializer.write_doc(doc, 1u32).unwrap();
}
@@ -88,13 +87,9 @@ impl SegmentPostings {
.close_term()
.expect("In memory Serialization should never fail.");
}
let (buffer, _) = counting_writer
.finish()
.expect("Serializing in a buffer should never fail.");
let data = ReadOnlySource::from(buffer);
let block_segment_postings = BlockSegmentPostings::from_data(
docs.len(),
SourceRead::from(data),
OwnedRead::new(buffer),
FreqReadingOption::NoFreq,
);
SegmentPostings::from_block_postings(block_segment_postings, None)
@@ -306,13 +301,13 @@ pub struct BlockSegmentPostings {
doc_offset: DocId,
num_bitpacked_blocks: usize,
num_vint_docs: usize,
remaining_data: SourceRead,
remaining_data: OwnedRead,
}
impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: usize,
data: SourceRead,
data: OwnedRead,
freq_reading_option: FreqReadingOption,
) -> BlockSegmentPostings {
let num_bitpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
@@ -339,7 +334,7 @@ impl BlockSegmentPostings {
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) {
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: OwnedRead) {
let num_binpacked_blocks: usize = doc_freq / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = doc_freq & (COMPRESSION_BLOCK_SIZE - 1);
self.num_bitpacked_blocks = num_binpacked_blocks;
@@ -449,7 +444,7 @@ impl BlockSegmentPostings {
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,
remaining_data: From::from(ReadOnlySource::empty()),
remaining_data: OwnedRead::new(vec![]),
doc_offset: 0,
doc_freq: 0,
}

View File

@@ -239,13 +239,60 @@ impl<'a> FieldSerializer<'a> {
}
}
struct Block {
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
len: usize
}
impl Block {
fn new() -> Self {
Block {
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
len: 0
}
}
fn doc_ids(&self) -> &[DocId] {
&self.doc_ids[..self.len]
}
fn term_freqs(&self) -> &[u32] {
&self.term_freqs[..self.len]
}
fn clear(&mut self) {
self.len = 0;
}
fn append_doc(&mut self, doc: DocId, term_freq: u32) {
let len = self.len;
self.doc_ids[len] = doc;
self.term_freqs[len] = term_freq;
self.len = len + 1;
}
fn is_full(&self) -> bool {
self.len == COMPRESSION_BLOCK_SIZE
}
fn is_empty(&self) -> bool {
self.len == 0
}
fn last_doc(&self) -> DocId {
assert_eq!(self.len, COMPRESSION_BLOCK_SIZE);
self.doc_ids[COMPRESSION_BLOCK_SIZE - 1]
}
}
pub struct PostingsSerializer<W: Write> {
postings_write: CountingWriter<W>,
last_doc_id_encoded: u32,
block_encoder: BlockEncoder,
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
block: Box<Block>,
termfreq_enabled: bool,
}
@@ -256,41 +303,41 @@ impl<W: Write> PostingsSerializer<W> {
postings_write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
doc_ids: vec![],
term_freqs: vec![],
block: Box::new(Block::new()),
last_doc_id_encoded: 0u32,
termfreq_enabled,
}
}
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
self.doc_ids.push(doc_id);
if self.termfreq_enabled {
self.term_freqs.push(term_freq as u32);
fn write_block(&mut self) -> io::Result<()> {
{
// encode the doc ids
let block_encoded: &[u8] = self.block_encoder
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.postings_write.write_all(block_encoded)?;
}
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
{
// encode the doc ids
let block_encoded: &[u8] = self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
self.postings_write.write_all(block_encoded)?;
}
if self.termfreq_enabled {
// encode the term_freqs
let block_encoded: &[u8] =
self.block_encoder.compress_block_unsorted(&self.term_freqs);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
self.doc_ids.clear();
if self.termfreq_enabled {
// encode the term_freqs
let block_encoded: &[u8] =
self.block_encoder.compress_block_unsorted(&self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
}
self.block.clear();
Ok(())
}
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
self.block.append_doc(doc_id, term_freq);
if self.block.is_full() {
self.write_block()?;
}
Ok(())
}
pub fn close_term(&mut self) -> io::Result<()> {
if !self.doc_ids.is_empty() {
if !self.block.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
@@ -299,17 +346,16 @@ impl<W: Write> PostingsSerializer<W> {
// using variable int encoding.
{
let block_encoded = self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.termfreq_enabled {
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
.compress_vint_unsorted(self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
self.block.clear();
}
Ok(())
}
@@ -323,8 +369,7 @@ impl<W: Write> PostingsSerializer<W> {
}
fn clear(&mut self) {
self.doc_ids.clear();
self.term_freqs.clear();
self.block.clear();
self.last_doc_id_encoded = 0;
}
}

View File

@@ -0,0 +1,218 @@
use super::{Addr, MemoryArena};
use common::is_power_of_2;
use std::mem;
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
const FIRST_BLOCK: u32 = 4u32;
#[inline]
pub fn jump_needed(len: u32) -> Option<usize> {
match len {
0...3 => None,
4...MAX_BLOCK_LEN => {
if is_power_of_2(len as usize) {
Some(len as usize)
} else {
None
}
}
n => {
if n % MAX_BLOCK_LEN == 0 {
Some(MAX_BLOCK_LEN as usize)
} else {
None
}
}
}
}
/// An exponential unrolled link.
///
/// The use case is as follows. Tantivy's indexer conceptually acts like a
/// `HashMap<Term, Vec<u32>>`. As we come accross a given term in document
/// `D`, we lookup the term in the map and append the document id to its vector.
///
/// The vector is then only read when it is serialized.
///
/// The `ExpUnrolledLinkedList` offers a more efficient solution to this
/// problem.
///
/// It combines the idea of the unrolled linked list and tries to address the
/// problem of selecting an adequate block size using a strategy similar to
/// that of the `Vec` amortized resize strategy.
///
/// Data is stored in a linked list of blocks. The first block has a size of `4`
/// and each block has a length of twice that of the previous block up to
/// `MAX_BLOCK_LEN = 32768`.
///
/// This strategy is a good trade off to handle numerous very rare terms
/// and avoid wasting half of the memory for very frequent terms.
#[derive(Debug, Clone, Copy)]
pub struct ExpUnrolledLinkedList {
len: u32,
head: Addr,
tail: Addr,
}
impl ExpUnrolledLinkedList {
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
ExpUnrolledLinkedList {
len: 0u32,
head: addr,
tail: addr,
}
}
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap,
addr: self.head,
len: self.len,
consumed: 0,
}
}
/// Appends a new element to the current stack.
///
/// If the current block end is reached, a new block is allocated.
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
self.len += 1;
if let Some(new_block_len) = jump_needed(self.len) {
// We need to allocate another block.
// We also allocate an extra `u32` to store the pointer
// to the future next block.
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
let new_block_addr: Addr = heap.allocate_space(new_block_size);
unsafe {
// logic
heap.write(self.tail, new_block_addr)
};
self.tail = new_block_addr;
}
unsafe {
// logic
heap.write(self.tail, val);
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
}
}
}
pub struct ExpUnrolledLinkedListIterator<'a> {
heap: &'a MemoryArena,
addr: Addr,
len: u32,
consumed: u32,
}
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
} else {
self.consumed += 1;
let addr: Addr = if jump_needed(self.consumed).is_some() {
unsafe {
// logic
self.heap.read(self.addr)
}
} else {
self.addr
};
self.addr = addr.offset(mem::size_of::<u32>() as u32);
Some(unsafe {
// logic
self.heap.read(addr)
})
}
}
}
#[cfg(test)]
mod tests {
use super::super::MemoryArena;
use super::jump_needed;
use super::*;
#[test]
fn test_stack() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
stack.push(1u32, &mut heap);
stack.push(2u32, &mut heap);
stack.push(4u32, &mut heap);
stack.push(8u32, &mut heap);
{
let mut it = stack.iter(&heap);
assert_eq!(it.next().unwrap(), 1u32);
assert_eq!(it.next().unwrap(), 2u32);
assert_eq!(it.next().unwrap(), 4u32);
assert_eq!(it.next().unwrap(), 8u32);
assert!(it.next().is_none());
}
}
#[test]
fn test_jump_if_needed() {
let mut block_len = 4u32;
let mut i = 0;
while i < 10_000_000 {
assert!(jump_needed(i + block_len - 1).is_none());
assert!(jump_needed(i + block_len + 1).is_none());
assert!(jump_needed(i + block_len).is_some());
let new_block_len = jump_needed(i + block_len).unwrap();
i += block_len;
block_len = new_block_len as u32;
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::ExpUnrolledLinkedList;
use tantivy_memory_arena::MemoryArena;
use test::Bencher;
const NUM_STACK: usize = 10_000;
const STACK_SIZE: u32 = 1000;
#[bench]
fn bench_push_vec(bench: &mut Bencher) {
bench.iter(|| {
let mut vecs = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
vecs.push(Vec::new());
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
vecs[t].push(i);
}
}
});
}
#[bench]
fn bench_push_stack(bench: &mut Bencher) {
let heap = MemoryArena::new();
bench.iter(|| {
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
stacks.push(stack);
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
stacks[t].push(i, &heap);
}
}
heap.clear();
});
}
}

View File

@@ -0,0 +1,291 @@
//! 32-bits Memory arena for types implementing `Copy`.
//! This Memory arena has been implemented to fit the use of tantivy's indexer
//! and has *twisted specifications*.
//!
//! - It works on stable rust.
//! - One can get an accurate figure of the memory usage of the arena.
//! - Allocation are very cheap.
//! - Allocation happening consecutively are very likely to have great locality.
//! - Addresses (`Addr`) are 32bits.
//! - Dropping the whole `MemoryArena` is cheap.
//!
//! # Limitations
//!
//! - Your object shall not implement `Drop`.
//! - `Addr` to the `Arena` are 32-bits. The maximum capacity of the arena
//! is 4GB. *(Tantivy's indexer uses one arena per indexing thread.)*
//! - The arena only works for objects much smaller than `1MB`.
//! Allocating more than `1MB` at a time will result in a panic,
//! and allocating a lot of large object (> 500KB) will result in a fragmentation.
//! - Your objects are store in an unaligned fashion. For this reason,
//! the API does not let you access them as references.
//!
//! Instead, you store and access your data via `.write(...)` and `.read(...)`, which under the hood
//! stores your object using `ptr::write_unaligned` and `ptr::read_unaligned`.
use std::mem;
use std::ptr;
const NUM_BITS_PAGE_ADDR: usize = 20;
const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
/// Represents a pointer into the `MemoryArena`
/// .
/// Pointer are 32-bits and are split into
/// two parts.
///
/// The first 12 bits represent the id of a
/// page of memory.
///
/// The last 20 bits are an address within this page of memory.
#[derive(Clone, Copy, Debug)]
pub struct Addr(u32);
impl Addr {
/// Creates a null pointer.
pub fn null_pointer() -> Addr {
Addr(u32::max_value())
}
/// Returns the `Addr` object for `addr + offset`
pub fn offset(&self, offset: u32) -> Addr {
Addr(self.0.wrapping_add(offset))
}
fn new(page_id: usize, local_addr: usize) -> Addr {
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
}
fn page_id(&self) -> usize {
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
}
fn page_local_addr(&self) -> usize {
(self.0 as usize) & (PAGE_SIZE - 1)
}
/// Returns true if and only if the `Addr` is null.
pub fn is_null(&self) -> bool {
self.0 == u32::max_value()
}
}
/// Trait required for an object to be `storable`.
///
/// # Warning
///
/// Most of the time you should not implement this trait,
/// and only use the `MemoryArena` with object implementing `Copy`.
///
/// `ArenaStorable` is used in `tantivy` to force
/// a `Copy` object and a `slice` of data to be stored contiguously.
pub trait ArenaStorable {
fn num_bytes(&self) -> usize;
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
}
impl<V> ArenaStorable for V
where
V: Copy,
{
fn num_bytes(&self) -> usize {
mem::size_of::<V>()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
ptr::write_unaligned(dst_ptr, self);
}
}
/// The `MemoryArena`
pub struct MemoryArena {
pages: Vec<Page>,
}
impl MemoryArena {
/// Creates a new memory arena.
pub fn new() -> MemoryArena {
let first_page = Page::new(0);
MemoryArena {
pages: vec![first_page],
}
}
fn add_page(&mut self) -> &mut Page {
let new_page_id = self.pages.len();
self.pages.push(Page::new(new_page_id));
&mut self.pages[new_page_id]
}
/// Returns an estimate in number of bytes
/// of resident memory consumed by the `MemoryArena`.
///
/// Internally, it counts a number of `1MB` pages
/// and therefore delivers an upperbound.
pub fn mem_usage(&self) -> usize {
self.pages.len() * PAGE_SIZE
}
/// Writes a slice at the given address, assuming the
/// memory was allocated beforehands.
///
/// # Panics
///
/// May panic or corrupt the heap if he space was not
/// properly allocated beforehands.
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
let bytes = data.as_ref();
self.pages[addr.page_id()]
.get_mut_slice(addr.page_local_addr(), bytes.len())
.copy_from_slice(bytes);
}
/// Returns the `len` bytes starting at `addr`
///
/// # Panics
///
/// Panics if the memory has not been allocated beforehands.
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
}
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
}
/// Stores an item's data in the heap
///
/// It allocates the `Item` beforehands.
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
let num_bytes = val.num_bytes();
let addr = self.allocate_space(num_bytes);
unsafe {
self.write(addr, val);
};
addr
}
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
val.write_into(self, addr)
}
/// Read an item in the heap at the given `address`.
///
/// # Panics
///
/// If the address is erroneous
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
ptr::read_unaligned(ptr as *const Item)
}
/// Allocates `len` bytes and returns the allocated address.
pub fn allocate_space(&mut self, len: usize) -> Addr {
let page_id = self.pages.len() - 1;
if let Some(addr) = self.pages[page_id].allocate_space(len) {
return addr;
}
self.add_page().allocate_space(len).unwrap()
}
}
struct Page {
page_id: usize,
len: usize,
data: Box<[u8]>,
}
impl Page {
fn new(page_id: usize) -> Page {
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
unsafe {
data.set_len(PAGE_SIZE);
} // avoid initializing page
Page {
page_id,
len: 0,
data: data.into_boxed_slice(),
}
}
#[inline(always)]
fn is_available(&self, len: usize) -> bool {
len + self.len <= PAGE_SIZE
}
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
&mut self.data[local_addr..][..len]
}
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
&self.data[local_addr..][..len]
}
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
if self.is_available(len) {
let addr = Addr::new(self.page_id, self.len);
self.len += len;
Some(addr)
} else {
None
}
}
#[inline(always)]
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
self.data.as_ptr().offset(addr as isize)
}
#[inline(always)]
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
self.data.as_mut_ptr().offset(addr as isize)
}
}
#[cfg(test)]
mod tests {
use super::MemoryArena;
#[test]
fn test_arena_allocate_slice() {
let mut arena = MemoryArena::new();
let a = b"hello";
let b = b"happy tax payer";
let addr_a = arena.allocate_space(a.len());
arena.write_bytes(addr_a, a);
let addr_b = arena.allocate_space(b.len());
arena.write_bytes(addr_b, b);
assert_eq!(arena.read_slice(addr_a, a.len()), a);
assert_eq!(arena.read_slice(addr_b, b.len()), b);
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct MyTest {
pub a: usize,
pub b: u8,
pub c: u32,
}
#[test]
fn test_store_object() {
let mut arena = MemoryArena::new();
let a = MyTest {
a: 143,
b: 21,
c: 32,
};
let b = MyTest {
a: 113,
b: 221,
c: 12,
};
let addr_a = arena.store(a);
let addr_b = arena.store(b);
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
}
}

View File

@@ -0,0 +1,9 @@
mod expull;
mod memory_arena;
mod murmurhash2;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -0,0 +1,86 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -0,0 +1,296 @@
use super::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena};
use std::iter;
use std::mem;
use std::slice;
pub type BucketId = usize;
struct KeyBytesValue<'a, V> {
key: &'a [u8],
value: V,
}
impl<'a, V> KeyBytesValue<'a, V> {
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
KeyBytesValue { key, value }
}
}
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
where
V: ArenaStorable,
{
fn num_bytes(&self) -> usize {
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
arena.write(addr, self.key.len() as u16);
arena.write_bytes(addr.offset(2), self.key);
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
}
}
/// Returns the actual memory size in bytes
/// required to create a table of size $2^num_bits$.
pub fn compute_table_size(num_bits: usize) -> usize {
(1 << num_bits) * mem::size_of::<KeyValue>()
}
/// `KeyValue` is the item stored in the hash table.
/// The key is actually a `BytesRef` object stored in an external heap.
/// The `value_addr` also points to an address in the heap.
///
/// The key and the value are actually stored contiguously.
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
#[derive(Copy, Clone)]
struct KeyValue {
key_value_addr: Addr,
hash: u32,
}
impl Default for KeyValue {
fn default() -> Self {
KeyValue {
key_value_addr: Addr::null_pointer(),
hash: 0u32,
}
}
}
impl KeyValue {
fn is_empty(&self) -> bool {
self.key_value_addr.is_null()
}
}
/// Customized `HashMap` with string keys
///
/// This `HashMap` takes String as keys. Keys are
/// stored in a user defined heap.
///
/// The quirky API has the benefit of avoiding
/// the computation of the hash of the key twice,
/// or copying the key as long as there is no insert.
///
pub struct TermHashMap {
table: Box<[KeyValue]>,
pub heap: MemoryArena,
mask: usize,
occupied: Vec<usize>,
}
struct QuadraticProbing {
hash: usize,
i: usize,
mask: usize,
}
impl QuadraticProbing {
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing { hash, i: 0, mask }
}
#[inline]
fn next_probe(&mut self) -> usize {
self.i += 1;
(self.hash + self.i) & self.mask
}
}
pub struct Iter<'a> {
hashmap: &'a TermHashMap,
inner: slice::Iter<'a, usize>,
}
impl<'a> Iterator for Iter<'a> {
type Item = (&'a [u8], Addr, BucketId);
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'a [u8], Addr) =
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
(key, offset, bucket as BucketId)
})
}
}
impl TermHashMap {
pub fn new(num_bucket_power_of_2: usize) -> TermHashMap {
let heap = MemoryArena::new();
let table_size = 1 << num_bucket_power_of_2;
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
TermHashMap {
table: table.into_boxed_slice(),
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
}
}
fn probe(&self, hash: u32) -> QuadraticProbing {
QuadraticProbing::compute(hash as usize, self.mask)
}
pub fn mem_usage(&self) -> usize {
self.table.len() * mem::size_of::<KeyValue>()
}
fn is_saturated(&self) -> bool {
self.table.len() < self.occupied.len() * 3
}
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
let key_addr = addr.offset(2u32);
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
(key_bytes, val_addr)
}
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key_value_addr,
hash,
};
}
pub fn iter(&self) -> Iter {
Iter {
inner: self.occupied.iter(),
hashmap: &self,
}
}
fn resize(&mut self) {
let new_len = self.table.len() * 2;
let mask = new_len - 1;
self.mask = mask;
let new_table = vec![KeyValue::default(); new_len].into_boxed_slice();
let old_table = mem::replace(&mut self.table, new_table);
for old_pos in self.occupied.iter_mut() {
let key_value: KeyValue = old_table[*old_pos];
let mut probe = QuadraticProbing::compute(key_value.hash as usize, mask);
loop {
let bucket = probe.next_probe();
if self.table[bucket].is_empty() {
*old_pos = bucket;
self.table[bucket] = key_value;
break;
}
}
}
}
/// `update` create a new entry for a given key if it does not exists
/// or updates the existing entry.
///
/// The actual logic for this update is define in the the `updater`
/// argument.
///
/// If the key is not present, `updater` will receive `None` and
/// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`.
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
where
S: AsRef<[u8]>,
V: Copy,
TMutator: FnMut(Option<V>) -> V,
{
if self.is_saturated() {
self.resize();
}
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
let val = updater(None);
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
self.set_bucket(hash, key_addr, bucket);
return bucket as BucketId;
} else if kv.hash == hash {
let (key_matches, val_addr) = {
let (stored_key, val_addr): (&[u8], Addr) =
unsafe { self.get_key_value(kv.key_value_addr) };
(stored_key == key_bytes, val_addr)
};
if key_matches {
unsafe {
// logic
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write(val_addr, new_v);
};
return bucket as BucketId;
}
}
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)]
mod tests {
use super::TermHashMap;
use std::collections::HashMap;
#[test]
fn test_hash_map() {
let mut hash_map: TermHashMap = TermHashMap::new(18);
{
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, None);
3u32
});
}
{
hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
assert_eq!(opt_val, None);
4u32
});
}
{
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, Some(3u32));
5u32
});
}
let mut vanilla_hash_map = HashMap::new();
let mut iter_values = hash_map.iter();
while let Some((key, addr, _)) = iter_values.next() {
let val: u32 = unsafe {
// test
hash_map.heap.read(addr)
};
vanilla_hash_map.insert(key.to_owned(), val);
}
assert_eq!(vanilla_hash_map.len(), 2);
}
}

View File

@@ -0,0 +1,59 @@
use common::BitSet;
use core::SegmentReader;
use fst::Automaton;
use query::BitSetDocSet;
use query::ConstScorer;
use query::{Scorer, Weight};
use schema::{Field, IndexRecordOption};
use termdict::{TermDictionary, TermStreamer};
use Result;
/// A weight struct for Fuzzy Term and Regex Queries
pub struct AutomatonWeight<A>
where
A: Automaton,
{
field: Field,
automaton: A,
}
impl<A> AutomatonWeight<A>
where
A: Automaton,
{
/// Create a new AutomationWeight
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
AutomatonWeight { field, automaton }
}
fn automaton_stream<'a>(&'a self, term_dict: &'a TermDictionary) -> TermStreamer<'a, &'a A> {
let term_stream_builder = term_dict.search(&self.automaton);
term_stream_builder.into_stream()
}
}
impl<A> Weight for AutomatonWeight<A>
where
A: Automaton,
{
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(self.field);
let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict);
while term_stream.advance() {
let term_info = term_stream.value();
let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
while block_segment_postings.advance() {
for &doc in block_segment_postings.docs() {
doc_bitset.insert(doc);
}
}
}
let doc_bitset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(doc_bitset)))
}
}

162
src/query/fuzzy_query.rs Normal file
View File

@@ -0,0 +1,162 @@
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
use query::{AutomatonWeight, Query, Weight};
use schema::Term;
use std::collections::HashMap;
use Result;
use Searcher;
lazy_static! {
static ref LEV_BUILDER: HashMap<(u8, bool), LevenshteinAutomatonBuilder> = {
let mut lev_builder_cache = HashMap::new();
// TODO make population lazy on a `(distance, val)` basis
for distance in 0..3 {
for &transposition in [false, true].iter() {
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition);
lev_builder_cache.insert((distance, transposition), lev_automaton_builder);
}
}
lev_builder_cache
};
}
/// A Fuzzy Query matches all of the documents
/// containing a specific term that is within
/// Levenshtein distance
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result, Term};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::FuzzyTermQuery;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let term = Term::from_field_text(title, "Diary");
/// let query = FuzzyTermQuery::new(term, 1, true);
/// searcher.search(&query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
#[derive(Debug, Clone)]
pub struct FuzzyTermQuery {
/// What term are we searching
term: Term,
/// How many changes are we going to allow
distance: u8,
/// Should a transposition cost 1 or 2?
transposition_cost_one: bool,
///
prefix: bool,
}
impl FuzzyTermQuery {
/// Creates a new Fuzzy Query
pub fn new(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
FuzzyTermQuery {
term,
distance,
transposition_cost_one,
prefix: false,
}
}
/// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
FuzzyTermQuery {
term,
distance,
transposition_cost_one,
prefix: true,
}
}
fn specialized_weight(&self) -> Result<AutomatonWeight<DFA>> {
let automaton = LEV_BUILDER.get(&(self.distance, false))
.unwrap() // TODO return an error
.build_dfa(self.term.text());
Ok(AutomatonWeight::new(self.term.field(), automaton))
}
}
impl Query for FuzzyTermQuery {
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
Ok(Box::new(self.specialized_weight()?))
}
}
#[cfg(test)]
mod test {
use super::FuzzyTermQuery;
use collector::TopCollector;
use schema::SchemaBuilder;
use schema::TEXT;
use tests::assert_nearly_equals;
use Index;
use Term;
#[test]
pub fn test_fuzzy_term() {
let mut schema_builder = SchemaBuilder::new();
let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(
country_field => "japan",
));
index_writer.add_document(doc!(
country_field => "korea",
));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
{
let mut collector = TopCollector::with_limit(2);
let term = Term::from_field_text(country_field, "japon");
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
searcher.search(&fuzzy_query, &mut collector).unwrap();
let scored_docs = collector.score_docs();
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
let (score, _) = scored_docs[0];
assert_nearly_equals(1f32, score);
}
}
}

View File

@@ -228,7 +228,8 @@ where
TOtherScorer: Scorer,
{
fn score(&mut self) -> Score {
self.left.score() + self.right.score()
self.left.score()
+ self.right.score()
+ self.others.iter_mut().map(Scorer::score).sum::<Score>()
}
}

View File

@@ -3,16 +3,19 @@ Query
*/
mod all_query;
mod automaton_weight;
mod bitset;
mod bm25;
mod boolean_query;
mod exclude;
mod fuzzy_query;
mod intersection;
mod occur;
mod phrase_query;
mod query;
mod query_parser;
mod range_query;
mod regex_query;
mod reqopt_scorer;
mod scorer;
mod term_query;
@@ -31,9 +34,11 @@ pub use self::union::Union;
pub use self::vec_docset::VecDocSet;
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
pub use self::automaton_weight::AutomatonWeight;
pub use self::bitset::BitSetDocSet;
pub use self::boolean_query::BooleanQuery;
pub use self::exclude::Exclude;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers;
pub use self::occur::Occur;
pub use self::phrase_query::PhraseQuery;
@@ -41,6 +46,7 @@ pub use self::query::Query;
pub use self::query_parser::QueryParser;
pub use self::query_parser::QueryParserError;
pub use self::range_query::RangeQuery;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::scorer::ConstScorer;
pub use self::scorer::EmptyScorer;

View File

@@ -1,8 +1,10 @@
use super::Weight;
use collector::Collector;
use core::searcher::Searcher;
use downcast;
use std::fmt;
use Result;
use SegmentLocalId;
/// The `Query` trait defines a set of documents and a scoring method
/// for those documents.
@@ -55,6 +57,26 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug {
}
Ok(result)
}
/// Search works as follows :
///
/// First the weight object associated to the query is created.
///
/// Then, the query loops over the segments and for each segment :
/// - setup the collector and informs it that the segment being processed has changed.
/// - creates a `Scorer` object associated for this segment
/// - iterate throw the matched documents and push them to the collector.
///
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<()> {
let scoring_enabled = collector.requires_scoring();
let weight = self.weight(searcher, scoring_enabled)?;
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
collector.set_segment(segment_ord as SegmentLocalId, segment_reader)?;
let mut scorer = weight.scorer(segment_reader)?;
scorer.collect(collector, segment_reader.delete_bitset());
}
Ok(())
}
}
pub trait QueryClone {

View File

@@ -1,11 +1,21 @@
use query::Occur;
use schema::Field;
use schema::Term;
use schema::Type;
use std::fmt;
use std::ops::Bound;
#[derive(Clone)]
pub enum LogicalLiteral {
Term(Term),
Phrase(Vec<Term>),
Range {
field: Field,
value_type: Type,
lower: Bound<Term>,
upper: Bound<Term>,
},
All,
}
#[derive(Clone)]
@@ -54,6 +64,12 @@ impl fmt::Debug for LogicalLiteral {
match *self {
LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term),
LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms),
LogicalLiteral::Range {
ref lower,
ref upper,
..
} => write!(formatter, "({:?} TO {:?})", lower, upper),
LogicalLiteral::All => write!(formatter, "*"),
}
}
}

View File

@@ -1,29 +1,37 @@
use super::user_input_ast::*;
use combine::char::*;
use combine::*;
use query::query_parser::user_input_ast::UserInputBound;
fn field<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
(
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
}
fn word<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
many1(satisfy(|c: char| c.is_alphanumeric()))
}
fn negative_number<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
}
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
where
I: Stream<Item = char>,
{
let term_val = || {
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
phrase.or(word)
phrase.or(word())
};
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let field = (
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let term_val_with_field = negative_numbers.or(term_val());
let term_val_with_field = negative_number().or(term_val());
let term_query =
(field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
(field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
field_name: Some(field_name),
phrase,
});
@@ -37,6 +45,36 @@ where
.parse_stream(input)
}
fn range<I: Stream<Item = char>>(input: I) -> ParseResult<UserInputAST, I> {
let term_val = || {
word().or(negative_number()).or(char('*').map(|_| "*".to_string()))
};
let lower_bound = {
let excl = (char('{'), term_val()).map(|(_, w)| UserInputBound::Exclusive(w));
let incl = (char('['), term_val()).map(|(_, w)| UserInputBound::Inclusive(w));
excl.or(incl)
};
let upper_bound = {
let excl = (term_val(), char('}')).map(|(w, _)| UserInputBound::Exclusive(w));
let incl = (term_val(), char(']')).map(|(w, _)| UserInputBound::Inclusive(w));
// TODO: this backtracking should be unnecessary
try(excl).or(incl)
};
(
optional((field(), char(':')).map(|x| x.0)),
lower_bound,
spaces(),
string("TO"),
spaces(),
upper_bound,
).map(|(field, lower, _, _, _, upper)| UserInputAST::Range {
field,
lower,
upper,
})
.parse_stream(input)
}
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
where
I: Stream<Item = char>,
@@ -45,6 +83,8 @@ where
.map(|(_, expr)| UserInputAST::Not(Box::new(expr)))
.or((char('+'), parser(leaf)).map(|(_, expr)| UserInputAST::Must(Box::new(expr))))
.or((char('('), parser(parse_to_ast), char(')')).map(|(_, expr, _)| expr))
.or(char('*').map(|_| UserInputAST::All))
.or(try(parser(range)))
.or(parser(literal))
.parse_stream(input)
}
@@ -91,6 +131,12 @@ mod test {
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:[\"*\" TO \"toto\"}");
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
test_is_parse_err("abc + ");
}
}

View File

@@ -2,15 +2,19 @@ use super::logical_ast::*;
use super::query_grammar::parse_to_ast;
use super::user_input_ast::*;
use core::Index;
use query::AllQuery;
use query::BooleanQuery;
use query::Occur;
use query::PhraseQuery;
use query::Query;
use query::RangeQuery;
use query::TermQuery;
use schema::IndexRecordOption;
use schema::{Field, Schema};
use schema::{FieldType, Term};
use std::borrow::Cow;
use std::num::ParseIntError;
use std::ops::Bound;
use std::str::FromStr;
use tokenizer::TokenizerManager;
@@ -39,6 +43,9 @@ pub enum QueryParserError {
/// The tokenizer for the given field is unknown
/// The two argument strings are the name of the field, the name of the tokenizer
UnknownTokenizer(String, String),
/// The query contains a range query with a phrase as one of the bounds.
/// Only terms can be used as bounds.
RangeMustNotHavePhrase,
}
impl From<ParseIntError> for QueryParserError {
@@ -66,8 +73,8 @@ impl From<ParseIntError> for QueryParserError {
/// by relevance : The user typically just scans through the first few
/// documents in order of decreasing relevance and will stop when the documents
/// are not relevant anymore.
/// Making it possible to make this behavior customizable is tracked in
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
///
/// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`.
///
/// * negative terms: By prepending a term by a `-`, a term can be excluded
/// from the search. This is useful for disambiguating a query.
@@ -75,6 +82,17 @@ impl From<ParseIntError> for QueryParserError {
///
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
///
/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed.
/// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed
/// by "obama".
///
/// * range terms: Range searches can be done by specifying the start and end bound. These can be
/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains
/// a word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
/// Inclusive bounds are `[]`, exclusive are `{}`.
///
/// * all docs query: A plain `*` will match all documents in the index.
///
pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
@@ -155,11 +173,12 @@ impl QueryParser {
}
Ok(ast)
}
fn compute_logical_ast_for_leaf(
fn compute_terms_for_string(
&self,
field: Field,
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
) -> Result<Vec<Term>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
if !field_type.is_indexed() {
@@ -170,12 +189,12 @@ impl QueryParser {
FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
let term = Term::from_field_i64(field, val);
Ok(Some(LogicalLiteral::Term(term)))
Ok(vec![term])
}
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
let term = Term::from_field_u64(field, val);
Ok(Some(LogicalLiteral::Term(term)))
Ok(vec![term])
}
FieldType::Str(ref str_options) => {
if let Some(option) = str_options.get_indexing_options() {
@@ -194,17 +213,15 @@ impl QueryParser {
terms.push(term);
});
if terms.is_empty() {
Ok(None)
Ok(vec![])
} else if terms.len() == 1 {
Ok(Some(LogicalLiteral::Term(
terms.into_iter().next().unwrap(),
)))
Ok(terms)
} else {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
if let Some(index_record_option) = field_type.get_index_record_option() {
if index_record_option.has_positions() {
Ok(Some(LogicalLiteral::Phrase(terms)))
Ok(terms)
} else {
let fieldname = self.schema.get_field_name(field).to_string();
Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
@@ -223,10 +240,7 @@ impl QueryParser {
))
}
}
FieldType::HierarchicalFacet => {
let term = Term::from_field_text(field, phrase);
Ok(Some(LogicalLiteral::Term(term)))
}
FieldType::HierarchicalFacet => Ok(vec![Term::from_field_text(field, phrase)]),
FieldType::Bytes => {
let field_name = self.schema.get_field_name(field).to_string();
Err(QueryParserError::FieldNotIndexed(field_name))
@@ -234,6 +248,21 @@ impl QueryParser {
}
}
fn compute_logical_ast_for_leaf(
&self,
field: Field,
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let terms = self.compute_terms_for_string(field, phrase)?;
match terms.len() {
0 => Ok(None),
1 => Ok(Some(LogicalLiteral::Term(
terms.into_iter().next().unwrap(),
))),
_ => Ok(Some(LogicalLiteral::Phrase(terms))),
}
}
fn default_occur(&self) -> Occur {
if self.conjunction_by_default {
Occur::Must
@@ -242,6 +271,37 @@ impl QueryParser {
}
}
fn resolve_bound(&self, field: Field, bound: &UserInputBound) -> Result<Bound<Term>, QueryParserError> {
if bound.term_str() == "*" {
return Ok(Bound::Unbounded);
}
let terms = self.compute_terms_for_string(field, bound.term_str())?;
if terms.len() != 1 {
return Err(QueryParserError::RangeMustNotHavePhrase);
}
let term = terms.into_iter().next().unwrap();
match *bound {
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
}
}
fn resolved_fields(
&self,
given_field: &Option<String>,
) -> Result<Cow<[Field]>, QueryParserError> {
match *given_field {
None => {
if self.default_fields.is_empty() {
Err(QueryParserError::NoDefaultFieldDeclared)
} else {
Ok(Cow::from(&self.default_fields[..]))
}
}
Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
}
}
fn compute_logical_ast_with_occur(
&self,
user_input_ast: UserInputAST,
@@ -265,6 +325,41 @@ impl QueryParser {
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
}
UserInputAST::Range {
field,
lower,
upper,
} => {
let fields = self.resolved_fields(&field)?;
let mut clauses = fields
.iter()
.map(|&field| {
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range {
field,
value_type,
lower: self.resolve_bound(field, &lower)?,
upper: self.resolve_bound(field, &upper)?,
})))
})
.collect::<Result<Vec<_>, QueryParserError>>()?;
let result_ast = if clauses.len() == 1 {
clauses.pop().unwrap()
} else {
LogicalAST::Clause(
clauses
.into_iter()
.map(|clause| (Occur::Should, clause))
.collect(),
)
};
Ok((Occur::Should, result_ast))
}
UserInputAST::All => Ok((
Occur::Should,
LogicalAST::Leaf(Box::new(LogicalLiteral::All)),
)),
UserInputAST::Leaf(literal) => {
let term_phrases: Vec<(Field, String)> = match literal.field_name {
Some(ref field_name) => {
@@ -327,6 +422,13 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
match logical_literal {
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
LogicalLiteral::Phrase(terms) => Box::new(PhraseQuery::new(terms)),
LogicalLiteral::Range {
field,
value_type,
lower,
upper,
} => Box::new(RangeQuery::new_term_bounds(field, value_type, lower, upper)),
LogicalLiteral::All => Box::new(AllQuery),
}
}
@@ -511,6 +613,42 @@ mod test {
Term([0, 0, 0, 0, 98])]\"",
false,
);
test_parse_query_to_logical_ast_helper(
"title:[a TO b]",
"(Included(Term([0, 0, 0, 0, 97])) TO \
Included(Term([0, 0, 0, 0, 98])))",
false,
);
test_parse_query_to_logical_ast_helper(
"[a TO b]",
"((Included(Term([0, 0, 0, 0, 97])) TO \
Included(Term([0, 0, 0, 0, 98]))) \
(Included(Term([0, 0, 0, 1, 97])) TO \
Included(Term([0, 0, 0, 1, 98]))))",
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO toto}",
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO \
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
false,
);
test_parse_query_to_logical_ast_helper(
"title:{* TO toto}",
"(Unbounded TO \
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
false,
);
test_parse_query_to_logical_ast_helper(
"title:{titi TO *}",
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
false,
);
test_parse_query_to_logical_ast_helper(
"*",
"*",
false,
);
}
#[test]

View File

@@ -14,10 +14,44 @@ impl fmt::Debug for UserInputLiteral {
}
}
pub enum UserInputBound {
Inclusive(String),
Exclusive(String),
}
impl UserInputBound {
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
}
}
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
}
}
pub fn term_str(&self) -> &str {
match *self {
UserInputBound::Inclusive(ref contents) => contents,
UserInputBound::Exclusive(ref contents) => contents,
}
}
}
pub enum UserInputAST {
Clause(Vec<Box<UserInputAST>>),
Not(Box<UserInputAST>),
Must(Box<UserInputAST>),
Range {
field: Option<String>,
lower: UserInputBound,
upper: UserInputBound,
},
All,
Leaf(Box<UserInputLiteral>),
}
@@ -45,6 +79,20 @@ impl fmt::Debug for UserInputAST {
Ok(())
}
UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery),
UserInputAST::Range {
ref field,
ref lower,
ref upper,
} => {
if let &Some(ref field) = field {
write!(formatter, "{}:", field)?;
}
lower.display_lower(formatter)?;
write!(formatter, " TO ")?;
upper.display_upper(formatter)?;
Ok(())
}
UserInputAST::All => write!(formatter, "*"),
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
}
}

View File

@@ -41,7 +41,8 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// # extern crate tantivy;
/// # use tantivy::Index;
/// # use tantivy::schema::{SchemaBuilder, INT_INDEXED};
/// # use tantivy::collector::{Collector, CountCollector};
/// # use tantivy::collector::CountCollector;
/// # use tantivy::query::Query;
/// # use tantivy::Result;
/// # use tantivy::query::RangeQuery;
/// #
@@ -67,7 +68,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
/// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
///
/// let mut count_collector = CountCollector::default();
/// count_collector.search(&*searcher, &docs_in_the_sixties)?;
/// docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
///
/// let num_60s_books = count_collector.count();
///
@@ -88,6 +89,28 @@ pub struct RangeQuery {
}
impl RangeQuery {
/// Creates a new `RangeQuery` from bounded start and end terms.
///
/// If the value type is not correct, something may go terribly wrong when
/// the `Weight` object is created.
pub fn new_term_bounds(
field: Field,
value_type: Type,
left_bound: Bound<Term>,
right_bound: Bound<Term>,
) -> RangeQuery {
let verify_and_unwrap_term = |val: &Term| {
assert_eq!(field, val.field());
val.value_bytes().to_owned()
};
RangeQuery {
field,
value_type,
left_bound: map_bound(&left_bound, &verify_and_unwrap_term),
right_bound: map_bound(&right_bound, &verify_and_unwrap_term),
}
}
/// Creates a new `RangeQuery` over a `i64` field.
///
/// If the field is not of the type `i64`, tantivy
@@ -194,12 +217,16 @@ impl RangeQuery {
/// Lower bound of range
pub fn left_bound(&self) -> Bound<Term> {
map_bound(&self.left_bound, &|bytes| Term::from_field_bytes(self.field, bytes))
map_bound(&self.left_bound, &|bytes| {
Term::from_field_bytes(self.field, bytes)
})
}
/// Upper bound of range
pub fn right_bound(&self) -> Bound<Term> {
map_bound(&self.right_bound, &|bytes| Term::from_field_bytes(self.field, bytes))
map_bound(&self.right_bound, &|bytes| {
Term::from_field_bytes(self.field, bytes)
})
}
}
@@ -273,7 +300,8 @@ impl Weight for RangeWeight {
mod tests {
use super::RangeQuery;
use collector::{Collector, CountCollector};
use collector::CountCollector;
use query::Query;
use schema::{Document, Field, SchemaBuilder, INT_INDEXED};
use std::collections::Bound;
use Index;
@@ -304,7 +332,7 @@ mod tests {
// ... or `1960..=1969` if inclusive range is enabled.
let mut count_collector = CountCollector::default();
count_collector.search(&*searcher, &docs_in_the_sixties)?;
docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
assert_eq!(count_collector.count(), 2285);
Ok(())
}
@@ -341,7 +369,9 @@ mod tests {
let searcher = index.searcher();
let count_multiples = |range_query: RangeQuery| {
let mut count_collector = CountCollector::default();
count_collector.search(&*searcher, &range_query).unwrap();
range_query
.search(&*searcher, &mut count_collector)
.unwrap();
count_collector.count()
};

143
src/query/regex_query.rs Normal file
View File

@@ -0,0 +1,143 @@
use error::ErrorKind;
use fst_regex::Regex;
use query::{AutomatonWeight, Query, Weight};
use schema::Field;
use std::clone::Clone;
use Result;
use Searcher;
// A Regex Query matches all of the documents
/// containing a specific term that matches
/// a regex pattern
/// A Fuzzy Query matches all of the documents
/// containing a specific term that is within
/// Levenshtein distance
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result, Term};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::RegexQuery;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let term = Term::from_field_text(title, "Diary");
/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title);
/// searcher.search(&query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 3);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
#[derive(Debug, Clone)]
pub struct RegexQuery {
regex_pattern: String,
field: Field,
}
impl RegexQuery {
/// Creates a new Fuzzy Query
pub fn new(regex_pattern: String, field: Field) -> RegexQuery {
RegexQuery {
regex_pattern,
field,
}
}
fn specialized_weight(&self) -> Result<AutomatonWeight<Regex>> {
let automaton = Regex::new(&self.regex_pattern)
.map_err(|_| ErrorKind::InvalidArgument(self.regex_pattern.clone()))?;
Ok(AutomatonWeight::new(self.field.clone(), automaton))
}
}
impl Query for RegexQuery {
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
Ok(Box::new(self.specialized_weight()?))
}
}
#[cfg(test)]
mod test {
use super::RegexQuery;
use collector::TopCollector;
use schema::SchemaBuilder;
use schema::TEXT;
use tests::assert_nearly_equals;
use Index;
#[test]
pub fn test_regex_query() {
let mut schema_builder = SchemaBuilder::new();
let country_field = schema_builder.add_text_field("country", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(
country_field => "japan",
));
index_writer.add_document(doc!(
country_field => "korea",
));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
{
let mut collector = TopCollector::with_limit(2);
let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field);
searcher.search(&regex_query, &mut collector).unwrap();
let scored_docs = collector.score_docs();
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
let (score, _) = scored_docs[0];
assert_nearly_equals(1f32, score);
}
let searcher = index.searcher();
{
let mut collector = TopCollector::with_limit(2);
let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field);
searcher.search(&regex_query, &mut collector).unwrap();
let scored_docs = collector.score_docs();
assert_eq!(scored_docs.len(), 0, "Expected ZERO document");
}
}
}

View File

@@ -1,4 +1,4 @@
use collector::SegmentCollector;
use collector::Collector;
use common::BitSet;
use docset::{DocSet, SkipResult};
use downcast;
@@ -18,7 +18,7 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
/// Consumes the complete `DocSet` and
/// push the scored documents to the collector.
fn collect<T>(&mut self, collector: &mut SegmentCollector<CollectionResult = T>, delete_bitset_opt: Option<&DeleteBitSet>) {
fn collect(&mut self, collector: &mut Collector, delete_bitset_opt: Option<&DeleteBitSet>) {
if let Some(delete_bitset) = delete_bitset_opt {
while self.advance() {
let doc = self.doc();
@@ -44,7 +44,7 @@ impl Scorer for Box<Scorer> {
self.deref_mut().score()
}
fn collect<T>(&mut self, collector: &mut SegmentCollector<CollectionResult = T>, delete_bitset: Option<&DeleteBitSet>) {
fn collect(&mut self, collector: &mut Collector, delete_bitset: Option<&DeleteBitSet>) {
let scorer = self.deref_mut();
scorer.collect(collector, delete_bitset);
}

View File

@@ -16,6 +16,59 @@ use Term;
/// * `idf` - inverse document frequency.
/// * `term_freq` - number of occurrences of the term in the field
/// * `field norm` - number of tokens in the field.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT, IndexRecordOption};
/// use tantivy::{Index, Result, Term};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::TermQuery;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit()?;
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let query = TermQuery::new(
/// Term::from_field_text(title, "diary"),
/// IndexRecordOption::Basic,
/// );
/// searcher.search(&query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
#[derive(Clone, Debug)]
pub struct TermQuery {
term: Term,

View File

@@ -6,7 +6,7 @@ use Result;
/// for a given set of segments.
///
/// See [`Query`](./trait.Query.html).
pub trait Weight: Send + Sync + 'static {
pub trait Weight {
/// Returns the scorer for the given segment.
/// See [`Query`](./trait.Query.html).
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;

View File

@@ -0,0 +1,19 @@
extern crate lz4;
use std::io::{self, Read, Write};
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = lz4::EncoderBuilder::new().build(compressed)?;
encoder.write_all(&uncompressed)?;
let (_, encoder_result) = encoder.finish();
encoder_result?;
Ok(())
}
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
let mut decoder = lz4::Decoder::new(compressed)?;
decoder.read_to_end(decompressed)?;
Ok(())
}

View File

@@ -0,0 +1,17 @@
extern crate snap;
use std::io::{self, Read, Write};
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = snap::Writer::new(compressed);
encoder.write_all(&uncompressed)?;
encoder.flush()?;
Ok(())
}
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
snap::Reader::new(compressed).read_to_end(decompressed)?;
Ok(())
}

View File

@@ -34,10 +34,21 @@ and should rely on either
!*/
mod reader;
mod skiplist;
mod writer;
pub use self::reader::StoreReader;
pub use self::writer::StoreWriter;
#[cfg(feature = "lz4")]
mod compression_lz4;
#[cfg(feature = "lz4")]
use self::compression_lz4::*;
#[cfg(not(feature = "lz4"))]
mod compression_snap;
#[cfg(not(feature = "lz4"))]
use self::compression_snap::*;
#[cfg(test)]
pub mod tests {

View File

@@ -1,13 +1,13 @@
use Result;
use super::decompress;
use super::skiplist::SkipList;
use common::BinarySerializable;
use common::VInt;
use datastruct::SkipList;
use directory::ReadOnlySource;
use lz4;
use schema::Document;
use std::cell::RefCell;
use std::io::{self, Read};
use std::io;
use std::mem::size_of;
use DocId;
@@ -61,9 +61,7 @@ impl StoreReader {
let mut current_block_mut = self.current_block.borrow_mut();
current_block_mut.clear();
let compressed_block = self.compressed_block(block_offset);
let mut lz4_decoder = lz4::Decoder::new(compressed_block)?;
*self.current_block_offset.borrow_mut() = usize::max_value();
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())?;
decompress(compressed_block, &mut current_block_mut)?;
*self.current_block_offset.borrow_mut() = block_offset;
}
Ok(())

View File

@@ -1,9 +1,9 @@
use super::compress;
use super::skiplist::SkipListBuilder;
use super::StoreReader;
use common::CountingWriter;
use common::{BinarySerializable, VInt};
use datastruct::SkipListBuilder;
use directory::WritePtr;
use lz4;
use schema::Document;
use std::io::{self, Write};
use DocId;
@@ -87,12 +87,7 @@ impl StoreWriter {
fn write_and_compress_block(&mut self) -> io::Result<()> {
self.intermediary_buffer.clear();
{
let mut encoder = lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer)?;
encoder.write_all(&self.current_block)?;
let (_, encoder_result) = encoder.finish();
encoder_result?;
}
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
(self.intermediary_buffer.len() as u32).serialize(&mut self.writer)?;
self.writer.write_all(&self.intermediary_buffer)?;
self.offset_index_writer

View File

@@ -94,7 +94,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
let bit_shift = (addr_bits % 8) as u64;
assert!(data.len() >= addr_byte + 8);
let val_unshifted_unmasked: u64 = unsafe {
//< ok : check len above
// ok thanks to the 7 byte padding on `.close`
let addr = data.as_ptr().offset(addr_byte as isize) as *const u64;
ptr::read_unaligned(addr)
};

View File

@@ -203,7 +203,7 @@ impl TermDictionary {
/// Returns a search builder, to stream all of the terms
/// within the Automaton
pub fn search<'a, A: Automaton>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
let stream_builder = self.fst_index.search(automaton);
TermStreamerBuilder::<A>::new(self, stream_builder)
}

View File

@@ -25,7 +25,7 @@ impl Default for Token {
offset_from: 0,
offset_to: 0,
position: usize::max_value(),
text: String::new(),
text: String::with_capacity(200),
}
}
}