mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
Compare commits
36 Commits
broken_col
...
0.6.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
31655e92d7 | ||
|
|
6b8d76685a | ||
|
|
ce5683fc6a | ||
|
|
5205579db6 | ||
|
|
d056ae60dc | ||
|
|
af9280c95f | ||
|
|
2e538ce6e6 | ||
|
|
00466d2b08 | ||
|
|
8ebbf6b336 | ||
|
|
1ce36bb211 | ||
|
|
2ac43bf21b | ||
|
|
3fd8c2aa5a | ||
|
|
c1022e23d2 | ||
|
|
8ccbfdea5d | ||
|
|
badfce3a23 | ||
|
|
e301e0bc87 | ||
|
|
317baf4e75 | ||
|
|
24398d94e4 | ||
|
|
360f4132eb | ||
|
|
2b8f02764b | ||
|
|
0465876854 | ||
|
|
6f7b099370 | ||
|
|
84f5cc4388 | ||
|
|
75aae0d2c2 | ||
|
|
009a3559be | ||
|
|
7a31669e9d | ||
|
|
5185eb790b | ||
|
|
a3dffbf1c6 | ||
|
|
857a5794d8 | ||
|
|
b0a6fc1448 | ||
|
|
989d52bea4 | ||
|
|
09661ea7ec | ||
|
|
b59132966f | ||
|
|
863d3411bc | ||
|
|
8a55d133ab | ||
|
|
432d49d814 |
154
.travis.yml
154
.travis.yml
@@ -1,37 +1,127 @@
|
||||
# Based on the "trust" template v0.1.2
|
||||
# https://github.com/japaric/trust/tree/v0.1.2
|
||||
|
||||
dist: trusty
|
||||
language: rust
|
||||
services: docker
|
||||
sudo: required
|
||||
cache: cargo
|
||||
rust:
|
||||
- nightly
|
||||
|
||||
env:
|
||||
global:
|
||||
- CC=gcc-4.8
|
||||
- CXX=g++-4.8
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- libcurl4-openssl-dev
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
- CRATE_NAME=tantivy
|
||||
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# iOS
|
||||
#- env: TARGET=aarch64-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7s-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=i386-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-ios DISABLE_TESTS=1
|
||||
os: osx
|
||||
|
||||
# Linux
|
||||
- env: TARGET=aarch64-unknown-linux-gnu
|
||||
# - env: TARGET=arm-unknown-linux-gnueabi
|
||||
# - env: TARGET=armv7-unknown-linux-gnueabihf
|
||||
- env: TARGET=i686-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-musl
|
||||
#- env: TARGET=mips-unknown-linux-gnu
|
||||
#- env: TARGET=mips64-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mips64el-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mipsel-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64le-unknown-linux-gnu
|
||||
#- env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-musl
|
||||
|
||||
# OSX
|
||||
#- env: TARGET=i686-apple-darwin
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-darwin
|
||||
os: osx
|
||||
|
||||
# *BSD
|
||||
#- env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1
|
||||
|
||||
# Windows
|
||||
#- env: TARGET=x86_64-pc-windows-gnu
|
||||
|
||||
# Bare metal
|
||||
# These targets don't support std and as such are likely not suitable for
|
||||
# most crates.
|
||||
# - env: TARGET=thumbv6m-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabihf
|
||||
# - env: TARGET=thumbv7m-none-eabi
|
||||
|
||||
# Testing other channels
|
||||
#- env: TARGET=x86_64-unknown-linux-gnu
|
||||
# rust: nightly
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
# rust: nightly
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
|
||||
script:
|
||||
- cargo build
|
||||
- cargo test
|
||||
- cargo test -- --ignored
|
||||
- cargo run --example simple_search
|
||||
- cargo doc
|
||||
after_success:
|
||||
- cargo coveralls --exclude-pattern src/functional_test.rs
|
||||
- cargo doc-upload
|
||||
- bash ci/script.sh
|
||||
|
||||
after_script: set +e
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
#
|
||||
#deploy:
|
||||
# # - Create a `public_repo` GitHub token. Go to: https://github.com/settings/tokens/new
|
||||
# # - Encrypt it: `travis encrypt 0123456789012345678901234567890123456789
|
||||
# # - Paste the output down here
|
||||
# api_key:
|
||||
# secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
# file_glob: true
|
||||
# file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.*
|
||||
# on:
|
||||
# # TODO Here you can pick which targets will generate binary releases
|
||||
# # In this example, there are some targets that are tested using the stable
|
||||
# # and nightly channels. This condition makes sure there is only one release
|
||||
# # for such targets and that's generated using the stable channel
|
||||
# condition: $TRAVIS_RUST_VERSION = stable
|
||||
# tags: true
|
||||
# provider: releases
|
||||
# skip_cleanup: true
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
# Travis can't cache files that are not readable by "others"
|
||||
- chmod -R a+r $HOME/.cargo
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
# # release tags
|
||||
# - /^v\d+\.\d+\.\d+.*$/
|
||||
# - master
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
|
||||
11
AUTHORS
Normal file
11
AUTHORS
Normal file
@@ -0,0 +1,11 @@
|
||||
# This is the list of authors of tantivy for copyright purposes.
|
||||
Paul Masurel
|
||||
Laurentiu Nicola
|
||||
Dru Sellers
|
||||
Ashley Mannix
|
||||
Michael J. Curry
|
||||
Jason Wolfe
|
||||
# As an employee of Google I am required to add Google LLC
|
||||
# in the list of authors, but this project is not affiliated to Google
|
||||
# in any other way.
|
||||
Google LLC
|
||||
28
CHANGELOG.md
28
CHANGELOG.md
@@ -1,14 +1,34 @@
|
||||
Tantivy 0.6.1
|
||||
=========================
|
||||
- Bugfix #324. GC removing was removing file that were still in useful
|
||||
- Added support for parsing AllQuery and RangeQuery via QueryParser
|
||||
- AllQuery: `*`
|
||||
- RangeQuery:
|
||||
- Inclusive `field:[startIncl to endIncl]`
|
||||
- Exclusive `field:{startExcl to endExcl}`
|
||||
- Mixed `field:[startIncl to endExcl}` and vice versa
|
||||
- Unbounded `field:[start to *]`, `field:[* to end]`
|
||||
|
||||
|
||||
Tantivy 0.6
|
||||
==========================
|
||||
- Removed C code. Tantivy is now pure Rust.
|
||||
- BM25
|
||||
- Approximate field norms encoded over 1 byte.
|
||||
- Compiles on stable rust
|
||||
|
||||
Special thanks to @drusellers and @jason-wolfe for their contributions
|
||||
to this release!
|
||||
|
||||
- Removed C code. Tantivy is now pure Rust. (@pmasurel)
|
||||
- BM25 (@pmasurel)
|
||||
- Approximate field norms encoded over 1 byte. (@pmasurel)
|
||||
- Compiles on stable rust (@pmasurel)
|
||||
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
|
||||
- Completely uncompressed
|
||||
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
|
||||
- Add NGram token support (@drusellers)
|
||||
- Add Stopword Filter support (@drusellers)
|
||||
- Add a FuzzyTermQuery (@drusellers)
|
||||
- Add a RegexQuery (@drusellers)
|
||||
- Various performance improvements (@pmasurel)_
|
||||
|
||||
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
|
||||
17
Cargo.toml
17
Cargo.toml
@@ -1,10 +1,10 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.6.0-dev"
|
||||
version = "0.6.1"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Tantivy is a search engine library."""
|
||||
description = """Search engine library"""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
@@ -18,7 +18,10 @@ lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = {version="0.3", default-features=false}
|
||||
atomicwrites = {version="0.1", optional=true}
|
||||
fst-regex = { version="0.2" }
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "2.1"
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
@@ -29,7 +32,6 @@ serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
lz4 = "1.20"
|
||||
bit-set = "0.4.0"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
@@ -42,8 +44,10 @@ stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9" }
|
||||
matches = "0.1"
|
||||
bitpacking = "0.4"
|
||||
bitpacking = "0.5"
|
||||
census = "0.1"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
@@ -60,9 +64,8 @@ debug-assertions = false
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
simd = ["bitpacking/simd"]
|
||||
mmap = ["fst/mmap", "atomicwrites"]
|
||||
unstable = ["simd"]
|
||||
lz4-compression = ["lz4"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,4 +1,4 @@
|
||||
Copyright (c) 2018 by Paul Masurel, Google LLC
|
||||
Copyright (c) 2018 by the project authors, as listed in the AUTHORS file.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
|
||||
66
README.md
66
README.md
@@ -4,36 +4,50 @@
|
||||
[](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
|
||||
It is strongly inspired by Lucene's design.
|
||||
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- tf-idf scoring
|
||||
- Basic query language
|
||||
- Phrase queries
|
||||
- BM25 scoring (the same as lucene)
|
||||
- Basic query language (`+michael +jackson`)
|
||||
- Phrase queries search (\"michael jackson\"`)
|
||||
- Incremental indexing
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- Mmap directory
|
||||
- optional SIMD integer compression
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceting
|
||||
- configurable indexing (optional term frequency and position indexing
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing
|
||||
- Cheesy logo with a horse
|
||||
|
||||
Tantivy supports Linux, MacOS and Windows.
|
||||
# Non-features
|
||||
|
||||
- Distributed search and will not be in the scope of tantivy.
|
||||
|
||||
|
||||
# Supported OS and compiler
|
||||
|
||||
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
|
||||
|
||||
# Getting started
|
||||
|
||||
- [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy's simple search example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
|
||||
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents and search via the CLI or a small server with a REST API.
|
||||
It will walk you through getting a wikipedia search engine up and running in a few minutes.
|
||||
- [reference doc]
|
||||
- [For the last released version](https://docs.rs/tantivy/)
|
||||
@@ -43,40 +57,14 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
||||
|
||||
## Development
|
||||
|
||||
Tantivy now compiles on stable rust.
|
||||
To check out and run test, you can simply run :
|
||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run :
|
||||
|
||||
git clone git@github.com:tantivy-search/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
|
||||
|
||||
## Note on release build and performance
|
||||
|
||||
If your project depends on `tantivy`, for better performance, make sure to enable
|
||||
`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
|
||||
be available on most `x86_64` CPUs you will encounter).
|
||||
|
||||
For instance,
|
||||
|
||||
RUSTFLAGS='-C target-feature=+sse3'
|
||||
|
||||
Or, if you are targetting a specific cpu
|
||||
|
||||
RUSTFLAGS='-C target-cpu=native' build --release
|
||||
|
||||
Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
|
||||
If you want to disable those, you can run the following command :
|
||||
|
||||
cargo build --no-default-features
|
||||
|
||||
Alternatively, if you are trying to compile `tantivy` without simd compression,
|
||||
you can disable this functionality. In this case, this submodule is not required
|
||||
and you can compile tantivy by using the `--no-default-features` flag.
|
||||
|
||||
cargo build --no-default-features
|
||||
|
||||
|
||||
# Contribute
|
||||
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
|
||||
@@ -4,11 +4,8 @@
|
||||
os: Visual Studio 2015
|
||||
environment:
|
||||
matrix:
|
||||
- channel: nightly
|
||||
- channel: stable
|
||||
target: x86_64-pc-windows-msvc
|
||||
- channel: nightly
|
||||
target: x86_64-pc-windows-gnu
|
||||
msys_bits: 64
|
||||
|
||||
install:
|
||||
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
|
||||
|
||||
23
ci/before_deploy.ps1
Normal file
23
ci/before_deploy.ps1
Normal file
@@ -0,0 +1,23 @@
|
||||
# This script takes care of packaging the build artifacts that will go in the
|
||||
# release zipfile
|
||||
|
||||
$SRC_DIR = $PWD.Path
|
||||
$STAGE = [System.Guid]::NewGuid().ToString()
|
||||
|
||||
Set-Location $ENV:Temp
|
||||
New-Item -Type Directory -Name $STAGE
|
||||
Set-Location $STAGE
|
||||
|
||||
$ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip"
|
||||
|
||||
# TODO Update this to package the right artifacts
|
||||
Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\'
|
||||
|
||||
7z a "$ZIP" *
|
||||
|
||||
Push-AppveyorArtifact "$ZIP"
|
||||
|
||||
Remove-Item *.* -Force
|
||||
Set-Location ..
|
||||
Remove-Item $STAGE
|
||||
Set-Location $SRC_DIR
|
||||
33
ci/before_deploy.sh
Normal file
33
ci/before_deploy.sh
Normal file
@@ -0,0 +1,33 @@
|
||||
# This script takes care of building your crate and packaging it for release
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
local src=$(pwd) \
|
||||
stage=
|
||||
|
||||
case $TRAVIS_OS_NAME in
|
||||
linux)
|
||||
stage=$(mktemp -d)
|
||||
;;
|
||||
osx)
|
||||
stage=$(mktemp -d -t tmp)
|
||||
;;
|
||||
esac
|
||||
|
||||
test -f Cargo.lock || cargo generate-lockfile
|
||||
|
||||
# TODO Update this to build the artifacts that matter to you
|
||||
cross rustc --bin hello --target $TARGET --release -- -C lto
|
||||
|
||||
# TODO Update this to package the right artifacts
|
||||
cp target/$TARGET/release/hello $stage/
|
||||
|
||||
cd $stage
|
||||
tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
|
||||
cd $src
|
||||
|
||||
rm -rf $stage
|
||||
}
|
||||
|
||||
main
|
||||
47
ci/install.sh
Normal file
47
ci/install.sh
Normal file
@@ -0,0 +1,47 @@
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
local target=
|
||||
if [ $TRAVIS_OS_NAME = linux ]; then
|
||||
target=x86_64-unknown-linux-musl
|
||||
sort=sort
|
||||
else
|
||||
target=x86_64-apple-darwin
|
||||
sort=gsort # for `sort --sort-version`, from brew's coreutils.
|
||||
fi
|
||||
|
||||
# Builds for iOS are done on OSX, but require the specific target to be
|
||||
# installed.
|
||||
case $TARGET in
|
||||
aarch64-apple-ios)
|
||||
rustup target install aarch64-apple-ios
|
||||
;;
|
||||
armv7-apple-ios)
|
||||
rustup target install armv7-apple-ios
|
||||
;;
|
||||
armv7s-apple-ios)
|
||||
rustup target install armv7s-apple-ios
|
||||
;;
|
||||
i386-apple-ios)
|
||||
rustup target install i386-apple-ios
|
||||
;;
|
||||
x86_64-apple-ios)
|
||||
rustup target install x86_64-apple-ios
|
||||
;;
|
||||
esac
|
||||
|
||||
# This fetches latest stable release
|
||||
local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \
|
||||
| cut -d/ -f3 \
|
||||
| grep -E '^v[0.1.0-9.]+$' \
|
||||
| $sort --version-sort \
|
||||
| tail -n1)
|
||||
curl -LSfs https://japaric.github.io/trust/install.sh | \
|
||||
sh -s -- \
|
||||
--force \
|
||||
--git japaric/cross \
|
||||
--tag $tag \
|
||||
--target $target
|
||||
}
|
||||
|
||||
main
|
||||
23
ci/script.sh
Normal file
23
ci/script.sh
Normal file
@@ -0,0 +1,23 @@
|
||||
# This script takes care of testing your crate
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
cross build --target $TARGET
|
||||
cross build --target $TARGET --release
|
||||
|
||||
if [ ! -z $DISABLE_TESTS ]; then
|
||||
return
|
||||
fi
|
||||
|
||||
cross test --target $TARGET
|
||||
# cross test --target $TARGET --release
|
||||
|
||||
# cross run --target $TARGET
|
||||
# cross run --target $TARGET --release
|
||||
}
|
||||
|
||||
# we don't run the "test phase" when doing deploys
|
||||
if [ -z $TRAVIS_TAG ]; then
|
||||
main
|
||||
fi
|
||||
@@ -61,7 +61,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
|
||||
@@ -64,7 +64,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
|
||||
@@ -4,87 +4,111 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
use collector::CollectorWrapper;
|
||||
|
||||
/// Collector that does nothing.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
/// be optimized away by the compiler.
|
||||
pub struct DoNothingCollector;
|
||||
impl Collector for DoNothingCollector {
|
||||
type Child = DoNothingCollector;
|
||||
#[inline]
|
||||
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<DoNothingCollector> {
|
||||
Ok(DoNothingCollector)
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
#[inline]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for DoNothingCollector {
|
||||
type CollectionResult = ();
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
|
||||
fn finalize(self) -> () {
|
||||
()
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
/// This contraption is only usable if the type of your collectors
|
||||
/// are known at compile time.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||
left: Left,
|
||||
right: Right,
|
||||
}
|
||||
|
||||
pub struct ChainedSegmentCollector<Left: SegmentCollector, Right: SegmentCollector> {
|
||||
left: Left,
|
||||
right: Right,
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
/// Adds a collector
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, CollectorWrapper<C>> {
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
||||
ChainedCollector {
|
||||
left: self,
|
||||
right: CollectorWrapper::new(new_collector),
|
||||
right: new_collector,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||
type Child = ChainedSegmentCollector<Left::Child, Right::Child>;
|
||||
fn for_segment(
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<Self::Child> {
|
||||
Ok(ChainedSegmentCollector {
|
||||
left: self.left.for_segment(segment_local_id, segment)?,
|
||||
right: self.right.for_segment(segment_local_id, segment)?,
|
||||
})
|
||||
) -> Result<()> {
|
||||
self.left.set_segment(segment_local_id, segment)?;
|
||||
self.right.set_segment(segment_local_id, segment)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Left: SegmentCollector, Right: SegmentCollector> SegmentCollector for ChainedSegmentCollector<Left, Right> {
|
||||
type CollectionResult = (Left::CollectionResult, Right::CollectionResult);
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.left.collect(doc, score);
|
||||
self.right.collect(doc, score);
|
||||
}
|
||||
|
||||
fn finalize(self) -> Self::CollectionResult {
|
||||
(self.left.finalize(), self.right.finalize())
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,35 +122,19 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{CountCollector, SegmentCollector, TopCollector};
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
use Document;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
|
||||
#[test]
|
||||
fn test_chained_collector() {
|
||||
let schema_builder = SchemaBuilder::new();
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let doc = Document::new();
|
||||
index_writer.add_document(doc);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let segment_readers = searcher.segment_readers();
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
let mut segment_collector = collectors.for_segment(0, &segment_readers[0]).unwrap();
|
||||
segment_collector.collect(1, 0.2);
|
||||
segment_collector.collect(2, 0.1);
|
||||
segment_collector.collect(3, 0.5);
|
||||
collectors.merge_children(vec![segment_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
|
||||
@@ -4,11 +4,56 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
use collector::Combinable;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::CountCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut count_collector).unwrap();
|
||||
///
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
@@ -23,10 +68,12 @@ impl CountCollector {
|
||||
}
|
||||
|
||||
impl Collector for CountCollector {
|
||||
type Child = CountCollector;
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<CountCollector> {
|
||||
Ok(CountCollector::default())
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -34,28 +81,10 @@ impl Collector for CountCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl Combinable for CountCollector {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.count += other.count;
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for CountCollector {
|
||||
type CollectionResult = CountCollector;
|
||||
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
}
|
||||
|
||||
fn finalize(self) -> CountCollector {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use collector::{Collector, CountCollector, SegmentCollector};
|
||||
use collector::{Collector, CountCollector};
|
||||
|
||||
#[test]
|
||||
fn test_count_collector() {
|
||||
|
||||
@@ -3,12 +3,14 @@ use docset::SkipResult;
|
||||
use fastfield::FacetReader;
|
||||
use schema::Facet;
|
||||
use schema::Field;
|
||||
use std::cell::UnsafeCell;
|
||||
use std::collections::btree_map;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use std::iter::Peekable;
|
||||
use std::mem;
|
||||
use std::{u64, usize};
|
||||
use termdict::TermMerger;
|
||||
|
||||
@@ -18,7 +20,6 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
@@ -193,22 +194,19 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// }
|
||||
/// ```
|
||||
pub struct FacetCollector {
|
||||
facet_ords: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<UnsafeCell<FacetReader>>,
|
||||
segment_counters: Vec<SegmentFacetCounter>,
|
||||
facets: BTreeSet<Facet>,
|
||||
}
|
||||
|
||||
pub struct FacetSegmentCollector {
|
||||
reader: FacetReader,
|
||||
|
||||
facet_ords_buf: Vec<u64>,
|
||||
|
||||
// facet_ord -> collapse facet_id
|
||||
collapse_mapping: Vec<usize>,
|
||||
current_segment_collapse_mapping: Vec<usize>,
|
||||
// collapse facet_id -> count
|
||||
counts: Vec<u64>,
|
||||
current_segment_counts: Vec<u64>,
|
||||
// collapse facet_id -> facet_ord
|
||||
collapse_facet_ords: Vec<u64>,
|
||||
current_collapse_facet_ords: Vec<u64>,
|
||||
|
||||
facets: BTreeSet<Facet>,
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
@@ -242,9 +240,15 @@ impl FacetCollector {
|
||||
/// is of the proper type.
|
||||
pub fn for_field(field: Field) -> FacetCollector {
|
||||
FacetCollector {
|
||||
facet_ords: Vec::with_capacity(255),
|
||||
segment_counters: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
facets: BTreeSet::new(),
|
||||
|
||||
current_segment_collapse_mapping: Vec::new(),
|
||||
current_collapse_facet_ords: Vec::new(),
|
||||
current_segment_counts: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -275,11 +279,69 @@ impl FacetCollector {
|
||||
self.facets.insert(facet);
|
||||
}
|
||||
|
||||
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||
self.current_segment_collapse_mapping.clear();
|
||||
self.current_collapse_facet_ords.clear();
|
||||
self.current_segment_counts.clear();
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
self.current_collapse_facet_ords.push(0);
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if !facet_streamer.advance() {
|
||||
return;
|
||||
}
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = self.current_collapse_facet_ords.len();
|
||||
self.current_collapse_facet_ords
|
||||
.push(facet_streamer.term_ord());
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(SegmentFacetCounter {
|
||||
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the results of the collection.
|
||||
///
|
||||
/// This method does not just return the counters,
|
||||
/// it also translates the facet ordinals of the last segment.
|
||||
pub fn harvest(self) -> FacetCounts {
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
@@ -327,92 +389,30 @@ impl FacetCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetSegmentCollector {
|
||||
fn into_segment_facet_counter(self) -> SegmentFacetCounter {
|
||||
SegmentFacetCounter {
|
||||
facet_reader: self.reader,
|
||||
facet_ords: self.collapse_facet_ords,
|
||||
facet_counts: self.counts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FacetCollector {
|
||||
type Child = FacetSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FacetSegmentCollector> {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.finalize_segment();
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
|
||||
let mut collapse_mapping = Vec::new();
|
||||
let mut counts = Vec::new();
|
||||
let mut collapse_facet_ords = Vec::new();
|
||||
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
collapse_facet_ords.push(0);
|
||||
{
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if facet_streamer.advance() {
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = collapse_facet_ords.len();
|
||||
collapse_facet_ords.push(facet_streamer.term_ord());
|
||||
collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
counts.resize(collapse_facet_ords.len(), 0);
|
||||
|
||||
Ok(FacetSegmentCollector {
|
||||
reader: facet_reader,
|
||||
facet_ords_buf: Vec::with_capacity(255),
|
||||
collapse_mapping,
|
||||
counts,
|
||||
collapse_facet_ords,
|
||||
})
|
||||
self.set_collapse_mapping(&facet_reader);
|
||||
self.current_segment_counts
|
||||
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for FacetSegmentCollector {
|
||||
type CollectionResult = Vec<SegmentFacetCounter>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
};
|
||||
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||
for &facet_ord in &self.facet_ords_buf {
|
||||
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
|
||||
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
|
||||
for &facet_ord in &self.facet_ords {
|
||||
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||
{
|
||||
0
|
||||
} else {
|
||||
1
|
||||
@@ -421,8 +421,8 @@ impl SegmentCollector for FacetSegmentCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize(self) -> Vec<SegmentFacetCounter> {
|
||||
vec![self.into_segment_facet_counter()]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -507,7 +507,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
@@ -587,7 +587,7 @@ mod tests {
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
@@ -644,7 +644,7 @@ mod bench {
|
||||
// 40425 docs
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
@@ -7,15 +7,12 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use query::Query;
|
||||
use Searcher;
|
||||
use downcast;
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::CountCollector;
|
||||
|
||||
//mod multi_collector;
|
||||
//pub use self::multi_collector::MultiCollector;
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
|
||||
mod top_collector;
|
||||
pub use self::top_collector::TopCollector;
|
||||
@@ -24,7 +21,7 @@ mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
|
||||
mod chained_collector;
|
||||
pub use self::chained_collector::chain;
|
||||
pub use self::chained_collector::{chain, ChainedCollector};
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
@@ -56,90 +53,31 @@ pub use self::chained_collector::chain;
|
||||
///
|
||||
/// Segments are not guaranteed to be visited in any specific order.
|
||||
pub trait Collector {
|
||||
type Child : SegmentCollector + 'static;
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn for_segment(
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<Self::Child>;
|
||||
|
||||
/// Returns true iff the collector requires to compute scores for documents.
|
||||
fn requires_scoring(&self) -> bool;
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
///
|
||||
/// Then, the query loops over the segments and for each segment :
|
||||
/// - setup the collector and informs it that the segment being processed has changed.
|
||||
/// - creates a SegmentCollector for collecting documents associated to the segment
|
||||
/// - creates a `Scorer` object associated for this segment
|
||||
/// - iterate through the matched documents and push them to the segment collector.
|
||||
/// - turn the segment collector into a Combinable segment result
|
||||
///
|
||||
/// Combining all of the segment results gives a single Child::CollectionResult, which is returned.
|
||||
///
|
||||
/// The result will be Ok(None) in case of having no segments.
|
||||
fn search(&mut self, searcher: &Searcher, query: &Query) -> Result<Option<<Self::Child as SegmentCollector>::CollectionResult>> {
|
||||
let scoring_enabled = self.requires_scoring();
|
||||
let weight = query.weight(searcher, scoring_enabled)?;
|
||||
let mut results = Vec::new();
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
let mut child: Self::Child = self.for_segment(segment_ord as SegmentLocalId, segment_reader)?;
|
||||
let mut scorer = weight.scorer(segment_reader)?;
|
||||
scorer.collect(&mut child, segment_reader.delete_bitset());
|
||||
results.push(child.finalize());
|
||||
}
|
||||
Ok(results.into_iter().fold1(|x,y| {
|
||||
x.combine_into(y);
|
||||
x
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Combinable {
|
||||
fn combine_into(&mut self, other: Self);
|
||||
}
|
||||
|
||||
impl Combinable for () {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Combinable for Vec<T> {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.extend(other.into_iter());
|
||||
}
|
||||
}
|
||||
|
||||
impl<L: Combinable, R: Combinable> Combinable for (L, R) {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.0.combine_into(other.0);
|
||||
self.1.combine_into(other.1);
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SegmentCollector: downcast::Any + 'static {
|
||||
type CollectionResult: Combinable + downcast::Any + 'static;
|
||||
) -> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
|
||||
/// Turn into the final result
|
||||
fn finalize(self) -> Self::CollectionResult;
|
||||
/// Returns true iff the collector requires to compute scores for documents.
|
||||
fn requires_scoring(&self) -> bool;
|
||||
}
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
type Child = C::Child;
|
||||
|
||||
fn for_segment(
|
||||
&mut self, // TODO Ask Jason : why &mut self here!?
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<C::Child> {
|
||||
(*self).for_segment(segment_local_id, segment)
|
||||
) -> Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
C::collect(self, doc, score)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -147,61 +85,6 @@ impl<'a, C: Collector> Collector for &'a mut C {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CollectorWrapper<'a, TCollector: 'a + Collector>(&'a mut TCollector);
|
||||
|
||||
impl<'a, T: 'a + Collector> CollectorWrapper<'a, T> {
|
||||
pub fn new(collector: &'a mut T) -> CollectorWrapper<'a, T> {
|
||||
CollectorWrapper(collector)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: 'a + Collector> Collector for CollectorWrapper<'a, T> {
|
||||
type Child = T::Child;
|
||||
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<T::Child> {
|
||||
self.0.for_segment(segment_local_id, segment)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.0.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
trait UntypedCollector {
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>>;
|
||||
}
|
||||
|
||||
|
||||
impl<'a, TCollector:'a + Collector> UntypedCollector for CollectorWrapper<'a, TCollector> {
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>> {
|
||||
let segment_collector = self.0.for_segment(segment_local_id, segment)?;
|
||||
Ok(Box::new(segment_collector))
|
||||
}
|
||||
}
|
||||
|
||||
trait UntypedSegmentCollector {
|
||||
fn finalize(self) -> Box<UntypedCombinable>;
|
||||
}
|
||||
|
||||
trait UntypedCombinable {
|
||||
fn combine_into(&mut self, other: Box<UntypedCombinable>);
|
||||
}
|
||||
|
||||
pub struct CombinableWrapper<'a, T: 'a + Combinable>(&'a mut T);
|
||||
|
||||
impl<'a, T: 'a + Combinable> CombinableWrapper<'a, T> {
|
||||
pub fn new(combinable: &'a mut T) -> CombinableWrapper<'a, T> {
|
||||
CombinableWrapper(combinable)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: 'a + Combinable> Combinable for CombinableWrapper<'a, T> {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.0.combine_into(*::downcast::Downcast::<T>::downcast(other).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
@@ -219,13 +102,8 @@ pub mod tests {
|
||||
/// It is unusable in practise, as it does not store
|
||||
/// the segment ordinals
|
||||
pub struct TestCollector {
|
||||
next_offset: DocId,
|
||||
docs: Vec<DocId>,
|
||||
scores: Vec<Score>,
|
||||
}
|
||||
|
||||
pub struct TestSegmentCollector {
|
||||
offset: DocId,
|
||||
segment_max_doc: DocId,
|
||||
docs: Vec<DocId>,
|
||||
scores: Vec<Score>,
|
||||
}
|
||||
@@ -244,7 +122,8 @@ pub mod tests {
|
||||
impl Default for TestCollector {
|
||||
fn default() -> TestCollector {
|
||||
TestCollector {
|
||||
next_offset: 0,
|
||||
offset: 0,
|
||||
segment_max_doc: 0,
|
||||
docs: Vec::new(),
|
||||
scores: Vec::new(),
|
||||
}
|
||||
@@ -252,33 +131,19 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
type Child = TestSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<TestSegmentCollector> {
|
||||
let offset = self.next_offset;
|
||||
self.next_offset += reader.max_doc();
|
||||
Ok(TestSegmentCollector {
|
||||
offset,
|
||||
docs: Vec::new(),
|
||||
scores: Vec::new(),
|
||||
})
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.offset += self.segment_max_doc;
|
||||
self.segment_max_doc = reader.max_doc();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TestSegmentCollector {
|
||||
type CollectionResult = Vec<TestSegmentCollector>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.docs.push(doc + self.offset);
|
||||
self.scores.push(score);
|
||||
}
|
||||
|
||||
fn finalize(self) -> Vec<TestSegmentCollector> {
|
||||
vec![self]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -287,26 +152,17 @@ pub mod tests {
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct FastFieldTestCollector {
|
||||
next_counter: usize,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct FastFieldSegmentCollectorState {
|
||||
counter: usize,
|
||||
vals: Vec<u64>,
|
||||
}
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
state: FastFieldSegmentCollectorState,
|
||||
reader: FastFieldReader<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<FastFieldReader<u64>>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
next_counter: 0,
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -316,32 +172,17 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
type Child = FastFieldSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FastFieldSegmentCollector> {
|
||||
let counter = self.next_counter;
|
||||
self.next_counter += 1;
|
||||
Ok(FastFieldSegmentCollector {
|
||||
state: FastFieldSegmentCollectorState::default(),
|
||||
reader: reader.fast_field_reader(self.field)?,
|
||||
})
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for FastFieldSegmentCollector {
|
||||
type CollectionResult = Vec<FastFieldSegmentCollectorState>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.reader.get(doc);
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
fn finalize(self) -> Vec<FastFieldSegmentCollectorState> {
|
||||
vec![self.state]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -352,11 +193,7 @@ pub mod tests {
|
||||
pub struct BytesFastFieldTestCollector {
|
||||
vals: Vec<u8>,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
pub struct BytesFastFieldSegmentCollector {
|
||||
vals: Vec<u8>,
|
||||
reader: BytesFastFieldReader,
|
||||
ff_reader: Option<BytesFastFieldReader>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldTestCollector {
|
||||
@@ -364,6 +201,7 @@ pub mod tests {
|
||||
BytesFastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -373,32 +211,20 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
type Child = BytesFastFieldSegmentCollector;
|
||||
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn for_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<BytesFastFieldSegmentCollector> {
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
||||
})
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
|
||||
self.vals.extend(val);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
type CollectionResult = Vec<Vec<u8>>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let val = self.reader.get_val(doc);
|
||||
self.vals.extend(val);
|
||||
}
|
||||
|
||||
fn finalize(self) -> Vec<Vec<u8>> {
|
||||
vec![self.vals]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -1,122 +1,119 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use DocId;
|
||||
use Score;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use downcast::Downcast;
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors =
|
||||
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct MultiCollector<'a> {
|
||||
collector_wrappers: Vec<Box<UntypedCollector + 'a>>
|
||||
collectors: Vec<&'a mut Collector>,
|
||||
}
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
pub fn new() -> MultiCollector<'a> {
|
||||
MultiCollector {
|
||||
collector_wrappers: Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_collector<TCollector: 'a + Collector>(&mut self, collector: &'a mut TCollector) {
|
||||
let collector_wrapper = CollectorWrapper(collector);
|
||||
self.collector_wrappers.push(Box::new(collector_wrapper));
|
||||
/// Constructor
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector { collectors }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
|
||||
type Child = MultiCollectorChild;
|
||||
|
||||
fn for_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<MultiCollectorChild> {
|
||||
let children = self.collector_wrappers
|
||||
.iter_mut()
|
||||
.map(|collector_wrapper| {
|
||||
collector_wrapper.for_segment(segment_local_id, segment)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
Ok(MultiCollectorChild {
|
||||
children
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector_wrappers
|
||||
.iter()
|
||||
.any(|c| c.requires_scoring())
|
||||
}
|
||||
|
||||
fn merge_children(&mut self, children: Vec<MultiCollectorChild>) {
|
||||
let mut per_collector_children: Vec<Vec<Box<SegmentCollector>>> =
|
||||
(0..self.collector_wrappers.len())
|
||||
.map(|_| Vec::with_capacity(children.len()))
|
||||
.collect::<Vec<_>>();
|
||||
for child in children {
|
||||
for (idx, segment_collector) in child.children.into_iter().enumerate() {
|
||||
per_collector_children[idx].push(segment_collector);
|
||||
}
|
||||
}
|
||||
for (collector, children) in self.collector_wrappers.iter_mut().zip(per_collector_children) {
|
||||
collector.merge_children_anys(children);
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
collector.set_segment(segment_local_id, segment)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct MultiCollectorChild {
|
||||
children: Vec<Box<SegmentCollector>>
|
||||
}
|
||||
|
||||
impl SegmentCollector for MultiCollectorChild {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
for child in &mut self.children {
|
||||
child.collect(doc, score);
|
||||
for collector in &mut self.collectors {
|
||||
collector.collect(doc, score);
|
||||
}
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collectors
|
||||
.iter()
|
||||
.any(|collector| collector.requires_scoring())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
use schema::{TEXT, SchemaBuilder};
|
||||
use query::TermQuery;
|
||||
use Index;
|
||||
use Term;
|
||||
use schema::IndexRecordOption;
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(text=>""));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_text(text, "abc");
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = MultiCollector::new();
|
||||
collectors.add_collector(&mut top_collector);
|
||||
collectors.add_collector(&mut count_collector);
|
||||
collectors.search(&*searcher, &query).unwrap();
|
||||
let mut collectors =
|
||||
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
}
|
||||
assert_eq!(count_collector.count(), 5);
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,8 +7,6 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
use collector::Combinable;
|
||||
|
||||
// Rust heap is a max-heap and we need a min heap.
|
||||
#[derive(Clone, Copy)]
|
||||
@@ -45,7 +43,61 @@ impl Eq for GlobalScoredDoc {}
|
||||
/// with the best scores.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity is `O(n log K)`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, DocId, Score};
|
||||
/// use tantivy::collector::TopCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut top_collector).unwrap();
|
||||
///
|
||||
/// let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
/// .score_docs()
|
||||
/// .into_iter()
|
||||
/// .map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
/// .collect();
|
||||
///
|
||||
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct TopCollector {
|
||||
limit: usize,
|
||||
heap: BinaryHeap<GlobalScoredDoc>,
|
||||
@@ -101,34 +153,11 @@ impl TopCollector {
|
||||
}
|
||||
|
||||
impl Collector for TopCollector {
|
||||
type Child = TopCollector;
|
||||
|
||||
fn for_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<TopCollector> {
|
||||
Ok(TopCollector {
|
||||
limit: self.limit,
|
||||
heap: BinaryHeap::new(),
|
||||
segment_id,
|
||||
})
|
||||
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
self.segment_id = segment_id;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl Combinable for TopCollector {
|
||||
// TODO: I think this could be a bit better
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.segment_id = other.segment_id;
|
||||
while let Some(doc) = other.heap.pop() {
|
||||
self.collect(doc.doc_address.doc(), doc.score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TopCollector {
|
||||
type CollectionResult = TopCollector;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
@@ -151,8 +180,8 @@ impl SegmentCollector for TopCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize(self) -> TopCollector {
|
||||
self
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,6 +189,7 @@ impl SegmentCollector for TopCollector {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
@@ -210,4 +240,5 @@ mod tests {
|
||||
fn test_top_0() {
|
||||
TopCollector::with_limit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ impl BitPacker {
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
@@ -98,31 +98,14 @@ where
|
||||
let addr_in_bits = idx * num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
if cfg!(feature = "simdcompression") {
|
||||
// for simdcompression,
|
||||
// the bitpacker is only used for fastfields,
|
||||
// and we expect them to be always padded.
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
} else {
|
||||
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
|
||||
} else {
|
||||
let mut buffer = [0u8; 8];
|
||||
for i in addr..data.len() {
|
||||
buffer[i - addr] += data[i];
|
||||
}
|
||||
unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
|
||||
};
|
||||
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
|
||||
val_shifted & mask
|
||||
}
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 =
|
||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
|
||||
/// Reads a range of values from the fast field.
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use compression::compressed_block_size;
|
||||
use compression::BlockDecoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use directory::ReadOnlySource;
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
/// Reads a stream of compressed ints.
|
||||
///
|
||||
@@ -10,7 +11,7 @@ use directory::{ReadOnlySource, SourceRead};
|
||||
/// The `.skip(...)` makes it possible to avoid
|
||||
/// decompressing blocks that are not required.
|
||||
pub struct CompressedIntStream {
|
||||
buffer: SourceRead,
|
||||
buffer: OwnedRead,
|
||||
|
||||
block_decoder: BlockDecoder,
|
||||
cached_addr: usize, // address of the currently decoded block
|
||||
@@ -24,7 +25,7 @@ impl CompressedIntStream {
|
||||
/// Opens a compressed int stream.
|
||||
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
|
||||
CompressedIntStream {
|
||||
buffer: SourceRead::from(source),
|
||||
buffer: OwnedRead::new(source),
|
||||
block_decoder: BlockDecoder::new(),
|
||||
cached_addr: usize::max_value(),
|
||||
cached_next_addr: usize::max_value(),
|
||||
|
||||
@@ -21,6 +21,7 @@ use directory::ManagedDirectory;
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use indexer::DirectoryLock;
|
||||
use num_cpus;
|
||||
@@ -51,12 +52,7 @@ impl Index {
|
||||
/// This should only be used for unit tests.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let ram_directory = RAMDirectory::create();
|
||||
// unwrap is ok here
|
||||
let directory = ManagedDirectory::new(ram_directory).expect(
|
||||
"Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.",
|
||||
);
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
Index::create(ram_directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
}
|
||||
|
||||
/// Creates a new index in a given filepath.
|
||||
@@ -64,15 +60,9 @@ impl Index {
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
Index::create(mmap_directory, schema)
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
@@ -86,10 +76,22 @@ impl Index {
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::create(mmap_directory, schema)
|
||||
}
|
||||
|
||||
/// Creates a new index given an implementation of the trait `Directory`
|
||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(dir)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
@@ -103,24 +105,22 @@ impl Index {
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open_directory<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
Index::open_directory(mmap_directory)
|
||||
Index::open(mmap_directory)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
@@ -137,9 +137,13 @@ impl Index {
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// num_threads specifies the number of indexing workers that
|
||||
/// - `num_threads` defines the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
///
|
||||
/// - `overall_heap_size_in_bytes` sets the amount of memory
|
||||
/// allocated for all indexing thread.
|
||||
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
@@ -147,21 +151,35 @@ impl Index {
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize,
|
||||
overall_heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
self,
|
||||
num_threads,
|
||||
heap_size_in_bytes_per_thread,
|
||||
directory_lock,
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a multithreaded writer
|
||||
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
|
||||
///
|
||||
/// Tantivy will automatically define the number of threads to use.
|
||||
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
||||
/// between a given number of threads.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer(&self, heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes)
|
||||
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
let mut num_threads = num_cpus::get();
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
|
||||
}
|
||||
self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
|
||||
}
|
||||
|
||||
/// Accessor to the index schema
|
||||
@@ -186,8 +204,8 @@ impl Index {
|
||||
|
||||
/// Creates a new segment.
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
let segment_meta = SegmentMeta::new(SegmentId::generate_random());
|
||||
create_segment(self.clone(), segment_meta)
|
||||
let segment_meta = SegmentMeta::new(SegmentId::generate_random(), 0);
|
||||
self.segment(segment_meta)
|
||||
}
|
||||
|
||||
/// Return a reference to the index directory.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use common::BinarySerializable;
|
||||
use compression::CompressedIntStream;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use directory::ReadOnlySource;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::TermInfo;
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
@@ -8,6 +8,7 @@ use schema::FieldType;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use termdict::TermDictionary;
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -92,7 +93,7 @@ impl InvertedIndexReader {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let end_source = self.postings_source.len();
|
||||
let postings_slice = self.postings_source.slice(offset, end_source);
|
||||
let postings_reader = SourceRead::from(postings_slice);
|
||||
let postings_reader = OwnedRead::new(postings_slice);
|
||||
block_postings.reset(term_info.doc_freq as usize, postings_reader);
|
||||
}
|
||||
|
||||
@@ -114,7 +115,7 @@ impl InvertedIndexReader {
|
||||
};
|
||||
BlockSegmentPostings::from_data(
|
||||
term_info.doc_freq as usize,
|
||||
SourceRead::from(postings_data),
|
||||
OwnedRead::new(postings_data),
|
||||
freq_reading_option,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -73,7 +73,7 @@ impl Searcher {
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher
|
||||
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
|
||||
collector.search(self, query)
|
||||
query.search(self, collector)
|
||||
}
|
||||
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
|
||||
@@ -4,7 +4,7 @@ use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use directory::error::{OpenReadError, OpenWriteError};
|
||||
use directory::Directory;
|
||||
use directory::{FileProtection, ReadOnlySource, WritePtr};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use schema::Schema;
|
||||
use std::fmt;
|
||||
@@ -28,6 +28,7 @@ impl fmt::Debug for Segment {
|
||||
/// Creates a new segment given an `Index` and a `SegmentId`
|
||||
///
|
||||
/// The function is here to make it private outside `tantivy`.
|
||||
/// #[doc(hidden)]
|
||||
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment { index, meta }
|
||||
}
|
||||
@@ -49,8 +50,11 @@ impl Segment {
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
|
||||
self.meta.set_delete_meta(num_deleted_docs, opstamp);
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the segment's id.
|
||||
@@ -66,16 +70,6 @@ impl Segment {
|
||||
self.meta.relative_path(component)
|
||||
}
|
||||
|
||||
/// Protects a specific component file from being deleted.
|
||||
///
|
||||
/// Returns a FileProtection object. The file is guaranteed
|
||||
/// to not be garbage collected as long as this `FileProtection` object
|
||||
/// lives.
|
||||
pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection {
|
||||
let path = self.relative_path(component);
|
||||
self.index.directory().protect_file_from_delete(&path)
|
||||
}
|
||||
|
||||
/// Open one of the component file for a *regular* read.
|
||||
pub fn open_read(
|
||||
&self,
|
||||
@@ -105,35 +99,3 @@ pub trait SerializableSegment {
|
||||
/// The number of documents in the segment.
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32>;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use core::SegmentComponent;
|
||||
use directory::Directory;
|
||||
use schema::SchemaBuilder;
|
||||
use std::collections::HashSet;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_segment_protect_component() {
|
||||
let mut index = Index::create_in_ram(SchemaBuilder::new().build());
|
||||
let segment = index.new_segment();
|
||||
let path = segment.relative_path(SegmentComponent::POSTINGS);
|
||||
|
||||
let directory = index.directory_mut();
|
||||
directory.atomic_write(&*path, &vec![0u8]).unwrap();
|
||||
|
||||
let living_files = HashSet::new();
|
||||
{
|
||||
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
|
||||
assert!(directory.exists(&*path));
|
||||
directory.garbage_collect(|| living_files.clone());
|
||||
assert!(directory.exists(&*path));
|
||||
}
|
||||
|
||||
directory.garbage_collect(|| living_files);
|
||||
assert!(!directory.exists(&*path));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,8 +1,15 @@
|
||||
use super::SegmentComponent;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use core::SegmentId;
|
||||
use serde;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
lazy_static! {
|
||||
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
@@ -13,32 +20,72 @@ struct DeleteMeta {
|
||||
///
|
||||
/// For instance the number of docs it contains,
|
||||
/// how many are deleted, etc.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
tracked: TrackedObject<InnerSegmentMeta>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentMeta {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
self.tracked.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for SegmentMeta {
|
||||
fn serialize<S>(
|
||||
&self,
|
||||
serializer: S,
|
||||
) -> Result<<S as serde::Serializer>::Ok, <S as serde::Serializer>::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
self.tracked.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> serde::Deserialize<'a> for SegmentMeta {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, <D as serde::Deserializer<'a>>::Error>
|
||||
where
|
||||
D: serde::Deserializer<'a>,
|
||||
{
|
||||
let inner = InnerSegmentMeta::deserialize(deserializer)?;
|
||||
let tracked = INVENTORY.track(inner);
|
||||
Ok(SegmentMeta { tracked: tracked })
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentMeta {
|
||||
/// Creates a new segment meta for
|
||||
/// a segment with no deletes and no documents.
|
||||
pub fn new(segment_id: SegmentId) -> SegmentMeta {
|
||||
SegmentMeta {
|
||||
/// Lists all living `SegmentMeta` object at the time of the call.
|
||||
pub fn all() -> Vec<SegmentMeta> {
|
||||
INVENTORY
|
||||
.list()
|
||||
.into_iter()
|
||||
.map(|inner| SegmentMeta { tracked: inner })
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
/// Creates a new `SegmentMeta` object.
|
||||
#[doc(hidden)]
|
||||
pub fn new(segment_id: SegmentId, max_doc: u32) -> SegmentMeta {
|
||||
let inner = InnerSegmentMeta {
|
||||
segment_id,
|
||||
max_doc: 0,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
};
|
||||
SegmentMeta {
|
||||
tracked: INVENTORY.track(inner),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the segment id.
|
||||
pub fn id(&self) -> SegmentId {
|
||||
self.segment_id
|
||||
self.tracked.segment_id
|
||||
}
|
||||
|
||||
/// Returns the number of deleted documents.
|
||||
pub fn num_deleted_docs(&self) -> u32 {
|
||||
self.deletes
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.num_deleted_docs)
|
||||
.unwrap_or(0u32)
|
||||
@@ -80,7 +127,7 @@ impl SegmentMeta {
|
||||
/// and all the doc ids contains in this segment
|
||||
/// are exactly (0..max_doc).
|
||||
pub fn max_doc(&self) -> u32 {
|
||||
self.max_doc
|
||||
self.tracked.max_doc
|
||||
}
|
||||
|
||||
/// Return the number of documents in the segment.
|
||||
@@ -91,25 +138,36 @@ impl SegmentMeta {
|
||||
/// Returns the opstamp of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<u64> {
|
||||
self.deletes.as_ref().map(|delete_meta| delete_meta.opstamp)
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.opstamp)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment meta contains
|
||||
/// delete information.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.deletes.is_some()
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn set_max_doc(&mut self, max_doc: u32) {
|
||||
self.max_doc = max_doc;
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
|
||||
self.deletes = Some(DeleteMeta {
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
};
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc: inner_meta.max_doc,
|
||||
deletes: Some(delete_meta),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct InnerSegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
}
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
mod skip;
|
||||
pub mod stacker;
|
||||
|
||||
pub use self::skip::{SkipList, SkipListBuilder};
|
||||
@@ -1,168 +0,0 @@
|
||||
use super::heap::{Heap, HeapAllocable};
|
||||
use std::mem;
|
||||
|
||||
#[inline]
|
||||
pub fn is_power_of_2(val: u32) -> bool {
|
||||
val & (val - 1) == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(val: u32) -> bool {
|
||||
val > 3 && is_power_of_2(val)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
end: u32,
|
||||
val0: u32,
|
||||
val1: u32,
|
||||
val2: u32,
|
||||
next: u32, // inline of the first block
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push(&mut self, val: u32, heap: &Heap) {
|
||||
self.len += 1;
|
||||
if jump_needed(self.len) {
|
||||
// we need to allocate another block.
|
||||
// ... As we want to grow block exponentially
|
||||
// the next block as a size of (length so far),
|
||||
// and we need to add 1u32 to store the pointer
|
||||
// to the next element.
|
||||
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: u32 = heap.allocate_space(new_block_size);
|
||||
heap.set(self.end, &new_block_addr);
|
||||
self.end = new_block_addr;
|
||||
}
|
||||
heap.set(self.end, &val);
|
||||
self.end += mem::size_of::<u32>() as u32;
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for u32 {
|
||||
fn with_addr(_addr: u32) -> u32 {
|
||||
0u32
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for ExpUnrolledLinkedList {
|
||||
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
|
||||
let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
end: last_addr,
|
||||
val0: 0u32,
|
||||
val1: 0u32,
|
||||
val2: 0u32,
|
||||
next: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a Heap,
|
||||
addr: u32,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
let addr: u32;
|
||||
self.consumed += 1;
|
||||
if jump_needed(self.consumed) {
|
||||
addr = *self.heap.get_mut_ref(self.addr);
|
||||
} else {
|
||||
addr = self.addr;
|
||||
}
|
||||
self.addr = addr + mem::size_of::<u32>() as u32;
|
||||
Some(*self.heap.get_mut_ref(addr))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::heap::Heap;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let heap = Heap::with_capacity(1_000_000);
|
||||
let (addr, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stack.push(1u32, &heap);
|
||||
stack.push(2u32, &heap);
|
||||
stack.push(4u32, &heap);
|
||||
stack.push(8u32, &heap);
|
||||
{
|
||||
let mut it = stack.iter(addr, &heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::ExpUnrolledLinkedList;
|
||||
use super::Heap;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut vecs = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
vecs.push(Vec::new());
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
vecs[t].push(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = Heap::with_capacity(64_000_000);
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &heap);
|
||||
}
|
||||
}
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1,335 +0,0 @@
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
use postings::UnorderedTermId;
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use std::slice;
|
||||
|
||||
mod murmurhash2 {
|
||||
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let len = key.len() as u32;
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { *key_ptr }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
));
|
||||
let table_size = compute_table_size(table_num_bits);
|
||||
let heap_size = per_thread_memory_budget - table_size;
|
||||
(heap_size, table_num_bits)
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone, Default)]
|
||||
struct KeyValue {
|
||||
key_value_addr: BytesRef,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
/// The quirky API has the benefit of avoiding
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct TermHashMap<'a> {
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing { hash, i: 0, mask }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i * self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Iter<'a: 'b, 'b> {
|
||||
hashmap: &'b TermHashMap<'a>,
|
||||
inner: slice::Iter<'a, usize>,
|
||||
}
|
||||
|
||||
impl<'a, 'b> Iterator for Iter<'a, 'b> {
|
||||
type Item = (&'b [u8], u32, UnorderedTermId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'b [u8], u32) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermHashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
pub fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
|
||||
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
|
||||
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
|
||||
(key_bytes, expull_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr,
|
||||
hash,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> Iter<'a, 'b> {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
hashmap: &self,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
|
||||
&mut self,
|
||||
key: S,
|
||||
) -> (UnorderedTermId, &mut V) {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
|
||||
self.set_bucket(hash, key_bytes_ref, bucket);
|
||||
return (bucket as UnorderedTermId, val);
|
||||
} else if kv.hash == hash {
|
||||
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
|
||||
if stored_key == key_bytes {
|
||||
return (
|
||||
bucket as UnorderedTermId,
|
||||
self.heap.get_mut_ref(expull_addr),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash2(b: &mut Bencher) {
|
||||
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
let mut s = 0;
|
||||
for &key in &keys {
|
||||
s ^= murmurhash2(key.as_bytes());
|
||||
}
|
||||
s
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::heap::{Heap, HeapAllocable};
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use super::split_memory;
|
||||
use super::*;
|
||||
use std::collections::HashSet;
|
||||
|
||||
struct TestValue {
|
||||
val: u32,
|
||||
_addr: u32,
|
||||
}
|
||||
|
||||
impl HeapAllocable for TestValue {
|
||||
fn with_addr(addr: u32) -> TestValue {
|
||||
TestValue {
|
||||
val: 0u32,
|
||||
_addr: addr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(split_memory(100_000), (67232, 12));
|
||||
assert_eq!(split_memory(1_000_000), (737856, 15));
|
||||
assert_eq!(split_memory(10_000_000), (7902848, 18));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let heap = Heap::with_capacity(2_000_000);
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 4u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 3u32);
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 4u32);
|
||||
}
|
||||
let mut iter_values = hash_map.iter();
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 3u32);
|
||||
}
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 4u32);
|
||||
}
|
||||
assert!(iter_values.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,233 +0,0 @@
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
///
|
||||
/// The slice will encode the length of the `&[u8]` slice
|
||||
/// on 16-bits, and then the data is encoded.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct BytesRef(u32);
|
||||
|
||||
impl BytesRef {
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
|
||||
pub fn addr(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BytesRef {
|
||||
fn default() -> BytesRef {
|
||||
BytesRef(u32::max_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// Object that can be allocated in tantivy's custom `Heap`.
|
||||
pub trait HeapAllocable {
|
||||
fn with_addr(addr: u32) -> Self;
|
||||
}
|
||||
|
||||
/// Tantivy's custom `Heap`.
|
||||
pub struct Heap {
|
||||
inner: UnsafeCell<InnerHeap>,
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
|
||||
impl Heap {
|
||||
/// Creates a new heap with a given capacity
|
||||
pub fn with_capacity(num_bytes: usize) -> Heap {
|
||||
Heap {
|
||||
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
|
||||
}
|
||||
}
|
||||
|
||||
fn inner(&self) -> &mut InnerHeap {
|
||||
unsafe { &mut *self.inner.get() }
|
||||
}
|
||||
|
||||
/// Clears the heap. All the underlying data is lost.
|
||||
///
|
||||
/// This heap does not support deallocation.
|
||||
/// This method is the only way to free memory.
|
||||
pub fn clear(&self) {
|
||||
self.inner().clear();
|
||||
}
|
||||
|
||||
/// Return amount of free space, in bytes.
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
self.inner().num_free_bytes()
|
||||
}
|
||||
|
||||
/// Allocate a given amount of space and returns an address
|
||||
/// in the Heap.
|
||||
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
|
||||
self.inner().allocate_space(num_bytes)
|
||||
}
|
||||
|
||||
/// Allocate an object in the heap
|
||||
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
|
||||
let addr = self.inner().allocate_space(mem::size_of::<V>());
|
||||
let v: V = V::with_addr(addr);
|
||||
self.inner().set(addr, &v);
|
||||
(addr, self.inner().get_mut_ref(addr))
|
||||
}
|
||||
|
||||
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
|
||||
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
|
||||
self.inner().allocate_and_set(data)
|
||||
}
|
||||
|
||||
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
|
||||
/// given as argumetn
|
||||
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
self.inner().get_slice(bytes_ref)
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap, at the given `address`.
|
||||
pub fn set<Item>(&self, addr: u32, val: &Item) {
|
||||
self.inner().set(addr, val);
|
||||
}
|
||||
|
||||
/// Returns a mutable reference for an object at a given Item.
|
||||
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to an `Item` at a given `addr`.
|
||||
#[cfg(test)]
|
||||
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.get_mut_ref(addr)
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerHeap {
|
||||
buffer: Vec<u8>,
|
||||
buffer_len: u32,
|
||||
used: u32,
|
||||
next_heap: Option<Box<InnerHeap>>,
|
||||
}
|
||||
|
||||
impl InnerHeap {
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = vec![0u8; num_bytes];
|
||||
InnerHeap {
|
||||
buffer,
|
||||
buffer_len: num_bytes as u32,
|
||||
next_heap: None,
|
||||
used: 0u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.used = 0u32;
|
||||
self.next_heap = None;
|
||||
}
|
||||
|
||||
// Returns the number of free bytes. If the buffer
|
||||
// has reached it's capacity and overflowed to another buffer, return 0.
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
if self.next_heap.is_some() {
|
||||
0u32
|
||||
} else {
|
||||
self.buffer_len - self.used
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
|
||||
let addr = self.used;
|
||||
self.used += num_bytes as u32;
|
||||
if self.used <= self.buffer_len {
|
||||
addr
|
||||
} else {
|
||||
if self.next_heap.is_none() {
|
||||
info!(
|
||||
r#"Exceeded heap size. The segment will be committed right
|
||||
after indexing this document."#,
|
||||
);
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
}
|
||||
}
|
||||
|
||||
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
let start = bytes_ref.0;
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(BytesRef(start - self.buffer_len))
|
||||
} else {
|
||||
let start = start as usize;
|
||||
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
|
||||
&self.buffer[start + 2..start + 2 + len]
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
} else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
}
|
||||
|
||||
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
|
||||
assert!(data.len() < u16::max_value() as usize);
|
||||
let total_len = 2 + data.len();
|
||||
let start = self.allocate_space(total_len);
|
||||
let total_buff = self.get_mut_slice(start, start + total_len as u32);
|
||||
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
|
||||
total_buff[2..].clone_from_slice(data);
|
||||
BytesRef(start)
|
||||
}
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut(addr - self.buffer_len)
|
||||
} else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_ref(addr - self.buffer_len)
|
||||
} else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
unsafe { &mut *v_ptr }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set(addr - self.buffer_len, val);
|
||||
} else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
|
||||
unsafe {
|
||||
let dest_ptr: *mut u8 = self.get_mut(addr);
|
||||
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
mod expull;
|
||||
pub(crate) mod hashmap;
|
||||
mod heap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::hashmap::TermHashMap;
|
||||
pub use self::heap::{Heap, HeapAllocable};
|
||||
|
||||
#[test]
|
||||
fn test_unrolled_linked_list() {
|
||||
use std::collections;
|
||||
let heap = Heap::with_capacity(30_000_000);
|
||||
{
|
||||
heap.clear();
|
||||
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
|
||||
ks.push(2);
|
||||
ks.push(3);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
|
||||
for j in 0..k {
|
||||
for i in 0..500 {
|
||||
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
|
||||
v.push(i * j, &heap);
|
||||
}
|
||||
}
|
||||
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
|
||||
for (key, addr, _) in hashmap.iter() {
|
||||
map_addr.insert(Vec::from(key), addr);
|
||||
}
|
||||
|
||||
for i in 0..500 {
|
||||
let key: String = i.to_string();
|
||||
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
|
||||
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
|
||||
let mut it = exp_pull.iter(addr, &heap);
|
||||
for j in 0..k {
|
||||
assert_eq!(it.next().unwrap(), i * j);
|
||||
}
|
||||
assert!(!it.next().is_some());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -173,9 +173,6 @@ pub enum DeleteError {
|
||||
/// Any kind of IO error that happens when
|
||||
/// interacting with the underlying IO device.
|
||||
IOError(IOError),
|
||||
/// The file may not be deleted because it is
|
||||
/// protected.
|
||||
FileProtected(PathBuf),
|
||||
}
|
||||
|
||||
impl From<IOError> for DeleteError {
|
||||
@@ -190,9 +187,6 @@ impl fmt::Display for DeleteError {
|
||||
DeleteError::FileDoesNotExist(ref path) => {
|
||||
write!(f, "the file '{:?}' does not exist", path)
|
||||
}
|
||||
DeleteError::FileProtected(ref path) => {
|
||||
write!(f, "the file '{:?}' is protected and can't be deleted", path)
|
||||
}
|
||||
DeleteError::IOError(ref err) => {
|
||||
write!(f, "an io error occurred while deleting a file: '{}'", err)
|
||||
}
|
||||
@@ -207,7 +201,7 @@ impl StdError for DeleteError {
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
match *self {
|
||||
DeleteError::FileDoesNotExist(_) | DeleteError::FileProtected(_) => None,
|
||||
DeleteError::FileDoesNotExist(_) => None,
|
||||
DeleteError::IOError(ref err) => Some(err),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,9 +3,7 @@ use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use error::{ErrorKind, Result, ResultExt};
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -32,37 +30,6 @@ pub struct ManagedDirectory {
|
||||
#[derive(Debug, Default)]
|
||||
struct MetaInformation {
|
||||
managed_paths: HashSet<PathBuf>,
|
||||
protected_files: HashMap<PathBuf, usize>,
|
||||
}
|
||||
|
||||
/// A `FileProtection` prevents the garbage collection of a file.
|
||||
///
|
||||
/// See `ManagedDirectory.protect_file_from_delete`.
|
||||
pub struct FileProtection {
|
||||
directory: ManagedDirectory,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
|
||||
let mut meta_informations_wlock = directory
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
|
||||
(*counter_ref_mut) -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for FileProtection {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
|
||||
write!(formatter, "FileProtectionFor({:?})", self.path)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for FileProtection {
|
||||
fn drop(&mut self) {
|
||||
unprotect_file_from_delete(&self.directory, &*self.path);
|
||||
}
|
||||
}
|
||||
|
||||
/// Saves the file containing the list of existing files
|
||||
@@ -89,7 +56,6 @@ impl ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default(),
|
||||
})),
|
||||
})
|
||||
}
|
||||
@@ -158,9 +124,6 @@ impl ManagedDirectory {
|
||||
error!("Failed to delete {:?}", file_to_delete);
|
||||
}
|
||||
}
|
||||
DeleteError::FileProtected(_) => {
|
||||
// this is expected.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -185,28 +148,6 @@ impl ManagedDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
/// Protects a file from being garbage collected.
|
||||
///
|
||||
/// The method returns a `FileProtection` object.
|
||||
/// The file will not be garbage collected as long as the
|
||||
/// `FileProtection` object is kept alive.
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let pathbuf = path.to_owned();
|
||||
{
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned on protect");
|
||||
*meta_informations_wlock
|
||||
.protected_files
|
||||
.entry(pathbuf.clone())
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
FileProtection {
|
||||
directory: self.clone(),
|
||||
path: pathbuf.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a file as managed
|
||||
///
|
||||
/// This method must be called before the file is
|
||||
@@ -247,16 +188,6 @@ impl Directory for ManagedDirectory {
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
{
|
||||
let metas_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("poisoned lock in managed directory meta");
|
||||
if let Some(counter) = metas_rlock.protected_files.get(path) {
|
||||
if *counter > 0 {
|
||||
return Err(DeleteError::FileProtected(path.to_owned()));
|
||||
}
|
||||
}
|
||||
}
|
||||
self.directory.delete(path)
|
||||
}
|
||||
|
||||
@@ -372,28 +303,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_managed_directory_protect() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
{
|
||||
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -25,8 +25,7 @@ pub use self::read_only_source::ReadOnlySource;
|
||||
#[cfg(feature = "mmap")]
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
|
||||
pub(crate) use self::read_only_source::SourceRead;
|
||||
pub(crate) use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Synonym of Seek + Write
|
||||
pub trait SeekableWrite: Seek + Write {}
|
||||
|
||||
@@ -3,9 +3,8 @@ use common::HasLen;
|
||||
#[cfg(feature = "mmap")]
|
||||
use fst::raw::MmapReadOnly;
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
use std::io::{self, Read};
|
||||
use std::ops::Deref;
|
||||
use std::slice;
|
||||
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
@@ -120,49 +119,3 @@ impl From<Vec<u8>> for ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(shared_data)
|
||||
}
|
||||
}
|
||||
|
||||
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
|
||||
pub(crate) struct SourceRead {
|
||||
_data_owner: ReadOnlySource,
|
||||
cursor: &'static [u8],
|
||||
}
|
||||
|
||||
impl SourceRead {
|
||||
// Advance the cursor by a given number of bytes.
|
||||
pub fn advance(&mut self, len: usize) {
|
||||
self.cursor = &self.cursor[len..];
|
||||
}
|
||||
|
||||
pub fn slice_from(&self, start: usize) -> &[u8] {
|
||||
&self.cursor[start..]
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u8 {
|
||||
self.cursor[idx]
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SourceRead {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.cursor
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ReadOnlySource> for SourceRead {
|
||||
// Creates a new `SourceRead` from a given `ReadOnlySource`
|
||||
fn from(source: ReadOnlySource) -> SourceRead {
|
||||
let len = source.len();
|
||||
let slice_ptr = source.as_slice().as_ptr();
|
||||
let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) };
|
||||
SourceRead {
|
||||
_data_owner: source,
|
||||
cursor: static_slice,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for SourceRead {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
self.cursor.read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,9 +9,6 @@ use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use datastruct::stacker::hashmap::split_memory;
|
||||
use datastruct::stacker::Heap;
|
||||
use directory::FileProtection;
|
||||
use docset::DocSet;
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
use fastfield::write_delete_bitset;
|
||||
@@ -24,6 +21,7 @@ use indexer::DirectoryLock;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentWriter;
|
||||
use postings::compute_table_size;
|
||||
use schema::Document;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
@@ -34,10 +32,11 @@ use std::thread::JoinHandle;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
|
||||
pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
||||
|
||||
// We impose the memory per thread to be at least 3 MB.
|
||||
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
|
||||
pub const HEAP_SIZE_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
|
||||
pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
|
||||
// Add document will block if the number of docs waiting in the queue to be indexed
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
@@ -46,6 +45,24 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
type DocumentSender = chan::Sender<AddOperation>;
|
||||
type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
(1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
))
|
||||
.min(19) // we cap it at 512K
|
||||
}
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
///
|
||||
/// It manages a small number of indexing thread, as well as a shared
|
||||
@@ -100,11 +117,16 @@ pub fn open_index_writer(
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!(
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
let err_msg = format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT
|
||||
));
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
@@ -193,15 +215,13 @@ pub fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
) -> Result<()> {
|
||||
{
|
||||
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
|
||||
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
|
||||
// We are already up-to-date here.
|
||||
if target_opstamp == previous_opstamp {
|
||||
return Ok(file_protect);
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
let max_doc = segment_reader.max_doc();
|
||||
|
||||
@@ -220,6 +240,7 @@ pub fn advance_deletes(
|
||||
target_opstamp,
|
||||
)?;
|
||||
|
||||
// TODO optimize
|
||||
for doc in 0u32..max_doc {
|
||||
if segment_reader.is_deleted(doc) {
|
||||
delete_bitset.insert(doc as usize);
|
||||
@@ -228,54 +249,39 @@ pub fn advance_deletes(
|
||||
|
||||
let num_deleted_docs = delete_bitset.len();
|
||||
if num_deleted_docs > 0 {
|
||||
segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
file_protect = Some(segment.protect_from_delete(SegmentComponent::DELETE));
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||
}
|
||||
}
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
Ok(file_protect)
|
||||
segment_entry.set_meta((*segment.meta()).clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn index_documents(
|
||||
heap: &mut Heap,
|
||||
table_size: usize,
|
||||
memory_budget: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
heap.clear();
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?;
|
||||
let table_size = initial_table_size(memory_budget);
|
||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||
for doc in document_iterator {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
// There is two possible conditions to close the segment.
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
if segment_writer.is_buffer_full() {
|
||||
|
||||
let mem_usage = segment_writer.mem_usage();
|
||||
|
||||
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
// The second is the term dictionary hash table
|
||||
// is reaching saturation.
|
||||
//
|
||||
// Tantivy does not resize its hashtable. When it reaches
|
||||
// capacity, we just stop indexing new document.
|
||||
if segment_writer.is_term_saturated() {
|
||||
info!(
|
||||
"Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !segment_updater.is_alive() {
|
||||
@@ -290,8 +296,7 @@ fn index_documents(
|
||||
|
||||
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
|
||||
|
||||
let mut segment_meta = SegmentMeta::new(segment_id);
|
||||
segment_meta.set_max_doc(num_docs);
|
||||
let segment_meta = SegmentMeta::new(segment_id, num_docs);
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
|
||||
@@ -367,14 +372,12 @@ impl IndexWriter {
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let document_receiver_clone = self.document_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
|
||||
info!("heap size {}, table_size {}", heap_size, table_size);
|
||||
let mut heap = Heap::with_capacity(heap_size);
|
||||
|
||||
let generation = self.generation;
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!(
|
||||
"indexing thread {} for gen {}",
|
||||
@@ -402,8 +405,7 @@ impl IndexWriter {
|
||||
}
|
||||
let segment = segment_updater.new_segment();
|
||||
index_documents(
|
||||
&mut heap,
|
||||
table_size,
|
||||
mem_budget,
|
||||
&segment,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
@@ -441,7 +443,9 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
|
||||
///
|
||||
/// `segment_ids` is required to be non-empty.
|
||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||
self.segment_updater.start_merge(segment_ids)
|
||||
}
|
||||
|
||||
@@ -637,6 +641,7 @@ impl IndexWriter {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::initial_table_size;
|
||||
use env_logger;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
@@ -699,7 +704,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap();
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.rollback().unwrap();
|
||||
|
||||
@@ -732,7 +737,7 @@ mod tests {
|
||||
};
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
let mut doc = Document::default();
|
||||
@@ -766,7 +771,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
@@ -801,7 +806,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
@@ -831,4 +836,12 @@ mod tests {
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000), 12);
|
||||
assert_eq!(initial_table_size(1_000_000), 15);
|
||||
assert_eq!(initial_table_size(10_000_000), 18);
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -116,15 +116,17 @@ mod tests {
|
||||
assert!(result_list.is_empty());
|
||||
}
|
||||
|
||||
fn seg_meta(num_docs: u32) -> SegmentMeta {
|
||||
let mut segment_metas = SegmentMeta::new(SegmentId::generate_random());
|
||||
segment_metas.set_max_doc(num_docs);
|
||||
segment_metas
|
||||
fn create_random_segment_meta(num_docs: u32) -> SegmentMeta {
|
||||
SegmentMeta::new(SegmentId::generate_random(), num_docs)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_log_merge_policy_pair() {
|
||||
let test_input = vec![seg_meta(10), seg_meta(10), seg_meta(10)];
|
||||
let test_input = vec![
|
||||
create_random_segment_meta(10),
|
||||
create_random_segment_meta(10),
|
||||
create_random_segment_meta(10),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 1);
|
||||
}
|
||||
@@ -137,17 +139,17 @@ mod tests {
|
||||
// * one with the 3 * 1000-docs segments
|
||||
// no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3
|
||||
let test_input = vec![
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(10000),
|
||||
seg_meta(10000),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
create_random_segment_meta(10),
|
||||
create_random_segment_meta(10),
|
||||
create_random_segment_meta(10),
|
||||
create_random_segment_meta(1000),
|
||||
create_random_segment_meta(1000),
|
||||
create_random_segment_meta(1000),
|
||||
create_random_segment_meta(10000),
|
||||
create_random_segment_meta(10000),
|
||||
create_random_segment_meta(10),
|
||||
create_random_segment_meta(10),
|
||||
create_random_segment_meta(10),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
@@ -157,12 +159,12 @@ mod tests {
|
||||
fn test_log_merge_policy_within_levels() {
|
||||
// multiple levels all get merged correctly
|
||||
let test_input = vec![
|
||||
seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
|
||||
seg_meta(11), // log2(11) = ~3.46
|
||||
seg_meta(12), // log2(12) = ~3.58
|
||||
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
|
||||
seg_meta(1000), // log2(1000) = ~9.97
|
||||
seg_meta(1000),
|
||||
create_random_segment_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
|
||||
create_random_segment_meta(11), // log2(11) = ~3.46
|
||||
create_random_segment_meta(12), // log2(12) = ~3.58
|
||||
create_random_segment_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
|
||||
create_random_segment_meta(1000), // log2(1000) = ~9.97
|
||||
create_random_segment_meta(1000),
|
||||
]; // log2(1000) = ~9.97
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
@@ -171,12 +173,12 @@ mod tests {
|
||||
fn test_log_merge_policy_small_segments() {
|
||||
// segments under min_layer_size are merged together
|
||||
let test_input = vec![
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
create_random_segment_meta(1),
|
||||
create_random_segment_meta(1),
|
||||
create_random_segment_meta(1),
|
||||
create_random_segment_meta(2),
|
||||
create_random_segment_meta(2),
|
||||
create_random_segment_meta(2),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 1);
|
||||
|
||||
@@ -683,7 +683,7 @@ mod tests {
|
||||
};
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
{
|
||||
// writing the segment
|
||||
{
|
||||
@@ -733,9 +733,10 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
@@ -979,6 +980,7 @@ mod tests {
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
@@ -1075,6 +1077,7 @@ mod tests {
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
@@ -1128,6 +1131,7 @@ mod tests {
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
@@ -1138,126 +1142,126 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_merge_facets() {
|
||||
// let mut schema_builder = schema::SchemaBuilder::default();
|
||||
// let facet_field = schema_builder.add_facet_field("facet");
|
||||
// let index = Index::create_in_ram(schema_builder.build());
|
||||
// use schema::Facet;
|
||||
// {
|
||||
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
// let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||
// let mut doc = Document::default();
|
||||
// for facet in doc_facets {
|
||||
// doc.add_facet(facet_field, Facet::from(facet));
|
||||
// }
|
||||
// index_writer.add_document(doc);
|
||||
// };
|
||||
//
|
||||
// index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
|
||||
// index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
|
||||
// index_doc(&mut index_writer, &["/top/a", "/top/b"]);
|
||||
// index_doc(&mut index_writer, &["/top/a"]);
|
||||
//
|
||||
// index_doc(&mut index_writer, &["/top/b", "/top/d"]);
|
||||
// index_doc(&mut index_writer, &["/top/d"]);
|
||||
// index_doc(&mut index_writer, &["/top/e"]);
|
||||
// index_writer.commit().expect("committed");
|
||||
//
|
||||
// index_doc(&mut index_writer, &["/top/a"]);
|
||||
// index_doc(&mut index_writer, &["/top/b"]);
|
||||
// index_doc(&mut index_writer, &["/top/c"]);
|
||||
// index_writer.commit().expect("committed");
|
||||
//
|
||||
// index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||
// index_writer.commit().expect("committed");
|
||||
// }
|
||||
// index.load_searchers().unwrap();
|
||||
// let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||
// let searcher = index.searcher();
|
||||
// let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
// facet_collector.add_facet(Facet::from("/top"));
|
||||
// use collector::{CountCollector, MultiCollector};
|
||||
// let mut count_collector = CountCollector::default();
|
||||
// {
|
||||
// let mut multi_collectors = MultiCollector::new();
|
||||
// multi_collectors.add_collector(&mut count_collector);
|
||||
// multi_collectors.add_collector(&mut facet_collector);
|
||||
// searcher.search(&AllQuery, &mut multi_collectors).unwrap();
|
||||
// }
|
||||
// assert_eq!(count_collector.count(), expected_num_docs);
|
||||
// let facet_counts = facet_collector.harvest();
|
||||
// let facets: Vec<(String, u64)> = facet_counts
|
||||
// .get("/top")
|
||||
// .map(|(facet, count)| (facet.to_string(), count))
|
||||
// .collect();
|
||||
// assert_eq!(
|
||||
// facets,
|
||||
// expected
|
||||
// .iter()
|
||||
// .map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
// .collect::<Vec<_>>()
|
||||
// );
|
||||
// };
|
||||
// test_searcher(
|
||||
// 11,
|
||||
// &[
|
||||
// ("/top/a", 5),
|
||||
// ("/top/b", 5),
|
||||
// ("/top/c", 2),
|
||||
// ("/top/d", 2),
|
||||
// ("/top/e", 2),
|
||||
// ("/top/f", 1),
|
||||
// ],
|
||||
// );
|
||||
//
|
||||
// // Merging the segments
|
||||
// {
|
||||
// let segment_ids = index
|
||||
// .searchable_segment_ids()
|
||||
// .expect("Searchable segments failed.");
|
||||
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
// index_writer
|
||||
// .merge(&segment_ids)
|
||||
// .wait()
|
||||
// .expect("Merging failed");
|
||||
// index_writer.wait_merging_threads().unwrap();
|
||||
//
|
||||
// index.load_searchers().unwrap();
|
||||
// test_searcher(
|
||||
// 11,
|
||||
// &[
|
||||
// ("/top/a", 5),
|
||||
// ("/top/b", 5),
|
||||
// ("/top/c", 2),
|
||||
// ("/top/d", 2),
|
||||
// ("/top/e", 2),
|
||||
// ("/top/f", 1),
|
||||
// ],
|
||||
// );
|
||||
// }
|
||||
//
|
||||
// // Deleting one term
|
||||
// {
|
||||
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
// let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
||||
// let facet_term = Term::from_facet(facet_field, &facet);
|
||||
// index_writer.delete_term(facet_term);
|
||||
// index_writer.commit().unwrap();
|
||||
// index.load_searchers().unwrap();
|
||||
// test_searcher(
|
||||
// 9,
|
||||
// &[
|
||||
// ("/top/a", 3),
|
||||
// ("/top/b", 3),
|
||||
// ("/top/c", 1),
|
||||
// ("/top/d", 2),
|
||||
// ("/top/e", 2),
|
||||
// ("/top/f", 1),
|
||||
// ],
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
#[test]
|
||||
fn test_merge_facets() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
use schema::Facet;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||
let mut doc = Document::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
|
||||
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
|
||||
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/e"]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
index_doc(&mut index_writer, &["/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/c"]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||
index_writer.commit().expect("committed");
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||
let searcher = index.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top"));
|
||||
use collector::{CountCollector, MultiCollector};
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut multi_collectors =
|
||||
MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
|
||||
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
|
||||
}
|
||||
assert_eq!(count_collector.count(), expected_num_docs);
|
||||
let facet_counts = facet_collector.harvest();
|
||||
let facets: Vec<(String, u64)> = facet_counts
|
||||
.get("/top")
|
||||
.map(|(facet, count)| (facet.to_string(), count))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
expected
|
||||
.iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
};
|
||||
test_searcher(
|
||||
11,
|
||||
&[
|
||||
("/top/a", 5),
|
||||
("/top/b", 5),
|
||||
("/top/c", 2),
|
||||
("/top/d", 2),
|
||||
("/top/e", 2),
|
||||
("/top/f", 1),
|
||||
],
|
||||
);
|
||||
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
test_searcher(
|
||||
11,
|
||||
&[
|
||||
("/top/a", 5),
|
||||
("/top/b", 5),
|
||||
("/top/c", 2),
|
||||
("/top/d", 2),
|
||||
("/top/e", 2),
|
||||
("/top/f", 1),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// Deleting one term
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
||||
let facet_term = Term::from_facet(facet_field, &facet);
|
||||
index_writer.delete_term(facet_term);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
test_searcher(
|
||||
9,
|
||||
&[
|
||||
("/top/a", 3),
|
||||
("/top/b", 3),
|
||||
("/top/c", 1),
|
||||
("/top/d", 2),
|
||||
("/top/e", 2),
|
||||
("/top/f", 1),
|
||||
],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||
@@ -1290,6 +1294,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
@@ -1392,6 +1397,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
|
||||
@@ -2,6 +2,8 @@ use super::segment_register::SegmentRegister;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
|
||||
use error::ErrorKind;
|
||||
use error::Result as TantivyResult;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::SegmentEntry;
|
||||
use std::collections::hash_set::HashSet;
|
||||
@@ -64,8 +66,9 @@ impl SegmentManager {
|
||||
|
||||
/// Returns all of the segment entries (committed or uncommitted)
|
||||
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||
let mut segment_entries = self.read().uncommitted.segment_entries();
|
||||
segment_entries.extend(self.read().committed.segment_entries());
|
||||
let registers_lock = self.read();
|
||||
let mut segment_entries = registers_lock.uncommitted.segment_entries();
|
||||
segment_entries.extend(registers_lock.committed.segment_entries());
|
||||
segment_entries
|
||||
}
|
||||
|
||||
@@ -76,32 +79,15 @@ impl SegmentManager {
|
||||
}
|
||||
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
let registers_lock = self.read();
|
||||
let mut files = HashSet::new();
|
||||
files.insert(META_FILEPATH.clone());
|
||||
files.insert(LOCKFILE_FILEPATH.clone());
|
||||
|
||||
let segment_metas: Vec<SegmentMeta> = registers_lock
|
||||
.committed
|
||||
.get_all_segments()
|
||||
.into_iter()
|
||||
.chain(registers_lock.uncommitted.get_all_segments().into_iter())
|
||||
.chain(registers_lock.writing.iter().cloned().map(SegmentMeta::new))
|
||||
.collect();
|
||||
for segment_meta in segment_metas {
|
||||
for segment_meta in SegmentMeta::all() {
|
||||
files.extend(segment_meta.list_files());
|
||||
}
|
||||
files
|
||||
}
|
||||
|
||||
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||
let registers = self.read();
|
||||
registers
|
||||
.committed
|
||||
.segment_entry(segment_id)
|
||||
.or_else(|| registers.uncommitted.segment_entry(segment_id))
|
||||
}
|
||||
|
||||
// Lock poisoning should never happen :
|
||||
// The lock is acquired and released within this class,
|
||||
// and the operations cannot panic.
|
||||
@@ -126,19 +112,38 @@ impl SegmentManager {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) {
|
||||
/// Marks a list of segments as in merge.
|
||||
///
|
||||
/// Returns an error if some segments are missing, or if
|
||||
/// the `segment_ids` are not either all committed or all
|
||||
/// uncommitted.
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
||||
let mut registers_lock = self.write();
|
||||
let mut segment_entries = vec![];
|
||||
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||
for segment_id in segment_ids {
|
||||
registers_lock.uncommitted.start_merge(segment_id);
|
||||
let segment_entry = registers_lock.uncommitted
|
||||
.start_merge(segment_id)
|
||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||
segment_entries.push(segment_entry);
|
||||
}
|
||||
} else if registers_lock.committed.contains_all(segment_ids) {
|
||||
for segment_id in segment_ids {
|
||||
let segment_entry = registers_lock.committed
|
||||
.start_merge(segment_id)
|
||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||
segment_entries.push(segment_entry);
|
||||
}
|
||||
for segment_id in segment_ids {
|
||||
registers_lock.committed.start_merge(segment_id);
|
||||
}
|
||||
} else {
|
||||
error!("Merge operation sent for segments that are not all uncommited or commited.");
|
||||
let error_msg = "Merge operation sent for segments that are not \
|
||||
all uncommited or commited."
|
||||
.to_string();
|
||||
bail!(ErrorKind::InvalidArgument(error_msg))
|
||||
}
|
||||
Ok(segment_entries)
|
||||
}
|
||||
|
||||
pub fn cancel_merge(
|
||||
|
||||
@@ -3,8 +3,7 @@ use core::SegmentMeta;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::segment_entry::SegmentEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
|
||||
/// The segment register keeps track
|
||||
/// of the list of segment, their size as well
|
||||
@@ -39,13 +38,6 @@ impl SegmentRegister {
|
||||
self.segment_states.len()
|
||||
}
|
||||
|
||||
pub fn get_all_segments(&self) -> Vec<SegmentMeta> {
|
||||
self.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
|
||||
self.segment_states
|
||||
.values()
|
||||
@@ -67,10 +59,6 @@ impl SegmentRegister {
|
||||
segment_ids
|
||||
}
|
||||
|
||||
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||
self.segment_states.get(segment_id).cloned()
|
||||
}
|
||||
|
||||
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
|
||||
segment_ids
|
||||
.iter()
|
||||
@@ -93,11 +81,13 @@ impl SegmentRegister {
|
||||
.cancel_merge();
|
||||
}
|
||||
|
||||
pub fn start_merge(&mut self, segment_id: &SegmentId) {
|
||||
self.segment_states
|
||||
.get_mut(segment_id)
|
||||
.expect("Received a merge notification for a segment that is not registered")
|
||||
.start_merge();
|
||||
pub fn start_merge(&mut self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||
if let Some(segment_entry) = self.segment_states.get_mut(segment_id) {
|
||||
segment_entry.start_merge();
|
||||
Some(segment_entry.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
|
||||
@@ -109,6 +99,11 @@ impl SegmentRegister {
|
||||
}
|
||||
SegmentRegister { segment_states }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||
self.segment_states.get(segment_id).cloned()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -137,7 +132,7 @@ mod tests {
|
||||
let segment_id_merged = SegmentId::generate_random();
|
||||
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_a);
|
||||
let segment_meta = SegmentMeta::new(segment_id_a, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
@@ -150,7 +145,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_b);
|
||||
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
@@ -180,7 +175,7 @@ mod tests {
|
||||
segment_register.remove_segment(&segment_id_a);
|
||||
segment_register.remove_segment(&segment_id_b);
|
||||
{
|
||||
let segment_meta_merged = SegmentMeta::new(segment_id_merged);
|
||||
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
|
||||
@@ -7,11 +7,11 @@ use core::SegmentMeta;
|
||||
use core::SerializableSegment;
|
||||
use core::META_FILEPATH;
|
||||
use directory::Directory;
|
||||
use directory::FileProtection;
|
||||
use error::{Error, ErrorKind, Result};
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
use futures::oneshot;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use futures::Future;
|
||||
use futures_cpupool::Builder as CpuPoolBuilder;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use futures_cpupool::CpuPool;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
@@ -29,8 +29,7 @@ use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::thread;
|
||||
@@ -87,38 +86,19 @@ pub fn save_metas(
|
||||
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||
|
||||
fn perform_merge(
|
||||
segment_ids: &[SegmentId],
|
||||
segment_updater: &SegmentUpdater,
|
||||
index: &Index,
|
||||
mut segment_entries: Vec<SegmentEntry>,
|
||||
mut merged_segment: Segment,
|
||||
target_opstamp: u64,
|
||||
) -> Result<SegmentEntry> {
|
||||
// first we need to apply deletes to our segment.
|
||||
info!("Start merge: {:?}", segment_ids);
|
||||
|
||||
let index = &segment_updater.0.index;
|
||||
// TODO add logging
|
||||
let schema = index.schema();
|
||||
let mut segment_entries = vec![];
|
||||
|
||||
let mut file_protections: Vec<FileProtection> = vec![];
|
||||
|
||||
for segment_id in segment_ids {
|
||||
if let Some(mut segment_entry) = segment_updater.0.segment_manager.segment_entry(segment_id)
|
||||
{
|
||||
let segment = index.segment(segment_entry.meta().clone());
|
||||
if let Some(file_protection) =
|
||||
advance_deletes(segment, &mut segment_entry, target_opstamp)?
|
||||
{
|
||||
file_protections.push(file_protection);
|
||||
}
|
||||
segment_entries.push(segment_entry);
|
||||
} else {
|
||||
error!("Error, had to abort merge as some of the segment is not managed anymore.");
|
||||
let msg = format!(
|
||||
"Segment {:?} requested for merge is not managed.",
|
||||
segment_id
|
||||
);
|
||||
bail!(ErrorKind::InvalidArgument(msg));
|
||||
}
|
||||
for segment_entry in &mut segment_entries {
|
||||
let segment = index.segment(segment_entry.meta().clone());
|
||||
advance_deletes(segment, segment_entry, target_opstamp)?;
|
||||
}
|
||||
|
||||
let delete_cursor = segment_entries[0].delete_cursor().clone();
|
||||
@@ -135,13 +115,13 @@ fn perform_merge(
|
||||
// to merge the two segments.
|
||||
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
|
||||
.expect("Creating index serializer failed");
|
||||
.chain_err(|| "Creating index serializer failed")?;
|
||||
|
||||
let num_docs = merger
|
||||
.write(segment_serializer)
|
||||
.expect("Serializing merged index failed");
|
||||
let mut segment_meta = SegmentMeta::new(merged_segment.id());
|
||||
segment_meta.set_max_doc(num_docs);
|
||||
.chain_err(|| "Serializing merged index failed")?;
|
||||
|
||||
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
|
||||
|
||||
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
|
||||
Ok(after_merge_segment_entry)
|
||||
@@ -167,8 +147,12 @@ impl SegmentUpdater {
|
||||
) -> Result<SegmentUpdater> {
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
let pool = CpuPoolBuilder::new()
|
||||
.name_prefix("segment_updater")
|
||||
.pool_size(1)
|
||||
.create();
|
||||
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||
pool: CpuPool::new(1),
|
||||
pool,
|
||||
index,
|
||||
segment_manager,
|
||||
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
|
||||
@@ -283,69 +267,85 @@ impl SegmentUpdater {
|
||||
}).wait()
|
||||
}
|
||||
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
|
||||
self.0.segment_manager.start_merge(segment_ids);
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||
//let future_merged_segment = */
|
||||
let segment_ids_vec = segment_ids.to_vec();
|
||||
self.run_async(move |segment_updater| {
|
||||
segment_updater.start_merge_impl(&segment_ids_vec[..])
|
||||
}).wait()?
|
||||
}
|
||||
|
||||
// `segment_ids` is required to be non-empty.
|
||||
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
|
||||
|
||||
let segment_updater_clone = self.clone();
|
||||
let segment_entries: Vec<SegmentEntry> = self.0.segment_manager.start_merge(segment_ids)?;
|
||||
|
||||
let segment_ids_vec = segment_ids.to_vec();
|
||||
|
||||
let merging_thread_id = self.get_merging_thread_id();
|
||||
info!(
|
||||
"Starting merge thread #{} - {:?}",
|
||||
merging_thread_id, segment_ids
|
||||
);
|
||||
let (merging_future_send, merging_future_recv) = oneshot();
|
||||
|
||||
if segment_ids.is_empty() {
|
||||
return merging_future_recv;
|
||||
}
|
||||
|
||||
let target_opstamp = self.0.stamper.stamp();
|
||||
let merging_join_handle = thread::spawn(move || {
|
||||
// first we need to apply deletes to our segment.
|
||||
let merged_segment = segment_updater_clone.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merge_result = perform_merge(
|
||||
&segment_ids_vec,
|
||||
&segment_updater_clone,
|
||||
merged_segment,
|
||||
target_opstamp,
|
||||
);
|
||||
|
||||
match merge_result {
|
||||
Ok(after_merge_segment_entry) => {
|
||||
let merged_segment_meta = after_merge_segment_entry.meta().clone();
|
||||
segment_updater_clone
|
||||
.end_merge(segment_ids_vec, after_merge_segment_entry)
|
||||
.expect("Segment updater thread is corrupted.");
|
||||
// first we need to apply deletes to our segment.
|
||||
let merging_join_handle = thread::Builder::new()
|
||||
.name(format!("mergingthread-{}", merging_thread_id))
|
||||
.spawn(move || {
|
||||
// first we need to apply deletes to our segment.
|
||||
let merged_segment = segment_updater_clone.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merge_result = perform_merge(
|
||||
&segment_updater_clone.0.index,
|
||||
segment_entries,
|
||||
merged_segment,
|
||||
target_opstamp,
|
||||
);
|
||||
|
||||
// the future may fail if the listener of the oneshot future
|
||||
// has been destroyed.
|
||||
//
|
||||
// This is not a problem here, so we just ignore any
|
||||
// possible error.
|
||||
let _merging_future_res = merging_future_send.send(merged_segment_meta);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
|
||||
// ... cancel merge
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
match merge_result {
|
||||
Ok(after_merge_segment_entry) => {
|
||||
let merged_segment_meta = after_merge_segment_entry.meta().clone();
|
||||
segment_updater_clone
|
||||
.end_merge(segment_ids_vec, after_merge_segment_entry)
|
||||
.expect("Segment updater thread is corrupted.");
|
||||
|
||||
// the future may fail if the listener of the oneshot future
|
||||
// has been destroyed.
|
||||
//
|
||||
// This is not a problem here, so we just ignore any
|
||||
// possible error.
|
||||
let _merging_future_res = merging_future_send.send(merged_segment_meta);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
|
||||
// ... cancel merge
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
|
||||
// merging_future_send will be dropped, sending an error to the future.
|
||||
}
|
||||
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
|
||||
// merging_future_send will be dropped, sending an error to the future.
|
||||
}
|
||||
}
|
||||
segment_updater_clone
|
||||
.0
|
||||
.merging_threads
|
||||
.write()
|
||||
.unwrap()
|
||||
.remove(&merging_thread_id);
|
||||
Ok(())
|
||||
});
|
||||
segment_updater_clone
|
||||
.0
|
||||
.merging_threads
|
||||
.write()
|
||||
.unwrap()
|
||||
.remove(&merging_thread_id);
|
||||
Ok(())
|
||||
})
|
||||
.expect("Failed to spawn a thread.");
|
||||
self.0
|
||||
.merging_threads
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(merging_thread_id, merging_join_handle);
|
||||
merging_future_recv
|
||||
Ok(merging_future_recv)
|
||||
}
|
||||
|
||||
fn consider_merge_options(&self) {
|
||||
@@ -358,8 +358,18 @@ impl SegmentUpdater {
|
||||
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
|
||||
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
||||
for MergeCandidate(segment_metas) in merge_candidates {
|
||||
if let Err(e) = self.start_merge(&segment_metas).fuse().poll() {
|
||||
error!("The merge task failed quickly after starting: {:?}", e);
|
||||
match self.start_merge_impl(&segment_metas) {
|
||||
Ok(merge_future) => {
|
||||
if let Err(e) = merge_future.fuse().poll() {
|
||||
error!("The merge task failed quickly after starting: {:?}", e);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
"Starting the merge failed for the following reason. This is not fatal. {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -382,7 +392,6 @@ impl SegmentUpdater {
|
||||
self.run_async(move |segment_updater| {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
let mut _file_protection_opt = None;
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater
|
||||
.0
|
||||
@@ -393,29 +402,22 @@ impl SegmentUpdater {
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
match advance_deletes(
|
||||
segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp,
|
||||
) {
|
||||
Ok(file_protection_opt_res) => {
|
||||
_file_protection_opt = file_protection_opt_res;
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
before_merge_segment_ids, e
|
||||
);
|
||||
// ... cancel merge
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
segment_updater.cancel_merge(
|
||||
&before_merge_segment_ids,
|
||||
after_merge_segment_entry.segment_id(),
|
||||
);
|
||||
return;
|
||||
if let Err(e) =
|
||||
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp)
|
||||
{
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
before_merge_segment_ids, e
|
||||
);
|
||||
// ... cancel merge
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
segment_updater.cancel_merge(
|
||||
&before_merge_segment_ids,
|
||||
after_merge_segment_entry.segment_id(),
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use super::operation::AddOperation;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
use datastruct::stacker::Heap;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::index_writer::MARGIN_IN_BYTES;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use schema::FieldType;
|
||||
@@ -24,10 +22,9 @@ use Result;
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
pub struct SegmentWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
pub struct SegmentWriter {
|
||||
max_doc: DocId,
|
||||
multifield_postings: MultiFieldPostingsWriter<'a>,
|
||||
multifield_postings: MultiFieldPostingsWriter,
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
@@ -35,7 +32,7 @@ pub struct SegmentWriter<'a> {
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
impl<'a> SegmentWriter<'a> {
|
||||
impl SegmentWriter {
|
||||
/// Creates a new `SegmentWriter`
|
||||
///
|
||||
/// The arguments are defined as follows
|
||||
@@ -46,13 +43,12 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter<'a>> {
|
||||
) -> Result<SegmentWriter> {
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.iter()
|
||||
@@ -68,7 +64,6 @@ impl<'a> SegmentWriter<'a> {
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
heap,
|
||||
max_doc: 0,
|
||||
multifield_postings,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
|
||||
@@ -94,22 +89,8 @@ impl<'a> SegmentWriter<'a> {
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment writer's buffer has reached capacity.
|
||||
///
|
||||
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
|
||||
/// The `Segment` is `finalize`d when the buffer gets full.
|
||||
///
|
||||
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
|
||||
/// exceeds the heap size.
|
||||
pub fn is_buffer_full(&self) -> bool {
|
||||
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
|
||||
}
|
||||
|
||||
/// Return true if the term dictionary hashmap is reaching capacity.
|
||||
/// It is one of the condition that triggers a `SegmentWriter` to
|
||||
/// be finalized.
|
||||
pub(crate) fn is_term_saturated(&self) -> bool {
|
||||
self.multifield_postings.is_term_saturated()
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.multifield_postings.mem_usage()
|
||||
}
|
||||
|
||||
/// Indexes a new document
|
||||
@@ -248,7 +229,7 @@ fn write(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl<'a> SerializableSegment for SegmentWriter<'a> {
|
||||
impl SerializableSegment for SegmentWriter {
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(
|
||||
|
||||
10
src/lib.rs
10
src/lib.rs
@@ -55,7 +55,7 @@
|
||||
//!
|
||||
//! // Indexing documents
|
||||
//!
|
||||
//! let index = Index::create(index_path, schema.clone())?;
|
||||
//! let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
//!
|
||||
//! // Here we use a buffer of 100MB that will be split
|
||||
//! // between indexing threads.
|
||||
@@ -136,11 +136,11 @@ extern crate combine;
|
||||
extern crate crossbeam;
|
||||
extern crate fnv;
|
||||
extern crate fst;
|
||||
extern crate fst_regex;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate itertools;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate lz4;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
@@ -180,6 +180,9 @@ mod macros;
|
||||
|
||||
pub use error::{Error, ErrorKind, ResultExt};
|
||||
|
||||
extern crate census;
|
||||
extern crate owned_read;
|
||||
|
||||
/// Tantivy result.
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
@@ -188,8 +191,7 @@ mod compression;
|
||||
mod core;
|
||||
mod indexer;
|
||||
|
||||
mod datastruct;
|
||||
#[allow(unused_doc_comment)]
|
||||
#[allow(unused_doc_comments)]
|
||||
mod error;
|
||||
pub mod tokenizer;
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ mod postings_writer;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
mod serializer;
|
||||
mod stacker;
|
||||
mod term_info;
|
||||
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
@@ -21,6 +22,8 @@ pub use self::term_info::TermInfo;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
|
||||
pub use common::HasLen;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
@@ -39,7 +42,6 @@ pub mod tests {
|
||||
use core::Index;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentReader;
|
||||
use datastruct::stacker::Heap;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
@@ -160,10 +162,9 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let segment = index.new_segment();
|
||||
|
||||
let heap = Heap::with_capacity(10_000_000);
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
|
||||
SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||
|
||||
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use postings::UnorderedTermId;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
@@ -14,80 +15,89 @@ use tokenizer::TokenStream;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
heap: &'a Heap,
|
||||
) -> Box<PostingsWriter + 'a> {
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// FieldType::Bytes cannot actually be indexed.
|
||||
// TODO fix during the indexer refactoring described in #276
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
pub struct MultiFieldPostingsWriter {
|
||||
heap: MemoryArena,
|
||||
schema: Schema,
|
||||
term_index: TermHashMap<'a>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
term_index: TermHashMap,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
||||
}
|
||||
|
||||
impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
impl MultiFieldPostingsWriter {
|
||||
/// Create a new `MultiFieldPostingsWriter` given
|
||||
/// a schema and a heap.
|
||||
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
|
||||
let term_index = TermHashMap::new(table_bits, heap);
|
||||
pub fn new(schema: &Schema, table_bits: usize) -> MultiFieldPostingsWriter {
|
||||
let term_index = TermHashMap::new(table_bits);
|
||||
let per_field_postings_writers: Vec<_> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.map(|field_entry| posting_from_field_entry(field_entry))
|
||||
.collect();
|
||||
MultiFieldPostingsWriter {
|
||||
heap: MemoryArena::new(),
|
||||
schema: schema.clone(),
|
||||
heap,
|
||||
term_index,
|
||||
per_field_postings_writers,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.term_index.mem_usage() + self.heap.mem_usage()
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
|
||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
||||
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
|
||||
postings_writer.index_text(
|
||||
&mut self.term_index,
|
||||
doc,
|
||||
field,
|
||||
token_stream,
|
||||
&mut self.heap,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
|
||||
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, self.heap)
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
|
||||
}
|
||||
|
||||
/// Serialize the inverted index.
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
#[allow(needless_range_loop)]
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index
|
||||
.iter()
|
||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||
.collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||
@@ -142,23 +152,19 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&mut field_serializer,
|
||||
self.heap,
|
||||
&self.term_index.heap,
|
||||
&self.heap,
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(unordered_term_mappings)
|
||||
}
|
||||
|
||||
/// Return true iff the term dictionary is saturated.
|
||||
pub fn is_term_saturated(&self) -> bool {
|
||||
self.term_index.is_saturated()
|
||||
}
|
||||
}
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `Heap`.
|
||||
/// `PostingsWriter` writes in a `MemoryArena`.
|
||||
pub trait PostingsWriter {
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
@@ -173,16 +179,17 @@ pub trait PostingsWriter {
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
heap: &mut MemoryArena,
|
||||
) -> UnorderedTermId;
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
term_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and subscribe all of its token.
|
||||
@@ -192,7 +199,7 @@ pub trait PostingsWriter {
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &Heap,
|
||||
heap: &mut MemoryArena,
|
||||
) -> u32 {
|
||||
let mut term = Term::for_field(field);
|
||||
let num_tokens = {
|
||||
@@ -210,61 +217,67 @@ pub trait PostingsWriter {
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
|
||||
heap: &'a Heap,
|
||||
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
/// constructor
|
||||
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
|
||||
pub fn new() -> SpecializedPostingsWriter<Rec> {
|
||||
SpecializedPostingsWriter {
|
||||
heap,
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
|
||||
pub fn new_boxed() -> Box<PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
heap: &mut MemoryArena,
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
let (term_ord, recorder): (UnorderedTermId, &mut Rec) = term_index.get_or_create(term);
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
if current_doc != u32::max_value() {
|
||||
recorder.close_doc(heap);
|
||||
}
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
self.total_num_tokens += 1;
|
||||
recorder.record_position(position, heap);
|
||||
term_ord
|
||||
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||
if opt_recorder.is_some() {
|
||||
let mut recorder = opt_recorder.unwrap();
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(heap);
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::new(heap);
|
||||
recorder.new_doc(doc, heap);
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
}
|
||||
}) as UnorderedTermId
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
termdict_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
for &(term_bytes, addr, _) in term_addrs {
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
let recorder: Rec = unsafe { termdict_heap.read(addr) };
|
||||
serializer.new_term(&term_bytes[4..])?;
|
||||
recorder.serialize(addr, serializer, heap)?;
|
||||
recorder.serialize(serializer, heap)?;
|
||||
serializer.close_term()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use postings::FieldSerializer;
|
||||
use std::{self, io};
|
||||
use DocId;
|
||||
@@ -15,62 +15,53 @@ const POSITION_END: u32 = std::u32::MAX;
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub trait Recorder: HeapAllocable {
|
||||
pub trait Recorder: Copy {
|
||||
///
|
||||
fn new(heap: &mut MemoryArena) -> Self;
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
/// This method shall only be called if the term is within the document.
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap);
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena);
|
||||
/// Record the position of a term. For each document,
|
||||
/// this method will be called `term_freq` times.
|
||||
fn record_position(&mut self, position: u32, heap: &Heap);
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena);
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &Heap);
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena);
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl HeapAllocable for NothingRecorder {
|
||||
fn with_addr(addr: u32) -> NothingRecorder {
|
||||
impl Recorder for NothingRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {}
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
||||
|
||||
fn close_doc(&mut self, _heap: &Heap) {}
|
||||
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for doc in self.stack.iter(self_addr, heap) {
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
for doc in self.stack.iter(heap) {
|
||||
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -78,52 +69,46 @@ impl Recorder for NothingRecorder {
|
||||
}
|
||||
|
||||
/// Recorder encoding document ids, and term frequencies
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
current_tf: u32,
|
||||
}
|
||||
|
||||
impl HeapAllocable for TermFrequencyRecorder {
|
||||
fn with_addr(addr: u32) -> TermFrequencyRecorder {
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
current_doc: u32::max_value(),
|
||||
current_tf: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
||||
self.current_tf += 1;
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
self.stack.push(self.current_tf, heap);
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self.stack
|
||||
.iter(self_addr, heap)
|
||||
.iter(heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
|
||||
while let Some(doc) = doc_iter.next() {
|
||||
@@ -137,46 +122,40 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
|
||||
/// Recorder encoding term frequencies as well as positions.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl HeapAllocable for TFAndPositionRecorder {
|
||||
fn with_addr(addr: u32) -> TFAndPositionRecorder {
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, position: u32, heap: &Heap) {
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
||||
self.stack.push(position, heap);
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(self_addr, heap);
|
||||
let mut positions_iter = self.stack.iter(heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
let mut prev_position = 0;
|
||||
doc_positions.clear();
|
||||
|
||||
@@ -2,15 +2,14 @@ use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BL
|
||||
use DocId;
|
||||
|
||||
use common::BitSet;
|
||||
use common::CountingWriter;
|
||||
use common::HasLen;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::Postings;
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
@@ -78,9 +77,9 @@ impl SegmentPostings {
|
||||
/// and returns a `SegmentPostings` object that embeds a
|
||||
/// buffer with the serialized data.
|
||||
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
|
||||
let mut counting_writer = CountingWriter::wrap(Vec::new());
|
||||
let mut buffer = Vec::new();
|
||||
{
|
||||
let mut postings_serializer = PostingsSerializer::new(&mut counting_writer, false);
|
||||
let mut postings_serializer = PostingsSerializer::new(&mut buffer, false);
|
||||
for &doc in docs {
|
||||
postings_serializer.write_doc(doc, 1u32).unwrap();
|
||||
}
|
||||
@@ -88,13 +87,9 @@ impl SegmentPostings {
|
||||
.close_term()
|
||||
.expect("In memory Serialization should never fail.");
|
||||
}
|
||||
let (buffer, _) = counting_writer
|
||||
.finish()
|
||||
.expect("Serializing in a buffer should never fail.");
|
||||
let data = ReadOnlySource::from(buffer);
|
||||
let block_segment_postings = BlockSegmentPostings::from_data(
|
||||
docs.len(),
|
||||
SourceRead::from(data),
|
||||
OwnedRead::new(buffer),
|
||||
FreqReadingOption::NoFreq,
|
||||
);
|
||||
SegmentPostings::from_block_postings(block_segment_postings, None)
|
||||
@@ -306,13 +301,13 @@ pub struct BlockSegmentPostings {
|
||||
doc_offset: DocId,
|
||||
num_bitpacked_blocks: usize,
|
||||
num_vint_docs: usize,
|
||||
remaining_data: SourceRead,
|
||||
remaining_data: OwnedRead,
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: usize,
|
||||
data: SourceRead,
|
||||
data: OwnedRead,
|
||||
freq_reading_option: FreqReadingOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let num_bitpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
|
||||
@@ -339,7 +334,7 @@ impl BlockSegmentPostings {
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) {
|
||||
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: OwnedRead) {
|
||||
let num_binpacked_blocks: usize = doc_freq / COMPRESSION_BLOCK_SIZE;
|
||||
let num_vint_docs = doc_freq & (COMPRESSION_BLOCK_SIZE - 1);
|
||||
self.num_bitpacked_blocks = num_binpacked_blocks;
|
||||
@@ -449,7 +444,7 @@ impl BlockSegmentPostings {
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
freq_reading_option: FreqReadingOption::NoFreq,
|
||||
|
||||
remaining_data: From::from(ReadOnlySource::empty()),
|
||||
remaining_data: OwnedRead::new(vec![]),
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
}
|
||||
|
||||
@@ -239,13 +239,60 @@ impl<'a> FieldSerializer<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
struct Block {
|
||||
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
|
||||
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
|
||||
len: usize
|
||||
}
|
||||
|
||||
impl Block {
|
||||
fn new() -> Self {
|
||||
Block {
|
||||
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
|
||||
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
|
||||
len: 0
|
||||
}
|
||||
}
|
||||
|
||||
fn doc_ids(&self) -> &[DocId] {
|
||||
&self.doc_ids[..self.len]
|
||||
}
|
||||
|
||||
fn term_freqs(&self) -> &[u32] {
|
||||
&self.term_freqs[..self.len]
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.len = 0;
|
||||
}
|
||||
|
||||
fn append_doc(&mut self, doc: DocId, term_freq: u32) {
|
||||
let len = self.len;
|
||||
self.doc_ids[len] = doc;
|
||||
self.term_freqs[len] = term_freq;
|
||||
self.len = len + 1;
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.len == COMPRESSION_BLOCK_SIZE
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len == 0
|
||||
}
|
||||
|
||||
fn last_doc(&self) -> DocId {
|
||||
assert_eq!(self.len, COMPRESSION_BLOCK_SIZE);
|
||||
self.doc_ids[COMPRESSION_BLOCK_SIZE - 1]
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostingsSerializer<W: Write> {
|
||||
postings_write: CountingWriter<W>,
|
||||
last_doc_id_encoded: u32,
|
||||
|
||||
block_encoder: BlockEncoder,
|
||||
doc_ids: Vec<DocId>,
|
||||
term_freqs: Vec<u32>,
|
||||
block: Box<Block>,
|
||||
|
||||
termfreq_enabled: bool,
|
||||
}
|
||||
@@ -256,41 +303,41 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
postings_write: CountingWriter::wrap(write),
|
||||
|
||||
block_encoder: BlockEncoder::new(),
|
||||
doc_ids: vec![],
|
||||
term_freqs: vec![],
|
||||
block: Box::new(Block::new()),
|
||||
|
||||
last_doc_id_encoded: 0u32,
|
||||
termfreq_enabled,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
|
||||
self.doc_ids.push(doc_id);
|
||||
if self.termfreq_enabled {
|
||||
self.term_freqs.push(term_freq as u32);
|
||||
fn write_block(&mut self) -> io::Result<()> {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.block.last_doc();
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
}
|
||||
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
}
|
||||
if self.termfreq_enabled {
|
||||
// encode the term_freqs
|
||||
let block_encoded: &[u8] =
|
||||
self.block_encoder.compress_block_unsorted(&self.term_freqs);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
self.doc_ids.clear();
|
||||
if self.termfreq_enabled {
|
||||
// encode the term_freqs
|
||||
let block_encoded: &[u8] =
|
||||
self.block_encoder.compress_block_unsorted(&self.block.term_freqs());
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
}
|
||||
self.block.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
|
||||
self.block.append_doc(doc_id, term_freq);
|
||||
if self.block.is_full() {
|
||||
self.write_block()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
if !self.doc_ids.is_empty() {
|
||||
if !self.block.is_empty() {
|
||||
// we have doc ids waiting to be written
|
||||
// this happens when the number of doc ids is
|
||||
// not a perfect multiple of our block size.
|
||||
@@ -299,17 +346,16 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.termfreq_enabled {
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
.compress_vint_unsorted(self.block.term_freqs());
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
self.block.clear();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -323,8 +369,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.doc_ids.clear();
|
||||
self.term_freqs.clear();
|
||||
self.block.clear();
|
||||
self.last_doc_id_encoded = 0;
|
||||
}
|
||||
}
|
||||
|
||||
218
src/postings/stacker/expull.rs
Normal file
218
src/postings/stacker/expull.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
use super::{Addr, MemoryArena};
|
||||
|
||||
use common::is_power_of_2;
|
||||
use std::mem;
|
||||
|
||||
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
||||
|
||||
const FIRST_BLOCK: u32 = 4u32;
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(len: u32) -> Option<usize> {
|
||||
match len {
|
||||
0...3 => None,
|
||||
4...MAX_BLOCK_LEN => {
|
||||
if is_power_of_2(len as usize) {
|
||||
Some(len as usize)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
n => {
|
||||
if n % MAX_BLOCK_LEN == 0 {
|
||||
Some(MAX_BLOCK_LEN as usize)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An exponential unrolled link.
|
||||
///
|
||||
/// The use case is as follows. Tantivy's indexer conceptually acts like a
|
||||
/// `HashMap<Term, Vec<u32>>`. As we come accross a given term in document
|
||||
/// `D`, we lookup the term in the map and append the document id to its vector.
|
||||
///
|
||||
/// The vector is then only read when it is serialized.
|
||||
///
|
||||
/// The `ExpUnrolledLinkedList` offers a more efficient solution to this
|
||||
/// problem.
|
||||
///
|
||||
/// It combines the idea of the unrolled linked list and tries to address the
|
||||
/// problem of selecting an adequate block size using a strategy similar to
|
||||
/// that of the `Vec` amortized resize strategy.
|
||||
///
|
||||
/// Data is stored in a linked list of blocks. The first block has a size of `4`
|
||||
/// and each block has a length of twice that of the previous block up to
|
||||
/// `MAX_BLOCK_LEN = 32768`.
|
||||
///
|
||||
/// This strategy is a good trade off to handle numerous very rare terms
|
||||
/// and avoid wasting half of the memory for very frequent terms.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
head: Addr,
|
||||
tail: Addr,
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
|
||||
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
head: addr,
|
||||
tail: addr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: self.head,
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a new element to the current stack.
|
||||
///
|
||||
/// If the current block end is reached, a new block is allocated.
|
||||
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
|
||||
self.len += 1;
|
||||
if let Some(new_block_len) = jump_needed(self.len) {
|
||||
// We need to allocate another block.
|
||||
// We also allocate an extra `u32` to store the pointer
|
||||
// to the future next block.
|
||||
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: Addr = heap.allocate_space(new_block_size);
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, new_block_addr)
|
||||
};
|
||||
self.tail = new_block_addr;
|
||||
}
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, val);
|
||||
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a MemoryArena,
|
||||
addr: Addr,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
self.consumed += 1;
|
||||
let addr: Addr = if jump_needed(self.consumed).is_some() {
|
||||
unsafe {
|
||||
// logic
|
||||
self.heap.read(self.addr)
|
||||
}
|
||||
} else {
|
||||
self.addr
|
||||
};
|
||||
self.addr = addr.offset(mem::size_of::<u32>() as u32);
|
||||
Some(unsafe {
|
||||
// logic
|
||||
self.heap.read(addr)
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::MemoryArena;
|
||||
use super::jump_needed;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let mut heap = MemoryArena::new();
|
||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||
stack.push(1u32, &mut heap);
|
||||
stack.push(2u32, &mut heap);
|
||||
stack.push(4u32, &mut heap);
|
||||
stack.push(8u32, &mut heap);
|
||||
{
|
||||
let mut it = stack.iter(&heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed() {
|
||||
let mut block_len = 4u32;
|
||||
let mut i = 0;
|
||||
while i < 10_000_000 {
|
||||
assert!(jump_needed(i + block_len - 1).is_none());
|
||||
assert!(jump_needed(i + block_len + 1).is_none());
|
||||
assert!(jump_needed(i + block_len).is_some());
|
||||
let new_block_len = jump_needed(i + block_len).unwrap();
|
||||
i += block_len;
|
||||
block_len = new_block_len as u32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::ExpUnrolledLinkedList;
|
||||
use tantivy_memory_arena::MemoryArena;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut vecs = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
vecs.push(Vec::new());
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
vecs[t].push(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = MemoryArena::new();
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &heap);
|
||||
}
|
||||
}
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
291
src/postings/stacker/memory_arena.rs
Normal file
291
src/postings/stacker/memory_arena.rs
Normal file
@@ -0,0 +1,291 @@
|
||||
//! 32-bits Memory arena for types implementing `Copy`.
|
||||
//! This Memory arena has been implemented to fit the use of tantivy's indexer
|
||||
//! and has *twisted specifications*.
|
||||
//!
|
||||
//! - It works on stable rust.
|
||||
//! - One can get an accurate figure of the memory usage of the arena.
|
||||
//! - Allocation are very cheap.
|
||||
//! - Allocation happening consecutively are very likely to have great locality.
|
||||
//! - Addresses (`Addr`) are 32bits.
|
||||
//! - Dropping the whole `MemoryArena` is cheap.
|
||||
//!
|
||||
//! # Limitations
|
||||
//!
|
||||
//! - Your object shall not implement `Drop`.
|
||||
//! - `Addr` to the `Arena` are 32-bits. The maximum capacity of the arena
|
||||
//! is 4GB. *(Tantivy's indexer uses one arena per indexing thread.)*
|
||||
//! - The arena only works for objects much smaller than `1MB`.
|
||||
//! Allocating more than `1MB` at a time will result in a panic,
|
||||
//! and allocating a lot of large object (> 500KB) will result in a fragmentation.
|
||||
//! - Your objects are store in an unaligned fashion. For this reason,
|
||||
//! the API does not let you access them as references.
|
||||
//!
|
||||
//! Instead, you store and access your data via `.write(...)` and `.read(...)`, which under the hood
|
||||
//! stores your object using `ptr::write_unaligned` and `ptr::read_unaligned`.
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
|
||||
const NUM_BITS_PAGE_ADDR: usize = 20;
|
||||
const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
|
||||
|
||||
/// Represents a pointer into the `MemoryArena`
|
||||
/// .
|
||||
/// Pointer are 32-bits and are split into
|
||||
/// two parts.
|
||||
///
|
||||
/// The first 12 bits represent the id of a
|
||||
/// page of memory.
|
||||
///
|
||||
/// The last 20 bits are an address within this page of memory.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct Addr(u32);
|
||||
|
||||
impl Addr {
|
||||
/// Creates a null pointer.
|
||||
pub fn null_pointer() -> Addr {
|
||||
Addr(u32::max_value())
|
||||
}
|
||||
|
||||
/// Returns the `Addr` object for `addr + offset`
|
||||
pub fn offset(&self, offset: u32) -> Addr {
|
||||
Addr(self.0.wrapping_add(offset))
|
||||
}
|
||||
|
||||
fn new(page_id: usize, local_addr: usize) -> Addr {
|
||||
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
|
||||
}
|
||||
|
||||
fn page_id(&self) -> usize {
|
||||
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
|
||||
}
|
||||
|
||||
fn page_local_addr(&self) -> usize {
|
||||
(self.0 as usize) & (PAGE_SIZE - 1)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the `Addr` is null.
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait required for an object to be `storable`.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// Most of the time you should not implement this trait,
|
||||
/// and only use the `MemoryArena` with object implementing `Copy`.
|
||||
///
|
||||
/// `ArenaStorable` is used in `tantivy` to force
|
||||
/// a `Copy` object and a `slice` of data to be stored contiguously.
|
||||
pub trait ArenaStorable {
|
||||
fn num_bytes(&self) -> usize;
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
|
||||
}
|
||||
|
||||
impl<V> ArenaStorable for V
|
||||
where
|
||||
V: Copy,
|
||||
{
|
||||
fn num_bytes(&self) -> usize {
|
||||
mem::size_of::<V>()
|
||||
}
|
||||
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
|
||||
ptr::write_unaligned(dst_ptr, self);
|
||||
}
|
||||
}
|
||||
|
||||
/// The `MemoryArena`
|
||||
pub struct MemoryArena {
|
||||
pages: Vec<Page>,
|
||||
}
|
||||
|
||||
impl MemoryArena {
|
||||
/// Creates a new memory arena.
|
||||
pub fn new() -> MemoryArena {
|
||||
let first_page = Page::new(0);
|
||||
MemoryArena {
|
||||
pages: vec![first_page],
|
||||
}
|
||||
}
|
||||
|
||||
fn add_page(&mut self) -> &mut Page {
|
||||
let new_page_id = self.pages.len();
|
||||
self.pages.push(Page::new(new_page_id));
|
||||
&mut self.pages[new_page_id]
|
||||
}
|
||||
|
||||
/// Returns an estimate in number of bytes
|
||||
/// of resident memory consumed by the `MemoryArena`.
|
||||
///
|
||||
/// Internally, it counts a number of `1MB` pages
|
||||
/// and therefore delivers an upperbound.
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.pages.len() * PAGE_SIZE
|
||||
}
|
||||
|
||||
/// Writes a slice at the given address, assuming the
|
||||
/// memory was allocated beforehands.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic or corrupt the heap if he space was not
|
||||
/// properly allocated beforehands.
|
||||
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
|
||||
let bytes = data.as_ref();
|
||||
self.pages[addr.page_id()]
|
||||
.get_mut_slice(addr.page_local_addr(), bytes.len())
|
||||
.copy_from_slice(bytes);
|
||||
}
|
||||
|
||||
/// Returns the `len` bytes starting at `addr`
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the memory has not been allocated beforehands.
|
||||
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
|
||||
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
|
||||
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap
|
||||
///
|
||||
/// It allocates the `Item` beforehands.
|
||||
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
|
||||
let num_bytes = val.num_bytes();
|
||||
let addr = self.allocate_space(num_bytes);
|
||||
unsafe {
|
||||
self.write(addr, val);
|
||||
};
|
||||
addr
|
||||
}
|
||||
|
||||
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
|
||||
val.write_into(self, addr)
|
||||
}
|
||||
|
||||
/// Read an item in the heap at the given `address`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the address is erroneous
|
||||
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
|
||||
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
|
||||
ptr::read_unaligned(ptr as *const Item)
|
||||
}
|
||||
|
||||
/// Allocates `len` bytes and returns the allocated address.
|
||||
pub fn allocate_space(&mut self, len: usize) -> Addr {
|
||||
let page_id = self.pages.len() - 1;
|
||||
if let Some(addr) = self.pages[page_id].allocate_space(len) {
|
||||
return addr;
|
||||
}
|
||||
self.add_page().allocate_space(len).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
struct Page {
|
||||
page_id: usize,
|
||||
len: usize,
|
||||
data: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl Page {
|
||||
fn new(page_id: usize) -> Page {
|
||||
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
|
||||
unsafe {
|
||||
data.set_len(PAGE_SIZE);
|
||||
} // avoid initializing page
|
||||
Page {
|
||||
page_id,
|
||||
len: 0,
|
||||
data: data.into_boxed_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_available(&self, len: usize) -> bool {
|
||||
len + self.len <= PAGE_SIZE
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
||||
&mut self.data[local_addr..][..len]
|
||||
}
|
||||
|
||||
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
||||
&self.data[local_addr..][..len]
|
||||
}
|
||||
|
||||
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
||||
if self.is_available(len) {
|
||||
let addr = Addr::new(self.page_id, self.len);
|
||||
self.len += len;
|
||||
Some(addr)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
|
||||
self.data.as_ptr().offset(addr as isize)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
|
||||
self.data.as_mut_ptr().offset(addr as isize)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::MemoryArena;
|
||||
|
||||
#[test]
|
||||
fn test_arena_allocate_slice() {
|
||||
let mut arena = MemoryArena::new();
|
||||
let a = b"hello";
|
||||
let b = b"happy tax payer";
|
||||
|
||||
let addr_a = arena.allocate_space(a.len());
|
||||
arena.write_bytes(addr_a, a);
|
||||
|
||||
let addr_b = arena.allocate_space(b.len());
|
||||
arena.write_bytes(addr_b, b);
|
||||
|
||||
assert_eq!(arena.read_slice(addr_a, a.len()), a);
|
||||
assert_eq!(arena.read_slice(addr_b, b.len()), b);
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
struct MyTest {
|
||||
pub a: usize,
|
||||
pub b: u8,
|
||||
pub c: u32,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store_object() {
|
||||
let mut arena = MemoryArena::new();
|
||||
let a = MyTest {
|
||||
a: 143,
|
||||
b: 21,
|
||||
c: 32,
|
||||
};
|
||||
let b = MyTest {
|
||||
a: 113,
|
||||
b: 221,
|
||||
c: 12,
|
||||
};
|
||||
let addr_a = arena.store(a);
|
||||
let addr_b = arena.store(b);
|
||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
|
||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
|
||||
}
|
||||
}
|
||||
9
src/postings/stacker/mod.rs
Normal file
9
src/postings/stacker/mod.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
mod expull;
|
||||
mod memory_arena;
|
||||
mod murmurhash2;
|
||||
mod term_hashmap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
||||
use self::murmurhash2::murmurhash2;
|
||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
86
src/postings/stacker/murmurhash2.rs
Normal file
86
src/postings/stacker/murmurhash2.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
use std::ptr;
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let len = key.len() as u32;
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::murmurhash2;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
}
|
||||
296
src/postings/stacker/term_hashmap.rs
Normal file
296
src/postings/stacker/term_hashmap.rs
Normal file
@@ -0,0 +1,296 @@
|
||||
use super::murmurhash2;
|
||||
use super::{Addr, ArenaStorable, MemoryArena};
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use std::slice;
|
||||
|
||||
pub type BucketId = usize;
|
||||
|
||||
struct KeyBytesValue<'a, V> {
|
||||
key: &'a [u8],
|
||||
value: V,
|
||||
}
|
||||
|
||||
impl<'a, V> KeyBytesValue<'a, V> {
|
||||
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
|
||||
KeyBytesValue { key, value }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
|
||||
where
|
||||
V: ArenaStorable,
|
||||
{
|
||||
fn num_bytes(&self) -> usize {
|
||||
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
|
||||
}
|
||||
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||
arena.write(addr, self.key.len() as u16);
|
||||
arena.write_bytes(addr.offset(2), self.key);
|
||||
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the actual memory size in bytes
|
||||
/// required to create a table of size $2^num_bits$.
|
||||
pub fn compute_table_size(num_bits: usize) -> usize {
|
||||
(1 << num_bits) * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone)]
|
||||
struct KeyValue {
|
||||
key_value_addr: Addr,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl Default for KeyValue {
|
||||
fn default() -> Self {
|
||||
KeyValue {
|
||||
key_value_addr: Addr::null_pointer(),
|
||||
hash: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
/// The quirky API has the benefit of avoiding
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct TermHashMap {
|
||||
table: Box<[KeyValue]>,
|
||||
pub heap: MemoryArena,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing { hash, i: 0, mask }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Iter<'a> {
|
||||
hashmap: &'a TermHashMap,
|
||||
inner: slice::Iter<'a, usize>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = (&'a [u8], Addr, BucketId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) =
|
||||
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
|
||||
(key, offset, bucket as BucketId)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TermHashMap {
|
||||
pub fn new(num_bucket_power_of_2: usize) -> TermHashMap {
|
||||
let heap = MemoryArena::new();
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.table.len() * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
|
||||
let key_addr = addr.offset(2u32);
|
||||
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
|
||||
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
|
||||
(key_bytes, val_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr,
|
||||
hash,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
hashmap: &self,
|
||||
}
|
||||
}
|
||||
|
||||
fn resize(&mut self) {
|
||||
let new_len = self.table.len() * 2;
|
||||
let mask = new_len - 1;
|
||||
self.mask = mask;
|
||||
let new_table = vec![KeyValue::default(); new_len].into_boxed_slice();
|
||||
let old_table = mem::replace(&mut self.table, new_table);
|
||||
for old_pos in self.occupied.iter_mut() {
|
||||
let key_value: KeyValue = old_table[*old_pos];
|
||||
let mut probe = QuadraticProbing::compute(key_value.hash as usize, mask);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
if self.table[bucket].is_empty() {
|
||||
*old_pos = bucket;
|
||||
self.table[bucket] = key_value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `update` create a new entry for a given key if it does not exists
|
||||
/// or updates the existing entry.
|
||||
///
|
||||
/// The actual logic for this update is define in the the `updater`
|
||||
/// argument.
|
||||
///
|
||||
/// If the key is not present, `updater` will receive `None` and
|
||||
/// will be in charge of returning a default value.
|
||||
/// If the key already as an associated value, then it will be passed
|
||||
/// `Some(previous_value)`.
|
||||
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
||||
where
|
||||
S: AsRef<[u8]>,
|
||||
V: Copy,
|
||||
TMutator: FnMut(Option<V>) -> V,
|
||||
{
|
||||
if self.is_saturated() {
|
||||
self.resize();
|
||||
}
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let val = updater(None);
|
||||
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
|
||||
self.set_bucket(hash, key_addr, bucket);
|
||||
return bucket as BucketId;
|
||||
} else if kv.hash == hash {
|
||||
let (key_matches, val_addr) = {
|
||||
let (stored_key, val_addr): (&[u8], Addr) =
|
||||
unsafe { self.get_key_value(kv.key_value_addr) };
|
||||
(stored_key == key_bytes, val_addr)
|
||||
};
|
||||
if key_matches {
|
||||
unsafe {
|
||||
// logic
|
||||
let v = self.heap.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.heap.write(val_addr, new_v);
|
||||
};
|
||||
return bucket as BucketId;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash2(b: &mut Bencher) {
|
||||
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
let mut s = 0;
|
||||
for &key in &keys {
|
||||
s ^= murmurhash2(key.as_bytes());
|
||||
}
|
||||
s
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::TermHashMap;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18);
|
||||
{
|
||||
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
3u32
|
||||
});
|
||||
}
|
||||
{
|
||||
hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
4u32
|
||||
});
|
||||
}
|
||||
{
|
||||
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, Some(3u32));
|
||||
5u32
|
||||
});
|
||||
}
|
||||
|
||||
let mut vanilla_hash_map = HashMap::new();
|
||||
let mut iter_values = hash_map.iter();
|
||||
while let Some((key, addr, _)) = iter_values.next() {
|
||||
let val: u32 = unsafe {
|
||||
// test
|
||||
hash_map.heap.read(addr)
|
||||
};
|
||||
vanilla_hash_map.insert(key.to_owned(), val);
|
||||
}
|
||||
assert_eq!(vanilla_hash_map.len(), 2);
|
||||
}
|
||||
}
|
||||
59
src/query/automaton_weight.rs
Normal file
59
src/query/automaton_weight.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
use common::BitSet;
|
||||
use core::SegmentReader;
|
||||
use fst::Automaton;
|
||||
use query::BitSetDocSet;
|
||||
use query::ConstScorer;
|
||||
use query::{Scorer, Weight};
|
||||
use schema::{Field, IndexRecordOption};
|
||||
use termdict::{TermDictionary, TermStreamer};
|
||||
use Result;
|
||||
|
||||
/// A weight struct for Fuzzy Term and Regex Queries
|
||||
pub struct AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
field: Field,
|
||||
automaton: A,
|
||||
}
|
||||
|
||||
impl<A> AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
/// Create a new AutomationWeight
|
||||
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
||||
AutomatonWeight { field, automaton }
|
||||
}
|
||||
|
||||
fn automaton_stream<'a>(&'a self, term_dict: &'a TermDictionary) -> TermStreamer<'a, &'a A> {
|
||||
let term_stream_builder = term_dict.search(&self.automaton);
|
||||
term_stream_builder.into_stream()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A> Weight for AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
|
||||
let inverted_index = reader.inverted_index(self.field);
|
||||
let term_dict = inverted_index.terms();
|
||||
let mut term_stream = self.automaton_stream(term_dict);
|
||||
while term_stream.advance() {
|
||||
let term_info = term_stream.value();
|
||||
let mut block_segment_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
|
||||
while block_segment_postings.advance() {
|
||||
for &doc in block_segment_postings.docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(Box::new(ConstScorer::new(doc_bitset)))
|
||||
}
|
||||
}
|
||||
162
src/query/fuzzy_query.rs
Normal file
162
src/query/fuzzy_query.rs
Normal file
@@ -0,0 +1,162 @@
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
|
||||
use query::{AutomatonWeight, Query, Weight};
|
||||
use schema::Term;
|
||||
use std::collections::HashMap;
|
||||
use Result;
|
||||
use Searcher;
|
||||
|
||||
lazy_static! {
|
||||
static ref LEV_BUILDER: HashMap<(u8, bool), LevenshteinAutomatonBuilder> = {
|
||||
let mut lev_builder_cache = HashMap::new();
|
||||
// TODO make population lazy on a `(distance, val)` basis
|
||||
for distance in 0..3 {
|
||||
for &transposition in [false, true].iter() {
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition);
|
||||
lev_builder_cache.insert((distance, transposition), lev_automaton_builder);
|
||||
}
|
||||
}
|
||||
lev_builder_cache
|
||||
};
|
||||
}
|
||||
|
||||
/// A Fuzzy Query matches all of the documents
|
||||
/// containing a specific term that is within
|
||||
/// Levenshtein distance
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::FuzzyTermQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let term = Term::from_field_text(title, "Diary");
|
||||
/// let query = FuzzyTermQuery::new(term, 1, true);
|
||||
/// searcher.search(&query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FuzzyTermQuery {
|
||||
/// What term are we searching
|
||||
term: Term,
|
||||
/// How many changes are we going to allow
|
||||
distance: u8,
|
||||
/// Should a transposition cost 1 or 2?
|
||||
transposition_cost_one: bool,
|
||||
///
|
||||
prefix: bool,
|
||||
}
|
||||
|
||||
impl FuzzyTermQuery {
|
||||
/// Creates a new Fuzzy Query
|
||||
pub fn new(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
|
||||
FuzzyTermQuery {
|
||||
term,
|
||||
distance,
|
||||
transposition_cost_one,
|
||||
prefix: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
|
||||
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
|
||||
FuzzyTermQuery {
|
||||
term,
|
||||
distance,
|
||||
transposition_cost_one,
|
||||
prefix: true,
|
||||
}
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> Result<AutomatonWeight<DFA>> {
|
||||
let automaton = LEV_BUILDER.get(&(self.distance, false))
|
||||
.unwrap() // TODO return an error
|
||||
.build_dfa(self.term.text());
|
||||
Ok(AutomatonWeight::new(self.term.field(), automaton))
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for FuzzyTermQuery {
|
||||
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
Ok(Box::new(self.specialized_weight()?))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::FuzzyTermQuery;
|
||||
use collector::TopCollector;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::TEXT;
|
||||
use tests::assert_nearly_equals;
|
||||
use Index;
|
||||
use Term;
|
||||
|
||||
#[test]
|
||||
pub fn test_fuzzy_term() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "japan",
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "korea",
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
{
|
||||
let mut collector = TopCollector::with_limit(2);
|
||||
let term = Term::from_field_text(country_field, "japon");
|
||||
|
||||
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
||||
searcher.search(&fuzzy_query, &mut collector).unwrap();
|
||||
let scored_docs = collector.score_docs();
|
||||
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
|
||||
let (score, _) = scored_docs[0];
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -228,7 +228,8 @@ where
|
||||
TOtherScorer: Scorer,
|
||||
{
|
||||
fn score(&mut self) -> Score {
|
||||
self.left.score() + self.right.score()
|
||||
self.left.score()
|
||||
+ self.right.score()
|
||||
+ self.others.iter_mut().map(Scorer::score).sum::<Score>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,16 +3,19 @@ Query
|
||||
*/
|
||||
|
||||
mod all_query;
|
||||
mod automaton_weight;
|
||||
mod bitset;
|
||||
mod bm25;
|
||||
mod boolean_query;
|
||||
mod exclude;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
mod occur;
|
||||
mod phrase_query;
|
||||
mod query;
|
||||
mod query_parser;
|
||||
mod range_query;
|
||||
mod regex_query;
|
||||
mod reqopt_scorer;
|
||||
mod scorer;
|
||||
mod term_query;
|
||||
@@ -31,9 +34,11 @@ pub use self::union::Union;
|
||||
pub use self::vec_docset::VecDocSet;
|
||||
|
||||
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
||||
pub use self::automaton_weight::AutomatonWeight;
|
||||
pub use self::bitset::BitSetDocSet;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::occur::Occur;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
@@ -41,6 +46,7 @@ pub use self::query::Query;
|
||||
pub use self::query_parser::QueryParser;
|
||||
pub use self::query_parser::QueryParserError;
|
||||
pub use self::range_query::RangeQuery;
|
||||
pub use self::regex_query::RegexQuery;
|
||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||
pub use self::scorer::ConstScorer;
|
||||
pub use self::scorer::EmptyScorer;
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use super::Weight;
|
||||
use collector::Collector;
|
||||
use core::searcher::Searcher;
|
||||
use downcast;
|
||||
use std::fmt;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
/// for those documents.
|
||||
@@ -55,6 +57,26 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug {
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
///
|
||||
/// Then, the query loops over the segments and for each segment :
|
||||
/// - setup the collector and informs it that the segment being processed has changed.
|
||||
/// - creates a `Scorer` object associated for this segment
|
||||
/// - iterate throw the matched documents and push them to the collector.
|
||||
///
|
||||
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<()> {
|
||||
let scoring_enabled = collector.requires_scoring();
|
||||
let weight = self.weight(searcher, scoring_enabled)?;
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
collector.set_segment(segment_ord as SegmentLocalId, segment_reader)?;
|
||||
let mut scorer = weight.scorer(segment_reader)?;
|
||||
scorer.collect(collector, segment_reader.delete_bitset());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub trait QueryClone {
|
||||
|
||||
@@ -1,11 +1,21 @@
|
||||
use query::Occur;
|
||||
use schema::Field;
|
||||
use schema::Term;
|
||||
use schema::Type;
|
||||
use std::fmt;
|
||||
use std::ops::Bound;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalLiteral {
|
||||
Term(Term),
|
||||
Phrase(Vec<Term>),
|
||||
Range {
|
||||
field: Field,
|
||||
value_type: Type,
|
||||
lower: Bound<Term>,
|
||||
upper: Bound<Term>,
|
||||
},
|
||||
All,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -54,6 +64,12 @@ impl fmt::Debug for LogicalLiteral {
|
||||
match *self {
|
||||
LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term),
|
||||
LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms),
|
||||
LogicalLiteral::Range {
|
||||
ref lower,
|
||||
ref upper,
|
||||
..
|
||||
} => write!(formatter, "({:?} TO {:?})", lower, upper),
|
||||
LogicalLiteral::All => write!(formatter, "*"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,29 +1,37 @@
|
||||
use super::user_input_ast::*;
|
||||
use combine::char::*;
|
||||
use combine::*;
|
||||
use query::query_parser::user_input_ast::UserInputBound;
|
||||
|
||||
fn field<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
|
||||
fn word<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
|
||||
many1(satisfy(|c: char| c.is_alphanumeric()))
|
||||
}
|
||||
|
||||
fn negative_number<I: Stream<Item = char>>() -> impl Parser<Input = I, Output = String> {
|
||||
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
|
||||
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
let term_val = || {
|
||||
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
|
||||
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
|
||||
phrase.or(word)
|
||||
phrase.or(word())
|
||||
};
|
||||
|
||||
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let field = (
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let term_val_with_field = negative_numbers.or(term_val());
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
|
||||
let term_query =
|
||||
(field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
|
||||
(field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
});
|
||||
@@ -37,6 +45,36 @@ where
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
fn range<I: Stream<Item = char>>(input: I) -> ParseResult<UserInputAST, I> {
|
||||
let term_val = || {
|
||||
word().or(negative_number()).or(char('*').map(|_| "*".to_string()))
|
||||
};
|
||||
let lower_bound = {
|
||||
let excl = (char('{'), term_val()).map(|(_, w)| UserInputBound::Exclusive(w));
|
||||
let incl = (char('['), term_val()).map(|(_, w)| UserInputBound::Inclusive(w));
|
||||
excl.or(incl)
|
||||
};
|
||||
let upper_bound = {
|
||||
let excl = (term_val(), char('}')).map(|(w, _)| UserInputBound::Exclusive(w));
|
||||
let incl = (term_val(), char(']')).map(|(w, _)| UserInputBound::Inclusive(w));
|
||||
// TODO: this backtracking should be unnecessary
|
||||
try(excl).or(incl)
|
||||
};
|
||||
(
|
||||
optional((field(), char(':')).map(|x| x.0)),
|
||||
lower_bound,
|
||||
spaces(),
|
||||
string("TO"),
|
||||
spaces(),
|
||||
upper_bound,
|
||||
).map(|(field, lower, _, _, _, upper)| UserInputAST::Range {
|
||||
field,
|
||||
lower,
|
||||
upper,
|
||||
})
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
@@ -45,6 +83,8 @@ where
|
||||
.map(|(_, expr)| UserInputAST::Not(Box::new(expr)))
|
||||
.or((char('+'), parser(leaf)).map(|(_, expr)| UserInputAST::Must(Box::new(expr))))
|
||||
.or((char('('), parser(parse_to_ast), char(')')).map(|(_, expr, _)| expr))
|
||||
.or(char('*').map(|_| UserInputAST::All))
|
||||
.or(try(parser(range)))
|
||||
.or(parser(literal))
|
||||
.parse_stream(input)
|
||||
}
|
||||
@@ -91,6 +131,12 @@ mod test {
|
||||
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:[\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,15 +2,19 @@ use super::logical_ast::*;
|
||||
use super::query_grammar::parse_to_ast;
|
||||
use super::user_input_ast::*;
|
||||
use core::Index;
|
||||
use query::AllQuery;
|
||||
use query::BooleanQuery;
|
||||
use query::Occur;
|
||||
use query::PhraseQuery;
|
||||
use query::Query;
|
||||
use query::RangeQuery;
|
||||
use query::TermQuery;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Field, Schema};
|
||||
use schema::{FieldType, Term};
|
||||
use std::borrow::Cow;
|
||||
use std::num::ParseIntError;
|
||||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
use tokenizer::TokenizerManager;
|
||||
|
||||
@@ -39,6 +43,9 @@ pub enum QueryParserError {
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
UnknownTokenizer(String, String),
|
||||
/// The query contains a range query with a phrase as one of the bounds.
|
||||
/// Only terms can be used as bounds.
|
||||
RangeMustNotHavePhrase,
|
||||
}
|
||||
|
||||
impl From<ParseIntError> for QueryParserError {
|
||||
@@ -66,8 +73,8 @@ impl From<ParseIntError> for QueryParserError {
|
||||
/// by relevance : The user typically just scans through the first few
|
||||
/// documents in order of decreasing relevance and will stop when the documents
|
||||
/// are not relevant anymore.
|
||||
/// Making it possible to make this behavior customizable is tracked in
|
||||
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
|
||||
///
|
||||
/// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`.
|
||||
///
|
||||
/// * negative terms: By prepending a term by a `-`, a term can be excluded
|
||||
/// from the search. This is useful for disambiguating a query.
|
||||
@@ -75,6 +82,17 @@ impl From<ParseIntError> for QueryParserError {
|
||||
///
|
||||
/// * must terms: By prepending a term by a `+`, a term can be made required for the search.
|
||||
///
|
||||
/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed.
|
||||
/// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed
|
||||
/// by "obama".
|
||||
///
|
||||
/// * range terms: Range searches can be done by specifying the start and end bound. These can be
|
||||
/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains
|
||||
/// a word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
|
||||
/// Inclusive bounds are `[]`, exclusive are `{}`.
|
||||
///
|
||||
/// * all docs query: A plain `*` will match all documents in the index.
|
||||
///
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
@@ -155,11 +173,12 @@ impl QueryParser {
|
||||
}
|
||||
Ok(ast)
|
||||
}
|
||||
fn compute_logical_ast_for_leaf(
|
||||
|
||||
fn compute_terms_for_string(
|
||||
&self,
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
) -> Result<Vec<Term>, QueryParserError> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
if !field_type.is_indexed() {
|
||||
@@ -170,12 +189,12 @@ impl QueryParser {
|
||||
FieldType::I64(_) => {
|
||||
let val: i64 = i64::from_str(phrase)?;
|
||||
let term = Term::from_field_i64(field, val);
|
||||
Ok(Some(LogicalLiteral::Term(term)))
|
||||
Ok(vec![term])
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
let val: u64 = u64::from_str(phrase)?;
|
||||
let term = Term::from_field_u64(field, val);
|
||||
Ok(Some(LogicalLiteral::Term(term)))
|
||||
Ok(vec![term])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
if let Some(option) = str_options.get_indexing_options() {
|
||||
@@ -194,17 +213,15 @@ impl QueryParser {
|
||||
terms.push(term);
|
||||
});
|
||||
if terms.is_empty() {
|
||||
Ok(None)
|
||||
Ok(vec![])
|
||||
} else if terms.len() == 1 {
|
||||
Ok(Some(LogicalLiteral::Term(
|
||||
terms.into_iter().next().unwrap(),
|
||||
)))
|
||||
Ok(terms)
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
if let Some(index_record_option) = field_type.get_index_record_option() {
|
||||
if index_record_option.has_positions() {
|
||||
Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
Ok(terms)
|
||||
} else {
|
||||
let fieldname = self.schema.get_field_name(field).to_string();
|
||||
Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
|
||||
@@ -223,10 +240,7 @@ impl QueryParser {
|
||||
))
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet => {
|
||||
let term = Term::from_field_text(field, phrase);
|
||||
Ok(Some(LogicalLiteral::Term(term)))
|
||||
}
|
||||
FieldType::HierarchicalFacet => Ok(vec![Term::from_field_text(field, phrase)]),
|
||||
FieldType::Bytes => {
|
||||
let field_name = self.schema.get_field_name(field).to_string();
|
||||
Err(QueryParserError::FieldNotIndexed(field_name))
|
||||
@@ -234,6 +248,21 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_logical_ast_for_leaf(
|
||||
&self,
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let terms = self.compute_terms_for_string(field, phrase)?;
|
||||
match terms.len() {
|
||||
0 => Ok(None),
|
||||
1 => Ok(Some(LogicalLiteral::Term(
|
||||
terms.into_iter().next().unwrap(),
|
||||
))),
|
||||
_ => Ok(Some(LogicalLiteral::Phrase(terms))),
|
||||
}
|
||||
}
|
||||
|
||||
fn default_occur(&self) -> Occur {
|
||||
if self.conjunction_by_default {
|
||||
Occur::Must
|
||||
@@ -242,6 +271,37 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_bound(&self, field: Field, bound: &UserInputBound) -> Result<Bound<Term>, QueryParserError> {
|
||||
if bound.term_str() == "*" {
|
||||
return Ok(Bound::Unbounded);
|
||||
}
|
||||
let terms = self.compute_terms_for_string(field, bound.term_str())?;
|
||||
if terms.len() != 1 {
|
||||
return Err(QueryParserError::RangeMustNotHavePhrase);
|
||||
}
|
||||
let term = terms.into_iter().next().unwrap();
|
||||
match *bound {
|
||||
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
|
||||
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
|
||||
}
|
||||
}
|
||||
|
||||
fn resolved_fields(
|
||||
&self,
|
||||
given_field: &Option<String>,
|
||||
) -> Result<Cow<[Field]>, QueryParserError> {
|
||||
match *given_field {
|
||||
None => {
|
||||
if self.default_fields.is_empty() {
|
||||
Err(QueryParserError::NoDefaultFieldDeclared)
|
||||
} else {
|
||||
Ok(Cow::from(&self.default_fields[..]))
|
||||
}
|
||||
}
|
||||
Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])),
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_logical_ast_with_occur(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
@@ -265,6 +325,41 @@ impl QueryParser {
|
||||
let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Range {
|
||||
field,
|
||||
lower,
|
||||
upper,
|
||||
} => {
|
||||
let fields = self.resolved_fields(&field)?;
|
||||
let mut clauses = fields
|
||||
.iter()
|
||||
.map(|&field| {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let value_type = field_entry.field_type().value_type();
|
||||
Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
lower: self.resolve_bound(field, &lower)?,
|
||||
upper: self.resolve_bound(field, &upper)?,
|
||||
})))
|
||||
})
|
||||
.collect::<Result<Vec<_>, QueryParserError>>()?;
|
||||
let result_ast = if clauses.len() == 1 {
|
||||
clauses.pop().unwrap()
|
||||
} else {
|
||||
LogicalAST::Clause(
|
||||
clauses
|
||||
.into_iter()
|
||||
.map(|clause| (Occur::Should, clause))
|
||||
.collect(),
|
||||
)
|
||||
};
|
||||
Ok((Occur::Should, result_ast))
|
||||
}
|
||||
UserInputAST::All => Ok((
|
||||
Occur::Should,
|
||||
LogicalAST::Leaf(Box::new(LogicalLiteral::All)),
|
||||
)),
|
||||
UserInputAST::Leaf(literal) => {
|
||||
let term_phrases: Vec<(Field, String)> = match literal.field_name {
|
||||
Some(ref field_name) => {
|
||||
@@ -327,6 +422,13 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
match logical_literal {
|
||||
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
|
||||
LogicalLiteral::Phrase(terms) => Box::new(PhraseQuery::new(terms)),
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
lower,
|
||||
upper,
|
||||
} => Box::new(RangeQuery::new_term_bounds(field, value_type, lower, upper)),
|
||||
LogicalLiteral::All => Box::new(AllQuery),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -511,6 +613,42 @@ mod test {
|
||||
Term([0, 0, 0, 0, 98])]\"",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:[a TO b]",
|
||||
"(Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
Included(Term([0, 0, 0, 0, 98])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"[a TO b]",
|
||||
"((Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
Included(Term([0, 0, 0, 0, 98]))) \
|
||||
(Included(Term([0, 0, 0, 1, 97])) TO \
|
||||
Included(Term([0, 0, 0, 1, 98]))))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO toto}",
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO \
|
||||
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{* TO toto}",
|
||||
"(Unbounded TO \
|
||||
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO *}",
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"*",
|
||||
"*",
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -14,10 +14,44 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
}
|
||||
|
||||
pub enum UserInputBound {
|
||||
Inclusive(String),
|
||||
Exclusive(String),
|
||||
}
|
||||
|
||||
impl UserInputBound {
|
||||
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
|
||||
}
|
||||
}
|
||||
|
||||
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn term_str(&self) -> &str {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref contents) => contents,
|
||||
UserInputBound::Exclusive(ref contents) => contents,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<Box<UserInputAST>>),
|
||||
Not(Box<UserInputAST>),
|
||||
Must(Box<UserInputAST>),
|
||||
Range {
|
||||
field: Option<String>,
|
||||
lower: UserInputBound,
|
||||
upper: UserInputBound,
|
||||
},
|
||||
All,
|
||||
Leaf(Box<UserInputLiteral>),
|
||||
}
|
||||
|
||||
@@ -45,6 +79,20 @@ impl fmt::Debug for UserInputAST {
|
||||
Ok(())
|
||||
}
|
||||
UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery),
|
||||
UserInputAST::Range {
|
||||
ref field,
|
||||
ref lower,
|
||||
ref upper,
|
||||
} => {
|
||||
if let &Some(ref field) = field {
|
||||
write!(formatter, "{}:", field)?;
|
||||
}
|
||||
lower.display_lower(formatter)?;
|
||||
write!(formatter, " TO ")?;
|
||||
upper.display_upper(formatter)?;
|
||||
Ok(())
|
||||
}
|
||||
UserInputAST::All => write!(formatter, "*"),
|
||||
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,7 +41,8 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::Index;
|
||||
/// # use tantivy::schema::{SchemaBuilder, INT_INDEXED};
|
||||
/// # use tantivy::collector::{Collector, CountCollector};
|
||||
/// # use tantivy::collector::CountCollector;
|
||||
/// # use tantivy::query::Query;
|
||||
/// # use tantivy::Result;
|
||||
/// # use tantivy::query::RangeQuery;
|
||||
/// #
|
||||
@@ -67,7 +68,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
|
||||
/// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
||||
///
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// count_collector.search(&*searcher, &docs_in_the_sixties)?;
|
||||
/// docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
|
||||
///
|
||||
/// let num_60s_books = count_collector.count();
|
||||
///
|
||||
@@ -88,6 +89,28 @@ pub struct RangeQuery {
|
||||
}
|
||||
|
||||
impl RangeQuery {
|
||||
/// Creates a new `RangeQuery` from bounded start and end terms.
|
||||
///
|
||||
/// If the value type is not correct, something may go terribly wrong when
|
||||
/// the `Weight` object is created.
|
||||
pub fn new_term_bounds(
|
||||
field: Field,
|
||||
value_type: Type,
|
||||
left_bound: Bound<Term>,
|
||||
right_bound: Bound<Term>,
|
||||
) -> RangeQuery {
|
||||
let verify_and_unwrap_term = |val: &Term| {
|
||||
assert_eq!(field, val.field());
|
||||
val.value_bytes().to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type,
|
||||
left_bound: map_bound(&left_bound, &verify_and_unwrap_term),
|
||||
right_bound: map_bound(&right_bound, &verify_and_unwrap_term),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new `RangeQuery` over a `i64` field.
|
||||
///
|
||||
/// If the field is not of the type `i64`, tantivy
|
||||
@@ -194,12 +217,16 @@ impl RangeQuery {
|
||||
|
||||
/// Lower bound of range
|
||||
pub fn left_bound(&self) -> Bound<Term> {
|
||||
map_bound(&self.left_bound, &|bytes| Term::from_field_bytes(self.field, bytes))
|
||||
map_bound(&self.left_bound, &|bytes| {
|
||||
Term::from_field_bytes(self.field, bytes)
|
||||
})
|
||||
}
|
||||
|
||||
/// Upper bound of range
|
||||
pub fn right_bound(&self) -> Bound<Term> {
|
||||
map_bound(&self.right_bound, &|bytes| Term::from_field_bytes(self.field, bytes))
|
||||
map_bound(&self.right_bound, &|bytes| {
|
||||
Term::from_field_bytes(self.field, bytes)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -273,7 +300,8 @@ impl Weight for RangeWeight {
|
||||
mod tests {
|
||||
|
||||
use super::RangeQuery;
|
||||
use collector::{Collector, CountCollector};
|
||||
use collector::CountCollector;
|
||||
use query::Query;
|
||||
use schema::{Document, Field, SchemaBuilder, INT_INDEXED};
|
||||
use std::collections::Bound;
|
||||
use Index;
|
||||
@@ -304,7 +332,7 @@ mod tests {
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let mut count_collector = CountCollector::default();
|
||||
count_collector.search(&*searcher, &docs_in_the_sixties)?;
|
||||
docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
|
||||
assert_eq!(count_collector.count(), 2285);
|
||||
Ok(())
|
||||
}
|
||||
@@ -341,7 +369,9 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let count_multiples = |range_query: RangeQuery| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
count_collector.search(&*searcher, &range_query).unwrap();
|
||||
range_query
|
||||
.search(&*searcher, &mut count_collector)
|
||||
.unwrap();
|
||||
count_collector.count()
|
||||
};
|
||||
|
||||
|
||||
143
src/query/regex_query.rs
Normal file
143
src/query/regex_query.rs
Normal file
@@ -0,0 +1,143 @@
|
||||
use error::ErrorKind;
|
||||
use fst_regex::Regex;
|
||||
use query::{AutomatonWeight, Query, Weight};
|
||||
use schema::Field;
|
||||
use std::clone::Clone;
|
||||
use Result;
|
||||
use Searcher;
|
||||
|
||||
// A Regex Query matches all of the documents
|
||||
/// containing a specific term that matches
|
||||
/// a regex pattern
|
||||
/// A Fuzzy Query matches all of the documents
|
||||
/// containing a specific term that is within
|
||||
/// Levenshtein distance
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::RegexQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let term = Term::from_field_text(title, "Diary");
|
||||
/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title);
|
||||
/// searcher.search(&query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 3);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RegexQuery {
|
||||
regex_pattern: String,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl RegexQuery {
|
||||
/// Creates a new Fuzzy Query
|
||||
pub fn new(regex_pattern: String, field: Field) -> RegexQuery {
|
||||
RegexQuery {
|
||||
regex_pattern,
|
||||
field,
|
||||
}
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> Result<AutomatonWeight<Regex>> {
|
||||
let automaton = Regex::new(&self.regex_pattern)
|
||||
.map_err(|_| ErrorKind::InvalidArgument(self.regex_pattern.clone()))?;
|
||||
|
||||
Ok(AutomatonWeight::new(self.field.clone(), automaton))
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for RegexQuery {
|
||||
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
Ok(Box::new(self.specialized_weight()?))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::RegexQuery;
|
||||
use collector::TopCollector;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::TEXT;
|
||||
use tests::assert_nearly_equals;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
pub fn test_regex_query() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "japan",
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "korea",
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
{
|
||||
let mut collector = TopCollector::with_limit(2);
|
||||
|
||||
let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field);
|
||||
searcher.search(®ex_query, &mut collector).unwrap();
|
||||
let scored_docs = collector.score_docs();
|
||||
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
|
||||
let (score, _) = scored_docs[0];
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
|
||||
let searcher = index.searcher();
|
||||
{
|
||||
let mut collector = TopCollector::with_limit(2);
|
||||
|
||||
let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field);
|
||||
searcher.search(®ex_query, &mut collector).unwrap();
|
||||
let scored_docs = collector.score_docs();
|
||||
assert_eq!(scored_docs.len(), 0, "Expected ZERO document");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
use collector::SegmentCollector;
|
||||
use collector::Collector;
|
||||
use common::BitSet;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use downcast;
|
||||
@@ -18,7 +18,7 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
|
||||
|
||||
/// Consumes the complete `DocSet` and
|
||||
/// push the scored documents to the collector.
|
||||
fn collect<T>(&mut self, collector: &mut SegmentCollector<CollectionResult = T>, delete_bitset_opt: Option<&DeleteBitSet>) {
|
||||
fn collect(&mut self, collector: &mut Collector, delete_bitset_opt: Option<&DeleteBitSet>) {
|
||||
if let Some(delete_bitset) = delete_bitset_opt {
|
||||
while self.advance() {
|
||||
let doc = self.doc();
|
||||
@@ -44,7 +44,7 @@ impl Scorer for Box<Scorer> {
|
||||
self.deref_mut().score()
|
||||
}
|
||||
|
||||
fn collect<T>(&mut self, collector: &mut SegmentCollector<CollectionResult = T>, delete_bitset: Option<&DeleteBitSet>) {
|
||||
fn collect(&mut self, collector: &mut Collector, delete_bitset: Option<&DeleteBitSet>) {
|
||||
let scorer = self.deref_mut();
|
||||
scorer.collect(collector, delete_bitset);
|
||||
}
|
||||
|
||||
@@ -16,6 +16,59 @@ use Term;
|
||||
/// * `idf` - inverse document frequency.
|
||||
/// * `term_freq` - number of occurrences of the term in the field
|
||||
/// * `field norm` - number of tokens in the field.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT, IndexRecordOption};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::TermQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit()?;
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let query = TermQuery::new(
|
||||
/// Term::from_field_text(title, "diary"),
|
||||
/// IndexRecordOption::Basic,
|
||||
/// );
|
||||
/// searcher.search(&query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct TermQuery {
|
||||
term: Term,
|
||||
|
||||
@@ -6,7 +6,7 @@ use Result;
|
||||
/// for a given set of segments.
|
||||
///
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
pub trait Weight: Send + Sync + 'static {
|
||||
pub trait Weight {
|
||||
/// Returns the scorer for the given segment.
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
|
||||
|
||||
19
src/store/compression_lz4.rs
Normal file
19
src/store/compression_lz4.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
extern crate lz4;
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = lz4::EncoderBuilder::new().build(compressed)?;
|
||||
encoder.write_all(&uncompressed)?;
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
encoder_result?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
let mut decoder = lz4::Decoder::new(compressed)?;
|
||||
decoder.read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
17
src/store/compression_snap.rs
Normal file
17
src/store/compression_snap.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
extern crate snap;
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = snap::Writer::new(compressed);
|
||||
encoder.write_all(&uncompressed)?;
|
||||
encoder.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
snap::Reader::new(compressed).read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -34,10 +34,21 @@ and should rely on either
|
||||
!*/
|
||||
|
||||
mod reader;
|
||||
mod skiplist;
|
||||
mod writer;
|
||||
pub use self::reader::StoreReader;
|
||||
pub use self::writer::StoreWriter;
|
||||
|
||||
#[cfg(feature = "lz4")]
|
||||
mod compression_lz4;
|
||||
#[cfg(feature = "lz4")]
|
||||
use self::compression_lz4::*;
|
||||
|
||||
#[cfg(not(feature = "lz4"))]
|
||||
mod compression_snap;
|
||||
#[cfg(not(feature = "lz4"))]
|
||||
use self::compression_snap::*;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use Result;
|
||||
|
||||
use super::decompress;
|
||||
use super::skiplist::SkipList;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use datastruct::SkipList;
|
||||
use directory::ReadOnlySource;
|
||||
use lz4;
|
||||
use schema::Document;
|
||||
use std::cell::RefCell;
|
||||
use std::io::{self, Read};
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
use DocId;
|
||||
|
||||
@@ -61,9 +61,7 @@ impl StoreReader {
|
||||
let mut current_block_mut = self.current_block.borrow_mut();
|
||||
current_block_mut.clear();
|
||||
let compressed_block = self.compressed_block(block_offset);
|
||||
let mut lz4_decoder = lz4::Decoder::new(compressed_block)?;
|
||||
*self.current_block_offset.borrow_mut() = usize::max_value();
|
||||
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())?;
|
||||
decompress(compressed_block, &mut current_block_mut)?;
|
||||
*self.current_block_offset.borrow_mut() = block_offset;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use super::compress;
|
||||
use super::skiplist::SkipListBuilder;
|
||||
use super::StoreReader;
|
||||
use common::CountingWriter;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use datastruct::SkipListBuilder;
|
||||
use directory::WritePtr;
|
||||
use lz4;
|
||||
use schema::Document;
|
||||
use std::io::{self, Write};
|
||||
use DocId;
|
||||
@@ -87,12 +87,7 @@ impl StoreWriter {
|
||||
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
self.intermediary_buffer.clear();
|
||||
{
|
||||
let mut encoder = lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer)?;
|
||||
encoder.write_all(&self.current_block)?;
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
encoder_result?;
|
||||
}
|
||||
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
|
||||
(self.intermediary_buffer.len() as u32).serialize(&mut self.writer)?;
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
self.offset_index_writer
|
||||
|
||||
@@ -94,7 +94,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
|
||||
let bit_shift = (addr_bits % 8) as u64;
|
||||
assert!(data.len() >= addr_byte + 8);
|
||||
let val_unshifted_unmasked: u64 = unsafe {
|
||||
//< ok : check len above
|
||||
// ok thanks to the 7 byte padding on `.close`
|
||||
let addr = data.as_ptr().offset(addr_byte as isize) as *const u64;
|
||||
ptr::read_unaligned(addr)
|
||||
};
|
||||
|
||||
@@ -203,7 +203,7 @@ impl TermDictionary {
|
||||
|
||||
/// Returns a search builder, to stream all of the terms
|
||||
/// within the Automaton
|
||||
pub fn search<'a, A: Automaton>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
|
||||
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
|
||||
let stream_builder = self.fst_index.search(automaton);
|
||||
TermStreamerBuilder::<A>::new(self, stream_builder)
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ impl Default for Token {
|
||||
offset_from: 0,
|
||||
offset_to: 0,
|
||||
position: usize::max_value(),
|
||||
text: String::new(),
|
||||
text: String::with_capacity(200),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user