mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
Compare commits
11 Commits
0.6.0
...
broken_col
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9d8031664 | ||
|
|
9628413386 | ||
|
|
b2ce65f52d | ||
|
|
ad81e131ec | ||
|
|
327ca2ab02 | ||
|
|
16ca6a0e5c | ||
|
|
8c07ae653d | ||
|
|
3d483f8711 | ||
|
|
56b2e9731f | ||
|
|
c85668cabe | ||
|
|
8a33ddaca7 |
154
.travis.yml
154
.travis.yml
@@ -1,127 +1,37 @@
|
||||
# Based on the "trust" template v0.1.2
|
||||
# https://github.com/japaric/trust/tree/v0.1.2
|
||||
|
||||
dist: trusty
|
||||
language: rust
|
||||
services: docker
|
||||
sudo: required
|
||||
|
||||
cache: cargo
|
||||
rust:
|
||||
- nightly
|
||||
env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# iOS
|
||||
#- env: TARGET=aarch64-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7s-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=i386-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-ios DISABLE_TESTS=1
|
||||
os: osx
|
||||
|
||||
# Linux
|
||||
- env: TARGET=aarch64-unknown-linux-gnu
|
||||
# - env: TARGET=arm-unknown-linux-gnueabi
|
||||
# - env: TARGET=armv7-unknown-linux-gnueabihf
|
||||
- env: TARGET=i686-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-musl
|
||||
#- env: TARGET=mips-unknown-linux-gnu
|
||||
#- env: TARGET=mips64-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mips64el-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mipsel-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64le-unknown-linux-gnu
|
||||
#- env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-musl
|
||||
|
||||
# OSX
|
||||
#- env: TARGET=i686-apple-darwin
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-darwin
|
||||
os: osx
|
||||
|
||||
# *BSD
|
||||
#- env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1
|
||||
|
||||
# Windows
|
||||
#- env: TARGET=x86_64-pc-windows-gnu
|
||||
|
||||
# Bare metal
|
||||
# These targets don't support std and as such are likely not suitable for
|
||||
# most crates.
|
||||
# - env: TARGET=thumbv6m-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabihf
|
||||
# - env: TARGET=thumbv7m-none-eabi
|
||||
|
||||
# Testing other channels
|
||||
#- env: TARGET=x86_64-unknown-linux-gnu
|
||||
# rust: nightly
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
# rust: nightly
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
|
||||
- CC=gcc-4.8
|
||||
- CXX=g++-4.8
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- libcurl4-openssl-dev
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
script:
|
||||
- bash ci/script.sh
|
||||
|
||||
after_script: set +e
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
#
|
||||
#deploy:
|
||||
# # - Create a `public_repo` GitHub token. Go to: https://github.com/settings/tokens/new
|
||||
# # - Encrypt it: `travis encrypt 0123456789012345678901234567890123456789
|
||||
# # - Paste the output down here
|
||||
# api_key:
|
||||
# secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
# file_glob: true
|
||||
# file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.*
|
||||
# on:
|
||||
# # TODO Here you can pick which targets will generate binary releases
|
||||
# # In this example, there are some targets that are tested using the stable
|
||||
# # and nightly channels. This condition makes sure there is only one release
|
||||
# # for such targets and that's generated using the stable channel
|
||||
# condition: $TRAVIS_RUST_VERSION = stable
|
||||
# tags: true
|
||||
# provider: releases
|
||||
# skip_cleanup: true
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
# Travis can't cache files that are not readable by "others"
|
||||
- chmod -R a+r $HOME/.cargo
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
# # release tags
|
||||
# - /^v\d+\.\d+\.\d+.*$/
|
||||
# - master
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
- cargo build
|
||||
- cargo test
|
||||
- cargo test -- --ignored
|
||||
- cargo run --example simple_search
|
||||
- cargo doc
|
||||
after_success:
|
||||
- cargo coveralls --exclude-pattern src/functional_test.rs
|
||||
- cargo doc-upload
|
||||
|
||||
11
AUTHORS
11
AUTHORS
@@ -1,11 +0,0 @@
|
||||
# This is the list of authors of tantivy for copyright purposes.
|
||||
Paul Masurel
|
||||
Laurentiu Nicola
|
||||
Dru Sellers
|
||||
Ashley Mannix
|
||||
Michael J. Curry
|
||||
Jason Wolfe
|
||||
# As an employee of Google I am required to add Google LLC
|
||||
# in the list of authors, but this project is not affiliated to Google
|
||||
# in any other way.
|
||||
Google LLC
|
||||
17
CHANGELOG.md
17
CHANGELOG.md
@@ -1,23 +1,14 @@
|
||||
Tantivy 0.6
|
||||
==========================
|
||||
|
||||
|
||||
Special thanks to @drusellers and @jason-wolfe for their contributions
|
||||
to this release!
|
||||
|
||||
- Removed C code. Tantivy is now pure Rust. (@pmasurel)
|
||||
- BM25 (@pmasurel)
|
||||
- Approximate field norms encoded over 1 byte. (@pmasurel)
|
||||
- Compiles on stable rust (@pmasurel)
|
||||
- Removed C code. Tantivy is now pure Rust.
|
||||
- BM25
|
||||
- Approximate field norms encoded over 1 byte.
|
||||
- Compiles on stable rust
|
||||
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
|
||||
- Completely uncompressed
|
||||
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
|
||||
- Add NGram token support (@drusellers)
|
||||
- Add Stopword Filter support (@drusellers)
|
||||
- Add a FuzzyTermQuery (@drusellers)
|
||||
- Add a RegexQuery (@drusellers)
|
||||
- Various performance improvements (@pmasurel)_
|
||||
|
||||
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
|
||||
17
Cargo.toml
17
Cargo.toml
@@ -1,15 +1,15 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.6.0"
|
||||
version = "0.6.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
description = """Tantivy is a search engine library."""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "search engine", "information", "retrieval"]
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.9.1"
|
||||
@@ -18,10 +18,7 @@ lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = {version="0.3", default-features=false}
|
||||
fst-regex = { version="0.2" }
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
atomicwrites = {version="0.1", optional=true}
|
||||
tempfile = "2.1"
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
@@ -32,6 +29,7 @@ serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
lz4 = "1.20"
|
||||
bit-set = "0.4.0"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
@@ -44,7 +42,7 @@ stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9" }
|
||||
matches = "0.1"
|
||||
bitpacking = "0.5"
|
||||
bitpacking = "0.4"
|
||||
fnv = "1.0.6"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
@@ -62,8 +60,9 @@ debug-assertions = false
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
simd = ["bitpacking/simd"]
|
||||
mmap = ["fst/mmap", "atomicwrites"]
|
||||
lz4-compression = ["lz4"]
|
||||
unstable = ["simd"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,4 +1,4 @@
|
||||
Copyright (c) 2018 by the project authors, as listed in the AUTHORS file.
|
||||
Copyright (c) 2018 by Paul Masurel, Google LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
|
||||
66
README.md
66
README.md
@@ -4,50 +4,36 @@
|
||||
[](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy)
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
|
||||
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
It is strongly inspired by Lucene's design.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- BM25 scoring (the same as lucene)
|
||||
- Basic query language (`+michael +jackson`)
|
||||
- Phrase queries search (\"michael jackson\"`)
|
||||
- tf-idf scoring
|
||||
- Basic query language
|
||||
- Phrase queries
|
||||
- Incremental indexing
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- Mmap directory
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||
- optional SIMD integer compression
|
||||
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing
|
||||
- Faceting
|
||||
- configurable indexing (optional term frequency and position indexing
|
||||
- Cheesy logo with a horse
|
||||
|
||||
# Non-features
|
||||
Tantivy supports Linux, MacOS and Windows.
|
||||
|
||||
- Distributed search and will not be in the scope of tantivy.
|
||||
|
||||
|
||||
# Supported OS and compiler
|
||||
|
||||
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
|
||||
|
||||
# Getting started
|
||||
|
||||
- [tantivy's simple search example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
|
||||
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents and search via the CLI or a small server with a REST API.
|
||||
It will walk you through getting a wikipedia search engine up and running in a few minutes.
|
||||
- [reference doc]
|
||||
- [For the last released version](https://docs.rs/tantivy/)
|
||||
@@ -57,14 +43,40 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
||||
|
||||
## Development
|
||||
|
||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run :
|
||||
Tantivy now compiles on stable rust.
|
||||
To check out and run test, you can simply run :
|
||||
|
||||
git clone git@github.com:tantivy-search/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
|
||||
|
||||
## Note on release build and performance
|
||||
|
||||
If your project depends on `tantivy`, for better performance, make sure to enable
|
||||
`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
|
||||
be available on most `x86_64` CPUs you will encounter).
|
||||
|
||||
For instance,
|
||||
|
||||
RUSTFLAGS='-C target-feature=+sse3'
|
||||
|
||||
Or, if you are targetting a specific cpu
|
||||
|
||||
RUSTFLAGS='-C target-cpu=native' build --release
|
||||
|
||||
Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
|
||||
If you want to disable those, you can run the following command :
|
||||
|
||||
cargo build --no-default-features
|
||||
|
||||
Alternatively, if you are trying to compile `tantivy` without simd compression,
|
||||
you can disable this functionality. In this case, this submodule is not required
|
||||
and you can compile tantivy by using the `--no-default-features` flag.
|
||||
|
||||
cargo build --no-default-features
|
||||
|
||||
|
||||
# Contribute
|
||||
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
@@ -6,6 +6,9 @@ environment:
|
||||
matrix:
|
||||
- channel: nightly
|
||||
target: x86_64-pc-windows-msvc
|
||||
- channel: nightly
|
||||
target: x86_64-pc-windows-gnu
|
||||
msys_bits: 64
|
||||
|
||||
install:
|
||||
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
# This script takes care of packaging the build artifacts that will go in the
|
||||
# release zipfile
|
||||
|
||||
$SRC_DIR = $PWD.Path
|
||||
$STAGE = [System.Guid]::NewGuid().ToString()
|
||||
|
||||
Set-Location $ENV:Temp
|
||||
New-Item -Type Directory -Name $STAGE
|
||||
Set-Location $STAGE
|
||||
|
||||
$ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip"
|
||||
|
||||
# TODO Update this to package the right artifacts
|
||||
Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\'
|
||||
|
||||
7z a "$ZIP" *
|
||||
|
||||
Push-AppveyorArtifact "$ZIP"
|
||||
|
||||
Remove-Item *.* -Force
|
||||
Set-Location ..
|
||||
Remove-Item $STAGE
|
||||
Set-Location $SRC_DIR
|
||||
@@ -1,33 +0,0 @@
|
||||
# This script takes care of building your crate and packaging it for release
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
local src=$(pwd) \
|
||||
stage=
|
||||
|
||||
case $TRAVIS_OS_NAME in
|
||||
linux)
|
||||
stage=$(mktemp -d)
|
||||
;;
|
||||
osx)
|
||||
stage=$(mktemp -d -t tmp)
|
||||
;;
|
||||
esac
|
||||
|
||||
test -f Cargo.lock || cargo generate-lockfile
|
||||
|
||||
# TODO Update this to build the artifacts that matter to you
|
||||
cross rustc --bin hello --target $TARGET --release -- -C lto
|
||||
|
||||
# TODO Update this to package the right artifacts
|
||||
cp target/$TARGET/release/hello $stage/
|
||||
|
||||
cd $stage
|
||||
tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
|
||||
cd $src
|
||||
|
||||
rm -rf $stage
|
||||
}
|
||||
|
||||
main
|
||||
@@ -1,47 +0,0 @@
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
local target=
|
||||
if [ $TRAVIS_OS_NAME = linux ]; then
|
||||
target=x86_64-unknown-linux-musl
|
||||
sort=sort
|
||||
else
|
||||
target=x86_64-apple-darwin
|
||||
sort=gsort # for `sort --sort-version`, from brew's coreutils.
|
||||
fi
|
||||
|
||||
# Builds for iOS are done on OSX, but require the specific target to be
|
||||
# installed.
|
||||
case $TARGET in
|
||||
aarch64-apple-ios)
|
||||
rustup target install aarch64-apple-ios
|
||||
;;
|
||||
armv7-apple-ios)
|
||||
rustup target install armv7-apple-ios
|
||||
;;
|
||||
armv7s-apple-ios)
|
||||
rustup target install armv7s-apple-ios
|
||||
;;
|
||||
i386-apple-ios)
|
||||
rustup target install i386-apple-ios
|
||||
;;
|
||||
x86_64-apple-ios)
|
||||
rustup target install x86_64-apple-ios
|
||||
;;
|
||||
esac
|
||||
|
||||
# This fetches latest stable release
|
||||
local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \
|
||||
| cut -d/ -f3 \
|
||||
| grep -E '^v[0.1.0-9.]+$' \
|
||||
| $sort --version-sort \
|
||||
| tail -n1)
|
||||
curl -LSfs https://japaric.github.io/trust/install.sh | \
|
||||
sh -s -- \
|
||||
--force \
|
||||
--git japaric/cross \
|
||||
--tag $tag \
|
||||
--target $target
|
||||
}
|
||||
|
||||
main
|
||||
23
ci/script.sh
23
ci/script.sh
@@ -1,23 +0,0 @@
|
||||
# This script takes care of testing your crate
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
cross build --target $TARGET
|
||||
cross build --target $TARGET --release
|
||||
|
||||
if [ ! -z $DISABLE_TESTS ]; then
|
||||
return
|
||||
fi
|
||||
|
||||
cross test --target $TARGET
|
||||
# cross test --target $TARGET --release
|
||||
|
||||
# cross run --target $TARGET
|
||||
# cross run --target $TARGET --release
|
||||
}
|
||||
|
||||
# we don't run the "test phase" when doing deploys
|
||||
if [ -z $TRAVIS_TAG ]; then
|
||||
main
|
||||
fi
|
||||
@@ -61,7 +61,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
|
||||
@@ -64,7 +64,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
|
||||
@@ -4,111 +4,87 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
use collector::CollectorWrapper;
|
||||
|
||||
/// Collector that does nothing.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
/// be optimized away by the compiler.
|
||||
pub struct DoNothingCollector;
|
||||
impl Collector for DoNothingCollector {
|
||||
type Child = DoNothingCollector;
|
||||
#[inline]
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<DoNothingCollector> {
|
||||
Ok(DoNothingCollector)
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
#[inline]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for DoNothingCollector {
|
||||
type CollectionResult = ();
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
|
||||
fn finalize(self) -> () {
|
||||
()
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
/// This contraption is only usable if the type of your collectors
|
||||
/// are known at compile time.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||
left: Left,
|
||||
right: Right,
|
||||
}
|
||||
|
||||
pub struct ChainedSegmentCollector<Left: SegmentCollector, Right: SegmentCollector> {
|
||||
left: Left,
|
||||
right: Right,
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
/// Adds a collector
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, CollectorWrapper<C>> {
|
||||
ChainedCollector {
|
||||
left: self,
|
||||
right: new_collector,
|
||||
right: CollectorWrapper::new(new_collector),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||
fn set_segment(
|
||||
type Child = ChainedSegmentCollector<Left::Child, Right::Child>;
|
||||
fn for_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
self.left.set_segment(segment_local_id, segment)?;
|
||||
self.right.set_segment(segment_local_id, segment)?;
|
||||
Ok(())
|
||||
) -> Result<Self::Child> {
|
||||
Ok(ChainedSegmentCollector {
|
||||
left: self.left.for_segment(segment_local_id, segment)?,
|
||||
right: self.right.for_segment(segment_local_id, segment)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Left: SegmentCollector, Right: SegmentCollector> SegmentCollector for ChainedSegmentCollector<Left, Right> {
|
||||
type CollectionResult = (Left::CollectionResult, Right::CollectionResult);
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.left.collect(doc, score);
|
||||
self.right.collect(doc, score);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
fn finalize(self) -> Self::CollectionResult {
|
||||
(self.left.finalize(), self.right.finalize())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,19 +98,35 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
use collector::{CountCollector, SegmentCollector, TopCollector};
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
use Document;
|
||||
|
||||
#[test]
|
||||
fn test_chained_collector() {
|
||||
let schema_builder = SchemaBuilder::new();
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let doc = Document::new();
|
||||
index_writer.add_document(doc);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let segment_readers = searcher.segment_readers();
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
let mut segment_collector = collectors.for_segment(0, &segment_readers[0]).unwrap();
|
||||
segment_collector.collect(1, 0.2);
|
||||
segment_collector.collect(2, 0.1);
|
||||
segment_collector.collect(3, 0.5);
|
||||
collectors.merge_children(vec![segment_collector]);
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
|
||||
@@ -4,56 +4,11 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
use collector::Combinable;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::CountCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut count_collector).unwrap();
|
||||
///
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
@@ -68,12 +23,10 @@ impl CountCollector {
|
||||
}
|
||||
|
||||
impl Collector for CountCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
type Child = CountCollector;
|
||||
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<CountCollector> {
|
||||
Ok(CountCollector::default())
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -81,10 +34,28 @@ impl Collector for CountCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl Combinable for CountCollector {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.count += other.count;
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for CountCollector {
|
||||
type CollectionResult = CountCollector;
|
||||
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
}
|
||||
|
||||
fn finalize(self) -> CountCollector {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use collector::{Collector, CountCollector};
|
||||
use collector::{Collector, CountCollector, SegmentCollector};
|
||||
|
||||
#[test]
|
||||
fn test_count_collector() {
|
||||
|
||||
@@ -3,14 +3,12 @@ use docset::SkipResult;
|
||||
use fastfield::FacetReader;
|
||||
use schema::Facet;
|
||||
use schema::Field;
|
||||
use std::cell::UnsafeCell;
|
||||
use std::collections::btree_map;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use std::iter::Peekable;
|
||||
use std::mem;
|
||||
use std::{u64, usize};
|
||||
use termdict::TermMerger;
|
||||
|
||||
@@ -20,6 +18,7 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
@@ -194,19 +193,22 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// }
|
||||
/// ```
|
||||
pub struct FacetCollector {
|
||||
facet_ords: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<UnsafeCell<FacetReader>>,
|
||||
segment_counters: Vec<SegmentFacetCounter>,
|
||||
facets: BTreeSet<Facet>,
|
||||
}
|
||||
|
||||
pub struct FacetSegmentCollector {
|
||||
reader: FacetReader,
|
||||
|
||||
facet_ords_buf: Vec<u64>,
|
||||
|
||||
// facet_ord -> collapse facet_id
|
||||
current_segment_collapse_mapping: Vec<usize>,
|
||||
collapse_mapping: Vec<usize>,
|
||||
// collapse facet_id -> count
|
||||
current_segment_counts: Vec<u64>,
|
||||
counts: Vec<u64>,
|
||||
// collapse facet_id -> facet_ord
|
||||
current_collapse_facet_ords: Vec<u64>,
|
||||
|
||||
facets: BTreeSet<Facet>,
|
||||
collapse_facet_ords: Vec<u64>,
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
@@ -240,15 +242,9 @@ impl FacetCollector {
|
||||
/// is of the proper type.
|
||||
pub fn for_field(field: Field) -> FacetCollector {
|
||||
FacetCollector {
|
||||
facet_ords: Vec::with_capacity(255),
|
||||
segment_counters: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
facets: BTreeSet::new(),
|
||||
|
||||
current_segment_collapse_mapping: Vec::new(),
|
||||
current_collapse_facet_ords: Vec::new(),
|
||||
current_segment_counts: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -279,82 +275,21 @@ impl FacetCollector {
|
||||
self.facets.insert(facet);
|
||||
}
|
||||
|
||||
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||
self.current_segment_collapse_mapping.clear();
|
||||
self.current_collapse_facet_ords.clear();
|
||||
self.current_segment_counts.clear();
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
self.current_collapse_facet_ords.push(0);
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if !facet_streamer.advance() {
|
||||
return;
|
||||
}
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = self.current_collapse_facet_ords.len();
|
||||
self.current_collapse_facet_ords
|
||||
.push(facet_streamer.term_ord());
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(SegmentFacetCounter {
|
||||
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the results of the collection.
|
||||
///
|
||||
/// This method does not just return the counters,
|
||||
/// it also translates the facet ordinals of the last segment.
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self
|
||||
.segment_counters
|
||||
pub fn harvest(self) -> FacetCounts {
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
.collect();
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self
|
||||
.segment_counters
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||
.collect();
|
||||
|
||||
let facet_streams = self
|
||||
.segment_counters
|
||||
let facet_streams = self.segment_counters
|
||||
.iter()
|
||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||
.collect::<Vec<_>>();
|
||||
@@ -392,31 +327,92 @@ impl FacetCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetSegmentCollector {
|
||||
fn into_segment_facet_counter(self) -> SegmentFacetCounter {
|
||||
SegmentFacetCounter {
|
||||
facet_reader: self.reader,
|
||||
facet_ords: self.collapse_facet_ords,
|
||||
facet_counts: self.counts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FacetCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.finalize_segment();
|
||||
type Child = FacetSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FacetSegmentCollector> {
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
self.set_collapse_mapping(&facet_reader);
|
||||
self.current_segment_counts
|
||||
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||
Ok(())
|
||||
|
||||
let mut collapse_mapping = Vec::new();
|
||||
let mut counts = Vec::new();
|
||||
let mut collapse_facet_ords = Vec::new();
|
||||
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
collapse_facet_ords.push(0);
|
||||
{
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if facet_streamer.advance() {
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = collapse_facet_ords.len();
|
||||
collapse_facet_ords.push(facet_streamer.term_ord());
|
||||
collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
counts.resize(collapse_facet_ords.len(), 0);
|
||||
|
||||
Ok(FacetSegmentCollector {
|
||||
reader: facet_reader,
|
||||
facet_ords_buf: Vec::with_capacity(255),
|
||||
collapse_mapping,
|
||||
counts,
|
||||
collapse_facet_ords,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for FacetSegmentCollector {
|
||||
type CollectionResult = Vec<SegmentFacetCounter>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self
|
||||
.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
};
|
||||
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
|
||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||
for &facet_ord in &self.facet_ords {
|
||||
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||
{
|
||||
for &facet_ord in &self.facet_ords_buf {
|
||||
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
|
||||
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
@@ -425,8 +421,8 @@ impl Collector for FacetCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
fn finalize(self) -> Vec<SegmentFacetCounter> {
|
||||
vec![self.into_segment_facet_counter()]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -511,7 +507,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
@@ -591,7 +587,7 @@ mod tests {
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
@@ -648,7 +644,7 @@ mod bench {
|
||||
// 40425 docs
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
@@ -7,12 +7,15 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use query::Query;
|
||||
use Searcher;
|
||||
use downcast;
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::CountCollector;
|
||||
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
//mod multi_collector;
|
||||
//pub use self::multi_collector::MultiCollector;
|
||||
|
||||
mod top_collector;
|
||||
pub use self::top_collector::TopCollector;
|
||||
@@ -21,7 +24,7 @@ mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
|
||||
mod chained_collector;
|
||||
pub use self::chained_collector::{chain, ChainedCollector};
|
||||
pub use self::chained_collector::chain;
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
@@ -53,31 +56,90 @@ pub use self::chained_collector::{chain, ChainedCollector};
|
||||
///
|
||||
/// Segments are not guaranteed to be visited in any specific order.
|
||||
pub trait Collector {
|
||||
type Child : SegmentCollector + 'static;
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn set_segment(
|
||||
fn for_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
) -> Result<Self::Child>;
|
||||
|
||||
/// Returns true iff the collector requires to compute scores for documents.
|
||||
fn requires_scoring(&self) -> bool;
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
///
|
||||
/// Then, the query loops over the segments and for each segment :
|
||||
/// - setup the collector and informs it that the segment being processed has changed.
|
||||
/// - creates a SegmentCollector for collecting documents associated to the segment
|
||||
/// - creates a `Scorer` object associated for this segment
|
||||
/// - iterate through the matched documents and push them to the segment collector.
|
||||
/// - turn the segment collector into a Combinable segment result
|
||||
///
|
||||
/// Combining all of the segment results gives a single Child::CollectionResult, which is returned.
|
||||
///
|
||||
/// The result will be Ok(None) in case of having no segments.
|
||||
fn search(&mut self, searcher: &Searcher, query: &Query) -> Result<Option<<Self::Child as SegmentCollector>::CollectionResult>> {
|
||||
let scoring_enabled = self.requires_scoring();
|
||||
let weight = query.weight(searcher, scoring_enabled)?;
|
||||
let mut results = Vec::new();
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
let mut child: Self::Child = self.for_segment(segment_ord as SegmentLocalId, segment_reader)?;
|
||||
let mut scorer = weight.scorer(segment_reader)?;
|
||||
scorer.collect(&mut child, segment_reader.delete_bitset());
|
||||
results.push(child.finalize());
|
||||
}
|
||||
Ok(results.into_iter().fold1(|x,y| {
|
||||
x.combine_into(y);
|
||||
x
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Combinable {
|
||||
fn combine_into(&mut self, other: Self);
|
||||
}
|
||||
|
||||
impl Combinable for () {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Combinable for Vec<T> {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.extend(other.into_iter());
|
||||
}
|
||||
}
|
||||
|
||||
impl<L: Combinable, R: Combinable> Combinable for (L, R) {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.0.combine_into(other.0);
|
||||
self.1.combine_into(other.1);
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SegmentCollector: downcast::Any + 'static {
|
||||
type CollectionResult: Combinable + downcast::Any + 'static;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
|
||||
/// Turn into the final result
|
||||
fn finalize(self) -> Self::CollectionResult;
|
||||
}
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
type Child = C::Child;
|
||||
|
||||
fn for_segment(
|
||||
&mut self, // TODO Ask Jason : why &mut self here!?
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
C::collect(self, doc, score)
|
||||
) -> Result<C::Child> {
|
||||
(*self).for_segment(segment_local_id, segment)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -85,6 +147,61 @@ impl<'a, C: Collector> Collector for &'a mut C {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CollectorWrapper<'a, TCollector: 'a + Collector>(&'a mut TCollector);
|
||||
|
||||
impl<'a, T: 'a + Collector> CollectorWrapper<'a, T> {
|
||||
pub fn new(collector: &'a mut T) -> CollectorWrapper<'a, T> {
|
||||
CollectorWrapper(collector)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: 'a + Collector> Collector for CollectorWrapper<'a, T> {
|
||||
type Child = T::Child;
|
||||
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<T::Child> {
|
||||
self.0.for_segment(segment_local_id, segment)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.0.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
trait UntypedCollector {
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>>;
|
||||
}
|
||||
|
||||
|
||||
impl<'a, TCollector:'a + Collector> UntypedCollector for CollectorWrapper<'a, TCollector> {
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>> {
|
||||
let segment_collector = self.0.for_segment(segment_local_id, segment)?;
|
||||
Ok(Box::new(segment_collector))
|
||||
}
|
||||
}
|
||||
|
||||
trait UntypedSegmentCollector {
|
||||
fn finalize(self) -> Box<UntypedCombinable>;
|
||||
}
|
||||
|
||||
trait UntypedCombinable {
|
||||
fn combine_into(&mut self, other: Box<UntypedCombinable>);
|
||||
}
|
||||
|
||||
pub struct CombinableWrapper<'a, T: 'a + Combinable>(&'a mut T);
|
||||
|
||||
impl<'a, T: 'a + Combinable> CombinableWrapper<'a, T> {
|
||||
pub fn new(combinable: &'a mut T) -> CombinableWrapper<'a, T> {
|
||||
CombinableWrapper(combinable)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: 'a + Combinable> Combinable for CombinableWrapper<'a, T> {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.0.combine_into(*::downcast::Downcast::<T>::downcast(other).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
@@ -102,8 +219,13 @@ pub mod tests {
|
||||
/// It is unusable in practise, as it does not store
|
||||
/// the segment ordinals
|
||||
pub struct TestCollector {
|
||||
next_offset: DocId,
|
||||
docs: Vec<DocId>,
|
||||
scores: Vec<Score>,
|
||||
}
|
||||
|
||||
pub struct TestSegmentCollector {
|
||||
offset: DocId,
|
||||
segment_max_doc: DocId,
|
||||
docs: Vec<DocId>,
|
||||
scores: Vec<Score>,
|
||||
}
|
||||
@@ -122,8 +244,7 @@ pub mod tests {
|
||||
impl Default for TestCollector {
|
||||
fn default() -> TestCollector {
|
||||
TestCollector {
|
||||
offset: 0,
|
||||
segment_max_doc: 0,
|
||||
next_offset: 0,
|
||||
docs: Vec::new(),
|
||||
scores: Vec::new(),
|
||||
}
|
||||
@@ -131,19 +252,33 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.offset += self.segment_max_doc;
|
||||
self.segment_max_doc = reader.max_doc();
|
||||
Ok(())
|
||||
type Child = TestSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<TestSegmentCollector> {
|
||||
let offset = self.next_offset;
|
||||
self.next_offset += reader.max_doc();
|
||||
Ok(TestSegmentCollector {
|
||||
offset,
|
||||
docs: Vec::new(),
|
||||
scores: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TestSegmentCollector {
|
||||
type CollectionResult = Vec<TestSegmentCollector>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.docs.push(doc + self.offset);
|
||||
self.scores.push(score);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
fn finalize(self) -> Vec<TestSegmentCollector> {
|
||||
vec![self]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,17 +287,26 @@ pub mod tests {
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u64>,
|
||||
next_counter: usize,
|
||||
field: Field,
|
||||
ff_reader: Option<FastFieldReader<u64>>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct FastFieldSegmentCollectorState {
|
||||
counter: usize,
|
||||
vals: Vec<u64>,
|
||||
}
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
state: FastFieldSegmentCollectorState,
|
||||
reader: FastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
next_counter: 0,
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,20 +316,35 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
type Child = FastFieldSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FastFieldSegmentCollector> {
|
||||
let counter = self.next_counter;
|
||||
self.next_counter += 1;
|
||||
Ok(FastFieldSegmentCollector {
|
||||
state: FastFieldSegmentCollectorState::default(),
|
||||
reader: reader.fast_field_reader(self.field)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for FastFieldSegmentCollector {
|
||||
type CollectionResult = Vec<FastFieldSegmentCollectorState>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.reader.get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
fn finalize(self) -> Vec<FastFieldSegmentCollectorState> {
|
||||
vec![self.state]
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast field bytes for all of the
|
||||
/// docs in the `DocSet`
|
||||
///
|
||||
@@ -193,7 +352,11 @@ pub mod tests {
|
||||
pub struct BytesFastFieldTestCollector {
|
||||
vals: Vec<u8>,
|
||||
field: Field,
|
||||
ff_reader: Option<BytesFastFieldReader>,
|
||||
}
|
||||
|
||||
pub struct BytesFastFieldSegmentCollector {
|
||||
vals: Vec<u8>,
|
||||
reader: BytesFastFieldReader,
|
||||
}
|
||||
|
||||
impl BytesFastFieldTestCollector {
|
||||
@@ -201,7 +364,6 @@ pub mod tests {
|
||||
BytesFastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,20 +373,32 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
type Child = BytesFastFieldSegmentCollector;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
|
||||
self.vals.extend(val);
|
||||
fn for_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<BytesFastFieldSegmentCollector> {
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
type CollectionResult = Vec<Vec<u8>>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let val = self.reader.get_val(doc);
|
||||
self.vals.extend(val);
|
||||
}
|
||||
|
||||
fn finalize(self) -> Vec<Vec<u8>> {
|
||||
vec![self.vals]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -1,119 +1,122 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use downcast::Downcast;
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors =
|
||||
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct MultiCollector<'a> {
|
||||
collectors: Vec<&'a mut Collector>,
|
||||
collector_wrappers: Vec<Box<UntypedCollector + 'a>>
|
||||
}
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
/// Constructor
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector { collectors }
|
||||
pub fn new() -> MultiCollector<'a> {
|
||||
MultiCollector {
|
||||
collector_wrappers: Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_collector<TCollector: 'a + Collector>(&mut self, collector: &'a mut TCollector) {
|
||||
let collector_wrapper = CollectorWrapper(collector);
|
||||
self.collector_wrappers.push(Box::new(collector_wrapper));
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
collector.set_segment(segment_local_id, segment)?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
type Child = MultiCollectorChild;
|
||||
|
||||
fn for_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<MultiCollectorChild> {
|
||||
let children = self.collector_wrappers
|
||||
.iter_mut()
|
||||
.map(|collector_wrapper| {
|
||||
collector_wrapper.for_segment(segment_local_id, segment)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
Ok(MultiCollectorChild {
|
||||
children
|
||||
})
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
for collector in &mut self.collectors {
|
||||
collector.collect(doc, score);
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector_wrappers
|
||||
.iter()
|
||||
.any(|c| c.requires_scoring())
|
||||
}
|
||||
|
||||
fn merge_children(&mut self, children: Vec<MultiCollectorChild>) {
|
||||
let mut per_collector_children: Vec<Vec<Box<SegmentCollector>>> =
|
||||
(0..self.collector_wrappers.len())
|
||||
.map(|_| Vec::with_capacity(children.len()))
|
||||
.collect::<Vec<_>>();
|
||||
for child in children {
|
||||
for (idx, segment_collector) in child.children.into_iter().enumerate() {
|
||||
per_collector_children[idx].push(segment_collector);
|
||||
}
|
||||
}
|
||||
for (collector, children) in self.collector_wrappers.iter_mut().zip(per_collector_children) {
|
||||
collector.merge_children_anys(children);
|
||||
}
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collectors
|
||||
.iter()
|
||||
.any(|collector| collector.requires_scoring())
|
||||
|
||||
}
|
||||
|
||||
pub struct MultiCollectorChild {
|
||||
children: Vec<Box<SegmentCollector>>
|
||||
}
|
||||
|
||||
impl SegmentCollector for MultiCollectorChild {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
for child in &mut self.children {
|
||||
child.collect(doc, score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
use schema::{TEXT, SchemaBuilder};
|
||||
use query::TermQuery;
|
||||
use Index;
|
||||
use Term;
|
||||
use schema::IndexRecordOption;
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(text=>""));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_text(text, "abc");
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors =
|
||||
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
let mut collectors = MultiCollector::new();
|
||||
collectors.add_collector(&mut top_collector);
|
||||
collectors.add_collector(&mut count_collector);
|
||||
collectors.search(&*searcher, &query).unwrap();
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
assert_eq!(count_collector.count(), 5);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,8 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
use collector::Combinable;
|
||||
|
||||
// Rust heap is a max-heap and we need a min heap.
|
||||
#[derive(Clone, Copy)]
|
||||
@@ -43,61 +45,7 @@ impl Eq for GlobalScoredDoc {}
|
||||
/// with the best scores.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, DocId, Score};
|
||||
/// use tantivy::collector::TopCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut top_collector).unwrap();
|
||||
///
|
||||
/// let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
/// .score_docs()
|
||||
/// .into_iter()
|
||||
/// .map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
/// .collect();
|
||||
///
|
||||
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
/// The theorical complexity is `O(n log K)`.
|
||||
pub struct TopCollector {
|
||||
limit: usize,
|
||||
heap: BinaryHeap<GlobalScoredDoc>,
|
||||
@@ -153,21 +101,42 @@ impl TopCollector {
|
||||
}
|
||||
|
||||
impl Collector for TopCollector {
|
||||
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
self.segment_id = segment_id;
|
||||
Ok(())
|
||||
type Child = TopCollector;
|
||||
|
||||
fn for_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<TopCollector> {
|
||||
Ok(TopCollector {
|
||||
limit: self.limit,
|
||||
heap: BinaryHeap::new(),
|
||||
segment_id,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl Combinable for TopCollector {
|
||||
// TODO: I think this could be a bit better
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.segment_id = other.segment_id;
|
||||
while let Some(doc) = other.heap.pop() {
|
||||
self.collect(doc.doc_address.doc(), doc.score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TopCollector {
|
||||
type CollectionResult = TopCollector;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc = *self
|
||||
.heap
|
||||
let limit_doc: GlobalScoredDoc = *self.heap
|
||||
.peek()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self
|
||||
.heap
|
||||
let mut mut_head = self.heap
|
||||
.peek_mut()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
mut_head.score = score;
|
||||
@@ -182,8 +151,8 @@ impl Collector for TopCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
fn finalize(self) -> TopCollector {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
@@ -191,7 +160,6 @@ impl Collector for TopCollector {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
@@ -242,5 +210,4 @@ mod tests {
|
||||
fn test_top_0() {
|
||||
TopCollector::with_limit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ impl BitPacker {
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
@@ -98,14 +98,31 @@ where
|
||||
let addr_in_bits = idx * num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 =
|
||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
if cfg!(feature = "simdcompression") {
|
||||
// for simdcompression,
|
||||
// the bitpacker is only used for fastfields,
|
||||
// and we expect them to be always padded.
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
} else {
|
||||
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
|
||||
} else {
|
||||
let mut buffer = [0u8; 8];
|
||||
for i in addr..data.len() {
|
||||
buffer[i - addr] += data[i];
|
||||
}
|
||||
unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
|
||||
};
|
||||
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
|
||||
val_shifted & mask
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads a range of values from the fast field.
|
||||
|
||||
@@ -72,8 +72,7 @@ impl<W: Write> CompositeWrite<W> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self
|
||||
.offsets
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
.iter()
|
||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||
.collect();
|
||||
|
||||
@@ -34,8 +34,7 @@ impl BlockEncoder {
|
||||
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
|
||||
self.output[0] = num_bits;
|
||||
let written_size =
|
||||
1 + self
|
||||
.bitpacker
|
||||
1 + self.bitpacker
|
||||
.compress_sorted(offset, block, &mut self.output[1..], num_bits);
|
||||
&self.output[..written_size]
|
||||
}
|
||||
@@ -43,8 +42,7 @@ impl BlockEncoder {
|
||||
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
|
||||
let num_bits = self.bitpacker.num_bits(block);
|
||||
self.output[0] = num_bits;
|
||||
let written_size = 1 + self
|
||||
.bitpacker
|
||||
let written_size = 1 + self.bitpacker
|
||||
.compress(block, &mut self.output[1..], num_bits);
|
||||
&self.output[..written_size]
|
||||
}
|
||||
@@ -85,8 +83,7 @@ impl BlockDecoder {
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
1 + self
|
||||
.bitpacker
|
||||
1 + self.bitpacker
|
||||
.decompress(&compressed_data[1..], &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
|
||||
@@ -42,8 +42,7 @@ impl CompressedIntStream {
|
||||
// no need to read.
|
||||
self.cached_next_addr
|
||||
} else {
|
||||
let next_addr = addr + self
|
||||
.block_decoder
|
||||
let next_addr = addr + self.block_decoder
|
||||
.uncompress_block_unsorted(self.buffer.slice_from(addr));
|
||||
self.cached_addr = addr;
|
||||
self.cached_next_addr = next_addr;
|
||||
|
||||
@@ -21,7 +21,6 @@ use directory::ManagedDirectory;
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use indexer::DirectoryLock;
|
||||
use num_cpus;
|
||||
@@ -52,7 +51,12 @@ impl Index {
|
||||
/// This should only be used for unit tests.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let ram_directory = RAMDirectory::create();
|
||||
Index::create(ram_directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
// unwrap is ok here
|
||||
let directory = ManagedDirectory::new(ram_directory).expect(
|
||||
"Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.",
|
||||
);
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
}
|
||||
|
||||
/// Creates a new index in a given filepath.
|
||||
@@ -60,9 +64,15 @@ impl Index {
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
Index::create(mmap_directory, schema)
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
@@ -76,22 +86,10 @@ impl Index {
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
Index::create(mmap_directory, schema)
|
||||
}
|
||||
|
||||
/// Creates a new index given an implementation of the trait `Directory`
|
||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(dir)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
@@ -105,22 +103,24 @@ impl Index {
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
/// Open the index using the provided directory
|
||||
pub fn open_directory<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
Index::open(mmap_directory)
|
||||
Index::open_directory(mmap_directory)
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
/// Create a new index from a directory.
|
||||
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
@@ -137,13 +137,9 @@ impl Index {
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// - `num_threads` defines the number of indexing workers that
|
||||
/// num_threads specifies the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
///
|
||||
/// - `overall_heap_size_in_bytes` sets the amount of memory
|
||||
/// allocated for all indexing thread.
|
||||
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
@@ -151,35 +147,21 @@ impl Index {
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
overall_heap_size_in_bytes: usize,
|
||||
heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
self,
|
||||
num_threads,
|
||||
heap_size_in_bytes_per_thread,
|
||||
directory_lock,
|
||||
)
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
|
||||
}
|
||||
|
||||
/// Creates a multithreaded writer
|
||||
///
|
||||
/// Tantivy will automatically define the number of threads to use.
|
||||
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
||||
/// between a given number of threads.
|
||||
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
let mut num_threads = num_cpus::get();
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
|
||||
}
|
||||
self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
|
||||
pub fn writer(&self, heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes)
|
||||
}
|
||||
|
||||
/// Accessor to the index schema
|
||||
@@ -191,8 +173,7 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
@@ -227,8 +208,7 @@ impl Index {
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
|
||||
@@ -87,8 +87,7 @@ impl<T> Deref for LeasedItem<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self
|
||||
.gen_item
|
||||
&self.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
@@ -97,8 +96,7 @@ impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self
|
||||
.gen_item
|
||||
&mut self.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
|
||||
@@ -73,13 +73,12 @@ impl Searcher {
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher
|
||||
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
|
||||
query.search(self, collector)
|
||||
collector.search(self, query)
|
||||
}
|
||||
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||
let inv_index_readers = self
|
||||
.segment_readers
|
||||
let inv_index_readers = self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -99,8 +98,7 @@ impl FieldSearcher {
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// for the given field.
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
let term_streamers: Vec<_> = self
|
||||
.inv_index_readers
|
||||
let term_streamers: Vec<_> = self.inv_index_readers
|
||||
.iter()
|
||||
.map(|inverted_index| inverted_index.terms().stream())
|
||||
.collect();
|
||||
@@ -110,8 +108,7 @@ impl FieldSearcher {
|
||||
|
||||
impl fmt::Debug for Searcher {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let segment_ids = self
|
||||
.segment_readers
|
||||
let segment_ids = self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -156,13 +156,11 @@ impl SegmentReader {
|
||||
&FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self
|
||||
.fast_fields_composite
|
||||
let idx_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self
|
||||
.fast_fields_composite
|
||||
let values = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
@@ -274,8 +272,7 @@ impl SegmentReader {
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||
if let Some(inv_idx_reader) = self
|
||||
.inv_idx_reader_cache
|
||||
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
|
||||
.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
@@ -304,13 +301,11 @@ impl SegmentReader {
|
||||
|
||||
let postings_source = postings_source_opt.unwrap();
|
||||
|
||||
let termdict_source = self
|
||||
.termdict_composite
|
||||
let termdict_source = self.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
|
||||
let positions_source = self
|
||||
.positions_composite
|
||||
let positions_source = self.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
|
||||
4
src/datastruct/mod.rs
Normal file
4
src/datastruct/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
mod skip;
|
||||
pub mod stacker;
|
||||
|
||||
pub use self::skip::{SkipList, SkipListBuilder};
|
||||
@@ -72,8 +72,7 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
let mut skip_pointer = self.data_layer.insert(key, dest)?;
|
||||
loop {
|
||||
skip_pointer = match skip_pointer {
|
||||
Some((skip_doc_id, skip_offset)) => self
|
||||
.get_skip_layer(layer_id)
|
||||
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset)?,
|
||||
None => {
|
||||
return Ok(());
|
||||
168
src/datastruct/stacker/expull.rs
Normal file
168
src/datastruct/stacker/expull.rs
Normal file
@@ -0,0 +1,168 @@
|
||||
use super::heap::{Heap, HeapAllocable};
|
||||
use std::mem;
|
||||
|
||||
#[inline]
|
||||
pub fn is_power_of_2(val: u32) -> bool {
|
||||
val & (val - 1) == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(val: u32) -> bool {
|
||||
val > 3 && is_power_of_2(val)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
end: u32,
|
||||
val0: u32,
|
||||
val1: u32,
|
||||
val2: u32,
|
||||
next: u32, // inline of the first block
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push(&mut self, val: u32, heap: &Heap) {
|
||||
self.len += 1;
|
||||
if jump_needed(self.len) {
|
||||
// we need to allocate another block.
|
||||
// ... As we want to grow block exponentially
|
||||
// the next block as a size of (length so far),
|
||||
// and we need to add 1u32 to store the pointer
|
||||
// to the next element.
|
||||
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: u32 = heap.allocate_space(new_block_size);
|
||||
heap.set(self.end, &new_block_addr);
|
||||
self.end = new_block_addr;
|
||||
}
|
||||
heap.set(self.end, &val);
|
||||
self.end += mem::size_of::<u32>() as u32;
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for u32 {
|
||||
fn with_addr(_addr: u32) -> u32 {
|
||||
0u32
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for ExpUnrolledLinkedList {
|
||||
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
|
||||
let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
end: last_addr,
|
||||
val0: 0u32,
|
||||
val1: 0u32,
|
||||
val2: 0u32,
|
||||
next: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a Heap,
|
||||
addr: u32,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
let addr: u32;
|
||||
self.consumed += 1;
|
||||
if jump_needed(self.consumed) {
|
||||
addr = *self.heap.get_mut_ref(self.addr);
|
||||
} else {
|
||||
addr = self.addr;
|
||||
}
|
||||
self.addr = addr + mem::size_of::<u32>() as u32;
|
||||
Some(*self.heap.get_mut_ref(addr))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::heap::Heap;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let heap = Heap::with_capacity(1_000_000);
|
||||
let (addr, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stack.push(1u32, &heap);
|
||||
stack.push(2u32, &heap);
|
||||
stack.push(4u32, &heap);
|
||||
stack.push(8u32, &heap);
|
||||
{
|
||||
let mut it = stack.iter(addr, &heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::ExpUnrolledLinkedList;
|
||||
use super::Heap;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut vecs = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
vecs.push(Vec::new());
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
vecs[t].push(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = Heap::with_capacity(64_000_000);
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &heap);
|
||||
}
|
||||
}
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
335
src/datastruct/stacker/hashmap.rs
Normal file
335
src/datastruct/stacker/hashmap.rs
Normal file
@@ -0,0 +1,335 @@
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
use postings::UnorderedTermId;
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use std::slice;
|
||||
|
||||
mod murmurhash2 {
|
||||
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let len = key.len() as u32;
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { *key_ptr }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
));
|
||||
let table_size = compute_table_size(table_num_bits);
|
||||
let heap_size = per_thread_memory_budget - table_size;
|
||||
(heap_size, table_num_bits)
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone, Default)]
|
||||
struct KeyValue {
|
||||
key_value_addr: BytesRef,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
/// The quirky API has the benefit of avoiding
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct TermHashMap<'a> {
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing { hash, i: 0, mask }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i * self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Iter<'a: 'b, 'b> {
|
||||
hashmap: &'b TermHashMap<'a>,
|
||||
inner: slice::Iter<'a, usize>,
|
||||
}
|
||||
|
||||
impl<'a, 'b> Iterator for Iter<'a, 'b> {
|
||||
type Item = (&'b [u8], u32, UnorderedTermId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'b [u8], u32) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermHashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
pub fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
|
||||
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
|
||||
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
|
||||
(key_bytes, expull_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr,
|
||||
hash,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> Iter<'a, 'b> {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
hashmap: &self,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
|
||||
&mut self,
|
||||
key: S,
|
||||
) -> (UnorderedTermId, &mut V) {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
|
||||
self.set_bucket(hash, key_bytes_ref, bucket);
|
||||
return (bucket as UnorderedTermId, val);
|
||||
} else if kv.hash == hash {
|
||||
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
|
||||
if stored_key == key_bytes {
|
||||
return (
|
||||
bucket as UnorderedTermId,
|
||||
self.heap.get_mut_ref(expull_addr),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash2(b: &mut Bencher) {
|
||||
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
let mut s = 0;
|
||||
for &key in &keys {
|
||||
s ^= murmurhash2(key.as_bytes());
|
||||
}
|
||||
s
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::heap::{Heap, HeapAllocable};
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use super::split_memory;
|
||||
use super::*;
|
||||
use std::collections::HashSet;
|
||||
|
||||
struct TestValue {
|
||||
val: u32,
|
||||
_addr: u32,
|
||||
}
|
||||
|
||||
impl HeapAllocable for TestValue {
|
||||
fn with_addr(addr: u32) -> TestValue {
|
||||
TestValue {
|
||||
val: 0u32,
|
||||
_addr: addr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(split_memory(100_000), (67232, 12));
|
||||
assert_eq!(split_memory(1_000_000), (737856, 15));
|
||||
assert_eq!(split_memory(10_000_000), (7902848, 18));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let heap = Heap::with_capacity(2_000_000);
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 4u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 3u32);
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 4u32);
|
||||
}
|
||||
let mut iter_values = hash_map.iter();
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 3u32);
|
||||
}
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 4u32);
|
||||
}
|
||||
assert!(iter_values.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
|
||||
}
|
||||
233
src/datastruct/stacker/heap.rs
Normal file
233
src/datastruct/stacker/heap.rs
Normal file
@@ -0,0 +1,233 @@
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
///
|
||||
/// The slice will encode the length of the `&[u8]` slice
|
||||
/// on 16-bits, and then the data is encoded.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct BytesRef(u32);
|
||||
|
||||
impl BytesRef {
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
|
||||
pub fn addr(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BytesRef {
|
||||
fn default() -> BytesRef {
|
||||
BytesRef(u32::max_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// Object that can be allocated in tantivy's custom `Heap`.
|
||||
pub trait HeapAllocable {
|
||||
fn with_addr(addr: u32) -> Self;
|
||||
}
|
||||
|
||||
/// Tantivy's custom `Heap`.
|
||||
pub struct Heap {
|
||||
inner: UnsafeCell<InnerHeap>,
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
|
||||
impl Heap {
|
||||
/// Creates a new heap with a given capacity
|
||||
pub fn with_capacity(num_bytes: usize) -> Heap {
|
||||
Heap {
|
||||
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
|
||||
}
|
||||
}
|
||||
|
||||
fn inner(&self) -> &mut InnerHeap {
|
||||
unsafe { &mut *self.inner.get() }
|
||||
}
|
||||
|
||||
/// Clears the heap. All the underlying data is lost.
|
||||
///
|
||||
/// This heap does not support deallocation.
|
||||
/// This method is the only way to free memory.
|
||||
pub fn clear(&self) {
|
||||
self.inner().clear();
|
||||
}
|
||||
|
||||
/// Return amount of free space, in bytes.
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
self.inner().num_free_bytes()
|
||||
}
|
||||
|
||||
/// Allocate a given amount of space and returns an address
|
||||
/// in the Heap.
|
||||
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
|
||||
self.inner().allocate_space(num_bytes)
|
||||
}
|
||||
|
||||
/// Allocate an object in the heap
|
||||
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
|
||||
let addr = self.inner().allocate_space(mem::size_of::<V>());
|
||||
let v: V = V::with_addr(addr);
|
||||
self.inner().set(addr, &v);
|
||||
(addr, self.inner().get_mut_ref(addr))
|
||||
}
|
||||
|
||||
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
|
||||
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
|
||||
self.inner().allocate_and_set(data)
|
||||
}
|
||||
|
||||
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
|
||||
/// given as argumetn
|
||||
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
self.inner().get_slice(bytes_ref)
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap, at the given `address`.
|
||||
pub fn set<Item>(&self, addr: u32, val: &Item) {
|
||||
self.inner().set(addr, val);
|
||||
}
|
||||
|
||||
/// Returns a mutable reference for an object at a given Item.
|
||||
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to an `Item` at a given `addr`.
|
||||
#[cfg(test)]
|
||||
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.get_mut_ref(addr)
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerHeap {
|
||||
buffer: Vec<u8>,
|
||||
buffer_len: u32,
|
||||
used: u32,
|
||||
next_heap: Option<Box<InnerHeap>>,
|
||||
}
|
||||
|
||||
impl InnerHeap {
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = vec![0u8; num_bytes];
|
||||
InnerHeap {
|
||||
buffer,
|
||||
buffer_len: num_bytes as u32,
|
||||
next_heap: None,
|
||||
used: 0u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.used = 0u32;
|
||||
self.next_heap = None;
|
||||
}
|
||||
|
||||
// Returns the number of free bytes. If the buffer
|
||||
// has reached it's capacity and overflowed to another buffer, return 0.
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
if self.next_heap.is_some() {
|
||||
0u32
|
||||
} else {
|
||||
self.buffer_len - self.used
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
|
||||
let addr = self.used;
|
||||
self.used += num_bytes as u32;
|
||||
if self.used <= self.buffer_len {
|
||||
addr
|
||||
} else {
|
||||
if self.next_heap.is_none() {
|
||||
info!(
|
||||
r#"Exceeded heap size. The segment will be committed right
|
||||
after indexing this document."#,
|
||||
);
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
}
|
||||
}
|
||||
|
||||
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
let start = bytes_ref.0;
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(BytesRef(start - self.buffer_len))
|
||||
} else {
|
||||
let start = start as usize;
|
||||
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
|
||||
&self.buffer[start + 2..start + 2 + len]
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
} else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
}
|
||||
|
||||
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
|
||||
assert!(data.len() < u16::max_value() as usize);
|
||||
let total_len = 2 + data.len();
|
||||
let start = self.allocate_space(total_len);
|
||||
let total_buff = self.get_mut_slice(start, start + total_len as u32);
|
||||
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
|
||||
total_buff[2..].clone_from_slice(data);
|
||||
BytesRef(start)
|
||||
}
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut(addr - self.buffer_len)
|
||||
} else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_ref(addr - self.buffer_len)
|
||||
} else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
unsafe { &mut *v_ptr }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set(addr - self.buffer_len, val);
|
||||
} else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
|
||||
unsafe {
|
||||
let dest_ptr: *mut u8 = self.get_mut(addr);
|
||||
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
43
src/datastruct/stacker/mod.rs
Normal file
43
src/datastruct/stacker/mod.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
mod expull;
|
||||
pub(crate) mod hashmap;
|
||||
mod heap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::hashmap::TermHashMap;
|
||||
pub use self::heap::{Heap, HeapAllocable};
|
||||
|
||||
#[test]
|
||||
fn test_unrolled_linked_list() {
|
||||
use std::collections;
|
||||
let heap = Heap::with_capacity(30_000_000);
|
||||
{
|
||||
heap.clear();
|
||||
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
|
||||
ks.push(2);
|
||||
ks.push(3);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
|
||||
for j in 0..k {
|
||||
for i in 0..500 {
|
||||
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
|
||||
v.push(i * j, &heap);
|
||||
}
|
||||
}
|
||||
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
|
||||
for (key, addr, _) in hashmap.iter() {
|
||||
map_addr.insert(Vec::from(key), addr);
|
||||
}
|
||||
|
||||
for i in 0..500 {
|
||||
let key: String = i.to_string();
|
||||
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
|
||||
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
|
||||
let mut it = exp_pull.iter(addr, &heap);
|
||||
for j in 0..k {
|
||||
assert_eq!(it.next().unwrap(), i * j);
|
||||
}
|
||||
assert!(!it.next().is_some());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -117,8 +117,7 @@ impl ManagedDirectory {
|
||||
let mut files_to_delete = vec![];
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock = self
|
||||
.meta_informations
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
|
||||
@@ -171,8 +170,7 @@ impl ManagedDirectory {
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self
|
||||
.meta_informations
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
{
|
||||
@@ -195,8 +193,7 @@ impl ManagedDirectory {
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let pathbuf = path.to_owned();
|
||||
{
|
||||
let mut meta_informations_wlock = self
|
||||
.meta_informations
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned on protect");
|
||||
*meta_informations_wlock
|
||||
@@ -218,8 +215,7 @@ impl ManagedDirectory {
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
let mut meta_wlock = self
|
||||
.meta_informations
|
||||
let mut meta_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
@@ -252,8 +248,7 @@ impl Directory for ManagedDirectory {
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
{
|
||||
let metas_rlock = self
|
||||
.meta_informations
|
||||
let metas_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("poisoned lock in managed directory meta");
|
||||
if let Some(counter) = metas_rlock.protected_files.get(path) {
|
||||
|
||||
@@ -32,8 +32,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
}
|
||||
})?;
|
||||
|
||||
let meta_data = file
|
||||
.metadata()
|
||||
let meta_data = file.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
@@ -310,8 +309,7 @@ impl Directory for MmapDirectory {
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self
|
||||
.sync_directory()
|
||||
Ok(_) => self.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
|
||||
@@ -170,8 +170,7 @@ impl Directory for RAMDirectory {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self
|
||||
.fs
|
||||
let exists = self.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
|
||||
|
||||
@@ -41,8 +41,7 @@ pub struct DeleteBitSet {
|
||||
impl DeleteBitSet {
|
||||
/// Opens a delete bitset given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||
let num_deleted: usize = data
|
||||
.as_slice()
|
||||
let num_deleted: usize = data.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
|
||||
@@ -56,8 +56,7 @@ impl FacetReader {
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
let found_term = self
|
||||
.term_dict
|
||||
let found_term = self.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
}
|
||||
|
||||
@@ -52,8 +52,7 @@ impl DeleteQueue {
|
||||
//
|
||||
// Past delete operations are not accessible.
|
||||
pub fn cursor(&self) -> DeleteCursor {
|
||||
let last_block = self
|
||||
.inner
|
||||
let last_block = self.inner
|
||||
.read()
|
||||
.expect("Read lock poisoned when opening delete queue cursor")
|
||||
.last_block
|
||||
@@ -93,8 +92,7 @@ impl DeleteQueue {
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self
|
||||
.inner
|
||||
let mut self_wlock = self.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
|
||||
@@ -134,8 +132,7 @@ impl From<DeleteQueue> for NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self
|
||||
.0
|
||||
let next_read_lock = self.0
|
||||
.read()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||
@@ -144,8 +141,7 @@ impl NextBlock {
|
||||
}
|
||||
let next_block;
|
||||
{
|
||||
let mut next_write_lock = self
|
||||
.0
|
||||
let mut next_write_lock = self.0
|
||||
.write()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
match *next_write_lock {
|
||||
|
||||
@@ -9,6 +9,8 @@ use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use datastruct::stacker::hashmap::split_memory;
|
||||
use datastruct::stacker::Heap;
|
||||
use directory::FileProtection;
|
||||
use docset::DocSet;
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
@@ -22,7 +24,6 @@ use indexer::DirectoryLock;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentWriter;
|
||||
use postings::compute_table_size;
|
||||
use schema::Document;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
@@ -33,11 +34,10 @@ use std::thread::JoinHandle;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
||||
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
|
||||
|
||||
// We impose the memory per thread to be at least 3 MB.
|
||||
pub const HEAP_SIZE_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
|
||||
pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
|
||||
|
||||
// Add document will block if the number of docs waiting in the queue to be indexed
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
@@ -46,24 +46,6 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
type DocumentSender = chan::Sender<AddOperation>;
|
||||
type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
(1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
))
|
||||
.min(19) // we cap it at 512K
|
||||
}
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
///
|
||||
/// It manages a small number of indexing thread, as well as a shared
|
||||
@@ -118,16 +100,11 @@ pub fn open_index_writer(
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
let err_msg = format!(
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
HEAP_SIZE_LIMIT
|
||||
));
|
||||
}
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
@@ -262,29 +239,43 @@ pub fn advance_deletes(
|
||||
}
|
||||
|
||||
fn index_documents(
|
||||
memory_budget: usize,
|
||||
heap: &mut Heap,
|
||||
table_size: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
heap.clear();
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
let table_size = initial_table_size(memory_budget);
|
||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?;
|
||||
for doc in document_iterator {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
|
||||
let mem_usage = segment_writer.mem_usage();
|
||||
|
||||
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
||||
// There is two possible conditions to close the segment.
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
if segment_writer.is_buffer_full() {
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
// The second is the term dictionary hash table
|
||||
// is reaching saturation.
|
||||
//
|
||||
// Tantivy does not resize its hashtable. When it reaches
|
||||
// capacity, we just stop indexing new document.
|
||||
if segment_writer.is_term_saturated() {
|
||||
info!(
|
||||
"Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !segment_updater.is_alive() {
|
||||
@@ -342,8 +333,7 @@ impl IndexWriter {
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result = self
|
||||
.segment_updater
|
||||
let result = self.segment_updater
|
||||
.wait_merging_thread()
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
|
||||
|
||||
@@ -377,12 +367,14 @@ impl IndexWriter {
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let document_receiver_clone = self.document_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
|
||||
info!("heap size {}, table_size {}", heap_size, table_size);
|
||||
let mut heap = Heap::with_capacity(heap_size);
|
||||
|
||||
let generation = self.generation;
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!(
|
||||
"indexing thread {} for gen {}",
|
||||
@@ -410,7 +402,8 @@ impl IndexWriter {
|
||||
}
|
||||
let segment = segment_updater.new_segment();
|
||||
index_documents(
|
||||
mem_budget,
|
||||
&mut heap,
|
||||
table_size,
|
||||
&segment,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
@@ -488,8 +481,7 @@ impl IndexWriter {
|
||||
let document_receiver = self.document_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self
|
||||
._directory_lock
|
||||
let directory_lock = self._directory_lock
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
@@ -645,7 +637,6 @@ impl IndexWriter {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::initial_table_size;
|
||||
use env_logger;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
@@ -708,7 +699,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.rollback().unwrap();
|
||||
|
||||
@@ -741,7 +732,7 @@ mod tests {
|
||||
};
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
let mut doc = Document::default();
|
||||
@@ -775,7 +766,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
@@ -810,7 +801,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
@@ -840,12 +831,4 @@ mod tests {
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000), 12);
|
||||
assert_eq!(initial_table_size(1_000_000), 15);
|
||||
assert_eq!(initial_table_size(10_000_000), 18);
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -440,8 +440,7 @@ impl IndexMerger {
|
||||
) -> Result<Option<TermOrdinalMapping>> {
|
||||
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
let field_readers = self
|
||||
.readers
|
||||
let field_readers = self.readers
|
||||
.iter()
|
||||
.map(|reader| reader.inverted_index(indexed_field))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -684,7 +683,7 @@ mod tests {
|
||||
};
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
// writing the segment
|
||||
{
|
||||
@@ -734,7 +733,7 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
@@ -1139,125 +1138,126 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
use schema::Facet;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||
let mut doc = Document::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
|
||||
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
|
||||
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/e"]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
index_doc(&mut index_writer, &["/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/c"]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||
index_writer.commit().expect("committed");
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||
let searcher = index.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top"));
|
||||
use collector::{CountCollector, MultiCollector};
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut multi_collectors =
|
||||
MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
|
||||
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
|
||||
}
|
||||
assert_eq!(count_collector.count(), expected_num_docs);
|
||||
let facet_counts = facet_collector.harvest();
|
||||
let facets: Vec<(String, u64)> = facet_counts
|
||||
.get("/top")
|
||||
.map(|(facet, count)| (facet.to_string(), count))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
expected
|
||||
.iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
};
|
||||
test_searcher(
|
||||
11,
|
||||
&[
|
||||
("/top/a", 5),
|
||||
("/top/b", 5),
|
||||
("/top/c", 2),
|
||||
("/top/d", 2),
|
||||
("/top/e", 2),
|
||||
("/top/f", 1),
|
||||
],
|
||||
);
|
||||
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
test_searcher(
|
||||
11,
|
||||
&[
|
||||
("/top/a", 5),
|
||||
("/top/b", 5),
|
||||
("/top/c", 2),
|
||||
("/top/d", 2),
|
||||
("/top/e", 2),
|
||||
("/top/f", 1),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// Deleting one term
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
||||
let facet_term = Term::from_facet(facet_field, &facet);
|
||||
index_writer.delete_term(facet_term);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
test_searcher(
|
||||
9,
|
||||
&[
|
||||
("/top/a", 3),
|
||||
("/top/b", 3),
|
||||
("/top/c", 1),
|
||||
("/top/d", 2),
|
||||
("/top/e", 2),
|
||||
("/top/f", 1),
|
||||
],
|
||||
);
|
||||
}
|
||||
}
|
||||
// #[test]
|
||||
// fn test_merge_facets() {
|
||||
// let mut schema_builder = schema::SchemaBuilder::default();
|
||||
// let facet_field = schema_builder.add_facet_field("facet");
|
||||
// let index = Index::create_in_ram(schema_builder.build());
|
||||
// use schema::Facet;
|
||||
// {
|
||||
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
// let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||
// let mut doc = Document::default();
|
||||
// for facet in doc_facets {
|
||||
// doc.add_facet(facet_field, Facet::from(facet));
|
||||
// }
|
||||
// index_writer.add_document(doc);
|
||||
// };
|
||||
//
|
||||
// index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
|
||||
// index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
|
||||
// index_doc(&mut index_writer, &["/top/a", "/top/b"]);
|
||||
// index_doc(&mut index_writer, &["/top/a"]);
|
||||
//
|
||||
// index_doc(&mut index_writer, &["/top/b", "/top/d"]);
|
||||
// index_doc(&mut index_writer, &["/top/d"]);
|
||||
// index_doc(&mut index_writer, &["/top/e"]);
|
||||
// index_writer.commit().expect("committed");
|
||||
//
|
||||
// index_doc(&mut index_writer, &["/top/a"]);
|
||||
// index_doc(&mut index_writer, &["/top/b"]);
|
||||
// index_doc(&mut index_writer, &["/top/c"]);
|
||||
// index_writer.commit().expect("committed");
|
||||
//
|
||||
// index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||
// index_writer.commit().expect("committed");
|
||||
// }
|
||||
// index.load_searchers().unwrap();
|
||||
// let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||
// let searcher = index.searcher();
|
||||
// let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
// facet_collector.add_facet(Facet::from("/top"));
|
||||
// use collector::{CountCollector, MultiCollector};
|
||||
// let mut count_collector = CountCollector::default();
|
||||
// {
|
||||
// let mut multi_collectors = MultiCollector::new();
|
||||
// multi_collectors.add_collector(&mut count_collector);
|
||||
// multi_collectors.add_collector(&mut facet_collector);
|
||||
// searcher.search(&AllQuery, &mut multi_collectors).unwrap();
|
||||
// }
|
||||
// assert_eq!(count_collector.count(), expected_num_docs);
|
||||
// let facet_counts = facet_collector.harvest();
|
||||
// let facets: Vec<(String, u64)> = facet_counts
|
||||
// .get("/top")
|
||||
// .map(|(facet, count)| (facet.to_string(), count))
|
||||
// .collect();
|
||||
// assert_eq!(
|
||||
// facets,
|
||||
// expected
|
||||
// .iter()
|
||||
// .map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
// .collect::<Vec<_>>()
|
||||
// );
|
||||
// };
|
||||
// test_searcher(
|
||||
// 11,
|
||||
// &[
|
||||
// ("/top/a", 5),
|
||||
// ("/top/b", 5),
|
||||
// ("/top/c", 2),
|
||||
// ("/top/d", 2),
|
||||
// ("/top/e", 2),
|
||||
// ("/top/f", 1),
|
||||
// ],
|
||||
// );
|
||||
//
|
||||
// // Merging the segments
|
||||
// {
|
||||
// let segment_ids = index
|
||||
// .searchable_segment_ids()
|
||||
// .expect("Searchable segments failed.");
|
||||
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
// index_writer
|
||||
// .merge(&segment_ids)
|
||||
// .wait()
|
||||
// .expect("Merging failed");
|
||||
// index_writer.wait_merging_threads().unwrap();
|
||||
//
|
||||
// index.load_searchers().unwrap();
|
||||
// test_searcher(
|
||||
// 11,
|
||||
// &[
|
||||
// ("/top/a", 5),
|
||||
// ("/top/b", 5),
|
||||
// ("/top/c", 2),
|
||||
// ("/top/d", 2),
|
||||
// ("/top/e", 2),
|
||||
// ("/top/f", 1),
|
||||
// ],
|
||||
// );
|
||||
// }
|
||||
//
|
||||
// // Deleting one term
|
||||
// {
|
||||
// let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
// let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
||||
// let facet_term = Term::from_facet(facet_field, &facet);
|
||||
// index_writer.delete_term(facet_term);
|
||||
// index_writer.commit().unwrap();
|
||||
// index.load_searchers().unwrap();
|
||||
// test_searcher(
|
||||
// 9,
|
||||
// &[
|
||||
// ("/top/a", 3),
|
||||
// ("/top/b", 3),
|
||||
// ("/top/c", 1),
|
||||
// ("/top/d", 2),
|
||||
// ("/top/e", 2),
|
||||
// ("/top/f", 1),
|
||||
// ],
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
|
||||
#[test]
|
||||
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||
|
||||
@@ -59,8 +59,7 @@ impl SegmentRegister {
|
||||
}
|
||||
|
||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let mut segment_ids: Vec<SegmentMeta> = self
|
||||
.segment_states
|
||||
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use super::operation::AddOperation;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
use datastruct::stacker::Heap;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::index_writer::MARGIN_IN_BYTES;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use schema::FieldType;
|
||||
@@ -22,9 +24,10 @@ use Result;
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
pub struct SegmentWriter {
|
||||
pub struct SegmentWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
max_doc: DocId,
|
||||
multifield_postings: MultiFieldPostingsWriter,
|
||||
multifield_postings: MultiFieldPostingsWriter<'a>,
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
@@ -32,7 +35,7 @@ pub struct SegmentWriter {
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
impl SegmentWriter {
|
||||
impl<'a> SegmentWriter<'a> {
|
||||
/// Creates a new `SegmentWriter`
|
||||
///
|
||||
/// The arguments are defined as follows
|
||||
@@ -43,12 +46,13 @@ impl SegmentWriter {
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter> {
|
||||
) -> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.iter()
|
||||
@@ -64,6 +68,7 @@ impl SegmentWriter {
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
heap,
|
||||
max_doc: 0,
|
||||
multifield_postings,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
|
||||
@@ -89,8 +94,22 @@ impl SegmentWriter {
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.multifield_postings.mem_usage()
|
||||
/// Returns true iff the segment writer's buffer has reached capacity.
|
||||
///
|
||||
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
|
||||
/// The `Segment` is `finalize`d when the buffer gets full.
|
||||
///
|
||||
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
|
||||
/// exceeds the heap size.
|
||||
pub fn is_buffer_full(&self) -> bool {
|
||||
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
|
||||
}
|
||||
|
||||
/// Return true if the term dictionary hashmap is reaching capacity.
|
||||
/// It is one of the condition that triggers a `SegmentWriter` to
|
||||
/// be finalized.
|
||||
pub(crate) fn is_term_saturated(&self) -> bool {
|
||||
self.multifield_postings.is_term_saturated()
|
||||
}
|
||||
|
||||
/// Indexes a new document
|
||||
@@ -229,7 +248,7 @@ fn write(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl SerializableSegment for SegmentWriter {
|
||||
impl<'a> SerializableSegment for SegmentWriter<'a> {
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(
|
||||
|
||||
@@ -55,7 +55,7 @@
|
||||
//!
|
||||
//! // Indexing documents
|
||||
//!
|
||||
//! let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
//! let index = Index::create(index_path, schema.clone())?;
|
||||
//!
|
||||
//! // Here we use a buffer of 100MB that will be split
|
||||
//! // between indexing threads.
|
||||
@@ -136,11 +136,11 @@ extern crate combine;
|
||||
extern crate crossbeam;
|
||||
extern crate fnv;
|
||||
extern crate fst;
|
||||
extern crate fst_regex;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate itertools;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate lz4;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
@@ -188,7 +188,8 @@ mod compression;
|
||||
mod core;
|
||||
mod indexer;
|
||||
|
||||
#[allow(unused_doc_comments)]
|
||||
mod datastruct;
|
||||
#[allow(unused_doc_comment)]
|
||||
mod error;
|
||||
pub mod tokenizer;
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ mod postings_writer;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
mod serializer;
|
||||
mod stacker;
|
||||
mod term_info;
|
||||
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
@@ -22,8 +21,6 @@ pub use self::term_info::TermInfo;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
|
||||
pub use common::HasLen;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
@@ -42,6 +39,7 @@ pub mod tests {
|
||||
use core::Index;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentReader;
|
||||
use datastruct::stacker::Heap;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
@@ -162,9 +160,10 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let segment = index.new_segment();
|
||||
|
||||
let heap = Heap::with_capacity(10_000_000);
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
|
||||
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use postings::UnorderedTermId;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
@@ -15,90 +14,80 @@ use tokenizer::TokenStream;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
heap: &'a Heap,
|
||||
) -> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// FieldType::Bytes cannot actually be indexed.
|
||||
// TODO fix during the indexer refactoring described in #276
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MultiFieldPostingsWriter {
|
||||
heap: MemoryArena,
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
schema: Schema,
|
||||
term_index: TermHashMap,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
||||
term_index: TermHashMap<'a>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
}
|
||||
|
||||
impl MultiFieldPostingsWriter {
|
||||
impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
/// Create a new `MultiFieldPostingsWriter` given
|
||||
/// a schema and a heap.
|
||||
pub fn new(schema: &Schema, table_bits: usize) -> MultiFieldPostingsWriter {
|
||||
let term_index = TermHashMap::new(table_bits);
|
||||
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
|
||||
let term_index = TermHashMap::new(table_bits, heap);
|
||||
let per_field_postings_writers: Vec<_> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| posting_from_field_entry(field_entry))
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.collect();
|
||||
MultiFieldPostingsWriter {
|
||||
heap: MemoryArena::new(),
|
||||
schema: schema.clone(),
|
||||
heap,
|
||||
term_index,
|
||||
per_field_postings_writers,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.term_index.mem_usage() + self.heap.mem_usage()
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
|
||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
||||
postings_writer.index_text(
|
||||
&mut self.term_index,
|
||||
doc,
|
||||
field,
|
||||
token_stream,
|
||||
&mut self.heap,
|
||||
)
|
||||
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
|
||||
}
|
||||
|
||||
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
|
||||
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, self.heap)
|
||||
}
|
||||
|
||||
/// Serialize the inverted index.
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
#[allow(needless_range_loop)]
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
|
||||
.term_index
|
||||
.iter()
|
||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||
.collect();
|
||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||
@@ -153,19 +142,23 @@ impl MultiFieldPostingsWriter {
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&mut field_serializer,
|
||||
&self.term_index.heap,
|
||||
&self.heap,
|
||||
self.heap,
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(unordered_term_mappings)
|
||||
}
|
||||
|
||||
/// Return true iff the term dictionary is saturated.
|
||||
pub fn is_term_saturated(&self) -> bool {
|
||||
self.term_index.is_saturated()
|
||||
}
|
||||
}
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `MemoryArena`.
|
||||
/// `PostingsWriter` writes in a `Heap`.
|
||||
pub trait PostingsWriter {
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
@@ -180,17 +173,16 @@ pub trait PostingsWriter {
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &mut MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> UnorderedTermId;
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
term_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and subscribe all of its token.
|
||||
@@ -200,7 +192,7 @@ pub trait PostingsWriter {
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &mut MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> u32 {
|
||||
let mut term = Term::for_field(field);
|
||||
let num_tokens = {
|
||||
@@ -218,67 +210,61 @@ pub trait PostingsWriter {
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
|
||||
heap: &'a Heap,
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
/// constructor
|
||||
pub fn new() -> SpecializedPostingsWriter<Rec> {
|
||||
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
|
||||
SpecializedPostingsWriter {
|
||||
heap,
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
pub fn new_boxed() -> Box<PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &mut MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
self.total_num_tokens += 1;
|
||||
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||
if opt_recorder.is_some() {
|
||||
let mut recorder = opt_recorder.unwrap();
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(heap);
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::new(heap);
|
||||
recorder.new_doc(doc, heap);
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
let (term_ord, recorder): (UnorderedTermId, &mut Rec) = term_index.get_or_create(term);
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
if current_doc != u32::max_value() {
|
||||
recorder.close_doc(heap);
|
||||
}
|
||||
}) as UnorderedTermId
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
self.total_num_tokens += 1;
|
||||
recorder.record_position(position, heap);
|
||||
term_ord
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
termdict_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for &(term_bytes, addr, _) in term_addrs {
|
||||
let recorder: Rec = unsafe { termdict_heap.read(addr) };
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
serializer.new_term(&term_bytes[4..])?;
|
||||
recorder.serialize(serializer, heap)?;
|
||||
recorder.serialize(addr, serializer, heap)?;
|
||||
serializer.close_term()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
|
||||
use postings::FieldSerializer;
|
||||
use std::{self, io};
|
||||
use DocId;
|
||||
@@ -15,53 +15,62 @@ const POSITION_END: u32 = std::u32::MAX;
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub trait Recorder: Copy {
|
||||
///
|
||||
fn new(heap: &mut MemoryArena) -> Self;
|
||||
pub trait Recorder: HeapAllocable {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
/// This method shall only be called if the term is within the document.
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena);
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap);
|
||||
/// Record the position of a term. For each document,
|
||||
/// this method will be called `term_freq` times.
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena);
|
||||
fn record_position(&mut self, position: u32, heap: &Heap);
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena);
|
||||
fn close_doc(&mut self, heap: &Heap);
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
impl HeapAllocable for NothingRecorder {
|
||||
fn with_addr(addr: u32) -> NothingRecorder {
|
||||
NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {}
|
||||
|
||||
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
||||
fn close_doc(&mut self, _heap: &Heap) {}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
for doc in self.stack.iter(heap) {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for doc in self.stack.iter(self_addr, heap) {
|
||||
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -69,47 +78,52 @@ impl Recorder for NothingRecorder {
|
||||
}
|
||||
|
||||
/// Recorder encoding document ids, and term frequencies
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
current_tf: u32,
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
impl HeapAllocable for TermFrequencyRecorder {
|
||||
fn with_addr(addr: u32) -> TermFrequencyRecorder {
|
||||
TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
current_doc: u32::max_value(),
|
||||
current_tf: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {
|
||||
self.current_tf += 1;
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
self.stack.push(self.current_tf, heap);
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self
|
||||
.stack
|
||||
.iter(heap)
|
||||
let mut doc_iter = self.stack
|
||||
.iter(self_addr, heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
|
||||
while let Some(doc) = doc_iter.next() {
|
||||
@@ -123,40 +137,46 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
|
||||
/// Recorder encoding term frequencies as well as positions.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
impl HeapAllocable for TFAndPositionRecorder {
|
||||
fn with_addr(addr: u32) -> TFAndPositionRecorder {
|
||||
TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
||||
fn record_position(&mut self, position: u32, heap: &Heap) {
|
||||
self.stack.push(position, heap);
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(heap);
|
||||
let mut positions_iter = self.stack.iter(self_addr, heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
let mut prev_position = 0;
|
||||
doc_positions.clear();
|
||||
|
||||
@@ -399,8 +399,7 @@ impl BlockSegmentPostings {
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.num_bitpacked_blocks > 0 {
|
||||
let num_consumed_bytes = self
|
||||
.doc_decoder
|
||||
let num_consumed_bytes = self.doc_decoder
|
||||
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
match self.freq_reading_option {
|
||||
@@ -410,8 +409,7 @@ impl BlockSegmentPostings {
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
let num_consumed_bytes = self.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref());
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
|
||||
@@ -160,8 +160,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
}
|
||||
|
||||
fn current_term_info(&self) -> TermInfo {
|
||||
let (filepos, offset) = self
|
||||
.positions_serializer_opt
|
||||
let (filepos, offset) = self.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.addr())
|
||||
.unwrap_or((0u64, 0u8));
|
||||
@@ -273,8 +272,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] = self
|
||||
.block_encoder
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
@@ -300,16 +298,14 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.termfreq_enabled {
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
|
||||
@@ -1,218 +0,0 @@
|
||||
use super::{Addr, MemoryArena};
|
||||
|
||||
use common::is_power_of_2;
|
||||
use std::mem;
|
||||
|
||||
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
||||
|
||||
const FIRST_BLOCK: u32 = 4u32;
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(len: u32) -> Option<usize> {
|
||||
match len {
|
||||
0...3 => None,
|
||||
4...MAX_BLOCK_LEN => {
|
||||
if is_power_of_2(len as usize) {
|
||||
Some(len as usize)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
n => {
|
||||
if n % MAX_BLOCK_LEN == 0 {
|
||||
Some(MAX_BLOCK_LEN as usize)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An exponential unrolled link.
|
||||
///
|
||||
/// The use case is as follows. Tantivy's indexer conceptually acts like a
|
||||
/// `HashMap<Term, Vec<u32>>`. As we come accross a given term in document
|
||||
/// `D`, we lookup the term in the map and append the document id to its vector.
|
||||
///
|
||||
/// The vector is then only read when it is serialized.
|
||||
///
|
||||
/// The `ExpUnrolledLinkedList` offers a more efficient solution to this
|
||||
/// problem.
|
||||
///
|
||||
/// It combines the idea of the unrolled linked list and tries to address the
|
||||
/// problem of selecting an adequate block size using a strategy similar to
|
||||
/// that of the `Vec` amortized resize strategy.
|
||||
///
|
||||
/// Data is stored in a linked list of blocks. The first block has a size of `4`
|
||||
/// and each block has a length of twice that of the previous block up to
|
||||
/// `MAX_BLOCK_LEN = 32768`.
|
||||
///
|
||||
/// This strategy is a good trade off to handle numerous very rare terms
|
||||
/// and avoid wasting half of the memory for very frequent terms.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
head: Addr,
|
||||
tail: Addr,
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
|
||||
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
head: addr,
|
||||
tail: addr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: self.head,
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a new element to the current stack.
|
||||
///
|
||||
/// If the current block end is reached, a new block is allocated.
|
||||
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
|
||||
self.len += 1;
|
||||
if let Some(new_block_len) = jump_needed(self.len) {
|
||||
// We need to allocate another block.
|
||||
// We also allocate an extra `u32` to store the pointer
|
||||
// to the future next block.
|
||||
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: Addr = heap.allocate_space(new_block_size);
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, new_block_addr)
|
||||
};
|
||||
self.tail = new_block_addr;
|
||||
}
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, val);
|
||||
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a MemoryArena,
|
||||
addr: Addr,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
self.consumed += 1;
|
||||
let addr: Addr = if jump_needed(self.consumed).is_some() {
|
||||
unsafe {
|
||||
// logic
|
||||
self.heap.read(self.addr)
|
||||
}
|
||||
} else {
|
||||
self.addr
|
||||
};
|
||||
self.addr = addr.offset(mem::size_of::<u32>() as u32);
|
||||
Some(unsafe {
|
||||
// logic
|
||||
self.heap.read(addr)
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::MemoryArena;
|
||||
use super::jump_needed;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let mut heap = MemoryArena::new();
|
||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||
stack.push(1u32, &mut heap);
|
||||
stack.push(2u32, &mut heap);
|
||||
stack.push(4u32, &mut heap);
|
||||
stack.push(8u32, &mut heap);
|
||||
{
|
||||
let mut it = stack.iter(&heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed() {
|
||||
let mut block_len = 4u32;
|
||||
let mut i = 0;
|
||||
while i < 10_000_000 {
|
||||
assert!(jump_needed(i + block_len - 1).is_none());
|
||||
assert!(jump_needed(i + block_len + 1).is_none());
|
||||
assert!(jump_needed(i + block_len).is_some());
|
||||
let new_block_len = jump_needed(i + block_len).unwrap();
|
||||
i += block_len;
|
||||
block_len = new_block_len as u32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::ExpUnrolledLinkedList;
|
||||
use tantivy_memory_arena::MemoryArena;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut vecs = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
vecs.push(Vec::new());
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
vecs[t].push(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = MemoryArena::new();
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &heap);
|
||||
}
|
||||
}
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1,291 +0,0 @@
|
||||
//! 32-bits Memory arena for types implementing `Copy`.
|
||||
//! This Memory arena has been implemented to fit the use of tantivy's indexer
|
||||
//! and has *twisted specifications*.
|
||||
//!
|
||||
//! - It works on stable rust.
|
||||
//! - One can get an accurate figure of the memory usage of the arena.
|
||||
//! - Allocation are very cheap.
|
||||
//! - Allocation happening consecutively are very likely to have great locality.
|
||||
//! - Addresses (`Addr`) are 32bits.
|
||||
//! - Dropping the whole `MemoryArena` is cheap.
|
||||
//!
|
||||
//! # Limitations
|
||||
//!
|
||||
//! - Your object shall not implement `Drop`.
|
||||
//! - `Addr` to the `Arena` are 32-bits. The maximum capacity of the arena
|
||||
//! is 4GB. *(Tantivy's indexer uses one arena per indexing thread.)*
|
||||
//! - The arena only works for objects much smaller than `1MB`.
|
||||
//! Allocating more than `1MB` at a time will result in a panic,
|
||||
//! and allocating a lot of large object (> 500KB) will result in a fragmentation.
|
||||
//! - Your objects are store in an unaligned fashion. For this reason,
|
||||
//! the API does not let you access them as references.
|
||||
//!
|
||||
//! Instead, you store and access your data via `.write(...)` and `.read(...)`, which under the hood
|
||||
//! stores your object using `ptr::write_unaligned` and `ptr::read_unaligned`.
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
|
||||
const NUM_BITS_PAGE_ADDR: usize = 20;
|
||||
const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
|
||||
|
||||
/// Represents a pointer into the `MemoryArena`
|
||||
/// .
|
||||
/// Pointer are 32-bits and are split into
|
||||
/// two parts.
|
||||
///
|
||||
/// The first 12 bits represent the id of a
|
||||
/// page of memory.
|
||||
///
|
||||
/// The last 20 bits are an address within this page of memory.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct Addr(u32);
|
||||
|
||||
impl Addr {
|
||||
/// Creates a null pointer.
|
||||
pub fn null_pointer() -> Addr {
|
||||
Addr(u32::max_value())
|
||||
}
|
||||
|
||||
/// Returns the `Addr` object for `addr + offset`
|
||||
pub fn offset(&self, offset: u32) -> Addr {
|
||||
Addr(self.0.wrapping_add(offset))
|
||||
}
|
||||
|
||||
fn new(page_id: usize, local_addr: usize) -> Addr {
|
||||
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
|
||||
}
|
||||
|
||||
fn page_id(&self) -> usize {
|
||||
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
|
||||
}
|
||||
|
||||
fn page_local_addr(&self) -> usize {
|
||||
(self.0 as usize) & (PAGE_SIZE - 1)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the `Addr` is null.
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait required for an object to be `storable`.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// Most of the time you should not implement this trait,
|
||||
/// and only use the `MemoryArena` with object implementing `Copy`.
|
||||
///
|
||||
/// `ArenaStorable` is used in `tantivy` to force
|
||||
/// a `Copy` object and a `slice` of data to be stored contiguously.
|
||||
pub trait ArenaStorable {
|
||||
fn num_bytes(&self) -> usize;
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
|
||||
}
|
||||
|
||||
impl<V> ArenaStorable for V
|
||||
where
|
||||
V: Copy,
|
||||
{
|
||||
fn num_bytes(&self) -> usize {
|
||||
mem::size_of::<V>()
|
||||
}
|
||||
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
|
||||
ptr::write_unaligned(dst_ptr, self);
|
||||
}
|
||||
}
|
||||
|
||||
/// The `MemoryArena`
|
||||
pub struct MemoryArena {
|
||||
pages: Vec<Page>,
|
||||
}
|
||||
|
||||
impl MemoryArena {
|
||||
/// Creates a new memory arena.
|
||||
pub fn new() -> MemoryArena {
|
||||
let first_page = Page::new(0);
|
||||
MemoryArena {
|
||||
pages: vec![first_page],
|
||||
}
|
||||
}
|
||||
|
||||
fn add_page(&mut self) -> &mut Page {
|
||||
let new_page_id = self.pages.len();
|
||||
self.pages.push(Page::new(new_page_id));
|
||||
&mut self.pages[new_page_id]
|
||||
}
|
||||
|
||||
/// Returns an estimate in number of bytes
|
||||
/// of resident memory consumed by the `MemoryArena`.
|
||||
///
|
||||
/// Internally, it counts a number of `1MB` pages
|
||||
/// and therefore delivers an upperbound.
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.pages.len() * PAGE_SIZE
|
||||
}
|
||||
|
||||
/// Writes a slice at the given address, assuming the
|
||||
/// memory was allocated beforehands.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic or corrupt the heap if he space was not
|
||||
/// properly allocated beforehands.
|
||||
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
|
||||
let bytes = data.as_ref();
|
||||
self.pages[addr.page_id()]
|
||||
.get_mut_slice(addr.page_local_addr(), bytes.len())
|
||||
.copy_from_slice(bytes);
|
||||
}
|
||||
|
||||
/// Returns the `len` bytes starting at `addr`
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the memory has not been allocated beforehands.
|
||||
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
|
||||
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
|
||||
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap
|
||||
///
|
||||
/// It allocates the `Item` beforehands.
|
||||
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
|
||||
let num_bytes = val.num_bytes();
|
||||
let addr = self.allocate_space(num_bytes);
|
||||
unsafe {
|
||||
self.write(addr, val);
|
||||
};
|
||||
addr
|
||||
}
|
||||
|
||||
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
|
||||
val.write_into(self, addr)
|
||||
}
|
||||
|
||||
/// Read an item in the heap at the given `address`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the address is erroneous
|
||||
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
|
||||
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
|
||||
ptr::read_unaligned(ptr as *const Item)
|
||||
}
|
||||
|
||||
/// Allocates `len` bytes and returns the allocated address.
|
||||
pub fn allocate_space(&mut self, len: usize) -> Addr {
|
||||
let page_id = self.pages.len() - 1;
|
||||
if let Some(addr) = self.pages[page_id].allocate_space(len) {
|
||||
return addr;
|
||||
}
|
||||
self.add_page().allocate_space(len).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
struct Page {
|
||||
page_id: usize,
|
||||
len: usize,
|
||||
data: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl Page {
|
||||
fn new(page_id: usize) -> Page {
|
||||
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
|
||||
unsafe {
|
||||
data.set_len(PAGE_SIZE);
|
||||
} // avoid initializing page
|
||||
Page {
|
||||
page_id,
|
||||
len: 0,
|
||||
data: data.into_boxed_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_available(&self, len: usize) -> bool {
|
||||
len + self.len <= PAGE_SIZE
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
||||
&mut self.data[local_addr..][..len]
|
||||
}
|
||||
|
||||
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
||||
&self.data[local_addr..][..len]
|
||||
}
|
||||
|
||||
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
||||
if self.is_available(len) {
|
||||
let addr = Addr::new(self.page_id, self.len);
|
||||
self.len += len;
|
||||
Some(addr)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
|
||||
self.data.as_ptr().offset(addr as isize)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
|
||||
self.data.as_mut_ptr().offset(addr as isize)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::MemoryArena;
|
||||
|
||||
#[test]
|
||||
fn test_arena_allocate_slice() {
|
||||
let mut arena = MemoryArena::new();
|
||||
let a = b"hello";
|
||||
let b = b"happy tax payer";
|
||||
|
||||
let addr_a = arena.allocate_space(a.len());
|
||||
arena.write_bytes(addr_a, a);
|
||||
|
||||
let addr_b = arena.allocate_space(b.len());
|
||||
arena.write_bytes(addr_b, b);
|
||||
|
||||
assert_eq!(arena.read_slice(addr_a, a.len()), a);
|
||||
assert_eq!(arena.read_slice(addr_b, b.len()), b);
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
struct MyTest {
|
||||
pub a: usize,
|
||||
pub b: u8,
|
||||
pub c: u32,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store_object() {
|
||||
let mut arena = MemoryArena::new();
|
||||
let a = MyTest {
|
||||
a: 143,
|
||||
b: 21,
|
||||
c: 32,
|
||||
};
|
||||
let b = MyTest {
|
||||
a: 113,
|
||||
b: 221,
|
||||
c: 12,
|
||||
};
|
||||
let addr_a = arena.store(a);
|
||||
let addr_b = arena.store(b);
|
||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
|
||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
|
||||
}
|
||||
}
|
||||
@@ -1,9 +0,0 @@
|
||||
mod expull;
|
||||
mod memory_arena;
|
||||
mod murmurhash2;
|
||||
mod term_hashmap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
||||
use self::murmurhash2::murmurhash2;
|
||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
@@ -1,86 +0,0 @@
|
||||
use std::ptr;
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let len = key.len() as u32;
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::murmurhash2;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
}
|
||||
@@ -1,296 +0,0 @@
|
||||
use super::murmurhash2;
|
||||
use super::{Addr, ArenaStorable, MemoryArena};
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use std::slice;
|
||||
|
||||
pub type BucketId = usize;
|
||||
|
||||
struct KeyBytesValue<'a, V> {
|
||||
key: &'a [u8],
|
||||
value: V,
|
||||
}
|
||||
|
||||
impl<'a, V> KeyBytesValue<'a, V> {
|
||||
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
|
||||
KeyBytesValue { key, value }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
|
||||
where
|
||||
V: ArenaStorable,
|
||||
{
|
||||
fn num_bytes(&self) -> usize {
|
||||
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
|
||||
}
|
||||
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||
arena.write(addr, self.key.len() as u16);
|
||||
arena.write_bytes(addr.offset(2), self.key);
|
||||
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the actual memory size in bytes
|
||||
/// required to create a table of size $2^num_bits$.
|
||||
pub fn compute_table_size(num_bits: usize) -> usize {
|
||||
(1 << num_bits) * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone)]
|
||||
struct KeyValue {
|
||||
key_value_addr: Addr,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl Default for KeyValue {
|
||||
fn default() -> Self {
|
||||
KeyValue {
|
||||
key_value_addr: Addr::null_pointer(),
|
||||
hash: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
/// The quirky API has the benefit of avoiding
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct TermHashMap {
|
||||
table: Box<[KeyValue]>,
|
||||
pub heap: MemoryArena,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing { hash, i: 0, mask }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Iter<'a> {
|
||||
hashmap: &'a TermHashMap,
|
||||
inner: slice::Iter<'a, usize>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = (&'a [u8], Addr, BucketId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) =
|
||||
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
|
||||
(key, offset, bucket as BucketId)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TermHashMap {
|
||||
pub fn new(num_bucket_power_of_2: usize) -> TermHashMap {
|
||||
let heap = MemoryArena::new();
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.table.len() * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
|
||||
let key_addr = addr.offset(2u32);
|
||||
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
|
||||
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
|
||||
(key_bytes, val_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr,
|
||||
hash,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
hashmap: &self,
|
||||
}
|
||||
}
|
||||
|
||||
fn resize(&mut self) {
|
||||
let new_len = self.table.len() * 2;
|
||||
let mask = new_len - 1;
|
||||
self.mask = mask;
|
||||
let new_table = vec![KeyValue::default(); new_len].into_boxed_slice();
|
||||
let old_table = mem::replace(&mut self.table, new_table);
|
||||
for old_pos in self.occupied.iter_mut() {
|
||||
let key_value: KeyValue = old_table[*old_pos];
|
||||
let mut probe = QuadraticProbing::compute(key_value.hash as usize, mask);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
if self.table[bucket].is_empty() {
|
||||
*old_pos = bucket;
|
||||
self.table[bucket] = key_value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `update` create a new entry for a given key if it does not exists
|
||||
/// or updates the existing entry.
|
||||
///
|
||||
/// The actual logic for this update is define in the the `updater`
|
||||
/// argument.
|
||||
///
|
||||
/// If the key is not present, `updater` will receive `None` and
|
||||
/// will be in charge of returning a default value.
|
||||
/// If the key already as an associated value, then it will be passed
|
||||
/// `Some(previous_value)`.
|
||||
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
||||
where
|
||||
S: AsRef<[u8]>,
|
||||
V: Copy,
|
||||
TMutator: FnMut(Option<V>) -> V,
|
||||
{
|
||||
if self.is_saturated() {
|
||||
self.resize();
|
||||
}
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let val = updater(None);
|
||||
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
|
||||
self.set_bucket(hash, key_addr, bucket);
|
||||
return bucket as BucketId;
|
||||
} else if kv.hash == hash {
|
||||
let (key_matches, val_addr) = {
|
||||
let (stored_key, val_addr): (&[u8], Addr) =
|
||||
unsafe { self.get_key_value(kv.key_value_addr) };
|
||||
(stored_key == key_bytes, val_addr)
|
||||
};
|
||||
if key_matches {
|
||||
unsafe {
|
||||
// logic
|
||||
let v = self.heap.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.heap.write(val_addr, new_v);
|
||||
};
|
||||
return bucket as BucketId;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash2(b: &mut Bencher) {
|
||||
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
let mut s = 0;
|
||||
for &key in &keys {
|
||||
s ^= murmurhash2(key.as_bytes());
|
||||
}
|
||||
s
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::TermHashMap;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18);
|
||||
{
|
||||
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
3u32
|
||||
});
|
||||
}
|
||||
{
|
||||
hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
4u32
|
||||
});
|
||||
}
|
||||
{
|
||||
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, Some(3u32));
|
||||
5u32
|
||||
});
|
||||
}
|
||||
|
||||
let mut vanilla_hash_map = HashMap::new();
|
||||
let mut iter_values = hash_map.iter();
|
||||
while let Some((key, addr, _)) = iter_values.next() {
|
||||
let val: u32 = unsafe {
|
||||
// test
|
||||
hash_map.heap.read(addr)
|
||||
};
|
||||
vanilla_hash_map.insert(key.to_owned(), val);
|
||||
}
|
||||
assert_eq!(vanilla_hash_map.len(), 2);
|
||||
}
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
use common::BitSet;
|
||||
use core::SegmentReader;
|
||||
use fst::Automaton;
|
||||
use query::BitSetDocSet;
|
||||
use query::ConstScorer;
|
||||
use query::{Scorer, Weight};
|
||||
use schema::{Field, IndexRecordOption};
|
||||
use termdict::{TermDictionary, TermStreamer};
|
||||
use Result;
|
||||
|
||||
/// A weight struct for Fuzzy Term and Regex Queries
|
||||
pub struct AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
field: Field,
|
||||
automaton: A,
|
||||
}
|
||||
|
||||
impl<A> AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
/// Create a new AutomationWeight
|
||||
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
||||
AutomatonWeight { field, automaton }
|
||||
}
|
||||
|
||||
fn automaton_stream<'a>(&'a self, term_dict: &'a TermDictionary) -> TermStreamer<'a, &'a A> {
|
||||
let term_stream_builder = term_dict.search(&self.automaton);
|
||||
term_stream_builder.into_stream()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A> Weight for AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
|
||||
let inverted_index = reader.inverted_index(self.field);
|
||||
let term_dict = inverted_index.terms();
|
||||
let mut term_stream = self.automaton_stream(term_dict);
|
||||
while term_stream.advance() {
|
||||
let term_info = term_stream.value();
|
||||
let mut block_segment_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic);
|
||||
while block_segment_postings.advance() {
|
||||
for &doc in block_segment_postings.docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(Box::new(ConstScorer::new(doc_bitset)))
|
||||
}
|
||||
}
|
||||
@@ -41,8 +41,7 @@ impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
|
||||
|
||||
impl Query for BooleanQuery {
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
let sub_weights = self
|
||||
.subqueries
|
||||
let sub_weights = self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref occur, ref subquery)| {
|
||||
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
||||
|
||||
@@ -1,162 +0,0 @@
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
|
||||
use query::{AutomatonWeight, Query, Weight};
|
||||
use schema::Term;
|
||||
use std::collections::HashMap;
|
||||
use Result;
|
||||
use Searcher;
|
||||
|
||||
lazy_static! {
|
||||
static ref LEV_BUILDER: HashMap<(u8, bool), LevenshteinAutomatonBuilder> = {
|
||||
let mut lev_builder_cache = HashMap::new();
|
||||
// TODO make population lazy on a `(distance, val)` basis
|
||||
for distance in 0..3 {
|
||||
for &transposition in [false, true].iter() {
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition);
|
||||
lev_builder_cache.insert((distance, transposition), lev_automaton_builder);
|
||||
}
|
||||
}
|
||||
lev_builder_cache
|
||||
};
|
||||
}
|
||||
|
||||
/// A Fuzzy Query matches all of the documents
|
||||
/// containing a specific term that is within
|
||||
/// Levenshtein distance
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::FuzzyTermQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let term = Term::from_field_text(title, "Diary");
|
||||
/// let query = FuzzyTermQuery::new(term, 1, true);
|
||||
/// searcher.search(&query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FuzzyTermQuery {
|
||||
/// What term are we searching
|
||||
term: Term,
|
||||
/// How many changes are we going to allow
|
||||
distance: u8,
|
||||
/// Should a transposition cost 1 or 2?
|
||||
transposition_cost_one: bool,
|
||||
///
|
||||
prefix: bool,
|
||||
}
|
||||
|
||||
impl FuzzyTermQuery {
|
||||
/// Creates a new Fuzzy Query
|
||||
pub fn new(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
|
||||
FuzzyTermQuery {
|
||||
term,
|
||||
distance,
|
||||
transposition_cost_one,
|
||||
prefix: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
|
||||
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
|
||||
FuzzyTermQuery {
|
||||
term,
|
||||
distance,
|
||||
transposition_cost_one,
|
||||
prefix: true,
|
||||
}
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> Result<AutomatonWeight<DFA>> {
|
||||
let automaton = LEV_BUILDER.get(&(self.distance, false))
|
||||
.unwrap() // TODO return an error
|
||||
.build_dfa(self.term.text());
|
||||
Ok(AutomatonWeight::new(self.term.field(), automaton))
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for FuzzyTermQuery {
|
||||
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
Ok(Box::new(self.specialized_weight()?))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::FuzzyTermQuery;
|
||||
use collector::TopCollector;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::TEXT;
|
||||
use tests::assert_nearly_equals;
|
||||
use Index;
|
||||
use Term;
|
||||
|
||||
#[test]
|
||||
pub fn test_fuzzy_term() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "japan",
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "korea",
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
{
|
||||
let mut collector = TopCollector::with_limit(2);
|
||||
let term = Term::from_field_text(country_field, "japon");
|
||||
|
||||
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
||||
searcher.search(&fuzzy_query, &mut collector).unwrap();
|
||||
let scored_docs = collector.score_docs();
|
||||
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
|
||||
let (score, _) = scored_docs[0];
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -228,8 +228,7 @@ where
|
||||
TOtherScorer: Scorer,
|
||||
{
|
||||
fn score(&mut self) -> Score {
|
||||
self.left.score()
|
||||
+ self.right.score()
|
||||
self.left.score() + self.right.score()
|
||||
+ self.others.iter_mut().map(Scorer::score).sum::<Score>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,19 +3,16 @@ Query
|
||||
*/
|
||||
|
||||
mod all_query;
|
||||
mod automaton_weight;
|
||||
mod bitset;
|
||||
mod bm25;
|
||||
mod boolean_query;
|
||||
mod exclude;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
mod occur;
|
||||
mod phrase_query;
|
||||
mod query;
|
||||
mod query_parser;
|
||||
mod range_query;
|
||||
mod regex_query;
|
||||
mod reqopt_scorer;
|
||||
mod scorer;
|
||||
mod term_query;
|
||||
@@ -34,11 +31,9 @@ pub use self::union::Union;
|
||||
pub use self::vec_docset::VecDocSet;
|
||||
|
||||
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
||||
pub use self::automaton_weight::AutomatonWeight;
|
||||
pub use self::bitset::BitSetDocSet;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::occur::Occur;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
@@ -46,7 +41,6 @@ pub use self::query::Query;
|
||||
pub use self::query_parser::QueryParser;
|
||||
pub use self::query_parser::QueryParserError;
|
||||
pub use self::range_query::RangeQuery;
|
||||
pub use self::regex_query::RegexQuery;
|
||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||
pub use self::scorer::ConstScorer;
|
||||
pub use self::scorer::EmptyScorer;
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use super::Weight;
|
||||
use collector::Collector;
|
||||
use core::searcher::Searcher;
|
||||
use downcast;
|
||||
use std::fmt;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
/// for those documents.
|
||||
@@ -57,26 +55,6 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug {
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
///
|
||||
/// Then, the query loops over the segments and for each segment :
|
||||
/// - setup the collector and informs it that the segment being processed has changed.
|
||||
/// - creates a `Scorer` object associated for this segment
|
||||
/// - iterate throw the matched documents and push them to the collector.
|
||||
///
|
||||
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<()> {
|
||||
let scoring_enabled = collector.requires_scoring();
|
||||
let weight = self.weight(searcher, scoring_enabled)?;
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
collector.set_segment(segment_ord as SegmentLocalId, segment_reader)?;
|
||||
let mut scorer = weight.scorer(segment_reader)?;
|
||||
scorer.collect(collector, segment_reader.delete_bitset());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub trait QueryClone {
|
||||
|
||||
@@ -41,8 +41,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::Index;
|
||||
/// # use tantivy::schema::{SchemaBuilder, INT_INDEXED};
|
||||
/// # use tantivy::collector::CountCollector;
|
||||
/// # use tantivy::query::Query;
|
||||
/// # use tantivy::collector::{Collector, CountCollector};
|
||||
/// # use tantivy::Result;
|
||||
/// # use tantivy::query::RangeQuery;
|
||||
/// #
|
||||
@@ -68,7 +67,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
|
||||
/// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
||||
///
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
|
||||
/// count_collector.search(&*searcher, &docs_in_the_sixties)?;
|
||||
///
|
||||
/// let num_60s_books = count_collector.count();
|
||||
///
|
||||
@@ -195,16 +194,12 @@ impl RangeQuery {
|
||||
|
||||
/// Lower bound of range
|
||||
pub fn left_bound(&self) -> Bound<Term> {
|
||||
map_bound(&self.left_bound, &|bytes| {
|
||||
Term::from_field_bytes(self.field, bytes)
|
||||
})
|
||||
map_bound(&self.left_bound, &|bytes| Term::from_field_bytes(self.field, bytes))
|
||||
}
|
||||
|
||||
/// Upper bound of range
|
||||
pub fn right_bound(&self) -> Bound<Term> {
|
||||
map_bound(&self.right_bound, &|bytes| {
|
||||
Term::from_field_bytes(self.field, bytes)
|
||||
})
|
||||
map_bound(&self.right_bound, &|bytes| Term::from_field_bytes(self.field, bytes))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -278,8 +273,7 @@ impl Weight for RangeWeight {
|
||||
mod tests {
|
||||
|
||||
use super::RangeQuery;
|
||||
use collector::CountCollector;
|
||||
use query::Query;
|
||||
use collector::{Collector, CountCollector};
|
||||
use schema::{Document, Field, SchemaBuilder, INT_INDEXED};
|
||||
use std::collections::Bound;
|
||||
use Index;
|
||||
@@ -310,7 +304,7 @@ mod tests {
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let mut count_collector = CountCollector::default();
|
||||
docs_in_the_sixties.search(&*searcher, &mut count_collector)?;
|
||||
count_collector.search(&*searcher, &docs_in_the_sixties)?;
|
||||
assert_eq!(count_collector.count(), 2285);
|
||||
Ok(())
|
||||
}
|
||||
@@ -347,9 +341,7 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let count_multiples = |range_query: RangeQuery| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
range_query
|
||||
.search(&*searcher, &mut count_collector)
|
||||
.unwrap();
|
||||
count_collector.search(&*searcher, &range_query).unwrap();
|
||||
count_collector.count()
|
||||
};
|
||||
|
||||
|
||||
@@ -1,143 +0,0 @@
|
||||
use error::ErrorKind;
|
||||
use fst_regex::Regex;
|
||||
use query::{AutomatonWeight, Query, Weight};
|
||||
use schema::Field;
|
||||
use std::clone::Clone;
|
||||
use Result;
|
||||
use Searcher;
|
||||
|
||||
// A Regex Query matches all of the documents
|
||||
/// containing a specific term that matches
|
||||
/// a regex pattern
|
||||
/// A Fuzzy Query matches all of the documents
|
||||
/// containing a specific term that is within
|
||||
/// Levenshtein distance
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::RegexQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let term = Term::from_field_text(title, "Diary");
|
||||
/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title);
|
||||
/// searcher.search(&query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 3);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RegexQuery {
|
||||
regex_pattern: String,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl RegexQuery {
|
||||
/// Creates a new Fuzzy Query
|
||||
pub fn new(regex_pattern: String, field: Field) -> RegexQuery {
|
||||
RegexQuery {
|
||||
regex_pattern,
|
||||
field,
|
||||
}
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> Result<AutomatonWeight<Regex>> {
|
||||
let automaton = Regex::new(&self.regex_pattern)
|
||||
.map_err(|_| ErrorKind::InvalidArgument(self.regex_pattern.clone()))?;
|
||||
|
||||
Ok(AutomatonWeight::new(self.field.clone(), automaton))
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for RegexQuery {
|
||||
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||
Ok(Box::new(self.specialized_weight()?))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::RegexQuery;
|
||||
use collector::TopCollector;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::TEXT;
|
||||
use tests::assert_nearly_equals;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
pub fn test_regex_query() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "japan",
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
country_field => "korea",
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
{
|
||||
let mut collector = TopCollector::with_limit(2);
|
||||
|
||||
let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field);
|
||||
searcher.search(®ex_query, &mut collector).unwrap();
|
||||
let scored_docs = collector.score_docs();
|
||||
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
|
||||
let (score, _) = scored_docs[0];
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
|
||||
let searcher = index.searcher();
|
||||
{
|
||||
let mut collector = TopCollector::with_limit(2);
|
||||
|
||||
let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field);
|
||||
searcher.search(®ex_query, &mut collector).unwrap();
|
||||
let scored_docs = collector.score_docs();
|
||||
assert_eq!(scored_docs.len(), 0, "Expected ZERO document");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
use collector::Collector;
|
||||
use collector::SegmentCollector;
|
||||
use common::BitSet;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use downcast;
|
||||
@@ -18,7 +18,7 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
|
||||
|
||||
/// Consumes the complete `DocSet` and
|
||||
/// push the scored documents to the collector.
|
||||
fn collect(&mut self, collector: &mut Collector, delete_bitset_opt: Option<&DeleteBitSet>) {
|
||||
fn collect<T>(&mut self, collector: &mut SegmentCollector<CollectionResult = T>, delete_bitset_opt: Option<&DeleteBitSet>) {
|
||||
if let Some(delete_bitset) = delete_bitset_opt {
|
||||
while self.advance() {
|
||||
let doc = self.doc();
|
||||
@@ -44,7 +44,7 @@ impl Scorer for Box<Scorer> {
|
||||
self.deref_mut().score()
|
||||
}
|
||||
|
||||
fn collect(&mut self, collector: &mut Collector, delete_bitset: Option<&DeleteBitSet>) {
|
||||
fn collect<T>(&mut self, collector: &mut SegmentCollector<CollectionResult = T>, delete_bitset: Option<&DeleteBitSet>) {
|
||||
let scorer = self.deref_mut();
|
||||
scorer.collect(collector, delete_bitset);
|
||||
}
|
||||
|
||||
@@ -16,59 +16,6 @@ use Term;
|
||||
/// * `idf` - inverse document frequency.
|
||||
/// * `term_freq` - number of occurrences of the term in the field
|
||||
/// * `field norm` - number of tokens in the field.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT, IndexRecordOption};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::TermQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit()?;
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let query = TermQuery::new(
|
||||
/// Term::from_field_text(title, "diary"),
|
||||
/// IndexRecordOption::Basic,
|
||||
/// );
|
||||
/// searcher.search(&query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct TermQuery {
|
||||
term: Term,
|
||||
|
||||
@@ -6,7 +6,7 @@ use Result;
|
||||
/// for a given set of segments.
|
||||
///
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
pub trait Weight {
|
||||
pub trait Weight: Send + Sync + 'static {
|
||||
/// Returns the scorer for the given segment.
|
||||
/// See [`Query`](./trait.Query.html).
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
extern crate lz4;
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = lz4::EncoderBuilder::new().build(compressed)?;
|
||||
encoder.write_all(&uncompressed)?;
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
encoder_result?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
let mut decoder = lz4::Decoder::new(compressed)?;
|
||||
decoder.read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
extern crate snap;
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = snap::Writer::new(compressed);
|
||||
encoder.write_all(&uncompressed)?;
|
||||
encoder.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
snap::Reader::new(compressed).read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -34,21 +34,10 @@ and should rely on either
|
||||
!*/
|
||||
|
||||
mod reader;
|
||||
mod skiplist;
|
||||
mod writer;
|
||||
pub use self::reader::StoreReader;
|
||||
pub use self::writer::StoreWriter;
|
||||
|
||||
#[cfg(feature = "lz4")]
|
||||
mod compression_lz4;
|
||||
#[cfg(feature = "lz4")]
|
||||
use self::compression_lz4::*;
|
||||
|
||||
#[cfg(not(feature = "lz4"))]
|
||||
mod compression_snap;
|
||||
#[cfg(not(feature = "lz4"))]
|
||||
use self::compression_snap::*;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use Result;
|
||||
|
||||
use super::decompress;
|
||||
use super::skiplist::SkipList;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use datastruct::SkipList;
|
||||
use directory::ReadOnlySource;
|
||||
use lz4;
|
||||
use schema::Document;
|
||||
use std::cell::RefCell;
|
||||
use std::io;
|
||||
use std::io::{self, Read};
|
||||
use std::mem::size_of;
|
||||
use DocId;
|
||||
|
||||
@@ -61,7 +61,9 @@ impl StoreReader {
|
||||
let mut current_block_mut = self.current_block.borrow_mut();
|
||||
current_block_mut.clear();
|
||||
let compressed_block = self.compressed_block(block_offset);
|
||||
decompress(compressed_block, &mut current_block_mut)?;
|
||||
let mut lz4_decoder = lz4::Decoder::new(compressed_block)?;
|
||||
*self.current_block_offset.borrow_mut() = usize::max_value();
|
||||
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())?;
|
||||
*self.current_block_offset.borrow_mut() = block_offset;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use super::compress;
|
||||
use super::skiplist::SkipListBuilder;
|
||||
use super::StoreReader;
|
||||
use common::CountingWriter;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use datastruct::SkipListBuilder;
|
||||
use directory::WritePtr;
|
||||
use lz4;
|
||||
use schema::Document;
|
||||
use std::io::{self, Write};
|
||||
use DocId;
|
||||
@@ -87,7 +87,12 @@ impl StoreWriter {
|
||||
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
self.intermediary_buffer.clear();
|
||||
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
|
||||
{
|
||||
let mut encoder = lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer)?;
|
||||
encoder.write_all(&self.current_block)?;
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
encoder_result?;
|
||||
}
|
||||
(self.intermediary_buffer.len() as u32).serialize(&mut self.writer)?;
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
self.offset_index_writer
|
||||
|
||||
@@ -94,7 +94,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
|
||||
let bit_shift = (addr_bits % 8) as u64;
|
||||
assert!(data.len() >= addr_byte + 8);
|
||||
let val_unshifted_unmasked: u64 = unsafe {
|
||||
// ok thanks to the 7 byte padding on `.close`
|
||||
//< ok : check len above
|
||||
let addr = data.as_ptr().offset(addr_byte as isize) as *const u64;
|
||||
ptr::read_unaligned(addr)
|
||||
};
|
||||
|
||||
@@ -164,8 +164,7 @@ impl TermDictionary {
|
||||
let fst = self.fst_index.as_fst();
|
||||
let mut node = fst.root();
|
||||
while ord != 0 || !node.is_final() {
|
||||
if let Some(transition) = node
|
||||
.transitions()
|
||||
if let Some(transition) = node.transitions()
|
||||
.take_while(|transition| transition.out.value() <= ord)
|
||||
.last()
|
||||
{
|
||||
@@ -204,7 +203,7 @@ impl TermDictionary {
|
||||
|
||||
/// Returns a search builder, to stream all of the terms
|
||||
/// within the Automaton
|
||||
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
|
||||
pub fn search<'a, A: Automaton>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
|
||||
let stream_builder = self.fst_index.search(automaton);
|
||||
TermStreamerBuilder::<A>::new(self, stream_builder)
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ impl Default for Token {
|
||||
offset_from: 0,
|
||||
offset_to: 0,
|
||||
position: usize::max_value(),
|
||||
text: String::with_capacity(200),
|
||||
text: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user