mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-29 21:42:55 +00:00
Compare commits
59 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c1022e23d2 | ||
|
|
8ccbfdea5d | ||
|
|
badfce3a23 | ||
|
|
e301e0bc87 | ||
|
|
317baf4e75 | ||
|
|
24398d94e4 | ||
|
|
360f4132eb | ||
|
|
2b8f02764b | ||
|
|
0465876854 | ||
|
|
6f7b099370 | ||
|
|
84f5cc4388 | ||
|
|
75aae0d2c2 | ||
|
|
009a3559be | ||
|
|
7a31669e9d | ||
|
|
5185eb790b | ||
|
|
a3dffbf1c6 | ||
|
|
857a5794d8 | ||
|
|
b0a6fc1448 | ||
|
|
989d52bea4 | ||
|
|
09661ea7ec | ||
|
|
b59132966f | ||
|
|
863d3411bc | ||
|
|
8a55d133ab | ||
|
|
432d49d814 | ||
|
|
0cea706f10 | ||
|
|
71d41ca209 | ||
|
|
bc69dab822 | ||
|
|
72acad0921 | ||
|
|
c9459f74e8 | ||
|
|
08d2cc6c7b | ||
|
|
82d87416c2 | ||
|
|
96b2c2971e | ||
|
|
162afd73f6 | ||
|
|
ddfd87fa59 | ||
|
|
24050d0eb5 | ||
|
|
89eb209ece | ||
|
|
9a0b7f9855 | ||
|
|
8e343b1ca3 | ||
|
|
99c0b84036 | ||
|
|
ca74c14647 | ||
|
|
68ee18e4e8 | ||
|
|
5637657c2f | ||
|
|
2e3c9a8878 | ||
|
|
78673172d0 | ||
|
|
175b76f119 | ||
|
|
9b79e21bd7 | ||
|
|
5e38ae336f | ||
|
|
8604351f59 | ||
|
|
6a48953d8a | ||
|
|
0804b42afa | ||
|
|
8083bc6eef | ||
|
|
0156f88265 | ||
|
|
a1c07bf457 | ||
|
|
9de74b68d1 | ||
|
|
57c7073867 | ||
|
|
121374b89b | ||
|
|
e44782bf14 | ||
|
|
dfafb24fa6 | ||
|
|
4c6f9541e9 |
154
.travis.yml
154
.travis.yml
@@ -1,37 +1,127 @@
|
||||
# Based on the "trust" template v0.1.2
|
||||
# https://github.com/japaric/trust/tree/v0.1.2
|
||||
|
||||
dist: trusty
|
||||
language: rust
|
||||
services: docker
|
||||
sudo: required
|
||||
cache: cargo
|
||||
rust:
|
||||
- nightly
|
||||
|
||||
env:
|
||||
global:
|
||||
- CC=gcc-4.8
|
||||
- CXX=g++-4.8
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- libcurl4-openssl-dev
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
- CRATE_NAME=tantivy
|
||||
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# iOS
|
||||
#- env: TARGET=aarch64-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7s-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=i386-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-ios DISABLE_TESTS=1
|
||||
os: osx
|
||||
|
||||
# Linux
|
||||
- env: TARGET=aarch64-unknown-linux-gnu
|
||||
# - env: TARGET=arm-unknown-linux-gnueabi
|
||||
# - env: TARGET=armv7-unknown-linux-gnueabihf
|
||||
- env: TARGET=i686-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-musl
|
||||
#- env: TARGET=mips-unknown-linux-gnu
|
||||
#- env: TARGET=mips64-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mips64el-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mipsel-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64le-unknown-linux-gnu
|
||||
#- env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-musl
|
||||
|
||||
# OSX
|
||||
#- env: TARGET=i686-apple-darwin
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-darwin
|
||||
os: osx
|
||||
|
||||
# *BSD
|
||||
#- env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1
|
||||
|
||||
# Windows
|
||||
#- env: TARGET=x86_64-pc-windows-gnu
|
||||
|
||||
# Bare metal
|
||||
# These targets don't support std and as such are likely not suitable for
|
||||
# most crates.
|
||||
# - env: TARGET=thumbv6m-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabihf
|
||||
# - env: TARGET=thumbv7m-none-eabi
|
||||
|
||||
# Testing other channels
|
||||
#- env: TARGET=x86_64-unknown-linux-gnu
|
||||
# rust: nightly
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
# rust: nightly
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
|
||||
script:
|
||||
- cargo build
|
||||
- cargo test
|
||||
- cargo test -- --ignored
|
||||
- cargo run --example simple_search
|
||||
- cargo doc
|
||||
after_success:
|
||||
- cargo coveralls --exclude-pattern src/functional_test.rs
|
||||
- cargo doc-upload
|
||||
- bash ci/script.sh
|
||||
|
||||
after_script: set +e
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
#
|
||||
#deploy:
|
||||
# # - Create a `public_repo` GitHub token. Go to: https://github.com/settings/tokens/new
|
||||
# # - Encrypt it: `travis encrypt 0123456789012345678901234567890123456789
|
||||
# # - Paste the output down here
|
||||
# api_key:
|
||||
# secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
# file_glob: true
|
||||
# file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.*
|
||||
# on:
|
||||
# # TODO Here you can pick which targets will generate binary releases
|
||||
# # In this example, there are some targets that are tested using the stable
|
||||
# # and nightly channels. This condition makes sure there is only one release
|
||||
# # for such targets and that's generated using the stable channel
|
||||
# condition: $TRAVIS_RUST_VERSION = stable
|
||||
# tags: true
|
||||
# provider: releases
|
||||
# skip_cleanup: true
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
# Travis can't cache files that are not readable by "others"
|
||||
- chmod -R a+r $HOME/.cargo
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
# # release tags
|
||||
# - /^v\d+\.\d+\.\d+.*$/
|
||||
# - master
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
|
||||
11
AUTHORS
Normal file
11
AUTHORS
Normal file
@@ -0,0 +1,11 @@
|
||||
# This is the list of authors of tantivy for copyright purposes.
|
||||
Paul Masurel
|
||||
Laurentiu Nicola
|
||||
Dru Sellers
|
||||
Ashley Mannix
|
||||
Michael J. Curry
|
||||
Jason Wolfe
|
||||
# As an employee of Google I am required to add Google LLC
|
||||
# in the list of authors, but this project is not affiliated to Google
|
||||
# in any other way.
|
||||
Google LLC
|
||||
37
CHANGELOG.md
37
CHANGELOG.md
@@ -1,9 +1,30 @@
|
||||
Tantivy 0.5.2
|
||||
Tantivy 0.6
|
||||
==========================
|
||||
|
||||
- Removed C code. Tantivy is now pure Rust.
|
||||
- BM25
|
||||
- Approximate field norms encoded over 1 byte.
|
||||
|
||||
Special thanks to @drusellers and @jason-wolfe for their contributions
|
||||
to this release!
|
||||
|
||||
- Removed C code. Tantivy is now pure Rust. (@pmasurel)
|
||||
- BM25 (@pmasurel)
|
||||
- Approximate field norms encoded over 1 byte. (@pmasurel)
|
||||
- Compiles on stable rust (@pmasurel)
|
||||
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
|
||||
- Completely uncompressed
|
||||
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
|
||||
- Add NGram token support (@drusellers)
|
||||
- Add Stopword Filter support (@drusellers)
|
||||
- Add a FuzzyTermQuery (@drusellers)
|
||||
- Add a RegexQuery (@drusellers)
|
||||
- Various performance improvements (@pmasurel)_
|
||||
|
||||
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
- bugfix #274
|
||||
- bugfix #280
|
||||
- bugfix #289
|
||||
|
||||
|
||||
Tantivy 0.5.1
|
||||
==========================
|
||||
@@ -81,7 +102,7 @@ Tantivy 0.3
|
||||
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
|
||||
for their contribution to this release.
|
||||
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
for their advise and company :)
|
||||
|
||||
https://gitter.im/tantivy-search/tantivy
|
||||
@@ -89,9 +110,9 @@ https://gitter.im/tantivy-search/tantivy
|
||||
|
||||
Warning:
|
||||
|
||||
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
|
||||
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
|
||||
code and index format.
|
||||
You should not expect backward compatibility before
|
||||
You should not expect backward compatibility before
|
||||
tantivy 1.0.
|
||||
|
||||
|
||||
@@ -117,7 +138,7 @@ Thanks to @KodrAus ! (#108)
|
||||
the natural ordering.
|
||||
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
|
||||
- Misc invisible bug fixes, and code cleanup.
|
||||
- Use
|
||||
- Use
|
||||
|
||||
|
||||
|
||||
|
||||
28
Cargo.toml
28
Cargo.toml
@@ -1,23 +1,27 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.5.1"
|
||||
version = "0.6.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Tantivy is a search engine library."""
|
||||
description = """Search engine library"""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
keywords = ["search", "search engine", "information", "retrieval"]
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.9.1"
|
||||
byteorder = "1.0"
|
||||
lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = {version="0.2", default-features=false}
|
||||
atomicwrites = {version="0.1", optional=true}
|
||||
fst = {version="0.3", default-features=false}
|
||||
fst-regex = { version="0.2" }
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "2.1"
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
@@ -27,7 +31,7 @@ serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
lz4 = "1.20"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
bit-set = "0.4.0"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
@@ -38,9 +42,10 @@ error-chain = "0.8"
|
||||
owning_ref = "0.3"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9", features = ["nightly"]}
|
||||
downcast = { version="0.9" }
|
||||
matches = "0.1"
|
||||
bitpacking = "0.3"
|
||||
bitpacking = "0.5"
|
||||
fnv = "1.0.6"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
@@ -55,12 +60,10 @@ debug = false
|
||||
lto = true
|
||||
debug-assertions = false
|
||||
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
streamdict = []
|
||||
mmap = ["fst/mmap", "atomicwrites"]
|
||||
|
||||
lz4-compression = ["lz4"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
@@ -68,3 +71,6 @@ travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
[[example]]
|
||||
name = "simple_search"
|
||||
required-features = ["mmap"]
|
||||
|
||||
[[example]]
|
||||
name = "custom_tokenizer"
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,4 +1,4 @@
|
||||
Copyright (c) 2016 Paul Masurel
|
||||
Copyright (c) 2018 by the project authors, as listed in the AUTHORS file.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
|
||||
71
README.md
71
README.md
@@ -4,36 +4,50 @@
|
||||
[](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
|
||||
It is strongly inspired by Lucene's design.
|
||||
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- tf-idf scoring
|
||||
- Basic query language
|
||||
- Phrase queries
|
||||
- BM25 scoring (the same as lucene)
|
||||
- Basic query language (`+michael +jackson`)
|
||||
- Phrase queries search (\"michael jackson\"`)
|
||||
- Incremental indexing
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- Mmap directory
|
||||
- optional SIMD integer compression
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceting
|
||||
- configurable indexing (optional term frequency and position indexing
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing
|
||||
- Cheesy logo with a horse
|
||||
|
||||
Tantivy supports Linux, MacOS and Windows.
|
||||
# Non-features
|
||||
|
||||
- Distributed search and will not be in the scope of tantivy.
|
||||
|
||||
|
||||
# Supported OS and compiler
|
||||
|
||||
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
|
||||
|
||||
# Getting started
|
||||
|
||||
- [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy's simple search example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
|
||||
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents and search via the CLI or a small server with a REST API.
|
||||
It will walk you through getting a wikipedia search engine up and running in a few minutes.
|
||||
- [reference doc]
|
||||
- [For the last released version](https://docs.rs/tantivy/)
|
||||
@@ -43,43 +57,14 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
||||
|
||||
## Development
|
||||
|
||||
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/unstable-book/language-features/box-syntax.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md),
|
||||
and [simd](https://github.com/rust-lang/rust/issues/27731).
|
||||
|
||||
|
||||
To check out and run test, you can simply run :
|
||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run :
|
||||
|
||||
git clone git@github.com:tantivy-search/tantivy.git
|
||||
cd tantivy
|
||||
cargo +nightly build
|
||||
|
||||
|
||||
## Note on release build and performance
|
||||
|
||||
If your project depends on `tantivy`, for better performance, make sure to enable
|
||||
`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
|
||||
be available on most `x86_64` CPUs you will encounter).
|
||||
|
||||
For instance,
|
||||
|
||||
RUSTFLAGS='-C target-feature=+sse3'
|
||||
|
||||
Or, if you are targetting a specific cpu
|
||||
|
||||
RUSTFLAGS='-C target-cpu=native' build --release
|
||||
|
||||
Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
|
||||
If you want to disable those, you can run the following command :
|
||||
|
||||
cargo build --no-default-features
|
||||
|
||||
Alternatively, if you are trying to compile `tantivy` without simd compression,
|
||||
you can disable this functionality. In this case, this submodule is not required
|
||||
and you can compile tantivy by using the `--no-default-features` flag.
|
||||
|
||||
cargo build --no-default-features
|
||||
cargo build
|
||||
|
||||
|
||||
# Contribute
|
||||
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
|
||||
@@ -6,9 +6,6 @@ environment:
|
||||
matrix:
|
||||
- channel: nightly
|
||||
target: x86_64-pc-windows-msvc
|
||||
- channel: nightly
|
||||
target: x86_64-pc-windows-gnu
|
||||
msys_bits: 64
|
||||
|
||||
install:
|
||||
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
|
||||
|
||||
23
ci/before_deploy.ps1
Normal file
23
ci/before_deploy.ps1
Normal file
@@ -0,0 +1,23 @@
|
||||
# This script takes care of packaging the build artifacts that will go in the
|
||||
# release zipfile
|
||||
|
||||
$SRC_DIR = $PWD.Path
|
||||
$STAGE = [System.Guid]::NewGuid().ToString()
|
||||
|
||||
Set-Location $ENV:Temp
|
||||
New-Item -Type Directory -Name $STAGE
|
||||
Set-Location $STAGE
|
||||
|
||||
$ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip"
|
||||
|
||||
# TODO Update this to package the right artifacts
|
||||
Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\'
|
||||
|
||||
7z a "$ZIP" *
|
||||
|
||||
Push-AppveyorArtifact "$ZIP"
|
||||
|
||||
Remove-Item *.* -Force
|
||||
Set-Location ..
|
||||
Remove-Item $STAGE
|
||||
Set-Location $SRC_DIR
|
||||
33
ci/before_deploy.sh
Normal file
33
ci/before_deploy.sh
Normal file
@@ -0,0 +1,33 @@
|
||||
# This script takes care of building your crate and packaging it for release
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
local src=$(pwd) \
|
||||
stage=
|
||||
|
||||
case $TRAVIS_OS_NAME in
|
||||
linux)
|
||||
stage=$(mktemp -d)
|
||||
;;
|
||||
osx)
|
||||
stage=$(mktemp -d -t tmp)
|
||||
;;
|
||||
esac
|
||||
|
||||
test -f Cargo.lock || cargo generate-lockfile
|
||||
|
||||
# TODO Update this to build the artifacts that matter to you
|
||||
cross rustc --bin hello --target $TARGET --release -- -C lto
|
||||
|
||||
# TODO Update this to package the right artifacts
|
||||
cp target/$TARGET/release/hello $stage/
|
||||
|
||||
cd $stage
|
||||
tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
|
||||
cd $src
|
||||
|
||||
rm -rf $stage
|
||||
}
|
||||
|
||||
main
|
||||
47
ci/install.sh
Normal file
47
ci/install.sh
Normal file
@@ -0,0 +1,47 @@
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
local target=
|
||||
if [ $TRAVIS_OS_NAME = linux ]; then
|
||||
target=x86_64-unknown-linux-musl
|
||||
sort=sort
|
||||
else
|
||||
target=x86_64-apple-darwin
|
||||
sort=gsort # for `sort --sort-version`, from brew's coreutils.
|
||||
fi
|
||||
|
||||
# Builds for iOS are done on OSX, but require the specific target to be
|
||||
# installed.
|
||||
case $TARGET in
|
||||
aarch64-apple-ios)
|
||||
rustup target install aarch64-apple-ios
|
||||
;;
|
||||
armv7-apple-ios)
|
||||
rustup target install armv7-apple-ios
|
||||
;;
|
||||
armv7s-apple-ios)
|
||||
rustup target install armv7s-apple-ios
|
||||
;;
|
||||
i386-apple-ios)
|
||||
rustup target install i386-apple-ios
|
||||
;;
|
||||
x86_64-apple-ios)
|
||||
rustup target install x86_64-apple-ios
|
||||
;;
|
||||
esac
|
||||
|
||||
# This fetches latest stable release
|
||||
local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \
|
||||
| cut -d/ -f3 \
|
||||
| grep -E '^v[0.1.0-9.]+$' \
|
||||
| $sort --version-sort \
|
||||
| tail -n1)
|
||||
curl -LSfs https://japaric.github.io/trust/install.sh | \
|
||||
sh -s -- \
|
||||
--force \
|
||||
--git japaric/cross \
|
||||
--tag $tag \
|
||||
--target $target
|
||||
}
|
||||
|
||||
main
|
||||
23
ci/script.sh
Normal file
23
ci/script.sh
Normal file
@@ -0,0 +1,23 @@
|
||||
# This script takes care of testing your crate
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
cross build --target $TARGET
|
||||
cross build --target $TARGET --release
|
||||
|
||||
if [ ! -z $DISABLE_TESTS ]; then
|
||||
return
|
||||
fi
|
||||
|
||||
cross test --target $TARGET
|
||||
# cross test --target $TARGET --release
|
||||
|
||||
# cross run --target $TARGET
|
||||
# cross run --target $TARGET --release
|
||||
}
|
||||
|
||||
# we don't run the "test phase" when doing deploys
|
||||
if [ -z $TRAVIS_TAG ]; then
|
||||
main
|
||||
fi
|
||||
226
examples/custom_tokenizer.rs
Normal file
226
examples/custom_tokenizer.rs
Normal file
@@ -0,0 +1,226 @@
|
||||
extern crate tantivy;
|
||||
extern crate tempdir;
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::Index;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
if let Ok(dir) = TempDir::new("tantivy_token_example_dir") {
|
||||
run_example(dir.path()).unwrap();
|
||||
dir.close().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// Our first field is title.
|
||||
// In this example we want to use NGram searching
|
||||
// we will set that to 3 characters, so any three
|
||||
// char in the title should be findable.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("ngram3")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
// We want full-text search for it, but we do not
|
||||
// need to be able to be able to retrieve it
|
||||
// for our application.
|
||||
//
|
||||
// We can make our index lighter and
|
||||
// by omitting `STORED` flag.
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
index
|
||||
.tokenizers()
|
||||
.register("ngram3", NgramTokenizer::new(3, 3, false));
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// heap for the indexer can increase its throughput.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
// ### Create a document "manually".
|
||||
//
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
// ### Create a document directly from json.
|
||||
//
|
||||
// Alternatively, we can use our schema to parse a
|
||||
// document object directly from json.
|
||||
// The document is a string, but we use the `json` macro
|
||||
// from `serde_json` for the convenience of multi-line support.
|
||||
let json = json!({
|
||||
"title": "Of Mice and Men",
|
||||
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
});
|
||||
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(mice_and_men_doc);
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
// The following document has two titles.
|
||||
let json = json!({
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
});
|
||||
let frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(frankenstein_doc);
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
// the English wikipedia. Tantivy's indexing is rather fast.
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 4 minutes on my computer!
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
// At this point our documents are not searchable.
|
||||
//
|
||||
//
|
||||
// We need to call .commit() explicitly to force the
|
||||
// index_writer to finish processing the documents in the queue,
|
||||
// flush the current index to the disk, and advertise
|
||||
// the existence of new documents.
|
||||
//
|
||||
// This call is blocking.
|
||||
index_writer.commit()?;
|
||||
|
||||
// If `.commit()` returns correctly, then all of the
|
||||
// documents that have been added are guaranteed to be
|
||||
// persistently indexed.
|
||||
//
|
||||
// In the scenario of a crash or a power failure,
|
||||
// tantivy behaves as if has rolled back to its last
|
||||
// commit.
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// Let's search our index. Start by reloading
|
||||
// searchers in the index. This should be done
|
||||
// after every commit().
|
||||
index.load_searchers()?;
|
||||
|
||||
// Afterwards create one (or more) searchers.
|
||||
//
|
||||
// You should create a searcher
|
||||
// every time you start a "search query".
|
||||
let searcher = index.searcher();
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
// Here, if the user does not specify which
|
||||
// field they want to search, tantivy will search
|
||||
// in both title and body.
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
let query = query_parser.parse_query("ken")?;
|
||||
|
||||
// A query defines a set of documents, as
|
||||
// well as the way they should be scored.
|
||||
//
|
||||
// A query created by the query parser is scored according
|
||||
// to a metric called Tf-Idf, and will consider
|
||||
// any document matching at least one of our terms.
|
||||
|
||||
// ### Collectors
|
||||
//
|
||||
// We are not interested in all of the documents but
|
||||
// only in the top 10. Keeping track of our top 10 best documents
|
||||
// is the role of the TopCollector.
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
// We can now perform our query.
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
// Our top collector now contains the 10
|
||||
// most relevant doc ids...
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
// Wait for indexing and merging threads to shut down.
|
||||
// Usually this isn't needed, but in `main` we try to
|
||||
// delete the temporary directory and that fails on
|
||||
// Windows if the files are still open.
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -5,11 +5,11 @@ extern crate tempdir;
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
use tantivy::Index;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() {
|
||||
// Let's create a temporary directory for the
|
||||
@@ -64,7 +64,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use Result;
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
/// Collector that does nothing.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
@@ -25,6 +25,57 @@ impl Collector for DoNothingCollector {
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
/// This contraption is only usable if the type of your collectors
|
||||
/// are known at compile time.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||
left: Left,
|
||||
right: Right,
|
||||
|
||||
@@ -1,12 +1,59 @@
|
||||
use super::Collector;
|
||||
use DocId;
|
||||
use Score;
|
||||
use Result;
|
||||
use SegmentReader;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::CountCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut count_collector).unwrap();
|
||||
///
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
|
||||
@@ -1,27 +1,25 @@
|
||||
use std::mem;
|
||||
use collector::Collector;
|
||||
use docset::SkipResult;
|
||||
use fastfield::FacetReader;
|
||||
use schema::Facet;
|
||||
use schema::Field;
|
||||
use std::cell::UnsafeCell;
|
||||
use schema::Facet;
|
||||
use std::collections::btree_map;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermStreamer;
|
||||
use termdict::TermStreamerBuilder;
|
||||
use std::collections::BTreeSet;
|
||||
use termdict::TermMerger;
|
||||
use docset::SkipResult;
|
||||
use std::{usize, u64};
|
||||
use std::iter::Peekable;
|
||||
use std::mem;
|
||||
use std::{u64, usize};
|
||||
use termdict::TermMerger;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use std::cmp::Ordering;
|
||||
use SegmentReader;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
@@ -344,16 +342,19 @@ impl FacetCollector {
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self
|
||||
.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
.collect();
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self
|
||||
.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||
.collect();
|
||||
|
||||
let facet_streams = self.segment_counters
|
||||
let facet_streams = self
|
||||
.segment_counters
|
||||
.iter()
|
||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||
.collect::<Vec<_>>();
|
||||
@@ -381,8 +382,10 @@ impl FacetCollector {
|
||||
})
|
||||
.sum();
|
||||
if count > 0u64 {
|
||||
let bytes = facet_merger.key().to_owned();
|
||||
facet_counts.insert(Facet::from_encoded(bytes), count);
|
||||
let bytes: Vec<u8> = facet_merger.key().to_owned();
|
||||
// may create an corrupted facet if the term dicitonary is corrupted
|
||||
let facet = unsafe { Facet::from_encoded(bytes) };
|
||||
facet_counts.insert(facet, count);
|
||||
}
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
@@ -402,7 +405,8 @@ impl Collector for FacetCollector {
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self.ff_reader
|
||||
&mut *self
|
||||
.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
@@ -432,9 +436,20 @@ pub struct FacetCounts {
|
||||
facet_counts: BTreeMap<Facet, u64>,
|
||||
}
|
||||
|
||||
pub struct FacetChildIterator<'a> {
|
||||
underlying: btree_map::Range<'a, Facet, u64>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for FacetChildIterator<'a> {
|
||||
type Item = (&'a Facet, u64);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.underlying.next().map(|(facet, count)| (facet, *count))
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetCounts {
|
||||
#[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime
|
||||
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
@@ -443,15 +458,13 @@ impl FacetCounts {
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
|
||||
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = Facet::from_encoded(facet_after_bytes);
|
||||
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
|
||||
self.facet_counts
|
||||
.range((left_bound, right_bound))
|
||||
.map(|(facet, count)| (facet, *count))
|
||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||
FacetChildIterator { underlying }
|
||||
}
|
||||
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
@@ -483,14 +496,13 @@ impl FacetCounts {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use test::Bencher;
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use query::AllQuery;
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use std::iter;
|
||||
use schema::Field;
|
||||
use core::Index;
|
||||
use query::AllQuery;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::Field;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
@@ -499,7 +511,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
@@ -545,8 +557,10 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
#[should_panic(
|
||||
expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet."
|
||||
)]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
@@ -577,7 +591,7 @@ mod tests {
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
@@ -604,6 +618,19 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use collector::FacetCollector;
|
||||
use query::AllQuery;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::Facet;
|
||||
use schema::SchemaBuilder;
|
||||
use test::Bencher;
|
||||
use Index;
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
@@ -621,7 +648,7 @@ mod tests {
|
||||
// 40425 docs
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
Defines how the documents matching a search query should be processed.
|
||||
*/
|
||||
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use DocId;
|
||||
use Score;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::CountCollector;
|
||||
@@ -21,7 +21,7 @@ mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
|
||||
mod chained_collector;
|
||||
pub use self::chained_collector::chain;
|
||||
pub use self::chained_collector::{chain, ChainedCollector};
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
@@ -89,13 +89,13 @@ impl<'a, C: Collector> Collector for &'a mut C {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use DocId;
|
||||
use Score;
|
||||
use core::SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
use DocId;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
@@ -186,6 +186,52 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast field bytes for all of the
|
||||
/// docs in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct BytesFastFieldTestCollector {
|
||||
vals: Vec<u8>,
|
||||
field: Field,
|
||||
ff_reader: Option<BytesFastFieldReader>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||
BytesFastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(self) -> Vec<u8> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
|
||||
self.vals.extend(val);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use collector::{Collector, CountCollector};
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
|
||||
@@ -1,14 +1,66 @@
|
||||
use super::Collector;
|
||||
use DocId;
|
||||
use Score;
|
||||
use Result;
|
||||
use SegmentReader;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors =
|
||||
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct MultiCollector<'a> {
|
||||
collectors: Vec<&'a mut Collector>,
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use super::Collector;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
// Rust heap is a max-heap and we need a min heap.
|
||||
#[derive(Clone, Copy)]
|
||||
@@ -43,7 +43,61 @@ impl Eq for GlobalScoredDoc {}
|
||||
/// with the best scores.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity is `O(n log K)`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, DocId, Score};
|
||||
/// use tantivy::collector::TopCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut top_collector).unwrap();
|
||||
///
|
||||
/// let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
/// .score_docs()
|
||||
/// .into_iter()
|
||||
/// .map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
/// .collect();
|
||||
///
|
||||
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct TopCollector {
|
||||
limit: usize,
|
||||
heap: BinaryHeap<GlobalScoredDoc>,
|
||||
@@ -107,11 +161,13 @@ impl Collector for TopCollector {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc = *self.heap
|
||||
let limit_doc: GlobalScoredDoc = *self
|
||||
.heap
|
||||
.peek()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self.heap
|
||||
let mut mut_head = self
|
||||
.heap
|
||||
.peek_mut()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
mut_head.score = score;
|
||||
@@ -135,9 +191,9 @@ impl Collector for TopCollector {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Score;
|
||||
use collector::Collector;
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
@@ -186,4 +242,5 @@ mod tests {
|
||||
fn test_top_0() {
|
||||
TopCollector::with_limit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use common::serialize::BinarySerializable;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
@@ -46,7 +46,7 @@ impl BitPacker {
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
@@ -98,30 +98,14 @@ where
|
||||
let addr_in_bits = idx * num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
if cfg!(feature = "simdcompression") {
|
||||
// for simdcompression,
|
||||
// the bitpacker is only used for fastfields,
|
||||
// and we expect them to be always padded.
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
} else {
|
||||
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
|
||||
} else {
|
||||
let mut buffer = [0u8; 8];
|
||||
for i in addr..data.len() {
|
||||
buffer[i - addr] += data[i];
|
||||
}
|
||||
unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
|
||||
};
|
||||
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
|
||||
val_shifted & mask
|
||||
}
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 =
|
||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
|
||||
/// Reads a range of values from the fast field.
|
||||
@@ -141,7 +125,8 @@ where
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
|
||||
@@ -202,15 +202,14 @@ impl BitSet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
extern crate test;
|
||||
use tests;
|
||||
use std::collections::HashSet;
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use query::BitSetDocSet;
|
||||
use docset::DocSet;
|
||||
use query::BitSetDocSet;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashSet;
|
||||
use tests;
|
||||
use tests::generate_nonunique_unsorted;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
@@ -353,6 +352,14 @@ mod tests {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
@@ -385,5 +392,4 @@ mod tests {
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use std::io::Write;
|
||||
use common::CountingWriter;
|
||||
use std::collections::HashMap;
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use directory::WritePtr;
|
||||
use std::io::{self, Read};
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::io::{self, Read};
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
@@ -30,10 +30,7 @@ impl BinarySerializable for FileAddr {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let field = Field::deserialize(reader)?;
|
||||
let idx = VInt::deserialize(reader)?.0 as usize;
|
||||
Ok(FileAddr {
|
||||
field,
|
||||
idx,
|
||||
})
|
||||
Ok(FileAddr { field, idx })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,7 +72,8 @@ impl<W: Write> CompositeWrite<W> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
let mut offset_fields: Vec<_> = self
|
||||
.offsets
|
||||
.iter()
|
||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||
.collect();
|
||||
@@ -166,7 +164,7 @@ impl CompositeFile {
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
|
||||
self.offsets_index
|
||||
.get(&FileAddr { field, idx, })
|
||||
.get(&FileAddr { field, idx })
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
}
|
||||
@@ -174,12 +172,12 @@ impl CompositeFile {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::io::Write;
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use common::BinarySerializable;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod counting_writer;
|
||||
mod composite_file;
|
||||
pub mod bitpacker;
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
mod counting_writer;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::VInt;
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::VInt;
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
@@ -104,8 +104,8 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use common::Endianness;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use std::io;
|
||||
use common::VInt;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
/// Trait for a simple binary serialization.
|
||||
pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
@@ -135,8 +135,8 @@ impl BinarySerializable for String {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use common::VInt;
|
||||
use super::*;
|
||||
use common::VInt;
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::BinarySerializable;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
|
||||
@@ -8,10 +8,8 @@ const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
pub use self::stream::CompressedIntStream;
|
||||
|
||||
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
|
||||
|
||||
/// Returns the size in bytes of a compressed block, given `num_bits`.
|
||||
pub fn compressed_block_size(num_bits: u8) -> usize {
|
||||
1 + (num_bits as usize) * COMPRESSION_BLOCK_SIZE / 8
|
||||
@@ -35,19 +33,23 @@ impl BlockEncoder {
|
||||
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] {
|
||||
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
|
||||
self.output[0] = num_bits;
|
||||
let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits);
|
||||
let written_size =
|
||||
1 + self
|
||||
.bitpacker
|
||||
.compress_sorted(offset, block, &mut self.output[1..], num_bits);
|
||||
&self.output[..written_size]
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
|
||||
let num_bits = self.bitpacker.num_bits(block);
|
||||
self.output[0] = num_bits;
|
||||
let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits);
|
||||
let written_size = 1 + self
|
||||
.bitpacker
|
||||
.compress(block, &mut self.output[1..], num_bits);
|
||||
&self.output[..written_size]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct BlockDecoder {
|
||||
bitpacker: BitPacker4x,
|
||||
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
|
||||
@@ -68,17 +70,24 @@ impl BlockDecoder {
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits)
|
||||
1 + self.bitpacker.decompress_sorted(
|
||||
offset,
|
||||
&compressed_data[1..],
|
||||
&mut self.output,
|
||||
num_bits,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits)
|
||||
1 + self
|
||||
.bitpacker
|
||||
.decompress(&compressed_data[1..], &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -174,8 +183,6 @@ impl VIntDecoder for BlockDecoder {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use tests;
|
||||
use test::Bencher;
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block() {
|
||||
@@ -264,11 +271,34 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::*;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use test::Bencher;
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32() < ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
@@ -277,7 +307,7 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
@@ -304,7 +334,7 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
@@ -313,12 +343,11 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn bench_uncompress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use compression::compressed_block_size;
|
||||
use compression::BlockDecoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
|
||||
/// Reads a stream of compressed ints.
|
||||
@@ -13,7 +13,7 @@ pub struct CompressedIntStream {
|
||||
buffer: SourceRead,
|
||||
|
||||
block_decoder: BlockDecoder,
|
||||
cached_addr: usize, // address of the currently decoded block
|
||||
cached_addr: usize, // address of the currently decoded block
|
||||
cached_next_addr: usize, // address following the currently decoded block
|
||||
|
||||
addr: usize, // address of the block associated to the current position
|
||||
@@ -42,7 +42,9 @@ impl CompressedIntStream {
|
||||
// no need to read.
|
||||
self.cached_next_addr
|
||||
} else {
|
||||
let next_addr = addr + self.block_decoder.uncompress_block_unsorted(self.buffer.slice_from(addr));
|
||||
let next_addr = addr + self
|
||||
.block_decoder
|
||||
.uncompress_block_unsorted(self.buffer.slice_from(addr));
|
||||
self.cached_addr = addr;
|
||||
self.cached_next_addr = next_addr;
|
||||
next_addr
|
||||
@@ -101,8 +103,8 @@ pub mod tests {
|
||||
|
||||
use super::CompressedIntStream;
|
||||
use compression::compressed_block_size;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::BlockEncoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use directory::ReadOnlySource;
|
||||
|
||||
fn create_stream_buffer() -> ReadOnlySource {
|
||||
|
||||
@@ -1,34 +1,33 @@
|
||||
use Result;
|
||||
use core::SegmentId;
|
||||
use error::{ErrorKind, ResultExt};
|
||||
use serde_json;
|
||||
use schema::Schema;
|
||||
use std::sync::Arc;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use core::SegmentId;
|
||||
use std::sync::Arc;
|
||||
use Result;
|
||||
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
use super::pool::LeasedItem;
|
||||
use super::pool::Pool;
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use core::searcher::Searcher;
|
||||
use core::IndexMeta;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use core::META_FILEPATH;
|
||||
use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use core::searcher::Searcher;
|
||||
use std::convert::From;
|
||||
use num_cpus;
|
||||
use super::segment::Segment;
|
||||
use core::SegmentReader;
|
||||
use super::pool::Pool;
|
||||
use core::SegmentMeta;
|
||||
use super::pool::LeasedItem;
|
||||
use std::path::Path;
|
||||
use core::IndexMeta;
|
||||
use indexer::DirectoryLock;
|
||||
use IndexWriter;
|
||||
use directory::ManagedDirectory;
|
||||
use core::META_FILEPATH;
|
||||
use super::segment::create_segment;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use indexer::DirectoryLock;
|
||||
use num_cpus;
|
||||
use std::path::Path;
|
||||
use tokenizer::TokenizerManager;
|
||||
use IndexWriter;
|
||||
|
||||
const NUM_SEARCHERS: usize = 12;
|
||||
|
||||
@@ -53,28 +52,17 @@ impl Index {
|
||||
/// This should only be used for unit tests.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let ram_directory = RAMDirectory::create();
|
||||
// unwrap is ok here
|
||||
let directory = ManagedDirectory::new(ram_directory).expect(
|
||||
"Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.",
|
||||
);
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
Index::create(ram_directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
}
|
||||
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
#[cfg(feature="mmap")]
|
||||
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
Index::create(mmap_directory, schema)
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
@@ -85,13 +73,25 @@ impl Index {
|
||||
///
|
||||
/// The temp directory is only used for testing the `MmapDirectory`.
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::create(mmap_directory, schema)
|
||||
}
|
||||
|
||||
/// Creates a new index given an implementation of the trait `Directory`
|
||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(dir)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
@@ -105,18 +105,21 @@ impl Index {
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature="mmap")]
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::open(mmap_directory)
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
@@ -134,9 +137,13 @@ impl Index {
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// num_threads specifies the number of indexing workers that
|
||||
/// - `num_threads` defines the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
///
|
||||
/// - `overall_heap_size_in_bytes` sets the amount of memory
|
||||
/// allocated for all indexing thread.
|
||||
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
@@ -144,21 +151,35 @@ impl Index {
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize,
|
||||
overall_heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
self,
|
||||
num_threads,
|
||||
heap_size_in_bytes_per_thread,
|
||||
directory_lock,
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a multithreaded writer
|
||||
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
|
||||
///
|
||||
/// Tantivy will automatically define the number of threads to use.
|
||||
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
||||
/// between a given number of threads.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer(&self, heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes)
|
||||
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
let mut num_threads = num_cpus::get();
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
|
||||
}
|
||||
self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
|
||||
}
|
||||
|
||||
/// Accessor to the index schema
|
||||
@@ -170,7 +191,8 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
@@ -205,7 +227,8 @@ impl Index {
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
@@ -223,8 +246,9 @@ impl Index {
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<Result<_>>()?;
|
||||
let schema = self.schema();
|
||||
let searchers = (0..NUM_SEARCHERS)
|
||||
.map(|_| Searcher::from(segment_readers.clone()))
|
||||
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use schema::Schema;
|
||||
use core::SegmentMeta;
|
||||
use std::fmt;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
@@ -45,9 +45,9 @@ impl fmt::Debug for IndexMeta {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json;
|
||||
use super::IndexMeta;
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use common::BinarySerializable;
|
||||
use compression::CompressedIntStream;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use postings::FreqReadingOption;
|
||||
use postings::TermInfo;
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use schema::FieldType;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use compression::CompressedIntStream;
|
||||
use postings::FreqReadingOption;
|
||||
use common::BinarySerializable;
|
||||
use schema::FieldType;
|
||||
use termdict::TermDictionary;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -23,16 +23,16 @@ use schema::FieldType;
|
||||
/// `InvertedIndexReader` are created by calling
|
||||
/// the `SegmentReader`'s [`.inverted_index(...)`] method
|
||||
pub struct InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl,
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
total_num_tokens: u64
|
||||
total_num_tokens: u64,
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionaryImpl,
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
@@ -45,7 +45,7 @@ impl InvertedIndexReader {
|
||||
postings_source: postings_source.slice_from(8),
|
||||
positions_source,
|
||||
record_option,
|
||||
total_num_tokens
|
||||
total_num_tokens,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,11 +56,11 @@ impl InvertedIndexReader {
|
||||
.get_index_record_option()
|
||||
.unwrap_or(IndexRecordOption::Basic);
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl::empty(field_type),
|
||||
termdict: TermDictionary::empty(field_type),
|
||||
postings_source: ReadOnlySource::empty(),
|
||||
positions_source: ReadOnlySource::empty(),
|
||||
record_option,
|
||||
total_num_tokens: 0u64
|
||||
total_num_tokens: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ impl InvertedIndexReader {
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn terms(&self) -> &TermDictionaryImpl {
|
||||
pub fn terms(&self) -> &TermDictionary {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
@@ -149,8 +149,6 @@ impl InvertedIndexReader {
|
||||
self.total_num_tokens
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encountered and indexed.
|
||||
///
|
||||
@@ -166,12 +164,15 @@ impl InvertedIndexReader {
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
pub(crate) fn read_postings_no_deletes(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> Option<SegmentPostings> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.get_term_info(term)
|
||||
@@ -179,6 +180,3 @@ impl InvertedIndexReader {
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,24 +1,24 @@
|
||||
pub mod searcher;
|
||||
pub mod index;
|
||||
mod segment_reader;
|
||||
mod segment_id;
|
||||
mod segment_component;
|
||||
mod segment;
|
||||
mod index_meta;
|
||||
mod pool;
|
||||
mod segment_meta;
|
||||
mod inverted_index_reader;
|
||||
mod pool;
|
||||
pub mod searcher;
|
||||
mod segment;
|
||||
mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_meta;
|
||||
mod segment_reader;
|
||||
|
||||
pub use self::index::Index;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::index::Index;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_meta::SegmentMeta;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
use crossbeam::sync::MsQueue;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use crossbeam::sync::MsQueue;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct GenerationItem<T> {
|
||||
@@ -87,7 +87,8 @@ impl<T> Deref for LeasedItem<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self.gen_item
|
||||
&self
|
||||
.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
@@ -96,7 +97,8 @@ impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self.gen_item
|
||||
&mut self
|
||||
.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
@@ -114,8 +116,8 @@ impl<T> Drop for LeasedItem<T> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::iter;
|
||||
use super::Pool;
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_pool() {
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
use Result;
|
||||
use core::SegmentReader;
|
||||
use schema::Document;
|
||||
use collector::Collector;
|
||||
use query::Query;
|
||||
use DocAddress;
|
||||
use schema::{Field, Term};
|
||||
use termdict::{TermDictionary, TermMerger};
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use core::InvertedIndexReader;
|
||||
use core::SegmentReader;
|
||||
use query::Query;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::{Field, Term};
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use termdict::TermMerger;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
@@ -16,10 +17,18 @@ use core::InvertedIndexReader;
|
||||
/// the destruction of the `Searcher`.
|
||||
///
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher {
|
||||
schema,
|
||||
segment_readers,
|
||||
}
|
||||
}
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
@@ -30,6 +39,11 @@ impl Searcher {
|
||||
segment_reader.doc(doc_id)
|
||||
}
|
||||
|
||||
/// Access the schema associated to the index of this searcher.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the index.
|
||||
pub fn num_docs(&self) -> u64 {
|
||||
self.segment_readers
|
||||
@@ -64,7 +78,8 @@ impl Searcher {
|
||||
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||
let inv_index_readers = self.segment_readers
|
||||
let inv_index_readers = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -84,7 +99,8 @@ impl FieldSearcher {
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// for the given field.
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
let term_streamers: Vec<_> = self.inv_index_readers
|
||||
let term_streamers: Vec<_> = self
|
||||
.inv_index_readers
|
||||
.iter()
|
||||
.map(|inverted_index| inverted_index.terms().stream())
|
||||
.collect();
|
||||
@@ -92,15 +108,10 @@ impl FieldSearcher {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<SegmentReader>> for Searcher {
|
||||
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher { segment_readers }
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Searcher {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let segment_ids = self.segment_readers
|
||||
let segment_ids = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
use Result;
|
||||
use std::path::PathBuf;
|
||||
use schema::Schema;
|
||||
use std::fmt;
|
||||
use core::SegmentId;
|
||||
use directory::{FileProtection, ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use super::SegmentComponent;
|
||||
use core::Index;
|
||||
use std::result;
|
||||
use directory::Directory;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use directory::error::{OpenReadError, OpenWriteError};
|
||||
use directory::Directory;
|
||||
use directory::{FileProtection, ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use schema::Schema;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::result;
|
||||
use Result;
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
#[derive(Clone)]
|
||||
@@ -111,8 +111,8 @@ mod tests {
|
||||
|
||||
use core::SegmentComponent;
|
||||
use directory::Directory;
|
||||
use std::collections::HashSet;
|
||||
use schema::SchemaBuilder;
|
||||
use std::collections::HashSet;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::slice;
|
||||
|
||||
/// Enum describing each component of a tantivy segment.
|
||||
/// Each component is stored in its own file,
|
||||
/// using the pattern `segment_uuid`.`component_extension`,
|
||||
@@ -26,7 +28,7 @@ pub enum SegmentComponent {
|
||||
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
|
||||
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use uuid::Uuid;
|
||||
use std::fmt;
|
||||
use std::cmp::{Ord, Ordering};
|
||||
use std::fmt;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
use std::sync::atomic;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentId;
|
||||
use super::SegmentComponent;
|
||||
use std::path::PathBuf;
|
||||
use core::SegmentId;
|
||||
use std::collections::HashSet;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
|
||||
@@ -1,31 +1,30 @@
|
||||
use Result;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentComponent;
|
||||
use std::sync::RwLock;
|
||||
use common::HasLen;
|
||||
use core::SegmentMeta;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::DeleteBitSet;
|
||||
use store::StoreReader;
|
||||
use schema::Document;
|
||||
use DocId;
|
||||
use std::sync::Arc;
|
||||
use std::collections::HashMap;
|
||||
use common::CompositeFile;
|
||||
use std::fmt;
|
||||
use common::HasLen;
|
||||
use core::InvertedIndexReader;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use error::ErrorKind;
|
||||
use termdict::TermDictionaryImpl;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Schema;
|
||||
use termdict::TermDictionary;
|
||||
use fastfield::{FastValue, MultiValueIntFastFieldReader};
|
||||
use schema::Cardinality;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use schema::Cardinality;
|
||||
use schema::Document;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use store::StoreReader;
|
||||
use termdict::TermDictionary;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -76,6 +75,11 @@ impl SegmentReader {
|
||||
self.segment_meta.num_docs()
|
||||
}
|
||||
|
||||
/// Returns the schema of the index this segment belongs to.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
@@ -105,12 +109,25 @@ impl SegmentReader {
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||
Ok(FastFieldReader::open(ff_source))
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
@@ -123,21 +140,34 @@ impl SegmentReader {
|
||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let vals_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
{
|
||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
@@ -157,7 +187,7 @@ impl SegmentReader {
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let termdict = TermDictionaryImpl::from_source(termdict_source);
|
||||
let termdict = TermDictionary::from_source(termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
}
|
||||
@@ -171,12 +201,14 @@ impl SegmentReader {
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
|
||||
if let Some(fieldnorm_source) = self.fieldnorms_composite
|
||||
.open_read(field) {
|
||||
if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
|
||||
FieldNormReader::open(fieldnorm_source)
|
||||
} else {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg= format!("Field norm not found for field {:?}. Was it market as indexed during indexing.", field_name);
|
||||
let err_msg = format!(
|
||||
"Field norm not found for field {:?}. Was it market as indexed during indexing.",
|
||||
field_name
|
||||
);
|
||||
panic!(err_msg);
|
||||
}
|
||||
}
|
||||
@@ -211,13 +243,12 @@ impl SegmentReader {
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
|
||||
let delete_bitset_opt =
|
||||
if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
@@ -243,7 +274,8 @@ impl SegmentReader {
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
|
||||
if let Some(inv_idx_reader) = self
|
||||
.inv_idx_reader_cache
|
||||
.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
@@ -272,16 +304,18 @@ impl SegmentReader {
|
||||
|
||||
let postings_source = postings_source_opt.unwrap();
|
||||
|
||||
let termdict_source = self.termdict_composite
|
||||
let termdict_source = self
|
||||
.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
|
||||
let positions_source = self.positions_composite
|
||||
let positions_source = self
|
||||
.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionaryImpl::from_source(termdict_source),
|
||||
TermDictionary::from_source(termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
record_option,
|
||||
@@ -323,6 +357,11 @@ impl SegmentReader {
|
||||
.map(|delete_set| delete_set.is_deleted(doc))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||
SegmentReaderAliveDocsIterator::new(&self)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
@@ -330,3 +369,90 @@ impl fmt::Debug for SegmentReader {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements the iterator trait to allow easy iteration
|
||||
/// over non-deleted ("alive") DocIds in a SegmentReader
|
||||
pub struct SegmentReaderAliveDocsIterator<'a> {
|
||||
reader: &'a SegmentReader,
|
||||
max_doc: DocId,
|
||||
current: DocId,
|
||||
}
|
||||
|
||||
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
||||
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
||||
SegmentReaderAliveDocsIterator {
|
||||
reader: reader,
|
||||
max_doc: reader.max_doc(),
|
||||
current: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
||||
type Item = DocId;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
|
||||
if self.current >= self.max_doc {
|
||||
return None;
|
||||
}
|
||||
|
||||
// find the next alive doc id
|
||||
while self.reader.is_deleted(self.current) {
|
||||
self.current += 1;
|
||||
|
||||
if self.current >= self.max_doc {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// capture the current alive DocId
|
||||
let result = Some(self.current);
|
||||
|
||||
// move down the chain
|
||||
self.current += 1;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use core::Index;
|
||||
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||
use DocId;
|
||||
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let name = schema.get_field("name").unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => "tantivy"));
|
||||
index_writer.add_document(doc!(name => "horse"));
|
||||
index_writer.add_document(doc!(name => "jockey"));
|
||||
index_writer.add_document(doc!(name => "cap"));
|
||||
|
||||
// we should now have one segment with two docs
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut index_writer2 = index.writer(50_000_000).unwrap();
|
||||
index_writer2.delete_term(Term::from_field_text(name, "horse"));
|
||||
index_writer2.delete_term(Term::from_field_text(name, "cap"));
|
||||
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer2.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
|
||||
assert_eq!(vec![0u32, 2u32], docs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
mod skip;
|
||||
pub mod stacker;
|
||||
|
||||
pub use self::skip::{SkipList, SkipListBuilder};
|
||||
@@ -1,161 +0,0 @@
|
||||
use std::mem;
|
||||
use super::heap::{Heap, HeapAllocable};
|
||||
|
||||
#[inline]
|
||||
pub fn is_power_of_2(val: u32) -> bool {
|
||||
val & (val - 1) == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(val: u32) -> bool {
|
||||
val > 3 && is_power_of_2(val)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
end: u32,
|
||||
val0: u32,
|
||||
val1: u32,
|
||||
val2: u32,
|
||||
next: u32, // inline of the first block
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push(&mut self, val: u32, heap: &Heap) {
|
||||
self.len += 1;
|
||||
if jump_needed(self.len) {
|
||||
// we need to allocate another block.
|
||||
// ... As we want to grow block exponentially
|
||||
// the next block as a size of (length so far),
|
||||
// and we need to add 1u32 to store the pointer
|
||||
// to the next element.
|
||||
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: u32 = heap.allocate_space(new_block_size);
|
||||
heap.set(self.end, &new_block_addr);
|
||||
self.end = new_block_addr;
|
||||
}
|
||||
heap.set(self.end, &val);
|
||||
self.end += mem::size_of::<u32>() as u32;
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for u32 {
|
||||
fn with_addr(_addr: u32) -> u32 {
|
||||
0u32
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for ExpUnrolledLinkedList {
|
||||
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
|
||||
let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
end: last_addr,
|
||||
val0: 0u32,
|
||||
val1: 0u32,
|
||||
val2: 0u32,
|
||||
next: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a Heap,
|
||||
addr: u32,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
let addr: u32;
|
||||
self.consumed += 1;
|
||||
if jump_needed(self.consumed) {
|
||||
addr = *self.heap.get_mut_ref(self.addr);
|
||||
} else {
|
||||
addr = self.addr;
|
||||
}
|
||||
self.addr = addr + mem::size_of::<u32>() as u32;
|
||||
Some(*self.heap.get_mut_ref(addr))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::Heap;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let heap = Heap::with_capacity(1_000_000);
|
||||
let (addr, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stack.push(1u32, &heap);
|
||||
stack.push(2u32, &heap);
|
||||
stack.push(4u32, &heap);
|
||||
stack.push(8u32, &heap);
|
||||
{
|
||||
let mut it = stack.iter(addr, &heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut vecs = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
vecs.push(Vec::new());
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
vecs[t].push(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = Heap::with_capacity(64_000_000);
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &heap);
|
||||
}
|
||||
}
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1,309 +0,0 @@
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use postings::UnorderedTermId;
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
|
||||
mod murmurhash2 {
|
||||
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let m: u32 = 0x5bd1_e995;
|
||||
let r = 24;
|
||||
let len = key.len() as u32;
|
||||
|
||||
let mut h: u32 = SEED ^ len;
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { *key_ptr };
|
||||
k = k.wrapping_mul(m);
|
||||
k ^= k >> r;
|
||||
k = k.wrapping_mul(m);
|
||||
k = k.wrapping_mul(m);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining = len & 3;
|
||||
let key_ptr_u8: *const u8 = key_ptr as *const u8;
|
||||
match remaining {
|
||||
3 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(2)) } << 16;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
2 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
1 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(m);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
));
|
||||
let table_size = compute_table_size(table_num_bits);
|
||||
let heap_size = per_thread_memory_budget - table_size;
|
||||
(heap_size, table_num_bits)
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone, Default)]
|
||||
struct KeyValue {
|
||||
key_value_addr: BytesRef,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
/// The quirky API has the benefit of avoiding
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct TermHashMap<'a> {
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing {
|
||||
hash,
|
||||
i: 0,
|
||||
mask,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i * self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermHashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
pub fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
|
||||
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
|
||||
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
|
||||
(key_bytes, expull_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr, hash
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
|
||||
self.occupied.iter().cloned().map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
let (key, offset) = self.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
|
||||
&mut self,
|
||||
key: S,
|
||||
) -> (UnorderedTermId, &mut V) {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
|
||||
self.set_bucket(hash, key_bytes_ref, bucket);
|
||||
return (bucket as UnorderedTermId, val);
|
||||
} else if kv.hash == hash {
|
||||
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
|
||||
if stored_key == key_bytes {
|
||||
return (
|
||||
bucket as UnorderedTermId,
|
||||
self.heap.get_mut_ref(expull_addr),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::{Heap, HeapAllocable};
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
use std::collections::HashSet;
|
||||
use super::split_memory;
|
||||
|
||||
struct TestValue {
|
||||
val: u32,
|
||||
_addr: u32,
|
||||
}
|
||||
|
||||
impl HeapAllocable for TestValue {
|
||||
fn with_addr(addr: u32) -> TestValue {
|
||||
TestValue {
|
||||
val: 0u32,
|
||||
_addr: addr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(split_memory(100_000), (67232, 12));
|
||||
assert_eq!(split_memory(1_000_000), (737856, 15));
|
||||
assert_eq!(split_memory(10_000_000), (7902848, 18));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let heap = Heap::with_capacity(2_000_000);
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 4u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 3u32);
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 4u32);
|
||||
}
|
||||
let mut iter_values = hash_map.iter();
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 3u32);
|
||||
}
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 4u32);
|
||||
}
|
||||
assert!(iter_values.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash_2(b: &mut Bencher) {
|
||||
let keys: Vec<&'static str> =
|
||||
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
keys.iter()
|
||||
.map(|&s| s.as_bytes())
|
||||
.map(murmurhash2::murmurhash2)
|
||||
.map(|h| h as u64)
|
||||
.last()
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,233 +0,0 @@
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
///
|
||||
/// The slice will encode the length of the `&[u8]` slice
|
||||
/// on 16-bits, and then the data is encoded.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct BytesRef(u32);
|
||||
|
||||
impl BytesRef {
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
|
||||
pub fn addr(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BytesRef {
|
||||
fn default() -> BytesRef {
|
||||
BytesRef(u32::max_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// Object that can be allocated in tantivy's custom `Heap`.
|
||||
pub trait HeapAllocable {
|
||||
fn with_addr(addr: u32) -> Self;
|
||||
}
|
||||
|
||||
/// Tantivy's custom `Heap`.
|
||||
pub struct Heap {
|
||||
inner: UnsafeCell<InnerHeap>,
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
|
||||
impl Heap {
|
||||
/// Creates a new heap with a given capacity
|
||||
pub fn with_capacity(num_bytes: usize) -> Heap {
|
||||
Heap {
|
||||
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
|
||||
}
|
||||
}
|
||||
|
||||
fn inner(&self) -> &mut InnerHeap {
|
||||
unsafe { &mut *self.inner.get() }
|
||||
}
|
||||
|
||||
/// Clears the heap. All the underlying data is lost.
|
||||
///
|
||||
/// This heap does not support deallocation.
|
||||
/// This method is the only way to free memory.
|
||||
pub fn clear(&self) {
|
||||
self.inner().clear();
|
||||
}
|
||||
|
||||
/// Return amount of free space, in bytes.
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
self.inner().num_free_bytes()
|
||||
}
|
||||
|
||||
/// Allocate a given amount of space and returns an address
|
||||
/// in the Heap.
|
||||
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
|
||||
self.inner().allocate_space(num_bytes)
|
||||
}
|
||||
|
||||
/// Allocate an object in the heap
|
||||
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
|
||||
let addr = self.inner().allocate_space(mem::size_of::<V>());
|
||||
let v: V = V::with_addr(addr);
|
||||
self.inner().set(addr, &v);
|
||||
(addr, self.inner().get_mut_ref(addr))
|
||||
}
|
||||
|
||||
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
|
||||
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
|
||||
self.inner().allocate_and_set(data)
|
||||
}
|
||||
|
||||
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
|
||||
/// given as argumetn
|
||||
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
self.inner().get_slice(bytes_ref)
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap, at the given `address`.
|
||||
pub fn set<Item>(&self, addr: u32, val: &Item) {
|
||||
self.inner().set(addr, val);
|
||||
}
|
||||
|
||||
/// Returns a mutable reference for an object at a given Item.
|
||||
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to an `Item` at a given `addr`.
|
||||
#[cfg(test)]
|
||||
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.get_mut_ref(addr)
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerHeap {
|
||||
buffer: Vec<u8>,
|
||||
buffer_len: u32,
|
||||
used: u32,
|
||||
next_heap: Option<Box<InnerHeap>>,
|
||||
}
|
||||
|
||||
impl InnerHeap {
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = vec![0u8; num_bytes];
|
||||
InnerHeap {
|
||||
buffer,
|
||||
buffer_len: num_bytes as u32,
|
||||
next_heap: None,
|
||||
used: 0u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.used = 0u32;
|
||||
self.next_heap = None;
|
||||
}
|
||||
|
||||
// Returns the number of free bytes. If the buffer
|
||||
// has reached it's capacity and overflowed to another buffer, return 0.
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
if self.next_heap.is_some() {
|
||||
0u32
|
||||
} else {
|
||||
self.buffer_len - self.used
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
|
||||
let addr = self.used;
|
||||
self.used += num_bytes as u32;
|
||||
if self.used <= self.buffer_len {
|
||||
addr
|
||||
} else {
|
||||
if self.next_heap.is_none() {
|
||||
info!(
|
||||
r#"Exceeded heap size. The segment will be committed right
|
||||
after indexing this document."#,
|
||||
);
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
}
|
||||
}
|
||||
|
||||
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
let start = bytes_ref.0;
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(BytesRef(start - self.buffer_len))
|
||||
} else {
|
||||
let start = start as usize;
|
||||
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
|
||||
&self.buffer[start + 2..start + 2 + len]
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
} else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
}
|
||||
|
||||
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
|
||||
assert!(data.len() < u16::max_value() as usize);
|
||||
let total_len = 2 + data.len();
|
||||
let start = self.allocate_space(total_len);
|
||||
let total_buff = self.get_mut_slice(start, start + total_len as u32);
|
||||
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
|
||||
total_buff[2..].clone_from_slice(data);
|
||||
BytesRef(start)
|
||||
}
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut(addr - self.buffer_len)
|
||||
} else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_ref(addr - self.buffer_len)
|
||||
} else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
unsafe { &mut *v_ptr }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set(addr - self.buffer_len, val);
|
||||
} else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
|
||||
unsafe {
|
||||
let dest_ptr: *mut u8 = self.get_mut(addr);
|
||||
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
pub(crate) mod hashmap;
|
||||
mod heap;
|
||||
mod expull;
|
||||
|
||||
pub use self::heap::{Heap, HeapAllocable};
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::hashmap::TermHashMap;
|
||||
|
||||
#[test]
|
||||
fn test_unrolled_linked_list() {
|
||||
use std::collections;
|
||||
let heap = Heap::with_capacity(30_000_000);
|
||||
{
|
||||
heap.clear();
|
||||
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
|
||||
ks.push(2);
|
||||
ks.push(3);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
|
||||
for j in 0..k {
|
||||
for i in 0..500 {
|
||||
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
|
||||
v.push(i * j, &heap);
|
||||
}
|
||||
}
|
||||
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
|
||||
for (key, addr, _) in hashmap.iter() {
|
||||
map_addr.insert(Vec::from(key), addr);
|
||||
}
|
||||
|
||||
for i in 0..500 {
|
||||
let key: String = i.to_string();
|
||||
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
|
||||
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
|
||||
let mut it = exp_pull.iter(addr, &heap);
|
||||
for j in 0..k {
|
||||
assert_eq!(it.next().unwrap(), i * j);
|
||||
}
|
||||
assert!(!it.next().is_some());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::marker::Send;
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::result;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::marker::Send;
|
||||
use std::marker::Sync;
|
||||
use std::path::Path;
|
||||
use std::result;
|
||||
|
||||
/// Write-once read many (WORM) abstraction for where
|
||||
/// tantivy's data should be stored.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::error::Error as StdError;
|
||||
use std::path::PathBuf;
|
||||
use std::io;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// General IO error with an optional path to the offending file.
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
use serde_json;
|
||||
use core::MANAGED_FILEPATH;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::result;
|
||||
use std::io;
|
||||
use Directory;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::collections::HashSet;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::io::Write;
|
||||
use core::MANAGED_FILEPATH;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use error::{ErrorKind, Result, ResultExt};
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use Directory;
|
||||
|
||||
/// Wrapper of directories that keeps track of files created by Tantivy.
|
||||
///
|
||||
@@ -86,7 +86,7 @@ impl ManagedDirectory {
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default(),
|
||||
@@ -94,7 +94,7 @@ impl ManagedDirectory {
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
|
||||
@@ -117,7 +117,8 @@ impl ManagedDirectory {
|
||||
let mut files_to_delete = vec![];
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
let meta_informations_rlock = self
|
||||
.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
|
||||
@@ -170,7 +171,8 @@ impl ManagedDirectory {
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
let mut meta_informations_wlock = self
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
{
|
||||
@@ -193,7 +195,8 @@ impl ManagedDirectory {
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let pathbuf = path.to_owned();
|
||||
{
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
let mut meta_informations_wlock = self
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned on protect");
|
||||
*meta_informations_wlock
|
||||
@@ -215,7 +218,8 @@ impl ManagedDirectory {
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
let mut meta_wlock = self.meta_informations
|
||||
let mut meta_wlock = self
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
@@ -248,7 +252,8 @@ impl Directory for ManagedDirectory {
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
{
|
||||
let metas_rlock = self.meta_informations
|
||||
let metas_rlock = self
|
||||
.meta_informations
|
||||
.read()
|
||||
.expect("poisoned lock in managed directory meta");
|
||||
if let Some(counter) = metas_rlock.protected_files.get(path) {
|
||||
@@ -265,7 +270,7 @@ impl Directory for ManagedDirectory {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
box self.clone()
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -282,10 +287,10 @@ impl Clone for ManagedDirectory {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use std::path::Path;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
|
||||
lazy_static! {
|
||||
@@ -294,7 +299,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
@@ -343,7 +348,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap ")]
|
||||
#[cfg(feature = "mmap ")]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
@@ -373,7 +378,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_managed_directory_protect() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
use atomicwrites;
|
||||
use common::make_io_err;
|
||||
use directory::Directory;
|
||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use directory::ReadOnlySource;
|
||||
use directory::shared_vec_slice::SharedVecSlice;
|
||||
use directory::Directory;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use std::collections::hash_map::Entry as HashMapEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
use std::fmt;
|
||||
use std::fs::{self, File};
|
||||
use std::fs::OpenOptions;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -32,7 +32,8 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
}
|
||||
})?;
|
||||
|
||||
let meta_data = file.metadata()
|
||||
let meta_data = file
|
||||
.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
@@ -40,9 +41,11 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
// instead.
|
||||
return Ok(None);
|
||||
}
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
unsafe {
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
@@ -307,7 +310,8 @@ impl Directory for MmapDirectory {
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self.sync_directory()
|
||||
Ok(_) => self
|
||||
.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
|
||||
@@ -4,29 +4,29 @@ WORM directory abstraction.
|
||||
|
||||
*/
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_directory;
|
||||
|
||||
mod ram_directory;
|
||||
mod directory;
|
||||
mod managed_directory;
|
||||
mod ram_directory;
|
||||
mod read_only_source;
|
||||
mod shared_vec_slice;
|
||||
mod managed_directory;
|
||||
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
|
||||
use std::io::{BufWriter, Seek, Write};
|
||||
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub use self::directory::Directory;
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub(crate) use self::read_only_source::SourceRead;
|
||||
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
|
||||
pub(crate) use self::read_only_source::SourceRead;
|
||||
|
||||
/// Synonym of Seek + Write
|
||||
pub trait SeekableWrite: Seek + Write {}
|
||||
@@ -42,8 +42,8 @@ pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use std::path::Path;
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::path::Path;
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
||||
@@ -56,7 +56,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_mmap_directory() {
|
||||
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
test_directory(&mut mmap_directory);
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::make_io_err;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::WritePtr;
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use common::make_io_err;
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::WritePtr;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
|
||||
/// Writer associated with the `RAMDirectory`
|
||||
///
|
||||
@@ -170,7 +170,8 @@ impl Directory for RAMDirectory {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self.fs
|
||||
let exists = self
|
||||
.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#[cfg(feature="mmap")]
|
||||
use fst::raw::MmapReadOnly;
|
||||
use std::ops::Deref;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::HasLen;
|
||||
use std::slice;
|
||||
use std::io::{self, Read};
|
||||
#[cfg(feature = "mmap")]
|
||||
use fst::raw::MmapReadOnly;
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
use std::io::{self, Read};
|
||||
use std::ops::Deref;
|
||||
use std::slice;
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
@@ -15,7 +15,7 @@ use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
/// hold by this object should never be altered or destroyed.
|
||||
pub enum ReadOnlySource {
|
||||
/// Mmap source of data
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
Mmap(MmapReadOnly),
|
||||
/// Wrapping a `Vec<u8>`
|
||||
Anonymous(SharedVecSlice),
|
||||
@@ -41,8 +41,8 @@ impl ReadOnlySource {
|
||||
/// Returns the data underlying the ReadOnlySource object.
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
match *self {
|
||||
#[cfg(feature="mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
|
||||
#[cfg(feature = "mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||
}
|
||||
}
|
||||
@@ -66,9 +66,14 @@ impl ReadOnlySource {
|
||||
/// 1KB slice is remaining, the whole `500MBs`
|
||||
/// are retained in memory.
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
|
||||
assert!(from_offset <= to_offset, "Requested negative slice [{}..{}]", from_offset, to_offset);
|
||||
assert!(
|
||||
from_offset <= to_offset,
|
||||
"Requested negative slice [{}..{}]",
|
||||
from_offset,
|
||||
to_offset
|
||||
);
|
||||
match *self {
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => {
|
||||
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
|
||||
ReadOnlySource::Mmap(sliced_mmap)
|
||||
@@ -130,13 +135,11 @@ impl SourceRead {
|
||||
|
||||
pub fn slice_from(&self, start: usize) -> &[u8] {
|
||||
&self.cursor[start..]
|
||||
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u8 {
|
||||
self.cursor[idx]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SourceRead {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use DocId;
|
||||
use common::BitSet;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
use common::BitSet;
|
||||
use DocId;
|
||||
|
||||
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
|
||||
14
src/error.rs
14
src/error.rs
@@ -2,13 +2,13 @@
|
||||
|
||||
use std::io;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use query;
|
||||
use schema;
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use serde_json;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
error_chain!(
|
||||
errors {
|
||||
@@ -48,10 +48,10 @@ error_chain!(
|
||||
description("an error occurred in a thread")
|
||||
display("an error occurred in a thread: '{}'", err)
|
||||
}
|
||||
/// An Error appeared related to the lack of a field.
|
||||
SchemaError(field: String) {
|
||||
description("a schema field is missing")
|
||||
display("a schema field is missing: '{}'", field)
|
||||
/// An Error appeared related to the schema.
|
||||
SchemaError(message: String) {
|
||||
description("the schema is not matching expectations.")
|
||||
display("Schema error: '{}'", message)
|
||||
}
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
FastFieldError(err: FastFieldNotAvailableError) {
|
||||
|
||||
38
src/fastfield/bytes/mod.rs
Normal file
38
src/fastfield/bytes/mod.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use self::reader::BytesFastFieldReader;
|
||||
pub use self::writer::BytesFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_bytes() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_bytes_field("bytesfield");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
|
||||
index_writer.add_document(doc!(field=>vec![]));
|
||||
index_writer.add_document(doc!(field=>vec![255u8]));
|
||||
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
|
||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
||||
|
||||
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_val(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_val(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
|
||||
let long = vec![0u8; 1000];
|
||||
assert_eq!(bytes_reader.get_val(4), long.as_slice());
|
||||
}
|
||||
}
|
||||
37
src/fastfield/bytes/reader.rs
Normal file
37
src/fastfield/bytes/reader.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use owning_ref::OwningRef;
|
||||
|
||||
use directory::ReadOnlySource;
|
||||
use fastfield::FastFieldReader;
|
||||
use DocId;
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
///
|
||||
/// The reader is implemented as a `u64` fast field and a separate collection of bytes.
|
||||
///
|
||||
/// The `vals_reader` will access the concatenated list of all values for all documents.
|
||||
///
|
||||
/// The `idx_reader` associates, for each document, the index of its first value.
|
||||
///
|
||||
/// Reading the value for a document is done by reading the start index for it,
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values: OwningRef<ReadOnlySource, [u8]>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values_source: ReadOnlySource,
|
||||
) -> BytesFastFieldReader {
|
||||
let values = OwningRef::new(values_source).map(|source| &source[..]);
|
||||
BytesFastFieldReader { idx_reader, values }
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_val(&self, doc: DocId) -> &[u8] {
|
||||
let start = self.idx_reader.get(doc) as usize;
|
||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||
&self.values[start..stop]
|
||||
}
|
||||
}
|
||||
96
src/fastfield/bytes/writer.rs
Normal file
96
src/fastfield/bytes/writer.rs
Normal file
@@ -0,0 +1,96 @@
|
||||
use std::io;
|
||||
|
||||
use fastfield::serializer::FastFieldSerializer;
|
||||
use schema::{Document, Field, Value};
|
||||
use DocId;
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
/// This `BytesFastFieldWriter` is only useful for advanced user.
|
||||
/// The normal way to get your associated bytes in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::SingleValue`
|
||||
/// in your schema
|
||||
/// - add your document simply by calling `.add_document(...)` with associating bytes to the field.
|
||||
///
|
||||
/// The `BytesFastFieldWriter` can be acquired from the
|
||||
/// fast field writer by calling
|
||||
/// [`.get_bytes_writer(...)`](./struct.FastFieldsWriter.html#method.get_bytes_writer).
|
||||
///
|
||||
/// Once acquired, writing is done by calling `.add_document_val(&[u8])`
|
||||
/// once per document, even if there are no bytes associated to it.
|
||||
pub struct BytesFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u8>,
|
||||
doc_index: Vec<u64>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldWriter {
|
||||
/// Creates a new `BytesFastFieldWriter`
|
||||
pub fn new(field: Field) -> Self {
|
||||
BytesFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Access the field associated to the `BytesFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Finalize the current document.
|
||||
pub(crate) fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Shift to the next document and add all of the
|
||||
/// matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
if let &Value::Bytes(ref bytes) = field_value.value() {
|
||||
self.vals.extend_from_slice(bytes);
|
||||
} else {
|
||||
panic!(
|
||||
"Bytes field contained non-Bytes Value!. Field {:?} = {:?}",
|
||||
self.field, field_value
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Register the bytes associated to a document.
|
||||
///
|
||||
/// The method returns the `DocId` of the document that was
|
||||
/// just written.
|
||||
pub fn add_document_val(&mut self, val: &[u8]) -> DocId {
|
||||
let doc = self.doc_index.len() as DocId;
|
||||
self.next_doc();
|
||||
self.vals.extend_from_slice(val);
|
||||
doc
|
||||
}
|
||||
|
||||
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
for &offset in &self.doc_index {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
}
|
||||
doc_index_serializer.add_val(self.vals.len() as u64)?;
|
||||
doc_index_serializer.close_field()?;
|
||||
}
|
||||
{
|
||||
// writing the values themselves
|
||||
let mut value_serializer = serializer.new_bytes_fast_field_with_idx(self.field, 1)?;
|
||||
value_serializer.write_all(&self.vals)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
use bit_set::BitSet;
|
||||
use directory::WritePtr;
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use common::HasLen;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use DocId;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
@@ -41,7 +41,8 @@ pub struct DeleteBitSet {
|
||||
impl DeleteBitSet {
|
||||
/// Opens a delete bitset given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||
let num_deleted: usize = data.as_slice()
|
||||
let num_deleted: usize = data
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
@@ -62,10 +63,8 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
@@ -74,10 +73,10 @@ impl HasLen for DeleteBitSet {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::PathBuf;
|
||||
use super::*;
|
||||
use bit_set::BitSet;
|
||||
use directory::*;
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn test_delete_bitset_helper(bitset: &BitSet) {
|
||||
let test_path = PathBuf::from("test");
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::result;
|
||||
use schema::FieldEntry;
|
||||
use std::result;
|
||||
|
||||
/// `FastFieldNotAvailableError` is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use DocId;
|
||||
use termdict::TermOrdinal;
|
||||
use schema::Facet;
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
/// facets associated to a given document in a specific
|
||||
@@ -19,7 +19,7 @@ use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
/// only makes sense for a given segment.
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionaryImpl,
|
||||
term_dict: TermDictionary,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
@@ -28,11 +28,11 @@ impl FacetReader {
|
||||
/// A facet reader just wraps :
|
||||
/// - a `MultiValueIntFastFieldReader` that makes it possible to
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionaryImpl` that helps associating a facet to
|
||||
/// - a `TermDictionary` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
pub fn new(
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionaryImpl,
|
||||
term_dict: TermDictionary,
|
||||
) -> FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
@@ -50,13 +50,14 @@ impl FacetReader {
|
||||
}
|
||||
|
||||
/// Accessor for the facet term dictionary.
|
||||
pub fn facet_dict(&self) -> &TermDictionaryImpl {
|
||||
pub fn facet_dict(&self) -> &TermDictionary {
|
||||
&self.term_dict
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
let found_term = self.term_dict
|
||||
let found_term = self
|
||||
.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
}
|
||||
|
||||
@@ -23,26 +23,28 @@ values stored.
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use common;
|
||||
use schema::Cardinality;
|
||||
use schema::FieldType;
|
||||
use schema::Value;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::MultiValueIntFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
|
||||
mod reader;
|
||||
mod writer;
|
||||
mod serializer;
|
||||
mod error;
|
||||
mod bytes;
|
||||
mod delete;
|
||||
mod error;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for types that are allowed for fast fields: (u64 or i64).
|
||||
pub trait FastValue: Default + Clone + Copy {
|
||||
@@ -121,31 +123,27 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use schema::Document;
|
||||
use schema::FAST;
|
||||
use schema::Field;
|
||||
use schema::FAST;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use super::*;
|
||||
use test;
|
||||
use test::Bencher;
|
||||
|
||||
lazy_static! {
|
||||
static ref SCHEMA: Schema = {
|
||||
pub static ref SCHEMA: Schema = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
schema_builder.add_u64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
};
|
||||
static ref FIELD: Field = {
|
||||
SCHEMA.get_field("field").unwrap()
|
||||
};
|
||||
pub static ref FIELD: Field = { SCHEMA.get_field("field").unwrap() };
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -369,7 +367,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_permutation() -> Vec<u64> {
|
||||
pub fn generate_permutation() -> Vec<u64> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng = XorShiftRng::from_seed(*seed);
|
||||
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||
@@ -409,13 +407,27 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::tests::FIELD;
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use test::{self, Bencher};
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
for i in (0u32..n / 7).map(|v| v * 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
@@ -461,7 +473,7 @@ mod tests {
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
for i in (0u32..n / 7).map(|val| val * 7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
@@ -502,4 +514,5 @@ mod tests {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
mod writer;
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
pub use self::reader::MultiValueIntFastFieldReader;
|
||||
pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Cardinality;
|
||||
use schema::IntOptions;
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use DocId;
|
||||
use fastfield::{FastFieldReader, FastValue};
|
||||
use DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
///
|
||||
@@ -26,13 +26,20 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let start = self.idx_reader.get(doc) as u32;
|
||||
let stop = self.idx_reader.get(doc + 1) as u32;
|
||||
let (start, stop) = self.range(doc);
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range(start, &mut vals[..]);
|
||||
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,35 @@
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::serializer::FastSingleFieldSerializer;
|
||||
use fastfield::value_to_u64;
|
||||
use std::collections::HashMap;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use itertools::Itertools;
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Document, Field};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use itertools::Itertools;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
///
|
||||
/// This `Writer` is only useful for advanced user.
|
||||
/// The normal way to get your multivalued int in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::MultiValues`
|
||||
/// in your schema
|
||||
/// - add your document simply by calling `.add_document(...)`.
|
||||
///
|
||||
/// The `MultiValueIntFastFieldWriter` can be acquired from the
|
||||
/// fastfield writer, by calling [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer).
|
||||
///
|
||||
/// Once acquired, writing is done by calling calls to
|
||||
/// `.add_document_vals(&[u64])` once per document.
|
||||
///
|
||||
/// The serializer makes it possible to remap all of the values
|
||||
/// that were pushed to the writer using a mapping.
|
||||
/// This makes it possible to push unordered term ids,
|
||||
/// during indexing and remap them to their respective
|
||||
/// term ids when the segment is getting serialized.
|
||||
pub struct MultiValueIntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u64>,
|
||||
@@ -16,7 +39,7 @@ pub struct MultiValueIntFastFieldWriter {
|
||||
|
||||
impl MultiValueIntFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field, is_facet: bool) -> Self {
|
||||
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
|
||||
MultiValueIntFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
@@ -25,24 +48,26 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Access the field associated to the `MultiValueIntFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
pub fn next_doc(&mut self) {
|
||||
/// Finalize the current document.
|
||||
pub(crate) fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: UnorderedTermId) {
|
||||
/// Pushes a new value to the current document.
|
||||
pub(crate) fn add_val(&mut self, val: UnorderedTermId) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
/// Shift to the next document and adds
|
||||
/// all of the matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if !self.is_facet {
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
@@ -52,6 +77,17 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Register all of the values associated to a document.
|
||||
///
|
||||
/// The method returns the `DocId` of the document that was
|
||||
/// just written.
|
||||
pub fn add_document_vals(&mut self, vals: &[UnorderedTermId]) -> DocId {
|
||||
let doc = self.doc_index.len() as DocId;
|
||||
self.next_doc();
|
||||
self.vals.extend_from_slice(vals);
|
||||
doc
|
||||
}
|
||||
|
||||
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
|
||||
///
|
||||
/// HashMap makes it possible to remap them before serializing.
|
||||
@@ -66,7 +102,7 @@ impl MultiValueIntFastFieldWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
@@ -90,13 +126,13 @@ impl MultiValueIntFastFieldWriter {
|
||||
1,
|
||||
)?;
|
||||
for val in &self.vals {
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
|
||||
value_serializer.add_val(remapped_val)?;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = self.vals.iter().cloned().minmax();
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for &val in &self.vals {
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
use common::BinarySerializable;
|
||||
use super::FastValue;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use common::CompositeFile;
|
||||
use common::compute_num_bits;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use common::BinarySerializable;
|
||||
use common::CompositeFile;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use owning_ref::OwningRef;
|
||||
use schema::FAST;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::FAST;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem;
|
||||
use std::path::Path;
|
||||
use super::FastValue;
|
||||
use DocId;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
@@ -67,12 +67,20 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
///
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64`
|
||||
self.bit_unpacker.get_range(start, output_u64);
|
||||
for out in output_u64.iter_mut() {
|
||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use common::BinarySerializable;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use common::bitpacker::BitPacker;
|
||||
use common::compute_num_bits;
|
||||
use common::CountingWriter;
|
||||
use common::BinarySerializable;
|
||||
use common::CompositeWrite;
|
||||
use common::CountingWriter;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
@@ -61,6 +61,16 @@ impl FastFieldSerializer {
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> io::Result<FastBytesFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer::open(field_write)
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
@@ -77,11 +87,20 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
@@ -107,3 +126,21 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write + 'a> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
fn open(write: &'a mut W) -> io::Result<FastBytesFieldSerializer<'a, W>> {
|
||||
Ok(FastBytesFieldSerializer { write })
|
||||
}
|
||||
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
use schema::{Cardinality, Document, Field, Schema};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use std::io;
|
||||
use schema::FieldType;
|
||||
use common;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use postings::UnorderedTermId;
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use common;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Cardinality, Document, Field, FieldType, Schema};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
|
||||
bytes_value_writers: Vec<BytesFastFieldWriter>,
|
||||
}
|
||||
|
||||
impl FastFieldsWriter {
|
||||
@@ -20,6 +21,7 @@ impl FastFieldsWriter {
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
@@ -47,12 +49,17 @@ impl FastFieldsWriter {
|
||||
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
let fast_field_writer = BytesFastFieldWriter::new(field);
|
||||
bytes_value_writers.push(fast_field_writer);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
bytes_value_writers,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,24 +75,36 @@ impl FastFieldsWriter {
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a multivalued fastfield in the schema.
|
||||
pub(crate) fn get_multivalue_writer(
|
||||
pub fn get_multivalue_writer(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValueIntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
// TODO expose for users
|
||||
self.multi_values_writers
|
||||
.iter_mut()
|
||||
.find(|multivalue_writer| multivalue_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the bytes fast field writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a bytes fastfield in the schema.
|
||||
pub fn get_bytes_writer(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.bytes_value_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.multi_values_writers {
|
||||
field_writer.next_doc();
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.bytes_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
}
|
||||
@@ -95,7 +114,7 @@ impl FastFieldsWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
@@ -104,6 +123,9 @@ impl FastFieldsWriter {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field))?;
|
||||
}
|
||||
for field_writer in &self.bytes_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
|
||||
#[inline(always)]
|
||||
pub fn id_to_fieldnorm(id: u8) -> u32 {
|
||||
FIELD_NORMS_TABLE[id as usize]
|
||||
}
|
||||
|
||||
|
||||
#[inline(always)]
|
||||
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
FIELD_NORMS_TABLE
|
||||
@@ -12,45 +10,34 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
.unwrap_or_else(|idx| idx - 1) as u8
|
||||
}
|
||||
|
||||
|
||||
pub const FIELD_NORMS_TABLE: [u32; 256] = [
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54,
|
||||
56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144,
|
||||
152, 168, 184, 200, 216, 232, 248, 264, 280, 312, 344, 376, 408, 440, 472, 504,
|
||||
536, 600, 664, 728, 792, 856, 920, 984,
|
||||
1048, 1176, 1304, 1432, 1560, 1688, 1816, 1944,
|
||||
2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
|
||||
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240,
|
||||
10264, 11288, 12312, 13336, 14360, 15384,
|
||||
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744,
|
||||
32792, 36888, 40984, 45080, 49176, 53272, 57368, 61464,
|
||||
65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
|
||||
163864, 180248, 196632, 213016, 229400, 245784, 262168,
|
||||
294936, 327704, 360472, 393240, 426008, 458776,
|
||||
491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528,
|
||||
983064, 1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032,
|
||||
1966104, 2097176, 2359320, 2621464, 2883608, 3145752, 3407896, 3670040, 3932184,
|
||||
4194328, 4718616, 5242904, 5767192, 6291480, 6815768, 7340056, 7864344, 8388632, 9437208,
|
||||
10485784, 11534360, 12582936, 13631512, 14680088, 15728664, 16777240, 18874392, 20971544,
|
||||
23068696, 25165848, 27263000, 29360152, 31457304, 33554456, 37748760, 41943064,
|
||||
46137368, 50331672, 54525976, 58720280, 62914584, 67108888, 75497496, 83886104,
|
||||
92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968, 167772184,
|
||||
184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912, 335544344,
|
||||
369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800, 671088664,
|
||||
738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576, 1342177304,
|
||||
1476395032, 1610612760, 1744830488, 1879048216, 2013265944
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
|
||||
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
|
||||
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1048,
|
||||
1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
|
||||
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384,
|
||||
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176,
|
||||
53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
|
||||
163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, 393240, 426008,
|
||||
458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, 1048600,
|
||||
1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, 2621464,
|
||||
2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, 6291480,
|
||||
6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, 14680088,
|
||||
15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, 31457304,
|
||||
33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, 67108888,
|
||||
75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968,
|
||||
167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912,
|
||||
335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800,
|
||||
671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576,
|
||||
1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
||||
];
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm, FIELD_NORMS_TABLE};
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_decode_code() {
|
||||
assert_eq!(fieldnorm_to_id(0), 0);
|
||||
@@ -103,4 +90,4 @@ mod tests {
|
||||
assert_eq!(FIELD_NORMS_TABLE[i], decode_fieldnorm_byte(i as u8));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,13 +17,12 @@
|
||||
//!
|
||||
//! This trick is used by the [BM25 similarity]().
|
||||
mod code;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
mod reader;
|
||||
|
||||
pub use self::reader::FieldNormReader;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
pub use self::serializer::FieldNormsSerializer;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use super::{id_to_fieldnorm, fieldnorm_to_id};
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
|
||||
|
||||
/// Reads the fieldnorm associated to a document.
|
||||
/// The fieldnorm represents the length associated to
|
||||
/// a given Field of a given document.
|
||||
@@ -21,16 +20,13 @@ use DocId;
|
||||
/// precompute computationally expensive functions of the fieldnorm
|
||||
/// in a very short array.
|
||||
pub struct FieldNormReader {
|
||||
data: ReadOnlySource
|
||||
data: ReadOnlySource,
|
||||
}
|
||||
|
||||
impl FieldNormReader {
|
||||
|
||||
/// Opens a field norm reader given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> Self {
|
||||
FieldNormReader {
|
||||
data
|
||||
}
|
||||
FieldNormReader { data }
|
||||
}
|
||||
|
||||
/// Returns the `fieldnorm` associated to a doc id.
|
||||
@@ -71,12 +67,13 @@ impl FieldNormReader {
|
||||
#[cfg(test)]
|
||||
impl From<Vec<u32>> for FieldNormReader {
|
||||
fn from(field_norms: Vec<u32>) -> FieldNormReader {
|
||||
let field_norms_id = field_norms.into_iter()
|
||||
let field_norms_id = field_norms
|
||||
.into_iter()
|
||||
.map(FieldNormReader::fieldnorm_to_id)
|
||||
.collect::<Vec<u8>>();
|
||||
let field_norms_data = ReadOnlySource::from(field_norms_id);
|
||||
FieldNormReader {
|
||||
data: field_norms_data
|
||||
data: field_norms_data,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,26 +1,24 @@
|
||||
use directory::WritePtr;
|
||||
use std::io;
|
||||
use common::CompositeWrite;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
|
||||
/// The fieldnorms serializer is in charge of
|
||||
/// the serialization of field norms for all fields.
|
||||
pub struct FieldNormsSerializer {
|
||||
composite_write: CompositeWrite,
|
||||
}
|
||||
|
||||
impl FieldNormsSerializer {
|
||||
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FieldNormsSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FieldNormsSerializer {
|
||||
composite_write
|
||||
})
|
||||
Ok(FieldNormsSerializer { composite_write })
|
||||
}
|
||||
|
||||
|
||||
/// Serialize the given field
|
||||
pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> {
|
||||
let write = self.composite_write.for_field(field);
|
||||
write.write_all(fieldnorms_data)?;
|
||||
@@ -28,10 +26,9 @@ impl FieldNormsSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up / flush / close
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1,30 +1,36 @@
|
||||
use DocId;
|
||||
|
||||
use schema::Field;
|
||||
use super::FieldNormsSerializer;
|
||||
use std::io;
|
||||
use schema::Schema;
|
||||
use super::fieldnorm_to_id;
|
||||
use super::FieldNormsSerializer;
|
||||
use schema::Field;
|
||||
use schema::Schema;
|
||||
use std::io;
|
||||
|
||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||
/// of each document for each field with field norms.
|
||||
///
|
||||
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
|
||||
/// byte per document per field.
|
||||
pub struct FieldNormsWriter {
|
||||
fields: Vec<Field>,
|
||||
fieldnorms_buffer: Vec<Vec<u8>>
|
||||
fieldnorms_buffer: Vec<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl FieldNormsWriter {
|
||||
|
||||
pub fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
||||
/// Returns the fields that should have field norms computed
|
||||
/// according to the given schema.
|
||||
pub(crate) fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| {
|
||||
field_entry.is_indexed()
|
||||
})
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field, _)| Field(field as u32))
|
||||
.collect::<Vec<Field>>()
|
||||
}
|
||||
|
||||
/// Initialize with state for tracking the field norm fields
|
||||
/// specified in the schema.
|
||||
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
|
||||
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
||||
let max_field = fields
|
||||
@@ -35,26 +41,40 @@ impl FieldNormsWriter {
|
||||
.unwrap_or(0);
|
||||
FieldNormsWriter {
|
||||
fields,
|
||||
fieldnorms_buffer: (0..max_field)
|
||||
.map(|_| Vec::new())
|
||||
.collect::<Vec<_>>()
|
||||
fieldnorms_buffer: (0..max_field).map(|_| Vec::new()).collect::<Vec<_>>(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensure that all documents in 0..max_doc have a byte associated with them
|
||||
/// in each of the fieldnorm vectors.
|
||||
///
|
||||
/// Will extend with 0-bytes for documents that have not been seen.
|
||||
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
||||
for &field in self.fields.iter() {
|
||||
self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8);
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the fieldnorm byte for the given document for the given field.
|
||||
///
|
||||
/// Will internally convert the u32 `fieldnorm` value to the appropriate byte
|
||||
/// to approximate the field norm in less space.
|
||||
///
|
||||
/// * doc - the document id
|
||||
/// * field - the field being set
|
||||
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
||||
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
||||
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.0 as usize];
|
||||
assert!(fieldnorm_buffer.len() <= doc as usize, "Cannot register a given fieldnorm twice");
|
||||
assert!(
|
||||
fieldnorm_buffer.len() <= doc as usize,
|
||||
"Cannot register a given fieldnorm twice"
|
||||
);
|
||||
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
||||
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
|
||||
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
|
||||
}
|
||||
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
|
||||
for &field in self.fields.iter() {
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..];
|
||||
@@ -62,4 +82,4 @@ impl FieldNormsWriter {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::collections::HashSet;
|
||||
use rand::thread_rng;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
use schema::*;
|
||||
use Index;
|
||||
use Searcher;
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
@@ -13,7 +13,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::operation::DeleteOperation;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
// The DeleteQueue is similar in conceptually to a multiple
|
||||
// consumer single producer broadcast channel.
|
||||
@@ -52,7 +52,8 @@ impl DeleteQueue {
|
||||
//
|
||||
// Past delete operations are not accessible.
|
||||
pub fn cursor(&self) -> DeleteCursor {
|
||||
let last_block = self.inner
|
||||
let last_block = self
|
||||
.inner
|
||||
.read()
|
||||
.expect("Read lock poisoned when opening delete queue cursor")
|
||||
.last_block
|
||||
@@ -92,7 +93,8 @@ impl DeleteQueue {
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self.inner
|
||||
let mut self_wlock = self
|
||||
.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
|
||||
@@ -132,7 +134,8 @@ impl From<DeleteQueue> for NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self.0
|
||||
let next_read_lock = self
|
||||
.0
|
||||
.read()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||
@@ -141,7 +144,8 @@ impl NextBlock {
|
||||
}
|
||||
let next_block;
|
||||
{
|
||||
let mut next_write_lock = self.0
|
||||
let mut next_write_lock = self
|
||||
.0
|
||||
.write()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
match *next_write_lock {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use Directory;
|
||||
use directory::error::OpenWriteError;
|
||||
use core::LOCKFILE_FILEPATH;
|
||||
use directory::error::OpenWriteError;
|
||||
use Directory;
|
||||
|
||||
/// The directory lock is a mechanism used to
|
||||
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use bit_set::BitSet;
|
||||
use chan;
|
||||
use core::Index;
|
||||
@@ -6,39 +9,35 @@ use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use indexer::stamper::Stamper;
|
||||
use datastruct::stacker::Heap;
|
||||
use directory::FileProtection;
|
||||
use docset::DocSet;
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
use fastfield::write_delete_bitset;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use futures::Canceled;
|
||||
use datastruct::stacker::hashmap::split_memory;
|
||||
use futures::Future;
|
||||
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::operation::DeleteOperation;
|
||||
use indexer::stamper::Stamper;
|
||||
use indexer::DirectoryLock;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentWriter;
|
||||
use docset::DocSet;
|
||||
use schema::IndexRecordOption;
|
||||
use postings::compute_table_size;
|
||||
use schema::Document;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::mem::swap;
|
||||
use std::thread::JoinHandle;
|
||||
use indexer::DirectoryLock;
|
||||
use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
|
||||
pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
||||
|
||||
// We impose the memory per thread to be at least 3 MB.
|
||||
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
|
||||
pub const HEAP_SIZE_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
|
||||
pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
|
||||
// Add document will block if the number of docs waiting in the queue to be indexed
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
@@ -47,6 +46,24 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
type DocumentSender = chan::Sender<AddOperation>;
|
||||
type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
(1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
))
|
||||
.min(19) // we cap it at 512K
|
||||
}
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
///
|
||||
/// It manages a small number of indexing thread, as well as a shared
|
||||
@@ -81,10 +98,6 @@ pub struct IndexWriter {
|
||||
committed_opstamp: u64,
|
||||
}
|
||||
|
||||
// IndexWriter cannot be sent to another thread.
|
||||
impl !Send for IndexWriter {}
|
||||
impl !Sync for IndexWriter {}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
@@ -105,11 +118,16 @@ pub fn open_index_writer(
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!(
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
let err_msg = format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT
|
||||
));
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
@@ -200,7 +218,6 @@ pub fn advance_deletes(
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
{
|
||||
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
|
||||
// We are already up-to-date here.
|
||||
@@ -241,48 +258,33 @@ pub fn advance_deletes(
|
||||
}
|
||||
}
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
|
||||
Ok(file_protect)
|
||||
}
|
||||
|
||||
fn index_documents(
|
||||
heap: &mut Heap,
|
||||
table_size: usize,
|
||||
memory_budget: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
heap.clear();
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?;
|
||||
let table_size = initial_table_size(memory_budget);
|
||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||
for doc in document_iterator {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
// There is two possible conditions to close the segment.
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
if segment_writer.is_buffer_full() {
|
||||
|
||||
let mem_usage = segment_writer.mem_usage();
|
||||
|
||||
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
// The second is the term dictionary hash table
|
||||
// is reaching saturation.
|
||||
//
|
||||
// Tantivy does not resize its hashtable. When it reaches
|
||||
// capacity, we just stop indexing new document.
|
||||
if segment_writer.is_term_saturated() {
|
||||
info!(
|
||||
"Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !segment_updater.is_alive() {
|
||||
@@ -340,7 +342,8 @@ impl IndexWriter {
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result = self.segment_updater
|
||||
let result = self
|
||||
.segment_updater
|
||||
.wait_merging_thread()
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
|
||||
|
||||
@@ -374,14 +377,12 @@ impl IndexWriter {
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let document_receiver_clone = self.document_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
|
||||
info!("heap size {}, table_size {}", heap_size, table_size);
|
||||
let mut heap = Heap::with_capacity(heap_size);
|
||||
|
||||
let generation = self.generation;
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!(
|
||||
"indexing thread {} for gen {}",
|
||||
@@ -409,8 +410,7 @@ impl IndexWriter {
|
||||
}
|
||||
let segment = segment_updater.new_segment();
|
||||
index_documents(
|
||||
&mut heap,
|
||||
table_size,
|
||||
mem_budget,
|
||||
&segment,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
@@ -448,10 +448,7 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
pub fn merge(
|
||||
&mut self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
|
||||
self.segment_updater.start_merge(segment_ids)
|
||||
}
|
||||
|
||||
@@ -491,7 +488,8 @@ impl IndexWriter {
|
||||
let document_receiver = self.document_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self._directory_lock
|
||||
let directory_lock = self
|
||||
._directory_lock
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
@@ -647,12 +645,13 @@ impl IndexWriter {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::initial_table_size;
|
||||
use env_logger;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
use schema::{self, Document};
|
||||
use Index;
|
||||
use Term;
|
||||
use error::*;
|
||||
use env_logger;
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_stops_duplicates() {
|
||||
@@ -675,7 +674,7 @@ mod tests {
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = box NoMergePolicy::default();
|
||||
let merge_policy = Box::new(NoMergePolicy::default());
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
@@ -709,7 +708,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap();
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.rollback().unwrap();
|
||||
|
||||
@@ -742,7 +741,7 @@ mod tests {
|
||||
};
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
let mut doc = Document::default();
|
||||
@@ -776,7 +775,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
@@ -811,7 +810,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
@@ -841,4 +840,12 @@ mod tests {
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000), 12);
|
||||
assert_eq!(initial_table_size(1_000_000), 15);
|
||||
assert_eq!(initial_table_size(10_000_000), 18);
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -82,7 +82,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box self.clone()
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,8 +99,8 @@ impl Default for LogMergePolicy {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use indexer::merge_policy::MergePolicy;
|
||||
use core::{SegmentId, SegmentMeta};
|
||||
use indexer::merge_policy::MergePolicy;
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use std::marker;
|
||||
use std::fmt::Debug;
|
||||
use std::marker;
|
||||
|
||||
/// Set of segment suggested for a merge.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -37,7 +37,7 @@ impl MergePolicy for NoMergePolicy {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box NoMergePolicy
|
||||
Box::new(NoMergePolicy)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ pub mod tests {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box MergeWheneverPossible
|
||||
Box::new(MergeWheneverPossible)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,29 +1,29 @@
|
||||
pub mod index_writer;
|
||||
pub mod segment_serializer;
|
||||
pub mod merger;
|
||||
pub mod merge_policy;
|
||||
mod log_merge_policy;
|
||||
mod segment_register;
|
||||
mod segment_writer;
|
||||
mod segment_manager;
|
||||
pub mod delete_queue;
|
||||
pub mod segment_updater;
|
||||
mod directory_lock;
|
||||
mod segment_entry;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod index_writer;
|
||||
mod log_merge_policy;
|
||||
pub mod merge_policy;
|
||||
pub mod merger;
|
||||
pub mod operation;
|
||||
mod stamper;
|
||||
mod prepared_commit;
|
||||
mod segment_entry;
|
||||
mod segment_manager;
|
||||
mod segment_register;
|
||||
pub mod segment_serializer;
|
||||
pub mod segment_updater;
|
||||
mod segment_writer;
|
||||
mod stamper;
|
||||
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use Result;
|
||||
use super::IndexWriter;
|
||||
use Result;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
@@ -13,7 +13,7 @@ impl<'a> PreparedCommit<'a> {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
opstamp
|
||||
opstamp,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentMeta;
|
||||
use bit_set::BitSet;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use std::sync::RwLock;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
|
||||
use core::SegmentId;
|
||||
use indexer::SegmentEntry;
|
||||
use std::path::PathBuf;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::SegmentEntry;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
|
||||
#[derive(Default)]
|
||||
struct SegmentRegisters {
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use core::SegmentId;
|
||||
use std::collections::HashMap;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::segment_entry::SegmentEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use indexer::segment_entry::SegmentEntry;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
|
||||
/// The segment register keeps track
|
||||
/// of the list of segment, their size as well
|
||||
@@ -59,7 +59,8 @@ impl SegmentRegister {
|
||||
}
|
||||
|
||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
|
||||
let mut segment_ids: Vec<SegmentMeta> = self
|
||||
.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
@@ -113,11 +114,11 @@ impl SegmentRegister {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use indexer::SegmentState;
|
||||
use super::*;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::*;
|
||||
use super::*;
|
||||
use indexer::SegmentState;
|
||||
|
||||
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
||||
segment_register
|
||||
|
||||
@@ -3,9 +3,9 @@ use Result;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use store::StoreWriter;
|
||||
use fieldnorm::FieldNormsSerializer;
|
||||
use postings::InvertedIndexSerializer;
|
||||
use store::StoreWriter;
|
||||
|
||||
/// Segment serializer is in charge of laying out on disk
|
||||
/// the data accumulated and sorted by the `SegmentWriter`.
|
||||
@@ -47,7 +47,7 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Accessor to the field norm serializer.
|
||||
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
|
||||
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
|
||||
&mut self.fieldnorms_serializer
|
||||
}
|
||||
|
||||
|
||||
@@ -1,40 +1,40 @@
|
||||
use super::segment_manager::{get_mergeable_segments, SegmentManager};
|
||||
use core::Index;
|
||||
use core::IndexMeta;
|
||||
use core::META_FILEPATH;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SerializableSegment;
|
||||
use core::META_FILEPATH;
|
||||
use directory::Directory;
|
||||
use indexer::stamper::Stamper;
|
||||
use error::{Error, ErrorKind, Result};
|
||||
use futures_cpupool::CpuPool;
|
||||
use futures::Future;
|
||||
use futures::Canceled;
|
||||
use futures::oneshot;
|
||||
use directory::FileProtection;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use error::{Error, ErrorKind, Result};
|
||||
use futures::oneshot;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use futures::Future;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use futures_cpupool::CpuPool;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::index_writer::advance_deletes;
|
||||
use indexer::MergeCandidate;
|
||||
use indexer::merger::IndexMerger;
|
||||
use indexer::stamper::Stamper;
|
||||
use indexer::MergeCandidate;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentSerializer;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use serde_json;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use super::segment_manager::{get_mergeable_segments, SegmentManager};
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
@@ -171,7 +171,7 @@ impl SegmentUpdater {
|
||||
pool: CpuPool::new(1),
|
||||
index,
|
||||
segment_manager,
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
@@ -283,10 +283,7 @@ impl SegmentUpdater {
|
||||
}).wait()
|
||||
}
|
||||
|
||||
pub fn start_merge(
|
||||
&self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
|
||||
self.0.segment_manager.start_merge(segment_ids);
|
||||
let segment_updater_clone = self.clone();
|
||||
|
||||
@@ -361,7 +358,9 @@ impl SegmentUpdater {
|
||||
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
|
||||
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
||||
for MergeCandidate(segment_metas) in merge_candidates {
|
||||
self.start_merge(&segment_metas);
|
||||
if let Err(e) = self.start_merge(&segment_metas).fuse().poll() {
|
||||
error!("The merge task failed quickly after starting: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -480,9 +479,9 @@ impl SegmentUpdater {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use Index;
|
||||
use schema::*;
|
||||
use indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
use schema::*;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_delete_during_merge() {
|
||||
@@ -494,7 +493,7 @@ mod tests {
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.set_merge_policy(box MergeWheneverPossible);
|
||||
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
||||
|
||||
{
|
||||
for _ in 0..100 {
|
||||
|
||||
@@ -1,33 +1,30 @@
|
||||
use Result;
|
||||
use DocId;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use super::operation::AddOperation;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use schema::FieldType;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use datastruct::stacker::Heap;
|
||||
use indexer::index_writer::MARGIN_IN_BYTES;
|
||||
use super::operation::AddOperation;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use schema::Value;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::FacetTokenizer;
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use schema::Value;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// documents.
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
pub struct SegmentWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
pub struct SegmentWriter {
|
||||
max_doc: DocId,
|
||||
multifield_postings: MultiFieldPostingsWriter<'a>,
|
||||
multifield_postings: MultiFieldPostingsWriter,
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
@@ -35,8 +32,7 @@ pub struct SegmentWriter<'a> {
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> SegmentWriter<'a> {
|
||||
impl SegmentWriter {
|
||||
/// Creates a new `SegmentWriter`
|
||||
///
|
||||
/// The arguments are defined as follows
|
||||
@@ -47,13 +43,12 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter<'a>> {
|
||||
) -> Result<SegmentWriter> {
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.iter()
|
||||
@@ -69,7 +64,6 @@ impl<'a> SegmentWriter<'a> {
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
heap,
|
||||
max_doc: 0,
|
||||
multifield_postings,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
|
||||
@@ -95,22 +89,8 @@ impl<'a> SegmentWriter<'a> {
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment writer's buffer has reached capacity.
|
||||
///
|
||||
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
|
||||
/// The `Segment` is `finalize`d when the buffer gets full.
|
||||
///
|
||||
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
|
||||
/// exceeds the heap size.
|
||||
pub fn is_buffer_full(&self) -> bool {
|
||||
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
|
||||
}
|
||||
|
||||
/// Return true if the term dictionary hashmap is reaching capacity.
|
||||
/// It is one of the condition that triggers a `SegmentWriter` to
|
||||
/// be finalized.
|
||||
pub(crate) fn is_term_saturated(&self) -> bool {
|
||||
self.multifield_postings.is_term_saturated()
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.multifield_postings.mem_usage()
|
||||
}
|
||||
|
||||
/// Indexes a new document
|
||||
@@ -139,8 +119,7 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
let mut term = Term::for_field(field); // we set the Term
|
||||
for facet_bytes in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
||||
@@ -179,8 +158,7 @@ impl<'a> SegmentWriter<'a> {
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, num_tokens);
|
||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
@@ -204,6 +182,9 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// Do nothing. Bytes only supports fast fields.
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
|
||||
@@ -248,7 +229,7 @@ fn write(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl<'a> SerializableSegment for SegmentWriter<'a> {
|
||||
impl SerializableSegment for SegmentWriter {
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(
|
||||
|
||||
@@ -1,15 +1,66 @@
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
// AtomicU64 have not landed in stable.
|
||||
// For the moment let's just use AtomicUsize on
|
||||
// x86/64 bit platform, and a mutex on other platform.
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
#[cfg(target = "x86_64")]
|
||||
mod archicture_impl {
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst)
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target = "x86_64"))]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<Mutex<u64>>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(Mutex::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
let mut guard = self.0.lock().expect("Failed to lock the stamper");
|
||||
let previous_val = *guard;
|
||||
*guard = previous_val + 1;
|
||||
previous_val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub use self::archicture_impl::Stamper;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::Stamper;
|
||||
|
||||
#[test]
|
||||
fn test_stamper() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
assert_eq!(stamper.stamp(), 7u64);
|
||||
assert_eq!(stamper.stamp(), 8u64);
|
||||
|
||||
let stamper_clone = stamper.clone();
|
||||
assert_eq!(stamper.stamp(), 9u64);
|
||||
|
||||
assert_eq!(stamper.stamp(), 10u64);
|
||||
assert_eq!(stamper_clone.stamp(), 11u64);
|
||||
}
|
||||
}
|
||||
|
||||
99
src/lib.rs
99
src/lib.rs
@@ -1,14 +1,7 @@
|
||||
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(module_inception))]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(inline_always))]
|
||||
#![feature(box_syntax)]
|
||||
#![feature(optin_builtin_traits)]
|
||||
#![feature(conservative_impl_trait)]
|
||||
#![feature(collections_range)]
|
||||
#![feature(integer_atomics)]
|
||||
#![feature(drain_filter)]
|
||||
#![cfg_attr(test, feature(test))]
|
||||
#![cfg_attr(test, feature(iterator_step_by))]
|
||||
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![allow(unknown_lints)]
|
||||
#![allow(new_without_default)]
|
||||
@@ -62,7 +55,7 @@
|
||||
//!
|
||||
//! // Indexing documents
|
||||
//!
|
||||
//! let index = Index::create(index_path, schema.clone())?;
|
||||
//! let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
//!
|
||||
//! // Here we use a buffer of 100MB that will be split
|
||||
//! // between indexing threads.
|
||||
@@ -123,35 +116,40 @@ extern crate lazy_static;
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
|
||||
#[cfg_attr(test, macro_use)]
|
||||
extern crate serde_json;
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
#[macro_use]
|
||||
extern crate error_chain;
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
extern crate atomicwrites;
|
||||
extern crate base64;
|
||||
extern crate bit_set;
|
||||
extern crate bitpacking;
|
||||
extern crate byteorder;
|
||||
extern crate chan;
|
||||
extern crate combine;
|
||||
extern crate crossbeam;
|
||||
extern crate fnv;
|
||||
extern crate fst;
|
||||
extern crate fst_regex;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate itertools;
|
||||
extern crate lz4;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
extern crate rust_stemmers;
|
||||
extern crate serde;
|
||||
extern crate serde_json;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate tempdir;
|
||||
extern crate tempfile;
|
||||
extern crate uuid;
|
||||
extern crate bitpacking;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
@@ -165,7 +163,8 @@ extern crate winapi;
|
||||
|
||||
#[cfg(test)]
|
||||
extern crate rand;
|
||||
#[cfg(test)]
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
extern crate test;
|
||||
|
||||
extern crate tinysegmenter;
|
||||
@@ -184,36 +183,35 @@ pub use error::{Error, ErrorKind, ResultExt};
|
||||
/// Tantivy result.
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
mod core;
|
||||
mod compression;
|
||||
mod indexer;
|
||||
mod common;
|
||||
mod compression;
|
||||
mod core;
|
||||
mod indexer;
|
||||
|
||||
#[allow(unused_doc_comment)]
|
||||
#[allow(unused_doc_comments)]
|
||||
mod error;
|
||||
pub mod tokenizer;
|
||||
mod datastruct;
|
||||
|
||||
pub mod termdict;
|
||||
pub mod store;
|
||||
pub mod query;
|
||||
pub mod directory;
|
||||
pub mod collector;
|
||||
pub mod postings;
|
||||
pub mod schema;
|
||||
pub mod directory;
|
||||
pub mod fastfield;
|
||||
pub mod fieldnorm;
|
||||
pub mod postings;
|
||||
pub mod query;
|
||||
pub mod schema;
|
||||
pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use directory::Directory;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use indexer::IndexWriter;
|
||||
pub use schema::{Document, Term};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use postings::Postings;
|
||||
pub use core::SegmentComponent;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use directory::Directory;
|
||||
pub use indexer::IndexWriter;
|
||||
pub use postings::Postings;
|
||||
pub use schema::{Document, Term};
|
||||
|
||||
pub use common::{i64_to_u64, u64_to_i64};
|
||||
|
||||
@@ -229,10 +227,10 @@ pub fn version() -> &'static str {
|
||||
|
||||
/// Defines tantivy's merging strategy
|
||||
pub mod merge_policy {
|
||||
pub use indexer::MergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
pub use indexer::NoMergePolicy;
|
||||
pub use indexer::DefaultMergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
pub use indexer::MergePolicy;
|
||||
pub use indexer::NoMergePolicy;
|
||||
}
|
||||
|
||||
/// A `u32` identifying a document within a segment.
|
||||
@@ -281,33 +279,29 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
mod tests {
|
||||
|
||||
use collector::tests::TestCollector;
|
||||
use Index;
|
||||
use core::SegmentReader;
|
||||
use query::BooleanQuery;
|
||||
use schema::*;
|
||||
use docset::DocSet;
|
||||
use query::BooleanQuery;
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use schema::*;
|
||||
use Index;
|
||||
use IndexWriter;
|
||||
use Postings;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
||||
assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
|
||||
assert!(
|
||||
nearly_equals(val, expected),
|
||||
"Got {}, expected {}.",
|
||||
val,
|
||||
expected
|
||||
);
|
||||
}
|
||||
|
||||
pub fn nearly_equals(a: f32, b: f32) -> bool {
|
||||
(a - b).abs() < 0.0005 * (a + b).abs()
|
||||
}
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32() < ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
@@ -317,10 +311,6 @@ mod tests {
|
||||
.collect::<Vec<u32>>()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
@@ -332,7 +322,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
@@ -458,7 +448,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn advance_undeleted(docset: &mut DocSet, reader: &SegmentReader) -> bool {
|
||||
while docset.advance() {
|
||||
if !reader.is_deleted(docset.doc()) {
|
||||
|
||||
@@ -6,23 +6,24 @@ Postings module (also called inverted index)
|
||||
///
|
||||
/// Postings, also called inverted lists, is the key datastructure
|
||||
/// to full-text search.
|
||||
|
||||
mod postings;
|
||||
mod recorder;
|
||||
mod serializer;
|
||||
mod postings_writer;
|
||||
mod term_info;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
mod serializer;
|
||||
mod stacker;
|
||||
mod term_info;
|
||||
|
||||
use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
|
||||
pub use common::HasLen;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
@@ -38,25 +39,21 @@ pub(crate) enum FreqReadingOption {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use core::Index;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentReader;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
use indexer::SegmentWriter;
|
||||
use query::Scorer;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use schema::Field;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||
use std::iter;
|
||||
use DocId;
|
||||
use Score;
|
||||
use query::Intersection;
|
||||
use query::Scorer;
|
||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||
use core::SegmentComponent;
|
||||
use indexer::SegmentWriter;
|
||||
use core::SegmentReader;
|
||||
use core::Index;
|
||||
use schema::IndexRecordOption;
|
||||
use std::iter;
|
||||
use datastruct::stacker::Heap;
|
||||
use schema::Field;
|
||||
use test::{self, Bencher};
|
||||
use indexer::operation::AddOperation;
|
||||
use tests;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use fieldnorm::FieldNormReader;
|
||||
|
||||
#[test]
|
||||
pub fn test_position_write() {
|
||||
@@ -127,7 +124,6 @@ pub mod tests {
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
@@ -166,10 +162,9 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let segment = index.new_segment();
|
||||
|
||||
let heap = Heap::with_capacity(10_000_000);
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
|
||||
SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
@@ -206,13 +201,14 @@ pub mod tests {
|
||||
{
|
||||
let segment_reader = SegmentReader::open(&segment).unwrap();
|
||||
{
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field) ;
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
|
||||
for i in 2..1000 {
|
||||
assert_eq!(
|
||||
fieldnorm_reader.fieldnorm_id(i),
|
||||
FieldNormReader::fieldnorm_to_id(i + 1) );
|
||||
FieldNormReader::fieldnorm_to_id(i + 1)
|
||||
);
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -449,7 +445,7 @@ pub mod tests {
|
||||
// delete everything else
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.delete_term(term_1);
|
||||
index_writer.delete_term(term_1);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
@@ -479,23 +475,23 @@ pub mod tests {
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref TERM_A: Term = {
|
||||
pub static ref TERM_A: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "a")
|
||||
};
|
||||
static ref TERM_B: Term = {
|
||||
pub static ref TERM_B: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "b")
|
||||
};
|
||||
static ref TERM_C: Term = {
|
||||
pub static ref TERM_C: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "c")
|
||||
};
|
||||
static ref TERM_D: Term = {
|
||||
pub static ref TERM_D: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "d")
|
||||
};
|
||||
static ref INDEX: Index = {
|
||||
pub static ref INDEX: Index = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
@@ -507,7 +503,7 @@ pub mod tests {
|
||||
let posting_list_size = 1_000_000;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
for _ in 0 .. posting_list_size {
|
||||
for _ in 0..posting_list_size {
|
||||
let mut doc = Document::default();
|
||||
if rng.gen_weighted_bool(15) {
|
||||
doc.add_text(text_field, "a");
|
||||
@@ -530,6 +526,85 @@ pub mod tests {
|
||||
};
|
||||
}
|
||||
|
||||
/// Wraps a given docset, and forward alls call but the
|
||||
/// `.skip_next(...)`. This is useful to test that a specialized
|
||||
/// implementation of `.skip_next(...)` is consistent
|
||||
/// with the default implementation.
|
||||
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
|
||||
|
||||
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
|
||||
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
|
||||
UnoptimizedDocSet(docset)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.0.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.0.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.0.score()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
|
||||
postings_factory: F,
|
||||
targets: Vec<u32>,
|
||||
) {
|
||||
for target in targets {
|
||||
let mut postings_opt = postings_factory();
|
||||
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
|
||||
let skip_result_opt = postings_opt.skip_next(target);
|
||||
let skip_result_unopt = postings_unopt.skip_next(target);
|
||||
assert_eq!(
|
||||
skip_result_unopt, skip_result_opt,
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
match skip_result_opt {
|
||||
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
|
||||
SkipResult::OverStep => assert!(postings_opt.doc() > target),
|
||||
SkipResult::End => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
while postings_opt.advance() {
|
||||
assert!(postings_unopt.advance());
|
||||
assert_eq!(
|
||||
postings_opt.doc(),
|
||||
postings_unopt.doc(),
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
}
|
||||
assert!(!postings_unopt.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::tests::*;
|
||||
use docset::SkipResult;
|
||||
use query::Intersection;
|
||||
use schema::IndexRecordOption;
|
||||
use test::{self, Bencher};
|
||||
use tests;
|
||||
use DocSet;
|
||||
|
||||
#[bench]
|
||||
fn bench_segment_postings(b: &mut Bencher) {
|
||||
let searcher = INDEX.searcher();
|
||||
@@ -646,71 +721,4 @@ pub mod tests {
|
||||
s
|
||||
});
|
||||
}
|
||||
|
||||
/// Wraps a given docset, and forward alls call but the
|
||||
/// `.skip_next(...)`. This is useful to test that a specialized
|
||||
/// implementation of `.skip_next(...)` is consistent
|
||||
/// with the default implementation.
|
||||
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
|
||||
|
||||
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
|
||||
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
|
||||
UnoptimizedDocSet(docset)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.0.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.0.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.0.score()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
|
||||
postings_factory: F,
|
||||
targets: Vec<u32>,
|
||||
) {
|
||||
for target in targets {
|
||||
let mut postings_opt = postings_factory();
|
||||
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
|
||||
let skip_result_opt = postings_opt.skip_next(target);
|
||||
let skip_result_unopt = postings_unopt.skip_next(target);
|
||||
assert_eq!(
|
||||
skip_result_unopt, skip_result_opt,
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
match skip_result_opt {
|
||||
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
|
||||
SkipResult::OverStep => assert!(postings_opt.doc() > target),
|
||||
SkipResult::End => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
while postings_opt.advance() {
|
||||
assert!(postings_unopt.advance());
|
||||
assert_eq!(
|
||||
postings_opt.doc(),
|
||||
postings_unopt.doc(),
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
}
|
||||
assert!(!postings_unopt.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,90 +1,104 @@
|
||||
use DocId;
|
||||
use schema::Term;
|
||||
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||
|
||||
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use postings::UnorderedTermId;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use std::io;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||
use std::collections::HashMap;
|
||||
use postings::Recorder;
|
||||
use Result;
|
||||
use schema::{Field, Schema};
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use termdict::TermOrdinal;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use schema::IndexRecordOption;
|
||||
use postings::UnorderedTermId;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
heap: &'a Heap,
|
||||
) -> Box<PostingsWriter + 'a> {
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// FieldType::Bytes cannot actually be indexed.
|
||||
// TODO fix during the indexer refactoring described in #276
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
term_index: TermHashMap<'a>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
pub struct MultiFieldPostingsWriter {
|
||||
heap: MemoryArena,
|
||||
schema: Schema,
|
||||
term_index: TermHashMap,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
||||
}
|
||||
|
||||
impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
impl MultiFieldPostingsWriter {
|
||||
/// Create a new `MultiFieldPostingsWriter` given
|
||||
/// a schema and a heap.
|
||||
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
|
||||
let term_index = TermHashMap::new(table_bits, heap);
|
||||
pub fn new(schema: &Schema, table_bits: usize) -> MultiFieldPostingsWriter {
|
||||
let term_index = TermHashMap::new(table_bits);
|
||||
let per_field_postings_writers: Vec<_> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.map(|field_entry| posting_from_field_entry(field_entry))
|
||||
.collect();
|
||||
|
||||
MultiFieldPostingsWriter {
|
||||
heap,
|
||||
heap: MemoryArena::new(),
|
||||
schema: schema.clone(),
|
||||
term_index,
|
||||
per_field_postings_writers,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.term_index.mem_usage() + self.heap.mem_usage()
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
|
||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
||||
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
|
||||
postings_writer.index_text(
|
||||
&mut self.term_index,
|
||||
doc,
|
||||
field,
|
||||
token_stream,
|
||||
&mut self.heap,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
|
||||
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, self.heap)
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
|
||||
}
|
||||
|
||||
/// Serialize the inverted index.
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
#[allow(needless_range_loop)]
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
|
||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
|
||||
.term_index
|
||||
.iter()
|
||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||
.collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||
@@ -94,8 +108,10 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
|
||||
HashMap::new();
|
||||
let mut unordered_term_mappings: HashMap<
|
||||
Field,
|
||||
HashMap<UnorderedTermId, TermOrdinal>,
|
||||
> = HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
@@ -110,40 +126,46 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i + 1];
|
||||
|
||||
// populating the unordered term ord -> ordered term ord mapping
|
||||
// for the field.
|
||||
let mut mapping = HashMap::new();
|
||||
for (term_ord, term_unord_id) in term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket)
|
||||
.enumerate()
|
||||
{
|
||||
mapping.insert(term_unord_id, term_ord);
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Str(_) | &FieldType::HierarchicalFacet => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let mut unordered_term_ids = term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| {
|
||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||
})
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
&FieldType::U64(_) | &FieldType::I64(_) => {}
|
||||
&FieldType::Bytes => {}
|
||||
}
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
let mut field_serializer = serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||
let mut field_serializer =
|
||||
serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&mut field_serializer,
|
||||
self.heap,
|
||||
&self.term_index.heap,
|
||||
&self.heap,
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(unordered_term_mappings)
|
||||
}
|
||||
|
||||
/// Return true iff the term dictionary is saturated.
|
||||
pub fn is_term_saturated(&self) -> bool {
|
||||
self.term_index.is_saturated()
|
||||
}
|
||||
}
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `Heap`.
|
||||
/// `PostingsWriter` writes in a `MemoryArena`.
|
||||
pub trait PostingsWriter {
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
@@ -158,16 +180,17 @@ pub trait PostingsWriter {
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
heap: &mut MemoryArena,
|
||||
) -> UnorderedTermId;
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
term_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and subscribe all of its token.
|
||||
@@ -177,10 +200,9 @@ pub trait PostingsWriter {
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &Heap,
|
||||
heap: &mut MemoryArena,
|
||||
) -> u32 {
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
let mut term = Term::for_field(field);
|
||||
let num_tokens = {
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
@@ -196,61 +218,67 @@ pub trait PostingsWriter {
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
|
||||
heap: &'a Heap,
|
||||
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
/// constructor
|
||||
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
|
||||
pub fn new() -> SpecializedPostingsWriter<Rec> {
|
||||
SpecializedPostingsWriter {
|
||||
heap,
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
|
||||
pub fn new_boxed() -> Box<PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
heap: &mut MemoryArena,
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
let (term_ord, recorder): (UnorderedTermId, &mut Rec) = term_index.get_or_create(term);
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
if current_doc != u32::max_value() {
|
||||
recorder.close_doc(heap);
|
||||
}
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
self.total_num_tokens += 1;
|
||||
recorder.record_position(position, heap);
|
||||
term_ord
|
||||
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||
if opt_recorder.is_some() {
|
||||
let mut recorder = opt_recorder.unwrap();
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(heap);
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::new(heap);
|
||||
recorder.new_doc(doc, heap);
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
}
|
||||
}) as UnorderedTermId
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
termdict_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
for &(term_bytes, addr, _) in term_addrs {
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
let recorder: Rec = unsafe { termdict_heap.read(addr) };
|
||||
serializer.new_term(&term_bytes[4..])?;
|
||||
recorder.serialize(addr, serializer, heap)?;
|
||||
recorder.serialize(serializer, heap)?;
|
||||
serializer.close_term()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use DocId;
|
||||
use std::{self, io};
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use postings::FieldSerializer;
|
||||
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
|
||||
use std::{self, io};
|
||||
use DocId;
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||
const POSITION_END: u32 = std::u32::MAX;
|
||||
@@ -15,62 +15,53 @@ const POSITION_END: u32 = std::u32::MAX;
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub trait Recorder: HeapAllocable {
|
||||
pub trait Recorder: Copy {
|
||||
///
|
||||
fn new(heap: &mut MemoryArena) -> Self;
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
/// This method shall only be called if the term is within the document.
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap);
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena);
|
||||
/// Record the position of a term. For each document,
|
||||
/// this method will be called `term_freq` times.
|
||||
fn record_position(&mut self, position: u32, heap: &Heap);
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena);
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &Heap);
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena);
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl HeapAllocable for NothingRecorder {
|
||||
fn with_addr(addr: u32) -> NothingRecorder {
|
||||
impl Recorder for NothingRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {}
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
||||
|
||||
fn close_doc(&mut self, _heap: &Heap) {}
|
||||
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for doc in self.stack.iter(self_addr, heap) {
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
for doc in self.stack.iter(heap) {
|
||||
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -78,52 +69,47 @@ impl Recorder for NothingRecorder {
|
||||
}
|
||||
|
||||
/// Recorder encoding document ids, and term frequencies
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
current_tf: u32,
|
||||
}
|
||||
|
||||
impl HeapAllocable for TermFrequencyRecorder {
|
||||
fn with_addr(addr: u32) -> TermFrequencyRecorder {
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
current_doc: u32::max_value(),
|
||||
current_tf: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
||||
self.current_tf += 1;
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
self.stack.push(self.current_tf, heap);
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self.stack
|
||||
.iter(self_addr, heap)
|
||||
let mut doc_iter = self
|
||||
.stack
|
||||
.iter(heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
|
||||
while let Some(doc) = doc_iter.next() {
|
||||
@@ -137,46 +123,40 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
|
||||
/// Recorder encoding term frequencies as well as positions.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl HeapAllocable for TFAndPositionRecorder {
|
||||
fn with_addr(addr: u32) -> TFAndPositionRecorder {
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, position: u32, heap: &Heap) {
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
||||
self.stack.push(position, heap);
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(self_addr, heap);
|
||||
let mut positions_iter = self.stack.iter(heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
let mut prev_position = 0;
|
||||
doc_positions.clear();
|
||||
|
||||
@@ -2,15 +2,15 @@ use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BL
|
||||
use DocId;
|
||||
|
||||
use common::BitSet;
|
||||
use common::CountingWriter;
|
||||
use common::HasLen;
|
||||
use postings::Postings;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::FreqReadingOption;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use common::CountingWriter;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::Postings;
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
@@ -84,9 +84,13 @@ impl SegmentPostings {
|
||||
for &doc in docs {
|
||||
postings_serializer.write_doc(doc, 1u32).unwrap();
|
||||
}
|
||||
postings_serializer.close_term().expect("In memory Serialization should never fail.");
|
||||
postings_serializer
|
||||
.close_term()
|
||||
.expect("In memory Serialization should never fail.");
|
||||
}
|
||||
let (buffer , _) = counting_writer.finish().expect("Serializing in a buffer should never fail.");
|
||||
let (buffer, _) = counting_writer
|
||||
.finish()
|
||||
.expect("Serializing in a buffer should never fail.");
|
||||
let data = ReadOnlySource::from(buffer);
|
||||
let block_segment_postings = BlockSegmentPostings::from_data(
|
||||
docs.len(),
|
||||
@@ -98,7 +102,6 @@ impl SegmentPostings {
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
@@ -125,7 +128,7 @@ fn exponential_search(target: u32, mut start: usize, arr: &[u32]) -> (usize, usi
|
||||
loop {
|
||||
let new = start + jump;
|
||||
if new >= end {
|
||||
return (start, end)
|
||||
return (start, end);
|
||||
}
|
||||
if arr[new] > target {
|
||||
return (start, new);
|
||||
@@ -163,7 +166,8 @@ impl DocSet for SegmentPostings {
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
|
||||
let sum_freq: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer.as_mut()
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freq as usize);
|
||||
}
|
||||
@@ -198,7 +202,8 @@ impl DocSet for SegmentPostings {
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
|
||||
let sum_freqs: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer.as_mut()
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freqs as usize);
|
||||
}
|
||||
@@ -211,7 +216,6 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
@@ -262,7 +266,6 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl HasLen for SegmentPostings {
|
||||
fn len(&self) -> usize {
|
||||
self.block_cursor.doc_freq()
|
||||
@@ -276,16 +279,11 @@ impl Postings for SegmentPostings {
|
||||
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
if self.position_computer.is_some() {
|
||||
let prev_capacity = output.capacity();
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if term_freq > prev_capacity {
|
||||
let additional_len = term_freq - output.len();
|
||||
output.reserve(additional_len);
|
||||
}
|
||||
unsafe {
|
||||
output.set_len(term_freq);
|
||||
self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..])
|
||||
}
|
||||
output.resize(self.term_freq() as usize, 0u32);
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.positions_with_offset(offset, &mut output[..])
|
||||
} else {
|
||||
output.clear();
|
||||
}
|
||||
@@ -401,7 +399,8 @@ impl BlockSegmentPostings {
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.num_bitpacked_blocks > 0 {
|
||||
let num_consumed_bytes = self.doc_decoder
|
||||
let num_consumed_bytes = self
|
||||
.doc_decoder
|
||||
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
match self.freq_reading_option {
|
||||
@@ -411,7 +410,8 @@ impl BlockSegmentPostings {
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self.freq_decoder
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref());
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
@@ -473,16 +473,16 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use docset::DocSet;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::SegmentPostings;
|
||||
use schema::SchemaBuilder;
|
||||
use common::HasLen;
|
||||
use core::Index;
|
||||
use schema::INT_INDEXED;
|
||||
use schema::Term;
|
||||
use docset::DocSet;
|
||||
use fst::Streamer;
|
||||
use schema::IndexRecordOption;
|
||||
use common::HasLen;
|
||||
use super::BlockSegmentPostings;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Term;
|
||||
use schema::INT_INDEXED;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -570,4 +570,3 @@ mod tests {
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,20 +1,16 @@
|
||||
use Result;
|
||||
use termdict::TermDictionaryBuilderImpl;
|
||||
use super::TermInfo;
|
||||
use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use directory::WritePtr;
|
||||
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use DocId;
|
||||
use core::Segment;
|
||||
use std::io::{self, Write};
|
||||
use compression::VIntEncoder;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::CompositeWrite;
|
||||
use termdict::TermDictionaryBuilder;
|
||||
use common::{CompositeWrite, CountingWriter};
|
||||
use compression::VIntEncoder;
|
||||
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use core::Segment;
|
||||
use directory::WritePtr;
|
||||
use schema::Schema;
|
||||
use schema::{Field, FieldEntry, FieldType};
|
||||
use std::io::{self, Write};
|
||||
use termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
@@ -85,7 +81,11 @@ impl InvertedIndexSerializer {
|
||||
/// a given field.
|
||||
///
|
||||
/// Loads the indexing options for the given field.
|
||||
pub fn new_field(&mut self, field: Field, total_num_tokens: u64) -> io::Result<FieldSerializer> {
|
||||
pub fn new_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
) -> io::Result<FieldSerializer> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -111,11 +111,12 @@ impl InvertedIndexSerializer {
|
||||
/// The field serializer is in charge of
|
||||
/// the serialization of a specific field.
|
||||
pub struct FieldSerializer<'a> {
|
||||
term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter<WritePtr>>,
|
||||
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
|
||||
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
|
||||
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
|
||||
current_term_info: TermInfo,
|
||||
term_open: bool,
|
||||
num_terms: TermOrdinal,
|
||||
}
|
||||
|
||||
impl<'a> FieldSerializer<'a> {
|
||||
@@ -125,7 +126,6 @@ impl<'a> FieldSerializer<'a> {
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
|
||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
if let Some(text_indexing_options) = text_options.get_indexing_options() {
|
||||
@@ -141,7 +141,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
_ => (false, false),
|
||||
};
|
||||
let term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
|
||||
TermDictionaryBuilder::new(term_dictionary_write, field_type)?;
|
||||
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
|
||||
let positions_serializer_opt = if position_enabled {
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
@@ -155,11 +155,13 @@ impl<'a> FieldSerializer<'a> {
|
||||
positions_serializer_opt,
|
||||
current_term_info: TermInfo::default(),
|
||||
term_open: false,
|
||||
num_terms: TermOrdinal::default(),
|
||||
})
|
||||
}
|
||||
|
||||
fn current_term_info(&self) -> TermInfo {
|
||||
let (filepos, offset) = self.positions_serializer_opt
|
||||
let (filepos, offset) = self
|
||||
.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.addr())
|
||||
.unwrap_or((0u64, 0u8));
|
||||
@@ -175,7 +177,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
|
||||
assert!(
|
||||
!self.term_open,
|
||||
"Called new_term, while the previous term was not closed."
|
||||
@@ -183,7 +185,10 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.term_open = true;
|
||||
self.postings_serializer.clear();
|
||||
self.current_term_info = self.current_term_info();
|
||||
self.term_dictionary_builder.insert_key(term)
|
||||
self.term_dictionary_builder.insert_key(term)?;
|
||||
let term_ordinal = self.num_terms;
|
||||
self.num_terms += 1;
|
||||
Ok(term_ordinal)
|
||||
}
|
||||
|
||||
/// Serialize the information that a document contains the current term,
|
||||
@@ -268,7 +273,8 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
let block_encoded: &[u8] = self
|
||||
.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
@@ -294,14 +300,16 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self.block_encoder
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.termfreq_enabled {
|
||||
let block_encoded = self.block_encoder
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
|
||||
218
src/postings/stacker/expull.rs
Normal file
218
src/postings/stacker/expull.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
use super::{Addr, MemoryArena};
|
||||
|
||||
use common::is_power_of_2;
|
||||
use std::mem;
|
||||
|
||||
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
||||
|
||||
const FIRST_BLOCK: u32 = 4u32;
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(len: u32) -> Option<usize> {
|
||||
match len {
|
||||
0...3 => None,
|
||||
4...MAX_BLOCK_LEN => {
|
||||
if is_power_of_2(len as usize) {
|
||||
Some(len as usize)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
n => {
|
||||
if n % MAX_BLOCK_LEN == 0 {
|
||||
Some(MAX_BLOCK_LEN as usize)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An exponential unrolled link.
|
||||
///
|
||||
/// The use case is as follows. Tantivy's indexer conceptually acts like a
|
||||
/// `HashMap<Term, Vec<u32>>`. As we come accross a given term in document
|
||||
/// `D`, we lookup the term in the map and append the document id to its vector.
|
||||
///
|
||||
/// The vector is then only read when it is serialized.
|
||||
///
|
||||
/// The `ExpUnrolledLinkedList` offers a more efficient solution to this
|
||||
/// problem.
|
||||
///
|
||||
/// It combines the idea of the unrolled linked list and tries to address the
|
||||
/// problem of selecting an adequate block size using a strategy similar to
|
||||
/// that of the `Vec` amortized resize strategy.
|
||||
///
|
||||
/// Data is stored in a linked list of blocks. The first block has a size of `4`
|
||||
/// and each block has a length of twice that of the previous block up to
|
||||
/// `MAX_BLOCK_LEN = 32768`.
|
||||
///
|
||||
/// This strategy is a good trade off to handle numerous very rare terms
|
||||
/// and avoid wasting half of the memory for very frequent terms.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
head: Addr,
|
||||
tail: Addr,
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
|
||||
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
head: addr,
|
||||
tail: addr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: self.head,
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a new element to the current stack.
|
||||
///
|
||||
/// If the current block end is reached, a new block is allocated.
|
||||
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
|
||||
self.len += 1;
|
||||
if let Some(new_block_len) = jump_needed(self.len) {
|
||||
// We need to allocate another block.
|
||||
// We also allocate an extra `u32` to store the pointer
|
||||
// to the future next block.
|
||||
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: Addr = heap.allocate_space(new_block_size);
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, new_block_addr)
|
||||
};
|
||||
self.tail = new_block_addr;
|
||||
}
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, val);
|
||||
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a MemoryArena,
|
||||
addr: Addr,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
self.consumed += 1;
|
||||
let addr: Addr = if jump_needed(self.consumed).is_some() {
|
||||
unsafe {
|
||||
// logic
|
||||
self.heap.read(self.addr)
|
||||
}
|
||||
} else {
|
||||
self.addr
|
||||
};
|
||||
self.addr = addr.offset(mem::size_of::<u32>() as u32);
|
||||
Some(unsafe {
|
||||
// logic
|
||||
self.heap.read(addr)
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::MemoryArena;
|
||||
use super::jump_needed;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let mut heap = MemoryArena::new();
|
||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||
stack.push(1u32, &mut heap);
|
||||
stack.push(2u32, &mut heap);
|
||||
stack.push(4u32, &mut heap);
|
||||
stack.push(8u32, &mut heap);
|
||||
{
|
||||
let mut it = stack.iter(&heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed() {
|
||||
let mut block_len = 4u32;
|
||||
let mut i = 0;
|
||||
while i < 10_000_000 {
|
||||
assert!(jump_needed(i + block_len - 1).is_none());
|
||||
assert!(jump_needed(i + block_len + 1).is_none());
|
||||
assert!(jump_needed(i + block_len).is_some());
|
||||
let new_block_len = jump_needed(i + block_len).unwrap();
|
||||
i += block_len;
|
||||
block_len = new_block_len as u32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::ExpUnrolledLinkedList;
|
||||
use tantivy_memory_arena::MemoryArena;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut vecs = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
vecs.push(Vec::new());
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
vecs[t].push(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = MemoryArena::new();
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &heap);
|
||||
}
|
||||
}
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
291
src/postings/stacker/memory_arena.rs
Normal file
291
src/postings/stacker/memory_arena.rs
Normal file
@@ -0,0 +1,291 @@
|
||||
//! 32-bits Memory arena for types implementing `Copy`.
|
||||
//! This Memory arena has been implemented to fit the use of tantivy's indexer
|
||||
//! and has *twisted specifications*.
|
||||
//!
|
||||
//! - It works on stable rust.
|
||||
//! - One can get an accurate figure of the memory usage of the arena.
|
||||
//! - Allocation are very cheap.
|
||||
//! - Allocation happening consecutively are very likely to have great locality.
|
||||
//! - Addresses (`Addr`) are 32bits.
|
||||
//! - Dropping the whole `MemoryArena` is cheap.
|
||||
//!
|
||||
//! # Limitations
|
||||
//!
|
||||
//! - Your object shall not implement `Drop`.
|
||||
//! - `Addr` to the `Arena` are 32-bits. The maximum capacity of the arena
|
||||
//! is 4GB. *(Tantivy's indexer uses one arena per indexing thread.)*
|
||||
//! - The arena only works for objects much smaller than `1MB`.
|
||||
//! Allocating more than `1MB` at a time will result in a panic,
|
||||
//! and allocating a lot of large object (> 500KB) will result in a fragmentation.
|
||||
//! - Your objects are store in an unaligned fashion. For this reason,
|
||||
//! the API does not let you access them as references.
|
||||
//!
|
||||
//! Instead, you store and access your data via `.write(...)` and `.read(...)`, which under the hood
|
||||
//! stores your object using `ptr::write_unaligned` and `ptr::read_unaligned`.
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
|
||||
const NUM_BITS_PAGE_ADDR: usize = 20;
|
||||
const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
|
||||
|
||||
/// Represents a pointer into the `MemoryArena`
|
||||
/// .
|
||||
/// Pointer are 32-bits and are split into
|
||||
/// two parts.
|
||||
///
|
||||
/// The first 12 bits represent the id of a
|
||||
/// page of memory.
|
||||
///
|
||||
/// The last 20 bits are an address within this page of memory.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct Addr(u32);
|
||||
|
||||
impl Addr {
|
||||
/// Creates a null pointer.
|
||||
pub fn null_pointer() -> Addr {
|
||||
Addr(u32::max_value())
|
||||
}
|
||||
|
||||
/// Returns the `Addr` object for `addr + offset`
|
||||
pub fn offset(&self, offset: u32) -> Addr {
|
||||
Addr(self.0.wrapping_add(offset))
|
||||
}
|
||||
|
||||
fn new(page_id: usize, local_addr: usize) -> Addr {
|
||||
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
|
||||
}
|
||||
|
||||
fn page_id(&self) -> usize {
|
||||
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
|
||||
}
|
||||
|
||||
fn page_local_addr(&self) -> usize {
|
||||
(self.0 as usize) & (PAGE_SIZE - 1)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the `Addr` is null.
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait required for an object to be `storable`.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// Most of the time you should not implement this trait,
|
||||
/// and only use the `MemoryArena` with object implementing `Copy`.
|
||||
///
|
||||
/// `ArenaStorable` is used in `tantivy` to force
|
||||
/// a `Copy` object and a `slice` of data to be stored contiguously.
|
||||
pub trait ArenaStorable {
|
||||
fn num_bytes(&self) -> usize;
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
|
||||
}
|
||||
|
||||
impl<V> ArenaStorable for V
|
||||
where
|
||||
V: Copy,
|
||||
{
|
||||
fn num_bytes(&self) -> usize {
|
||||
mem::size_of::<V>()
|
||||
}
|
||||
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
|
||||
ptr::write_unaligned(dst_ptr, self);
|
||||
}
|
||||
}
|
||||
|
||||
/// The `MemoryArena`
|
||||
pub struct MemoryArena {
|
||||
pages: Vec<Page>,
|
||||
}
|
||||
|
||||
impl MemoryArena {
|
||||
/// Creates a new memory arena.
|
||||
pub fn new() -> MemoryArena {
|
||||
let first_page = Page::new(0);
|
||||
MemoryArena {
|
||||
pages: vec![first_page],
|
||||
}
|
||||
}
|
||||
|
||||
fn add_page(&mut self) -> &mut Page {
|
||||
let new_page_id = self.pages.len();
|
||||
self.pages.push(Page::new(new_page_id));
|
||||
&mut self.pages[new_page_id]
|
||||
}
|
||||
|
||||
/// Returns an estimate in number of bytes
|
||||
/// of resident memory consumed by the `MemoryArena`.
|
||||
///
|
||||
/// Internally, it counts a number of `1MB` pages
|
||||
/// and therefore delivers an upperbound.
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.pages.len() * PAGE_SIZE
|
||||
}
|
||||
|
||||
/// Writes a slice at the given address, assuming the
|
||||
/// memory was allocated beforehands.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic or corrupt the heap if he space was not
|
||||
/// properly allocated beforehands.
|
||||
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
|
||||
let bytes = data.as_ref();
|
||||
self.pages[addr.page_id()]
|
||||
.get_mut_slice(addr.page_local_addr(), bytes.len())
|
||||
.copy_from_slice(bytes);
|
||||
}
|
||||
|
||||
/// Returns the `len` bytes starting at `addr`
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the memory has not been allocated beforehands.
|
||||
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
|
||||
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
|
||||
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap
|
||||
///
|
||||
/// It allocates the `Item` beforehands.
|
||||
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
|
||||
let num_bytes = val.num_bytes();
|
||||
let addr = self.allocate_space(num_bytes);
|
||||
unsafe {
|
||||
self.write(addr, val);
|
||||
};
|
||||
addr
|
||||
}
|
||||
|
||||
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
|
||||
val.write_into(self, addr)
|
||||
}
|
||||
|
||||
/// Read an item in the heap at the given `address`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the address is erroneous
|
||||
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
|
||||
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
|
||||
ptr::read_unaligned(ptr as *const Item)
|
||||
}
|
||||
|
||||
/// Allocates `len` bytes and returns the allocated address.
|
||||
pub fn allocate_space(&mut self, len: usize) -> Addr {
|
||||
let page_id = self.pages.len() - 1;
|
||||
if let Some(addr) = self.pages[page_id].allocate_space(len) {
|
||||
return addr;
|
||||
}
|
||||
self.add_page().allocate_space(len).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
struct Page {
|
||||
page_id: usize,
|
||||
len: usize,
|
||||
data: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl Page {
|
||||
fn new(page_id: usize) -> Page {
|
||||
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
|
||||
unsafe {
|
||||
data.set_len(PAGE_SIZE);
|
||||
} // avoid initializing page
|
||||
Page {
|
||||
page_id,
|
||||
len: 0,
|
||||
data: data.into_boxed_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_available(&self, len: usize) -> bool {
|
||||
len + self.len <= PAGE_SIZE
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
||||
&mut self.data[local_addr..][..len]
|
||||
}
|
||||
|
||||
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
||||
&self.data[local_addr..][..len]
|
||||
}
|
||||
|
||||
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
||||
if self.is_available(len) {
|
||||
let addr = Addr::new(self.page_id, self.len);
|
||||
self.len += len;
|
||||
Some(addr)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
|
||||
self.data.as_ptr().offset(addr as isize)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
|
||||
self.data.as_mut_ptr().offset(addr as isize)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::MemoryArena;
|
||||
|
||||
#[test]
|
||||
fn test_arena_allocate_slice() {
|
||||
let mut arena = MemoryArena::new();
|
||||
let a = b"hello";
|
||||
let b = b"happy tax payer";
|
||||
|
||||
let addr_a = arena.allocate_space(a.len());
|
||||
arena.write_bytes(addr_a, a);
|
||||
|
||||
let addr_b = arena.allocate_space(b.len());
|
||||
arena.write_bytes(addr_b, b);
|
||||
|
||||
assert_eq!(arena.read_slice(addr_a, a.len()), a);
|
||||
assert_eq!(arena.read_slice(addr_b, b.len()), b);
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
struct MyTest {
|
||||
pub a: usize,
|
||||
pub b: u8,
|
||||
pub c: u32,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store_object() {
|
||||
let mut arena = MemoryArena::new();
|
||||
let a = MyTest {
|
||||
a: 143,
|
||||
b: 21,
|
||||
c: 32,
|
||||
};
|
||||
let b = MyTest {
|
||||
a: 113,
|
||||
b: 221,
|
||||
c: 12,
|
||||
};
|
||||
let addr_a = arena.store(a);
|
||||
let addr_b = arena.store(b);
|
||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
|
||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
|
||||
}
|
||||
}
|
||||
9
src/postings/stacker/mod.rs
Normal file
9
src/postings/stacker/mod.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
mod expull;
|
||||
mod memory_arena;
|
||||
mod murmurhash2;
|
||||
mod term_hashmap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
||||
use self::murmurhash2::murmurhash2;
|
||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
86
src/postings/stacker/murmurhash2.rs
Normal file
86
src/postings/stacker/murmurhash2.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
use std::ptr;
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let len = key.len() as u32;
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::murmurhash2;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
}
|
||||
296
src/postings/stacker/term_hashmap.rs
Normal file
296
src/postings/stacker/term_hashmap.rs
Normal file
@@ -0,0 +1,296 @@
|
||||
use super::murmurhash2;
|
||||
use super::{Addr, ArenaStorable, MemoryArena};
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use std::slice;
|
||||
|
||||
pub type BucketId = usize;
|
||||
|
||||
struct KeyBytesValue<'a, V> {
|
||||
key: &'a [u8],
|
||||
value: V,
|
||||
}
|
||||
|
||||
impl<'a, V> KeyBytesValue<'a, V> {
|
||||
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
|
||||
KeyBytesValue { key, value }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
|
||||
where
|
||||
V: ArenaStorable,
|
||||
{
|
||||
fn num_bytes(&self) -> usize {
|
||||
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
|
||||
}
|
||||
|
||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||
arena.write(addr, self.key.len() as u16);
|
||||
arena.write_bytes(addr.offset(2), self.key);
|
||||
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the actual memory size in bytes
|
||||
/// required to create a table of size $2^num_bits$.
|
||||
pub fn compute_table_size(num_bits: usize) -> usize {
|
||||
(1 << num_bits) * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone)]
|
||||
struct KeyValue {
|
||||
key_value_addr: Addr,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl Default for KeyValue {
|
||||
fn default() -> Self {
|
||||
KeyValue {
|
||||
key_value_addr: Addr::null_pointer(),
|
||||
hash: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
/// The quirky API has the benefit of avoiding
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct TermHashMap {
|
||||
table: Box<[KeyValue]>,
|
||||
pub heap: MemoryArena,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing { hash, i: 0, mask }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Iter<'a> {
|
||||
hashmap: &'a TermHashMap,
|
||||
inner: slice::Iter<'a, usize>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = (&'a [u8], Addr, BucketId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) =
|
||||
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
|
||||
(key, offset, bucket as BucketId)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TermHashMap {
|
||||
pub fn new(num_bucket_power_of_2: usize) -> TermHashMap {
|
||||
let heap = MemoryArena::new();
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.table.len() * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
|
||||
let key_addr = addr.offset(2u32);
|
||||
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
|
||||
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
|
||||
(key_bytes, val_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr,
|
||||
hash,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
hashmap: &self,
|
||||
}
|
||||
}
|
||||
|
||||
fn resize(&mut self) {
|
||||
let new_len = self.table.len() * 2;
|
||||
let mask = new_len - 1;
|
||||
self.mask = mask;
|
||||
let new_table = vec![KeyValue::default(); new_len].into_boxed_slice();
|
||||
let old_table = mem::replace(&mut self.table, new_table);
|
||||
for old_pos in self.occupied.iter_mut() {
|
||||
let key_value: KeyValue = old_table[*old_pos];
|
||||
let mut probe = QuadraticProbing::compute(key_value.hash as usize, mask);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
if self.table[bucket].is_empty() {
|
||||
*old_pos = bucket;
|
||||
self.table[bucket] = key_value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `update` create a new entry for a given key if it does not exists
|
||||
/// or updates the existing entry.
|
||||
///
|
||||
/// The actual logic for this update is define in the the `updater`
|
||||
/// argument.
|
||||
///
|
||||
/// If the key is not present, `updater` will receive `None` and
|
||||
/// will be in charge of returning a default value.
|
||||
/// If the key already as an associated value, then it will be passed
|
||||
/// `Some(previous_value)`.
|
||||
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
||||
where
|
||||
S: AsRef<[u8]>,
|
||||
V: Copy,
|
||||
TMutator: FnMut(Option<V>) -> V,
|
||||
{
|
||||
if self.is_saturated() {
|
||||
self.resize();
|
||||
}
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let val = updater(None);
|
||||
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
|
||||
self.set_bucket(hash, key_addr, bucket);
|
||||
return bucket as BucketId;
|
||||
} else if kv.hash == hash {
|
||||
let (key_matches, val_addr) = {
|
||||
let (stored_key, val_addr): (&[u8], Addr) =
|
||||
unsafe { self.get_key_value(kv.key_value_addr) };
|
||||
(stored_key == key_bytes, val_addr)
|
||||
};
|
||||
if key_matches {
|
||||
unsafe {
|
||||
// logic
|
||||
let v = self.heap.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.heap.write(val_addr, new_v);
|
||||
};
|
||||
return bucket as BucketId;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash2(b: &mut Bencher) {
|
||||
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
let mut s = 0;
|
||||
for &key in &keys {
|
||||
s ^= murmurhash2(key.as_bytes());
|
||||
}
|
||||
s
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::TermHashMap;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18);
|
||||
{
|
||||
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
3u32
|
||||
});
|
||||
}
|
||||
{
|
||||
hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
4u32
|
||||
});
|
||||
}
|
||||
{
|
||||
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, Some(3u32));
|
||||
5u32
|
||||
});
|
||||
}
|
||||
|
||||
let mut vanilla_hash_map = HashMap::new();
|
||||
let mut iter_values = hash_map.iter();
|
||||
while let Some((key, addr, _)) = iter_values.next() {
|
||||
let val: u32 = unsafe {
|
||||
// test
|
||||
hash_map.heap.read(addr)
|
||||
};
|
||||
vanilla_hash_map.insert(key.to_owned(), val);
|
||||
}
|
||||
assert_eq!(vanilla_hash_map.len(), 2);
|
||||
}
|
||||
}
|
||||
@@ -1,34 +1,25 @@
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use std::io;
|
||||
|
||||
/// `TermInfo` contains all of the information
|
||||
/// associated to terms in the `.term` file.
|
||||
///
|
||||
/// It consists of
|
||||
/// * `doc_freq` : the number of document in the segment
|
||||
/// containing this term. It is also the length of the
|
||||
/// posting list associated to this term
|
||||
/// * `postings_offset` : an offset in the `.idx` file
|
||||
/// addressing the start of the posting list associated
|
||||
/// to this term.
|
||||
/// `TermInfo` wraps the metadata associated to a Term.
|
||||
/// It is segment-local.
|
||||
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
|
||||
pub struct TermInfo {
|
||||
/// Number of documents in the segment containing the term
|
||||
pub doc_freq: u32,
|
||||
/// Offset within the postings (`.idx`) file.
|
||||
/// Start offset within the postings (`.idx`) file.
|
||||
pub postings_offset: u64,
|
||||
/// Offset within the position (`.pos`) file.
|
||||
/// Start offset of the first block within the position (`.pos`) file.
|
||||
pub positions_offset: u64,
|
||||
/// Offset within the position block.
|
||||
/// Start offset within this position block.
|
||||
pub positions_inner_offset: u8,
|
||||
}
|
||||
|
||||
impl FixedSize for TermInfo {
|
||||
/// Size required for the binary serialization of `TermInfo`.
|
||||
/// This is large, but in practise, all `TermInfo` but the first one
|
||||
/// of the block are bitpacked.
|
||||
///
|
||||
/// See `TermInfoStore`.
|
||||
/// Size required for the binary serialization of a `TermInfo` object.
|
||||
/// This is large, but in practise, `TermInfo` are encoded in blocks and
|
||||
/// only the first `TermInfo` of a block is serialized uncompressed.
|
||||
/// The subsequent `TermInfo` are delta encoded and bitpacked.
|
||||
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,22 +1,20 @@
|
||||
use query::Query;
|
||||
use query::Weight;
|
||||
use query::Scorer;
|
||||
use core::Searcher;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use query::{Query, Scorer, Weight};
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use DocId;
|
||||
use core::Searcher;
|
||||
|
||||
/// Query that matches all of the documents.
|
||||
///
|
||||
/// All of the document get the score 1f32.
|
||||
#[derive(Debug)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AllQuery;
|
||||
|
||||
impl Query for AllQuery {
|
||||
fn weight(&self, _: &Searcher, _: bool) -> Result<Box<Weight>> {
|
||||
Ok(box AllWeight)
|
||||
Ok(Box::new(AllWeight))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,29 +23,47 @@ pub struct AllWeight;
|
||||
|
||||
impl Weight for AllWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
Ok(box AllScorer {
|
||||
started: false,
|
||||
Ok(Box::new(AllScorer {
|
||||
state: State::NotStarted,
|
||||
doc: 0u32,
|
||||
max_doc: reader.max_doc(),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
enum State {
|
||||
NotStarted,
|
||||
Started,
|
||||
Finished,
|
||||
}
|
||||
|
||||
/// Scorer associated to the `AllQuery` query.
|
||||
pub struct AllScorer {
|
||||
started: bool,
|
||||
state: State,
|
||||
doc: DocId,
|
||||
max_doc: DocId,
|
||||
}
|
||||
|
||||
impl DocSet for AllScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.started {
|
||||
self.doc += 1u32;
|
||||
} else {
|
||||
self.started = true;
|
||||
match self.state {
|
||||
State::NotStarted => {
|
||||
self.state = State::Started;
|
||||
self.doc = 0;
|
||||
}
|
||||
State::Started => {
|
||||
self.doc += 1u32;
|
||||
}
|
||||
State::Finished => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if self.doc < self.max_doc {
|
||||
return true;
|
||||
} else {
|
||||
self.state = State::Finished;
|
||||
return false;
|
||||
}
|
||||
self.doc < self.max_doc
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
@@ -64,3 +80,46 @@ impl Scorer for AllScorer {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::AllQuery;
|
||||
use query::Query;
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_all_query() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=>"aaa"));
|
||||
index_writer.add_document(doc!(field=>"bbb"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(field=>"ccc"));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let weight = AllQuery.weight(&searcher, false).unwrap();
|
||||
{
|
||||
let reader = searcher.segment_reader(0);
|
||||
let mut scorer = weight.scorer(reader).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 1u32);
|
||||
assert!(!scorer.advance());
|
||||
}
|
||||
{
|
||||
let reader = searcher.segment_reader(1);
|
||||
let mut scorer = weight.scorer(reader).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert!(!scorer.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user