mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-31 06:22:54 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
232ca5c06c | ||
|
|
e78af20375 | ||
|
|
30637f7a7f |
154
.travis.yml
154
.travis.yml
@@ -1,127 +1,37 @@
|
||||
# Based on the "trust" template v0.1.2
|
||||
# https://github.com/japaric/trust/tree/v0.1.2
|
||||
|
||||
dist: trusty
|
||||
language: rust
|
||||
services: docker
|
||||
sudo: required
|
||||
|
||||
cache: cargo
|
||||
rust:
|
||||
- nightly
|
||||
env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# iOS
|
||||
#- env: TARGET=aarch64-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7s-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=i386-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-ios DISABLE_TESTS=1
|
||||
os: osx
|
||||
|
||||
# Linux
|
||||
- env: TARGET=aarch64-unknown-linux-gnu
|
||||
# - env: TARGET=arm-unknown-linux-gnueabi
|
||||
# - env: TARGET=armv7-unknown-linux-gnueabihf
|
||||
- env: TARGET=i686-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-musl
|
||||
#- env: TARGET=mips-unknown-linux-gnu
|
||||
#- env: TARGET=mips64-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mips64el-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mipsel-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64le-unknown-linux-gnu
|
||||
#- env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-musl
|
||||
|
||||
# OSX
|
||||
#- env: TARGET=i686-apple-darwin
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-darwin
|
||||
os: osx
|
||||
|
||||
# *BSD
|
||||
#- env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1
|
||||
|
||||
# Windows
|
||||
#- env: TARGET=x86_64-pc-windows-gnu
|
||||
|
||||
# Bare metal
|
||||
# These targets don't support std and as such are likely not suitable for
|
||||
# most crates.
|
||||
# - env: TARGET=thumbv6m-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabihf
|
||||
# - env: TARGET=thumbv7m-none-eabi
|
||||
|
||||
# Testing other channels
|
||||
#- env: TARGET=x86_64-unknown-linux-gnu
|
||||
# rust: nightly
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
# rust: nightly
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
|
||||
- CC=gcc-4.8
|
||||
- CXX=g++-4.8
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- libcurl4-openssl-dev
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
script:
|
||||
- bash ci/script.sh
|
||||
|
||||
after_script: set +e
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
#
|
||||
#deploy:
|
||||
# # - Create a `public_repo` GitHub token. Go to: https://github.com/settings/tokens/new
|
||||
# # - Encrypt it: `travis encrypt 0123456789012345678901234567890123456789
|
||||
# # - Paste the output down here
|
||||
# api_key:
|
||||
# secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
# file_glob: true
|
||||
# file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.*
|
||||
# on:
|
||||
# # TODO Here you can pick which targets will generate binary releases
|
||||
# # In this example, there are some targets that are tested using the stable
|
||||
# # and nightly channels. This condition makes sure there is only one release
|
||||
# # for such targets and that's generated using the stable channel
|
||||
# condition: $TRAVIS_RUST_VERSION = stable
|
||||
# tags: true
|
||||
# provider: releases
|
||||
# skip_cleanup: true
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
# Travis can't cache files that are not readable by "others"
|
||||
- chmod -R a+r $HOME/.cargo
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
# # release tags
|
||||
# - /^v\d+\.\d+\.\d+.*$/
|
||||
# - master
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
- cargo build
|
||||
- cargo test
|
||||
- cargo test -- --ignored
|
||||
- cargo run --example simple_search
|
||||
- cargo doc
|
||||
after_success:
|
||||
- cargo coveralls --exclude-pattern src/functional_test.rs
|
||||
- cargo doc-upload
|
||||
|
||||
11
AUTHORS
11
AUTHORS
@@ -1,11 +0,0 @@
|
||||
# This is the list of authors of tantivy for copyright purposes.
|
||||
Paul Masurel
|
||||
Laurentiu Nicola
|
||||
Dru Sellers
|
||||
Ashley Mannix
|
||||
Michael J. Curry
|
||||
Jason Wolfe
|
||||
# As an employee of Google I am required to add Google LLC
|
||||
# in the list of authors, but this project is not affiliated to Google
|
||||
# in any other way.
|
||||
Google LLC
|
||||
37
CHANGELOG.md
37
CHANGELOG.md
@@ -1,30 +1,9 @@
|
||||
Tantivy 0.6
|
||||
Tantivy 0.5.2
|
||||
==========================
|
||||
|
||||
|
||||
Special thanks to @drusellers and @jason-wolfe for their contributions
|
||||
to this release!
|
||||
|
||||
- Removed C code. Tantivy is now pure Rust. (@pmasurel)
|
||||
- BM25 (@pmasurel)
|
||||
- Approximate field norms encoded over 1 byte. (@pmasurel)
|
||||
- Compiles on stable rust (@pmasurel)
|
||||
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
|
||||
- Completely uncompressed
|
||||
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
|
||||
- Add NGram token support (@drusellers)
|
||||
- Add Stopword Filter support (@drusellers)
|
||||
- Add a FuzzyTermQuery (@drusellers)
|
||||
- Add a RegexQuery (@drusellers)
|
||||
- Various performance improvements (@pmasurel)_
|
||||
|
||||
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
- bugfix #274
|
||||
- bugfix #280
|
||||
- bugfix #289
|
||||
|
||||
- Removed C code. Tantivy is now pure Rust.
|
||||
- BM25
|
||||
- Approximate field norms encoded over 1 byte.
|
||||
|
||||
Tantivy 0.5.1
|
||||
==========================
|
||||
@@ -102,7 +81,7 @@ Tantivy 0.3
|
||||
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
|
||||
for their contribution to this release.
|
||||
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
for their advise and company :)
|
||||
|
||||
https://gitter.im/tantivy-search/tantivy
|
||||
@@ -110,9 +89,9 @@ https://gitter.im/tantivy-search/tantivy
|
||||
|
||||
Warning:
|
||||
|
||||
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
|
||||
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
|
||||
code and index format.
|
||||
You should not expect backward compatibility before
|
||||
You should not expect backward compatibility before
|
||||
tantivy 1.0.
|
||||
|
||||
|
||||
@@ -138,7 +117,7 @@ Thanks to @KodrAus ! (#108)
|
||||
the natural ordering.
|
||||
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
|
||||
- Misc invisible bug fixes, and code cleanup.
|
||||
- Use
|
||||
- Use
|
||||
|
||||
|
||||
|
||||
|
||||
37
Cargo.toml
37
Cargo.toml
@@ -1,28 +1,23 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.6.0"
|
||||
version = "0.5.1"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
description = """Tantivy is a search engine library."""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "search engine", "information", "retrieval"]
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.9.1"
|
||||
byteorder = "1.0"
|
||||
lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = {version="0.3", default-features=false}
|
||||
fst-regex = { version="0.2" }
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "2.1"
|
||||
fst = {version="0.2", default-features=false}
|
||||
atomicwrites = {version="0.1", optional=true}
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
tempdir = "0.3"
|
||||
@@ -31,7 +26,6 @@ serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
bit-set = "0.4.0"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
@@ -42,16 +36,17 @@ error-chain = "0.8"
|
||||
owning_ref = "0.3"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9" }
|
||||
downcast = { version="0.9", features = ["nightly"]}
|
||||
matches = "0.1"
|
||||
bitpacking = "0.5"
|
||||
fnv = "1.0.6"
|
||||
snap = "0.2"
|
||||
bitpacking = {path = "../bitpacking"}
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.3"
|
||||
tempfile = "2.1"
|
||||
env_logger = "0.4"
|
||||
|
||||
[profile.release]
|
||||
@@ -60,10 +55,12 @@ debug = false
|
||||
lto = true
|
||||
debug-assertions = false
|
||||
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
streamdict = []
|
||||
mmap = ["fst/mmap", "atomicwrites"]
|
||||
lz4-compression = ["lz4"]
|
||||
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
@@ -72,5 +69,11 @@ travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
name = "simple_search"
|
||||
required-features = ["mmap"]
|
||||
|
||||
[[example]]
|
||||
name = "custom_tokenizer"
|
||||
|
||||
[[bin]]
|
||||
name = "convert_to_static"
|
||||
path = "./bin/convert_to_static.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "test_static_dir"
|
||||
path = "./bin/test_static_dir.rs"
|
||||
2
LICENSE
2
LICENSE
@@ -1,4 +1,4 @@
|
||||
Copyright (c) 2018 by the project authors, as listed in the AUTHORS file.
|
||||
Copyright (c) 2016 Paul Masurel
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
|
||||
71
README.md
71
README.md
@@ -4,50 +4,36 @@
|
||||
[](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy)
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
|
||||
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
It is strongly inspired by Lucene's design.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- BM25 scoring (the same as lucene)
|
||||
- Basic query language (`+michael +jackson`)
|
||||
- Phrase queries search (\"michael jackson\"`)
|
||||
- tf-idf scoring
|
||||
- Basic query language
|
||||
- Phrase queries
|
||||
- Incremental indexing
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- Mmap directory
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||
- optional SIMD integer compression
|
||||
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing
|
||||
- Faceting
|
||||
- configurable indexing (optional term frequency and position indexing
|
||||
- Cheesy logo with a horse
|
||||
|
||||
# Non-features
|
||||
Tantivy supports Linux, MacOS and Windows.
|
||||
|
||||
- Distributed search and will not be in the scope of tantivy.
|
||||
|
||||
|
||||
# Supported OS and compiler
|
||||
|
||||
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
|
||||
|
||||
# Getting started
|
||||
|
||||
- [tantivy's simple search example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
|
||||
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents and search via the CLI or a small server with a REST API.
|
||||
It will walk you through getting a wikipedia search engine up and running in a few minutes.
|
||||
- [reference doc]
|
||||
- [For the last released version](https://docs.rs/tantivy/)
|
||||
@@ -57,14 +43,43 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
||||
|
||||
## Development
|
||||
|
||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run :
|
||||
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/unstable-book/language-features/box-syntax.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md),
|
||||
and [simd](https://github.com/rust-lang/rust/issues/27731).
|
||||
|
||||
|
||||
To check out and run test, you can simply run :
|
||||
|
||||
git clone git@github.com:tantivy-search/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
cargo +nightly build
|
||||
|
||||
|
||||
## Note on release build and performance
|
||||
|
||||
If your project depends on `tantivy`, for better performance, make sure to enable
|
||||
`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
|
||||
be available on most `x86_64` CPUs you will encounter).
|
||||
|
||||
For instance,
|
||||
|
||||
RUSTFLAGS='-C target-feature=+sse3'
|
||||
|
||||
Or, if you are targetting a specific cpu
|
||||
|
||||
RUSTFLAGS='-C target-cpu=native' build --release
|
||||
|
||||
Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
|
||||
If you want to disable those, you can run the following command :
|
||||
|
||||
cargo build --no-default-features
|
||||
|
||||
Alternatively, if you are trying to compile `tantivy` without simd compression,
|
||||
you can disable this functionality. In this case, this submodule is not required
|
||||
and you can compile tantivy by using the `--no-default-features` flag.
|
||||
|
||||
cargo build --no-default-features
|
||||
|
||||
|
||||
# Contribute
|
||||
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
@@ -6,6 +6,9 @@ environment:
|
||||
matrix:
|
||||
- channel: nightly
|
||||
target: x86_64-pc-windows-msvc
|
||||
- channel: nightly
|
||||
target: x86_64-pc-windows-gnu
|
||||
msys_bits: 64
|
||||
|
||||
install:
|
||||
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
|
||||
|
||||
20
bin/convert_to_static.rs
Normal file
20
bin/convert_to_static.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
extern crate tantivy;
|
||||
use tantivy::directory::write_static_from_directory;
|
||||
|
||||
fn main() {
|
||||
// Prints each argument on a separate line
|
||||
let mut args = env::args();
|
||||
args.next().unwrap();
|
||||
let directory_path= args.next().expect("Expect 2 args.<directory_path> <outputfile>");
|
||||
let output_path = args.next().expect("Expect 2 args.<directory_path> <outputfile>");
|
||||
println!("{} => {}", directory_path, output_path);
|
||||
let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap();
|
||||
println!("Read all");
|
||||
let mut output = File::create(output_path).unwrap();
|
||||
output.write_all(&buffer[..]).unwrap();
|
||||
output.flush().unwrap();
|
||||
}
|
||||
51
bin/test_static_dir.rs
Normal file
51
bin/test_static_dir.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
extern crate tantivy;
|
||||
use tantivy::directory::{StaticDirectory, write_static_from_directory};
|
||||
use tantivy::Index;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::collector::TopCollector;
|
||||
|
||||
|
||||
static DATA: &'static [u8] = include_bytes!("output.bin");
|
||||
|
||||
fn run() -> tantivy::Result<()> {
|
||||
// Prints each argument on a separate line
|
||||
let directory = StaticDirectory::open(DATA).unwrap();
|
||||
let index = Index::open_directory(directory).unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let schema = index.schema();
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
let query = query_parser.parse_query("sea whale")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
fn main() {
|
||||
run().unwrap();
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
# This script takes care of packaging the build artifacts that will go in the
|
||||
# release zipfile
|
||||
|
||||
$SRC_DIR = $PWD.Path
|
||||
$STAGE = [System.Guid]::NewGuid().ToString()
|
||||
|
||||
Set-Location $ENV:Temp
|
||||
New-Item -Type Directory -Name $STAGE
|
||||
Set-Location $STAGE
|
||||
|
||||
$ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip"
|
||||
|
||||
# TODO Update this to package the right artifacts
|
||||
Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\'
|
||||
|
||||
7z a "$ZIP" *
|
||||
|
||||
Push-AppveyorArtifact "$ZIP"
|
||||
|
||||
Remove-Item *.* -Force
|
||||
Set-Location ..
|
||||
Remove-Item $STAGE
|
||||
Set-Location $SRC_DIR
|
||||
@@ -1,33 +0,0 @@
|
||||
# This script takes care of building your crate and packaging it for release
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
local src=$(pwd) \
|
||||
stage=
|
||||
|
||||
case $TRAVIS_OS_NAME in
|
||||
linux)
|
||||
stage=$(mktemp -d)
|
||||
;;
|
||||
osx)
|
||||
stage=$(mktemp -d -t tmp)
|
||||
;;
|
||||
esac
|
||||
|
||||
test -f Cargo.lock || cargo generate-lockfile
|
||||
|
||||
# TODO Update this to build the artifacts that matter to you
|
||||
cross rustc --bin hello --target $TARGET --release -- -C lto
|
||||
|
||||
# TODO Update this to package the right artifacts
|
||||
cp target/$TARGET/release/hello $stage/
|
||||
|
||||
cd $stage
|
||||
tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
|
||||
cd $src
|
||||
|
||||
rm -rf $stage
|
||||
}
|
||||
|
||||
main
|
||||
@@ -1,47 +0,0 @@
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
local target=
|
||||
if [ $TRAVIS_OS_NAME = linux ]; then
|
||||
target=x86_64-unknown-linux-musl
|
||||
sort=sort
|
||||
else
|
||||
target=x86_64-apple-darwin
|
||||
sort=gsort # for `sort --sort-version`, from brew's coreutils.
|
||||
fi
|
||||
|
||||
# Builds for iOS are done on OSX, but require the specific target to be
|
||||
# installed.
|
||||
case $TARGET in
|
||||
aarch64-apple-ios)
|
||||
rustup target install aarch64-apple-ios
|
||||
;;
|
||||
armv7-apple-ios)
|
||||
rustup target install armv7-apple-ios
|
||||
;;
|
||||
armv7s-apple-ios)
|
||||
rustup target install armv7s-apple-ios
|
||||
;;
|
||||
i386-apple-ios)
|
||||
rustup target install i386-apple-ios
|
||||
;;
|
||||
x86_64-apple-ios)
|
||||
rustup target install x86_64-apple-ios
|
||||
;;
|
||||
esac
|
||||
|
||||
# This fetches latest stable release
|
||||
local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \
|
||||
| cut -d/ -f3 \
|
||||
| grep -E '^v[0.1.0-9.]+$' \
|
||||
| $sort --version-sort \
|
||||
| tail -n1)
|
||||
curl -LSfs https://japaric.github.io/trust/install.sh | \
|
||||
sh -s -- \
|
||||
--force \
|
||||
--git japaric/cross \
|
||||
--tag $tag \
|
||||
--target $target
|
||||
}
|
||||
|
||||
main
|
||||
23
ci/script.sh
23
ci/script.sh
@@ -1,23 +0,0 @@
|
||||
# This script takes care of testing your crate
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
cross build --target $TARGET
|
||||
cross build --target $TARGET --release
|
||||
|
||||
if [ ! -z $DISABLE_TESTS ]; then
|
||||
return
|
||||
fi
|
||||
|
||||
cross test --target $TARGET
|
||||
# cross test --target $TARGET --release
|
||||
|
||||
# cross run --target $TARGET
|
||||
# cross run --target $TARGET --release
|
||||
}
|
||||
|
||||
# we don't run the "test phase" when doing deploys
|
||||
if [ -z $TRAVIS_TAG ]; then
|
||||
main
|
||||
fi
|
||||
@@ -1,226 +0,0 @@
|
||||
extern crate tantivy;
|
||||
extern crate tempdir;
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::Index;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
if let Ok(dir) = TempDir::new("tantivy_token_example_dir") {
|
||||
run_example(dir.path()).unwrap();
|
||||
dir.close().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// Our first field is title.
|
||||
// In this example we want to use NGram searching
|
||||
// we will set that to 3 characters, so any three
|
||||
// char in the title should be findable.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("ngram3")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
// We want full-text search for it, but we do not
|
||||
// need to be able to be able to retrieve it
|
||||
// for our application.
|
||||
//
|
||||
// We can make our index lighter and
|
||||
// by omitting `STORED` flag.
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
index
|
||||
.tokenizers()
|
||||
.register("ngram3", NgramTokenizer::new(3, 3, false));
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// heap for the indexer can increase its throughput.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
// ### Create a document "manually".
|
||||
//
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
// ### Create a document directly from json.
|
||||
//
|
||||
// Alternatively, we can use our schema to parse a
|
||||
// document object directly from json.
|
||||
// The document is a string, but we use the `json` macro
|
||||
// from `serde_json` for the convenience of multi-line support.
|
||||
let json = json!({
|
||||
"title": "Of Mice and Men",
|
||||
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
});
|
||||
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(mice_and_men_doc);
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
// The following document has two titles.
|
||||
let json = json!({
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
});
|
||||
let frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(frankenstein_doc);
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
// the English wikipedia. Tantivy's indexing is rather fast.
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 4 minutes on my computer!
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
// At this point our documents are not searchable.
|
||||
//
|
||||
//
|
||||
// We need to call .commit() explicitly to force the
|
||||
// index_writer to finish processing the documents in the queue,
|
||||
// flush the current index to the disk, and advertise
|
||||
// the existence of new documents.
|
||||
//
|
||||
// This call is blocking.
|
||||
index_writer.commit()?;
|
||||
|
||||
// If `.commit()` returns correctly, then all of the
|
||||
// documents that have been added are guaranteed to be
|
||||
// persistently indexed.
|
||||
//
|
||||
// In the scenario of a crash or a power failure,
|
||||
// tantivy behaves as if has rolled back to its last
|
||||
// commit.
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// Let's search our index. Start by reloading
|
||||
// searchers in the index. This should be done
|
||||
// after every commit().
|
||||
index.load_searchers()?;
|
||||
|
||||
// Afterwards create one (or more) searchers.
|
||||
//
|
||||
// You should create a searcher
|
||||
// every time you start a "search query".
|
||||
let searcher = index.searcher();
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
// Here, if the user does not specify which
|
||||
// field they want to search, tantivy will search
|
||||
// in both title and body.
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
let query = query_parser.parse_query("ken")?;
|
||||
|
||||
// A query defines a set of documents, as
|
||||
// well as the way they should be scored.
|
||||
//
|
||||
// A query created by the query parser is scored according
|
||||
// to a metric called Tf-Idf, and will consider
|
||||
// any document matching at least one of our terms.
|
||||
|
||||
// ### Collectors
|
||||
//
|
||||
// We are not interested in all of the documents but
|
||||
// only in the top 10. Keeping track of our top 10 best documents
|
||||
// is the role of the TopCollector.
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
// We can now perform our query.
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
// Our top collector now contains the 10
|
||||
// most relevant doc ids...
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
// Wait for indexing and merging threads to shut down.
|
||||
// Usually this isn't needed, but in `main` we try to
|
||||
// delete the temporary directory and that fails on
|
||||
// Windows if the files are still open.
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -5,11 +5,11 @@ extern crate tempdir;
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
use tantivy::Index;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() {
|
||||
// Let's create a temporary directory for the
|
||||
@@ -64,7 +64,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use collector::Collector;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
/// Collector that does nothing.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
@@ -25,57 +25,6 @@ impl Collector for DoNothingCollector {
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
/// This contraption is only usable if the type of your collectors
|
||||
/// are known at compile time.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||
left: Left,
|
||||
right: Right,
|
||||
|
||||
@@ -1,59 +1,12 @@
|
||||
use super::Collector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use Result;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::CountCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut count_collector).unwrap();
|
||||
///
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
|
||||
@@ -1,25 +1,27 @@
|
||||
use std::mem;
|
||||
use collector::Collector;
|
||||
use docset::SkipResult;
|
||||
use fastfield::FacetReader;
|
||||
use schema::Facet;
|
||||
use schema::Field;
|
||||
use std::cell::UnsafeCell;
|
||||
use std::collections::btree_map;
|
||||
use schema::Facet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use std::iter::Peekable;
|
||||
use std::mem;
|
||||
use std::{u64, usize};
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermStreamer;
|
||||
use termdict::TermStreamerBuilder;
|
||||
use std::collections::BTreeSet;
|
||||
use termdict::TermMerger;
|
||||
use docset::SkipResult;
|
||||
use std::{usize, u64};
|
||||
use std::iter::Peekable;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
@@ -342,19 +344,16 @@ impl FacetCollector {
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self
|
||||
.segment_counters
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
.collect();
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self
|
||||
.segment_counters
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||
.collect();
|
||||
|
||||
let facet_streams = self
|
||||
.segment_counters
|
||||
let facet_streams = self.segment_counters
|
||||
.iter()
|
||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||
.collect::<Vec<_>>();
|
||||
@@ -382,10 +381,8 @@ impl FacetCollector {
|
||||
})
|
||||
.sum();
|
||||
if count > 0u64 {
|
||||
let bytes: Vec<u8> = facet_merger.key().to_owned();
|
||||
// may create an corrupted facet if the term dicitonary is corrupted
|
||||
let facet = unsafe { Facet::from_encoded(bytes) };
|
||||
facet_counts.insert(facet, count);
|
||||
let bytes = facet_merger.key().to_owned();
|
||||
facet_counts.insert(Facet::from_encoded(bytes), count);
|
||||
}
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
@@ -405,8 +402,7 @@ impl Collector for FacetCollector {
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self
|
||||
.ff_reader
|
||||
&mut *self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
@@ -436,20 +432,9 @@ pub struct FacetCounts {
|
||||
facet_counts: BTreeMap<Facet, u64>,
|
||||
}
|
||||
|
||||
pub struct FacetChildIterator<'a> {
|
||||
underlying: btree_map::Range<'a, Facet, u64>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for FacetChildIterator<'a> {
|
||||
type Item = (&'a Facet, u64);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.underlying.next().map(|(facet, count)| (facet, *count))
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetCounts {
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator
|
||||
#[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime
|
||||
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
@@ -458,13 +443,15 @@ impl FacetCounts {
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
||||
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
||||
let facet_after = Facet::from_encoded(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||
FacetChildIterator { underlying }
|
||||
|
||||
self.facet_counts
|
||||
.range((left_bound, right_bound))
|
||||
.map(|(facet, count)| (facet, *count))
|
||||
}
|
||||
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
@@ -496,13 +483,14 @@ impl FacetCounts {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use test::Bencher;
|
||||
use core::Index;
|
||||
use query::AllQuery;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::Field;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use query::AllQuery;
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use std::iter;
|
||||
use schema::Field;
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
@@ -511,7 +499,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
@@ -557,10 +545,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(
|
||||
expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet."
|
||||
)]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
@@ -591,7 +577,7 @@ mod tests {
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
@@ -618,19 +604,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use collector::FacetCollector;
|
||||
use query::AllQuery;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::Facet;
|
||||
use schema::SchemaBuilder;
|
||||
use test::Bencher;
|
||||
use Index;
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
@@ -648,7 +621,7 @@ mod bench {
|
||||
// 40425 docs
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
Defines how the documents matching a search query should be processed.
|
||||
*/
|
||||
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use DocId;
|
||||
use Score;
|
||||
use Result;
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::CountCollector;
|
||||
@@ -21,7 +21,7 @@ mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
|
||||
mod chained_collector;
|
||||
pub use self::chained_collector::{chain, ChainedCollector};
|
||||
pub use self::chained_collector::chain;
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
@@ -89,13 +89,13 @@ impl<'a, C: Collector> Collector for &'a mut C {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use core::SegmentReader;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
use test::Bencher;
|
||||
use DocId;
|
||||
use Score;
|
||||
use core::SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
@@ -186,52 +186,6 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast field bytes for all of the
|
||||
/// docs in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct BytesFastFieldTestCollector {
|
||||
vals: Vec<u8>,
|
||||
field: Field,
|
||||
ff_reader: Option<BytesFastFieldReader>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||
BytesFastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(self) -> Vec<u8> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
|
||||
self.vals.extend(val);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use collector::{Collector, CountCollector};
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
|
||||
@@ -1,66 +1,14 @@
|
||||
use super::Collector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use Result;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors =
|
||||
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct MultiCollector<'a> {
|
||||
collectors: Vec<&'a mut Collector>,
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use super::Collector;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
// Rust heap is a max-heap and we need a min heap.
|
||||
#[derive(Clone, Copy)]
|
||||
@@ -43,61 +43,7 @@ impl Eq for GlobalScoredDoc {}
|
||||
/// with the best scores.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, DocId, Score};
|
||||
/// use tantivy::collector::TopCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut top_collector).unwrap();
|
||||
///
|
||||
/// let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
/// .score_docs()
|
||||
/// .into_iter()
|
||||
/// .map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
/// .collect();
|
||||
///
|
||||
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
/// The theorical complexity is `O(n log K)`.
|
||||
pub struct TopCollector {
|
||||
limit: usize,
|
||||
heap: BinaryHeap<GlobalScoredDoc>,
|
||||
@@ -161,13 +107,11 @@ impl Collector for TopCollector {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc = *self
|
||||
.heap
|
||||
let limit_doc: GlobalScoredDoc = *self.heap
|
||||
.peek()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self
|
||||
.heap
|
||||
let mut mut_head = self.heap
|
||||
.peek_mut()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
mut_head.score = score;
|
||||
@@ -191,9 +135,9 @@ impl Collector for TopCollector {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Score;
|
||||
use collector::Collector;
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
@@ -242,5 +186,4 @@ mod tests {
|
||||
fn test_top_0() {
|
||||
TopCollector::with_limit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use common::serialize::BinarySerializable;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use common::serialize::BinarySerializable;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
@@ -46,7 +46,7 @@ impl BitPacker {
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
@@ -98,14 +98,30 @@ where
|
||||
let addr_in_bits = idx * num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 =
|
||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
if cfg!(feature = "simdcompression") {
|
||||
// for simdcompression,
|
||||
// the bitpacker is only used for fastfields,
|
||||
// and we expect them to be always padded.
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
} else {
|
||||
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
|
||||
} else {
|
||||
let mut buffer = [0u8; 8];
|
||||
for i in addr..data.len() {
|
||||
buffer[i - addr] += data[i];
|
||||
}
|
||||
unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
|
||||
};
|
||||
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
|
||||
val_shifted & mask
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads a range of values from the fast field.
|
||||
@@ -125,8 +141,7 @@ where
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
|
||||
@@ -202,14 +202,15 @@ impl BitSet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
extern crate test;
|
||||
use tests;
|
||||
use std::collections::HashSet;
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use docset::DocSet;
|
||||
use query::BitSetDocSet;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashSet;
|
||||
use tests;
|
||||
use tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use query::BitSetDocSet;
|
||||
use docset::DocSet;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
@@ -352,14 +353,6 @@ mod tests {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
@@ -392,4 +385,5 @@ mod bench {
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use common::CountingWriter;
|
||||
use std::collections::HashMap;
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use directory::WritePtr;
|
||||
use std::io::{self, Read};
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
@@ -30,7 +30,10 @@ impl BinarySerializable for FileAddr {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let field = Field::deserialize(reader)?;
|
||||
let idx = VInt::deserialize(reader)?.0 as usize;
|
||||
Ok(FileAddr { field, idx })
|
||||
Ok(FileAddr {
|
||||
field,
|
||||
idx,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,8 +75,7 @@ impl<W: Write> CompositeWrite<W> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self
|
||||
.offsets
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
.iter()
|
||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||
.collect();
|
||||
@@ -164,7 +166,7 @@ impl CompositeFile {
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
|
||||
self.offsets_index
|
||||
.get(&FileAddr { field, idx })
|
||||
.get(&FileAddr { field, idx, })
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
}
|
||||
@@ -172,12 +174,12 @@ impl CompositeFile {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::io::Write;
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use schema::Field;
|
||||
use std::io::Write;
|
||||
use common::VInt;
|
||||
use common::BinarySerializable;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod counting_writer;
|
||||
mod composite_file;
|
||||
pub mod bitpacker;
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
mod counting_writer;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::VInt;
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
@@ -104,8 +104,8 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use common::Endianness;
|
||||
use common::VInt;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use std::io;
|
||||
use common::VInt;
|
||||
|
||||
/// Trait for a simple binary serialization.
|
||||
pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
@@ -135,8 +135,8 @@ impl BinarySerializable for String {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use super::*;
|
||||
use common::VInt;
|
||||
use super::*;
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::BinarySerializable;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
|
||||
@@ -8,8 +8,10 @@ const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
pub use self::stream::CompressedIntStream;
|
||||
|
||||
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
|
||||
|
||||
/// Returns the size in bytes of a compressed block, given `num_bits`.
|
||||
pub fn compressed_block_size(num_bits: u8) -> usize {
|
||||
1 + (num_bits as usize) * COMPRESSION_BLOCK_SIZE / 8
|
||||
@@ -33,23 +35,19 @@ impl BlockEncoder {
|
||||
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] {
|
||||
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
|
||||
self.output[0] = num_bits;
|
||||
let written_size =
|
||||
1 + self
|
||||
.bitpacker
|
||||
.compress_sorted(offset, block, &mut self.output[1..], num_bits);
|
||||
let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits);
|
||||
&self.output[..written_size]
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
|
||||
let num_bits = self.bitpacker.num_bits(block);
|
||||
self.output[0] = num_bits;
|
||||
let written_size = 1 + self
|
||||
.bitpacker
|
||||
.compress(block, &mut self.output[1..], num_bits);
|
||||
let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits);
|
||||
&self.output[..written_size]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct BlockDecoder {
|
||||
bitpacker: BitPacker4x,
|
||||
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
|
||||
@@ -70,24 +68,17 @@ impl BlockDecoder {
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
1 + self.bitpacker.decompress_sorted(
|
||||
offset,
|
||||
&compressed_data[1..],
|
||||
&mut self.output,
|
||||
num_bits,
|
||||
)
|
||||
1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
1 + self
|
||||
.bitpacker
|
||||
.decompress(&compressed_data[1..], &mut self.output, num_bits)
|
||||
1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -183,6 +174,8 @@ impl VIntDecoder for BlockDecoder {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use tests;
|
||||
use test::Bencher;
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block() {
|
||||
@@ -271,34 +264,11 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::*;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use test::Bencher;
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32() < ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
@@ -307,7 +277,7 @@ mod bench {
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
@@ -334,7 +304,7 @@ mod bench {
|
||||
#[bench]
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
@@ -343,11 +313,12 @@ mod bench {
|
||||
#[bench]
|
||||
fn bench_uncompress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use compression::compressed_block_size;
|
||||
use compression::BlockDecoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
|
||||
/// Reads a stream of compressed ints.
|
||||
@@ -13,7 +13,7 @@ pub struct CompressedIntStream {
|
||||
buffer: SourceRead,
|
||||
|
||||
block_decoder: BlockDecoder,
|
||||
cached_addr: usize, // address of the currently decoded block
|
||||
cached_addr: usize, // address of the currently decoded block
|
||||
cached_next_addr: usize, // address following the currently decoded block
|
||||
|
||||
addr: usize, // address of the block associated to the current position
|
||||
@@ -42,9 +42,7 @@ impl CompressedIntStream {
|
||||
// no need to read.
|
||||
self.cached_next_addr
|
||||
} else {
|
||||
let next_addr = addr + self
|
||||
.block_decoder
|
||||
.uncompress_block_unsorted(self.buffer.slice_from(addr));
|
||||
let next_addr = addr + self.block_decoder.uncompress_block_unsorted(self.buffer.slice_from(addr));
|
||||
self.cached_addr = addr;
|
||||
self.cached_next_addr = next_addr;
|
||||
next_addr
|
||||
@@ -103,8 +101,8 @@ pub mod tests {
|
||||
|
||||
use super::CompressedIntStream;
|
||||
use compression::compressed_block_size;
|
||||
use compression::BlockEncoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::BlockEncoder;
|
||||
use directory::ReadOnlySource;
|
||||
|
||||
fn create_stream_buffer() -> ReadOnlySource {
|
||||
|
||||
@@ -1,33 +1,34 @@
|
||||
use core::SegmentId;
|
||||
use Result;
|
||||
use error::{ErrorKind, ResultExt};
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use schema::Schema;
|
||||
use std::sync::Arc;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use Result;
|
||||
use core::SegmentId;
|
||||
|
||||
use super::pool::LeasedItem;
|
||||
use super::pool::Pool;
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use core::searcher::Searcher;
|
||||
use core::IndexMeta;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use core::META_FILEPATH;
|
||||
use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use indexer::DirectoryLock;
|
||||
use core::searcher::Searcher;
|
||||
use std::convert::From;
|
||||
use num_cpus;
|
||||
use super::segment::Segment;
|
||||
use core::SegmentReader;
|
||||
use super::pool::Pool;
|
||||
use core::SegmentMeta;
|
||||
use super::pool::LeasedItem;
|
||||
use std::path::Path;
|
||||
use tokenizer::TokenizerManager;
|
||||
use core::IndexMeta;
|
||||
use indexer::DirectoryLock;
|
||||
use IndexWriter;
|
||||
use directory::ManagedDirectory;
|
||||
use core::META_FILEPATH;
|
||||
use super::segment::create_segment;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use tokenizer::TokenizerManager;
|
||||
|
||||
const NUM_SEARCHERS: usize = 12;
|
||||
|
||||
@@ -52,17 +53,28 @@ impl Index {
|
||||
/// This should only be used for unit tests.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let ram_directory = RAMDirectory::create();
|
||||
Index::create(ram_directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
// unwrap is ok here
|
||||
let directory = ManagedDirectory::new(ram_directory).expect(
|
||||
"Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.",
|
||||
);
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
}
|
||||
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
#[cfg(feature="mmap")]
|
||||
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
Index::create(mmap_directory, schema)
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
@@ -73,25 +85,14 @@ impl Index {
|
||||
///
|
||||
/// The temp directory is only used for testing the `MmapDirectory`.
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(test)]
|
||||
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
Index::create(mmap_directory, schema)
|
||||
}
|
||||
|
||||
/// Creates a new index given an implementation of the trait `Directory`
|
||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(dir)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
@@ -105,25 +106,29 @@ impl Index {
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
/// Create a new index from a directory.
|
||||
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
#[cfg(feature="mmap")]
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
Index::open(mmap_directory)
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||
pub fn open_directory<TDirectory: Directory>(directory: TDirectory) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
|
||||
/// Reads the index meta file from the directory.
|
||||
pub fn load_metas(&self) -> Result<IndexMeta> {
|
||||
load_metas(self.directory())
|
||||
@@ -137,13 +142,9 @@ impl Index {
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// - `num_threads` defines the number of indexing workers that
|
||||
/// num_threads specifies the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
///
|
||||
/// - `overall_heap_size_in_bytes` sets the amount of memory
|
||||
/// allocated for all indexing thread.
|
||||
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
@@ -151,35 +152,21 @@ impl Index {
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
overall_heap_size_in_bytes: usize,
|
||||
heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
self,
|
||||
num_threads,
|
||||
heap_size_in_bytes_per_thread,
|
||||
directory_lock,
|
||||
)
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
|
||||
}
|
||||
|
||||
/// Creates a multithreaded writer
|
||||
///
|
||||
/// Tantivy will automatically define the number of threads to use.
|
||||
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
||||
/// between a given number of threads.
|
||||
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
let mut num_threads = num_cpus::get();
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
|
||||
}
|
||||
self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
|
||||
pub fn writer(&self, heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes)
|
||||
}
|
||||
|
||||
/// Accessor to the index schema
|
||||
@@ -191,8 +178,7 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
@@ -227,8 +213,7 @@ impl Index {
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
@@ -246,9 +231,8 @@ impl Index {
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<Result<_>>()?;
|
||||
let schema = self.schema();
|
||||
let searchers = (0..NUM_SEARCHERS)
|
||||
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
|
||||
.map(|_| Searcher::from(segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentMeta;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use core::SegmentMeta;
|
||||
use std::fmt;
|
||||
use serde_json;
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
@@ -45,9 +45,9 @@ impl fmt::Debug for IndexMeta {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json;
|
||||
use super::IndexMeta;
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use common::BinarySerializable;
|
||||
use compression::CompressedIntStream;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::FreqReadingOption;
|
||||
use postings::TermInfo;
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use schema::FieldType;
|
||||
use postings::TermInfo;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use termdict::TermDictionary;
|
||||
use compression::CompressedIntStream;
|
||||
use postings::FreqReadingOption;
|
||||
use common::BinarySerializable;
|
||||
use schema::FieldType;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -23,16 +23,16 @@ use termdict::TermDictionary;
|
||||
/// `InvertedIndexReader` are created by calling
|
||||
/// the `SegmentReader`'s [`.inverted_index(...)`] method
|
||||
pub struct InvertedIndexReader {
|
||||
termdict: TermDictionary,
|
||||
termdict: TermDictionaryImpl,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
total_num_tokens: u64,
|
||||
total_num_tokens: u64
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionary,
|
||||
termdict: TermDictionaryImpl,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
@@ -45,7 +45,7 @@ impl InvertedIndexReader {
|
||||
postings_source: postings_source.slice_from(8),
|
||||
positions_source,
|
||||
record_option,
|
||||
total_num_tokens,
|
||||
total_num_tokens
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,11 +56,11 @@ impl InvertedIndexReader {
|
||||
.get_index_record_option()
|
||||
.unwrap_or(IndexRecordOption::Basic);
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionary::empty(field_type),
|
||||
termdict: TermDictionaryImpl::empty(field_type),
|
||||
postings_source: ReadOnlySource::empty(),
|
||||
positions_source: ReadOnlySource::empty(),
|
||||
record_option,
|
||||
total_num_tokens: 0u64,
|
||||
total_num_tokens: 0u64
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ impl InvertedIndexReader {
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn terms(&self) -> &TermDictionary {
|
||||
pub fn terms(&self) -> &TermDictionaryImpl {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
@@ -149,6 +149,8 @@ impl InvertedIndexReader {
|
||||
self.total_num_tokens
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encountered and indexed.
|
||||
///
|
||||
@@ -164,15 +166,12 @@ impl InvertedIndexReader {
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
pub(crate) fn read_postings_no_deletes(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> Option<SegmentPostings> {
|
||||
pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.get_term_info(term)
|
||||
@@ -180,3 +179,6 @@ impl InvertedIndexReader {
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,24 +1,24 @@
|
||||
pub mod index;
|
||||
mod index_meta;
|
||||
mod inverted_index_reader;
|
||||
mod pool;
|
||||
pub mod searcher;
|
||||
mod segment;
|
||||
mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_meta;
|
||||
pub mod index;
|
||||
mod segment_reader;
|
||||
mod segment_id;
|
||||
mod segment_component;
|
||||
mod segment;
|
||||
mod index_meta;
|
||||
mod pool;
|
||||
mod segment_meta;
|
||||
mod inverted_index_reader;
|
||||
|
||||
pub use self::index::Index;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_meta::SegmentMeta;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::index::Index;
|
||||
pub use self::segment_meta::SegmentMeta;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use crossbeam::sync::MsQueue;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use crossbeam::sync::MsQueue;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct GenerationItem<T> {
|
||||
@@ -87,8 +87,7 @@ impl<T> Deref for LeasedItem<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self
|
||||
.gen_item
|
||||
&self.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
@@ -97,8 +96,7 @@ impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self
|
||||
.gen_item
|
||||
&mut self.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
@@ -116,8 +114,8 @@ impl<T> Drop for LeasedItem<T> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::Pool;
|
||||
use std::iter;
|
||||
use super::Pool;
|
||||
|
||||
#[test]
|
||||
fn test_pool() {
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
use collector::Collector;
|
||||
use core::InvertedIndexReader;
|
||||
use core::SegmentReader;
|
||||
use query::Query;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::{Field, Term};
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use termdict::TermMerger;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use core::SegmentReader;
|
||||
use schema::Document;
|
||||
use collector::Collector;
|
||||
use query::Query;
|
||||
use DocAddress;
|
||||
use schema::{Field, Term};
|
||||
use termdict::{TermDictionary, TermMerger};
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use core::InvertedIndexReader;
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
@@ -17,18 +16,10 @@ use Result;
|
||||
/// the destruction of the `Searcher`.
|
||||
///
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher {
|
||||
schema,
|
||||
segment_readers,
|
||||
}
|
||||
}
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
@@ -39,11 +30,6 @@ impl Searcher {
|
||||
segment_reader.doc(doc_id)
|
||||
}
|
||||
|
||||
/// Access the schema associated to the index of this searcher.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the index.
|
||||
pub fn num_docs(&self) -> u64 {
|
||||
self.segment_readers
|
||||
@@ -78,8 +64,7 @@ impl Searcher {
|
||||
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||
let inv_index_readers = self
|
||||
.segment_readers
|
||||
let inv_index_readers = self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -99,8 +84,7 @@ impl FieldSearcher {
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// for the given field.
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
let term_streamers: Vec<_> = self
|
||||
.inv_index_readers
|
||||
let term_streamers: Vec<_> = self.inv_index_readers
|
||||
.iter()
|
||||
.map(|inverted_index| inverted_index.terms().stream())
|
||||
.collect();
|
||||
@@ -108,10 +92,15 @@ impl FieldSearcher {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<SegmentReader>> for Searcher {
|
||||
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher { segment_readers }
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Searcher {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let segment_ids = self
|
||||
.segment_readers
|
||||
let segment_ids = self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
use super::SegmentComponent;
|
||||
use core::Index;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use directory::error::{OpenReadError, OpenWriteError};
|
||||
use directory::Directory;
|
||||
use directory::{FileProtection, ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use Result;
|
||||
use std::path::PathBuf;
|
||||
use schema::Schema;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use core::SegmentId;
|
||||
use directory::{FileProtection, ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use super::SegmentComponent;
|
||||
use core::Index;
|
||||
use std::result;
|
||||
use Result;
|
||||
use directory::Directory;
|
||||
use core::SegmentMeta;
|
||||
use directory::error::{OpenReadError, OpenWriteError};
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
#[derive(Clone)]
|
||||
@@ -111,8 +111,8 @@ mod tests {
|
||||
|
||||
use core::SegmentComponent;
|
||||
use directory::Directory;
|
||||
use schema::SchemaBuilder;
|
||||
use std::collections::HashSet;
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::slice;
|
||||
|
||||
/// Enum describing each component of a tantivy segment.
|
||||
/// Each component is stored in its own file,
|
||||
/// using the pattern `segment_uuid`.`component_extension`,
|
||||
@@ -28,7 +26,7 @@ pub enum SegmentComponent {
|
||||
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
||||
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::cmp::{Ord, Ordering};
|
||||
use std::fmt;
|
||||
use uuid::Uuid;
|
||||
use std::fmt;
|
||||
use std::cmp::{Ord, Ordering};
|
||||
|
||||
#[cfg(test)]
|
||||
use std::sync::atomic;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use std::collections::HashSet;
|
||||
use super::SegmentComponent;
|
||||
use std::path::PathBuf;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
|
||||
@@ -1,30 +1,31 @@
|
||||
use common::CompositeFile;
|
||||
use common::HasLen;
|
||||
use core::InvertedIndexReader;
|
||||
use Result;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentComponent;
|
||||
use std::sync::RwLock;
|
||||
use common::HasLen;
|
||||
use core::SegmentMeta;
|
||||
use error::ErrorKind;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use schema::Cardinality;
|
||||
use fastfield::DeleteBitSet;
|
||||
use store::StoreReader;
|
||||
use schema::Document;
|
||||
use DocId;
|
||||
use std::sync::Arc;
|
||||
use std::collections::HashMap;
|
||||
use common::CompositeFile;
|
||||
use std::fmt;
|
||||
use core::InvertedIndexReader;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use error::ErrorKind;
|
||||
use termdict::TermDictionaryImpl;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Schema;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use store::StoreReader;
|
||||
use termdict::TermDictionary;
|
||||
use DocId;
|
||||
use Result;
|
||||
use fastfield::{FastValue, MultiValueIntFastFieldReader};
|
||||
use schema::Cardinality;
|
||||
use fieldnorm::FieldNormReader;
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -75,11 +76,6 @@ impl SegmentReader {
|
||||
self.segment_meta.num_docs()
|
||||
}
|
||||
|
||||
/// Returns the schema of the index this segment belongs to.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
@@ -109,25 +105,12 @@ impl SegmentReader {
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||
Ok(FastFieldReader::open(ff_source))
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
@@ -140,34 +123,21 @@ impl SegmentReader {
|
||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
{
|
||||
let idx_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let vals_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
@@ -187,7 +157,7 @@ impl SegmentReader {
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let termdict = TermDictionary::from_source(termdict_source);
|
||||
let termdict = TermDictionaryImpl::from_source(termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
}
|
||||
@@ -201,14 +171,12 @@ impl SegmentReader {
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
|
||||
if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
|
||||
if let Some(fieldnorm_source) = self.fieldnorms_composite
|
||||
.open_read(field) {
|
||||
FieldNormReader::open(fieldnorm_source)
|
||||
} else {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg = format!(
|
||||
"Field norm not found for field {:?}. Was it market as indexed during indexing.",
|
||||
field_name
|
||||
);
|
||||
let err_msg= format!("Field norm not found for field {:?}. Was it market as indexed during indexing.", field_name);
|
||||
panic!(err_msg);
|
||||
}
|
||||
}
|
||||
@@ -243,12 +211,13 @@ impl SegmentReader {
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let delete_bitset_opt =
|
||||
if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
@@ -274,8 +243,7 @@ impl SegmentReader {
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||
if let Some(inv_idx_reader) = self
|
||||
.inv_idx_reader_cache
|
||||
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
|
||||
.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
@@ -304,18 +272,16 @@ impl SegmentReader {
|
||||
|
||||
let postings_source = postings_source_opt.unwrap();
|
||||
|
||||
let termdict_source = self
|
||||
.termdict_composite
|
||||
let termdict_source = self.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
|
||||
let positions_source = self
|
||||
.positions_composite
|
||||
let positions_source = self.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionary::from_source(termdict_source),
|
||||
TermDictionaryImpl::from_source(termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
record_option,
|
||||
@@ -357,11 +323,6 @@ impl SegmentReader {
|
||||
.map(|delete_set| delete_set.is_deleted(doc))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||
SegmentReaderAliveDocsIterator::new(&self)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
@@ -369,90 +330,3 @@ impl fmt::Debug for SegmentReader {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements the iterator trait to allow easy iteration
|
||||
/// over non-deleted ("alive") DocIds in a SegmentReader
|
||||
pub struct SegmentReaderAliveDocsIterator<'a> {
|
||||
reader: &'a SegmentReader,
|
||||
max_doc: DocId,
|
||||
current: DocId,
|
||||
}
|
||||
|
||||
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
||||
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
||||
SegmentReaderAliveDocsIterator {
|
||||
reader: reader,
|
||||
max_doc: reader.max_doc(),
|
||||
current: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
||||
type Item = DocId;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
|
||||
if self.current >= self.max_doc {
|
||||
return None;
|
||||
}
|
||||
|
||||
// find the next alive doc id
|
||||
while self.reader.is_deleted(self.current) {
|
||||
self.current += 1;
|
||||
|
||||
if self.current >= self.max_doc {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// capture the current alive DocId
|
||||
let result = Some(self.current);
|
||||
|
||||
// move down the chain
|
||||
self.current += 1;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use core::Index;
|
||||
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||
use DocId;
|
||||
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let name = schema.get_field("name").unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => "tantivy"));
|
||||
index_writer.add_document(doc!(name => "horse"));
|
||||
index_writer.add_document(doc!(name => "jockey"));
|
||||
index_writer.add_document(doc!(name => "cap"));
|
||||
|
||||
// we should now have one segment with two docs
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut index_writer2 = index.writer(50_000_000).unwrap();
|
||||
index_writer2.delete_term(Term::from_field_text(name, "horse"));
|
||||
index_writer2.delete_term(Term::from_field_text(name, "cap"));
|
||||
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer2.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
|
||||
assert_eq!(vec![0u32, 2u32], docs);
|
||||
}
|
||||
}
|
||||
|
||||
4
src/datastruct/mod.rs
Normal file
4
src/datastruct/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
mod skip;
|
||||
pub mod stacker;
|
||||
|
||||
pub use self::skip::{SkipList, SkipListBuilder};
|
||||
@@ -1,10 +1,10 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
mod skiplist;
|
||||
mod skiplist_builder;
|
||||
mod skiplist;
|
||||
|
||||
pub use self::skiplist::SkipList;
|
||||
pub use self::skiplist_builder::SkipListBuilder;
|
||||
pub use self::skiplist::SkipList;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -1,6 +1,6 @@
|
||||
use common::{BinarySerializable, VInt};
|
||||
use std::cmp::max;
|
||||
use std::marker::PhantomData;
|
||||
use std::cmp::max;
|
||||
|
||||
static EMPTY: [u8; 0] = [];
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use common::{is_power_of_2, BinarySerializable, VInt};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use common::{BinarySerializable, VInt, is_power_of_2};
|
||||
use std::marker::PhantomData;
|
||||
use std::io;
|
||||
|
||||
struct LayerBuilder<T: BinarySerializable> {
|
||||
period_mask: usize,
|
||||
@@ -72,8 +72,7 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
let mut skip_pointer = self.data_layer.insert(key, dest)?;
|
||||
loop {
|
||||
skip_pointer = match skip_pointer {
|
||||
Some((skip_doc_id, skip_offset)) => self
|
||||
.get_skip_layer(layer_id)
|
||||
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset)?,
|
||||
None => {
|
||||
return Ok(());
|
||||
161
src/datastruct/stacker/expull.rs
Normal file
161
src/datastruct/stacker/expull.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
use std::mem;
|
||||
use super::heap::{Heap, HeapAllocable};
|
||||
|
||||
#[inline]
|
||||
pub fn is_power_of_2(val: u32) -> bool {
|
||||
val & (val - 1) == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(val: u32) -> bool {
|
||||
val > 3 && is_power_of_2(val)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
end: u32,
|
||||
val0: u32,
|
||||
val1: u32,
|
||||
val2: u32,
|
||||
next: u32, // inline of the first block
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push(&mut self, val: u32, heap: &Heap) {
|
||||
self.len += 1;
|
||||
if jump_needed(self.len) {
|
||||
// we need to allocate another block.
|
||||
// ... As we want to grow block exponentially
|
||||
// the next block as a size of (length so far),
|
||||
// and we need to add 1u32 to store the pointer
|
||||
// to the next element.
|
||||
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: u32 = heap.allocate_space(new_block_size);
|
||||
heap.set(self.end, &new_block_addr);
|
||||
self.end = new_block_addr;
|
||||
}
|
||||
heap.set(self.end, &val);
|
||||
self.end += mem::size_of::<u32>() as u32;
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for u32 {
|
||||
fn with_addr(_addr: u32) -> u32 {
|
||||
0u32
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for ExpUnrolledLinkedList {
|
||||
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
|
||||
let last_addr = addr + mem::size_of::<u32>() as u32 * 2u32;
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
end: last_addr,
|
||||
val0: 0u32,
|
||||
val1: 0u32,
|
||||
val2: 0u32,
|
||||
next: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a Heap,
|
||||
addr: u32,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
let addr: u32;
|
||||
self.consumed += 1;
|
||||
if jump_needed(self.consumed) {
|
||||
addr = *self.heap.get_mut_ref(self.addr);
|
||||
} else {
|
||||
addr = self.addr;
|
||||
}
|
||||
self.addr = addr + mem::size_of::<u32>() as u32;
|
||||
Some(*self.heap.get_mut_ref(addr))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::Heap;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let heap = Heap::with_capacity(1_000_000);
|
||||
let (addr, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stack.push(1u32, &heap);
|
||||
stack.push(2u32, &heap);
|
||||
stack.push(4u32, &heap);
|
||||
stack.push(8u32, &heap);
|
||||
{
|
||||
let mut it = stack.iter(addr, &heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut vecs = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
vecs.push(Vec::new());
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
vecs[t].push(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = Heap::with_capacity(64_000_000);
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &heap);
|
||||
}
|
||||
}
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
309
src/datastruct/stacker/hashmap.rs
Normal file
309
src/datastruct/stacker/hashmap.rs
Normal file
@@ -0,0 +1,309 @@
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use postings::UnorderedTermId;
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
|
||||
mod murmurhash2 {
|
||||
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let m: u32 = 0x5bd1_e995;
|
||||
let r = 24;
|
||||
let len = key.len() as u32;
|
||||
|
||||
let mut h: u32 = SEED ^ len;
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { *key_ptr };
|
||||
k = k.wrapping_mul(m);
|
||||
k ^= k >> r;
|
||||
k = k.wrapping_mul(m);
|
||||
k = k.wrapping_mul(m);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining = len & 3;
|
||||
let key_ptr_u8: *const u8 = key_ptr as *const u8;
|
||||
match remaining {
|
||||
3 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(2)) } << 16;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
2 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
1 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(m);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
));
|
||||
let table_size = compute_table_size(table_num_bits);
|
||||
let heap_size = per_thread_memory_budget - table_size;
|
||||
(heap_size, table_num_bits)
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone, Default)]
|
||||
struct KeyValue {
|
||||
key_value_addr: BytesRef,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
/// The quirky API has the benefit of avoiding
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct TermHashMap<'a> {
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing {
|
||||
hash,
|
||||
i: 0,
|
||||
mask,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i * self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermHashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
pub fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
|
||||
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
|
||||
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
|
||||
(key_bytes, expull_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr, hash
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
|
||||
self.occupied.iter().cloned().map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
let (key, offset) = self.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
|
||||
&mut self,
|
||||
key: S,
|
||||
) -> (UnorderedTermId, &mut V) {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
|
||||
self.set_bucket(hash, key_bytes_ref, bucket);
|
||||
return (bucket as UnorderedTermId, val);
|
||||
} else if kv.hash == hash {
|
||||
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
|
||||
if stored_key == key_bytes {
|
||||
return (
|
||||
bucket as UnorderedTermId,
|
||||
self.heap.get_mut_ref(expull_addr),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::{Heap, HeapAllocable};
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
use std::collections::HashSet;
|
||||
use super::split_memory;
|
||||
|
||||
struct TestValue {
|
||||
val: u32,
|
||||
_addr: u32,
|
||||
}
|
||||
|
||||
impl HeapAllocable for TestValue {
|
||||
fn with_addr(addr: u32) -> TestValue {
|
||||
TestValue {
|
||||
val: 0u32,
|
||||
_addr: addr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(split_memory(100_000), (67232, 12));
|
||||
assert_eq!(split_memory(1_000_000), (737856, 15));
|
||||
assert_eq!(split_memory(10_000_000), (7902848, 18));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let heap = Heap::with_capacity(2_000_000);
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 4u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 3u32);
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 4u32);
|
||||
}
|
||||
let mut iter_values = hash_map.iter();
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 3u32);
|
||||
}
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 4u32);
|
||||
}
|
||||
assert!(iter_values.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash_2(b: &mut Bencher) {
|
||||
let keys: Vec<&'static str> =
|
||||
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
keys.iter()
|
||||
.map(|&s| s.as_bytes())
|
||||
.map(murmurhash2::murmurhash2)
|
||||
.map(|h| h as u64)
|
||||
.last()
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
233
src/datastruct/stacker/heap.rs
Normal file
233
src/datastruct/stacker/heap.rs
Normal file
@@ -0,0 +1,233 @@
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
///
|
||||
/// The slice will encode the length of the `&[u8]` slice
|
||||
/// on 16-bits, and then the data is encoded.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct BytesRef(u32);
|
||||
|
||||
impl BytesRef {
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
|
||||
pub fn addr(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BytesRef {
|
||||
fn default() -> BytesRef {
|
||||
BytesRef(u32::max_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// Object that can be allocated in tantivy's custom `Heap`.
|
||||
pub trait HeapAllocable {
|
||||
fn with_addr(addr: u32) -> Self;
|
||||
}
|
||||
|
||||
/// Tantivy's custom `Heap`.
|
||||
pub struct Heap {
|
||||
inner: UnsafeCell<InnerHeap>,
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
|
||||
impl Heap {
|
||||
/// Creates a new heap with a given capacity
|
||||
pub fn with_capacity(num_bytes: usize) -> Heap {
|
||||
Heap {
|
||||
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
|
||||
}
|
||||
}
|
||||
|
||||
fn inner(&self) -> &mut InnerHeap {
|
||||
unsafe { &mut *self.inner.get() }
|
||||
}
|
||||
|
||||
/// Clears the heap. All the underlying data is lost.
|
||||
///
|
||||
/// This heap does not support deallocation.
|
||||
/// This method is the only way to free memory.
|
||||
pub fn clear(&self) {
|
||||
self.inner().clear();
|
||||
}
|
||||
|
||||
/// Return amount of free space, in bytes.
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
self.inner().num_free_bytes()
|
||||
}
|
||||
|
||||
/// Allocate a given amount of space and returns an address
|
||||
/// in the Heap.
|
||||
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
|
||||
self.inner().allocate_space(num_bytes)
|
||||
}
|
||||
|
||||
/// Allocate an object in the heap
|
||||
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
|
||||
let addr = self.inner().allocate_space(mem::size_of::<V>());
|
||||
let v: V = V::with_addr(addr);
|
||||
self.inner().set(addr, &v);
|
||||
(addr, self.inner().get_mut_ref(addr))
|
||||
}
|
||||
|
||||
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
|
||||
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
|
||||
self.inner().allocate_and_set(data)
|
||||
}
|
||||
|
||||
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
|
||||
/// given as argumetn
|
||||
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
self.inner().get_slice(bytes_ref)
|
||||
}
|
||||
|
||||
/// Stores an item's data in the heap, at the given `address`.
|
||||
pub fn set<Item>(&self, addr: u32, val: &Item) {
|
||||
self.inner().set(addr, val);
|
||||
}
|
||||
|
||||
/// Returns a mutable reference for an object at a given Item.
|
||||
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to an `Item` at a given `addr`.
|
||||
#[cfg(test)]
|
||||
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.get_mut_ref(addr)
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerHeap {
|
||||
buffer: Vec<u8>,
|
||||
buffer_len: u32,
|
||||
used: u32,
|
||||
next_heap: Option<Box<InnerHeap>>,
|
||||
}
|
||||
|
||||
impl InnerHeap {
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = vec![0u8; num_bytes];
|
||||
InnerHeap {
|
||||
buffer,
|
||||
buffer_len: num_bytes as u32,
|
||||
next_heap: None,
|
||||
used: 0u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.used = 0u32;
|
||||
self.next_heap = None;
|
||||
}
|
||||
|
||||
// Returns the number of free bytes. If the buffer
|
||||
// has reached it's capacity and overflowed to another buffer, return 0.
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
if self.next_heap.is_some() {
|
||||
0u32
|
||||
} else {
|
||||
self.buffer_len - self.used
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
|
||||
let addr = self.used;
|
||||
self.used += num_bytes as u32;
|
||||
if self.used <= self.buffer_len {
|
||||
addr
|
||||
} else {
|
||||
if self.next_heap.is_none() {
|
||||
info!(
|
||||
r#"Exceeded heap size. The segment will be committed right
|
||||
after indexing this document."#,
|
||||
);
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
}
|
||||
}
|
||||
|
||||
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
let start = bytes_ref.0;
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(BytesRef(start - self.buffer_len))
|
||||
} else {
|
||||
let start = start as usize;
|
||||
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
|
||||
&self.buffer[start + 2..start + 2 + len]
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
} else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
}
|
||||
|
||||
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
|
||||
assert!(data.len() < u16::max_value() as usize);
|
||||
let total_len = 2 + data.len();
|
||||
let start = self.allocate_space(total_len);
|
||||
let total_buff = self.get_mut_slice(start, start + total_len as u32);
|
||||
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
|
||||
total_buff[2..].clone_from_slice(data);
|
||||
BytesRef(start)
|
||||
}
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut(addr - self.buffer_len)
|
||||
} else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_ref(addr - self.buffer_len)
|
||||
} else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
unsafe { &mut *v_ptr }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set(addr - self.buffer_len, val);
|
||||
} else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
|
||||
unsafe {
|
||||
let dest_ptr: *mut u8 = self.get_mut(addr);
|
||||
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
43
src/datastruct/stacker/mod.rs
Normal file
43
src/datastruct/stacker/mod.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
pub(crate) mod hashmap;
|
||||
mod heap;
|
||||
mod expull;
|
||||
|
||||
pub use self::heap::{Heap, HeapAllocable};
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::hashmap::TermHashMap;
|
||||
|
||||
#[test]
|
||||
fn test_unrolled_linked_list() {
|
||||
use std::collections;
|
||||
let heap = Heap::with_capacity(30_000_000);
|
||||
{
|
||||
heap.clear();
|
||||
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
|
||||
ks.push(2);
|
||||
ks.push(3);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
|
||||
for j in 0..k {
|
||||
for i in 0..500 {
|
||||
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
|
||||
v.push(i * j, &heap);
|
||||
}
|
||||
}
|
||||
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
|
||||
for (key, addr, _) in hashmap.iter() {
|
||||
map_addr.insert(Vec::from(key), addr);
|
||||
}
|
||||
|
||||
for i in 0..500 {
|
||||
let key: String = i.to_string();
|
||||
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
|
||||
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
|
||||
let mut it = exp_pull.iter(addr, &heap);
|
||||
for j in 0..k {
|
||||
assert_eq!(it.next().unwrap(), i * j);
|
||||
}
|
||||
assert!(!it.next().is_some());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::marker::Send;
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::marker::Send;
|
||||
use std::marker::Sync;
|
||||
use std::path::Path;
|
||||
use std::result;
|
||||
use std::io;
|
||||
use std::marker::Sync;
|
||||
|
||||
/// Write-once read many (WORM) abstraction for where
|
||||
/// tantivy's data should be stored.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::error::Error as StdError;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use std::io;
|
||||
use std::fmt;
|
||||
|
||||
/// General IO error with an optional path to the offending file.
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
use core::MANAGED_FILEPATH;
|
||||
use std::path::{Path, PathBuf};
|
||||
use serde_json;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use error::{ErrorKind, Result, ResultExt};
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::io;
|
||||
use Directory;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::collections::HashSet;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::io::Write;
|
||||
use core::MANAGED_FILEPATH;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use error::{ErrorKind, Result, ResultExt};
|
||||
|
||||
/// Wrapper of directories that keeps track of files created by Tantivy.
|
||||
///
|
||||
@@ -86,7 +86,7 @@ impl ManagedDirectory {
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
directory: box directory,
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default(),
|
||||
@@ -94,7 +94,7 @@ impl ManagedDirectory {
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
|
||||
@@ -117,8 +117,7 @@ impl ManagedDirectory {
|
||||
let mut files_to_delete = vec![];
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock = self
|
||||
.meta_informations
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
|
||||
@@ -171,8 +170,7 @@ impl ManagedDirectory {
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self
|
||||
.meta_informations
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
{
|
||||
@@ -195,8 +193,7 @@ impl ManagedDirectory {
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let pathbuf = path.to_owned();
|
||||
{
|
||||
let mut meta_informations_wlock = self
|
||||
.meta_informations
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned on protect");
|
||||
*meta_informations_wlock
|
||||
@@ -218,8 +215,7 @@ impl ManagedDirectory {
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
let mut meta_wlock = self
|
||||
.meta_informations
|
||||
let mut meta_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
@@ -252,8 +248,7 @@ impl Directory for ManagedDirectory {
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
{
|
||||
let metas_rlock = self
|
||||
.meta_informations
|
||||
let metas_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("poisoned lock in managed directory meta");
|
||||
if let Some(counter) = metas_rlock.protected_files.get(path) {
|
||||
@@ -270,7 +265,7 @@ impl Directory for ManagedDirectory {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
box self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -287,10 +282,10 @@ impl Clone for ManagedDirectory {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::io::Write;
|
||||
use tempdir::TempDir;
|
||||
|
||||
lazy_static! {
|
||||
@@ -299,7 +294,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
@@ -348,7 +343,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap ")]
|
||||
#[cfg(feature="mmap ")]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
@@ -378,7 +373,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
fn test_managed_directory_protect() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
use atomicwrites;
|
||||
use common::make_io_err;
|
||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use directory::shared_vec_slice::SharedVecSlice;
|
||||
use directory::Directory;
|
||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use directory::ReadOnlySource;
|
||||
use directory::shared_vec_slice::SharedVecSlice;
|
||||
use directory::WritePtr;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use std::collections::hash_map::Entry as HashMapEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
use std::fmt;
|
||||
use std::fs::OpenOptions;
|
||||
use std::fs::{self, File};
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -32,8 +32,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
}
|
||||
})?;
|
||||
|
||||
let meta_data = file
|
||||
.metadata()
|
||||
let meta_data = file.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
@@ -41,11 +40,9 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
// instead.
|
||||
return Ok(None);
|
||||
}
|
||||
unsafe {
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
}
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
@@ -310,8 +307,7 @@ impl Directory for MmapDirectory {
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self
|
||||
.sync_directory()
|
||||
Ok(_) => self.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
|
||||
@@ -4,29 +4,32 @@ WORM directory abstraction.
|
||||
|
||||
*/
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
mod mmap_directory;
|
||||
|
||||
mod directory;
|
||||
mod managed_directory;
|
||||
mod ram_directory;
|
||||
mod directory;
|
||||
mod read_only_source;
|
||||
mod shared_vec_slice;
|
||||
mod managed_directory;
|
||||
mod static_directory;
|
||||
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
|
||||
use std::io::{BufWriter, Seek, Write};
|
||||
|
||||
pub use self::static_directory::StaticDirectory;
|
||||
pub use self::static_directory::write_static_from_directory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub use self::directory::Directory;
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
|
||||
pub(crate) use self::read_only_source::SourceRead;
|
||||
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
|
||||
|
||||
/// Synonym of Seek + Write
|
||||
pub trait SeekableWrite: Seek + Write {}
|
||||
@@ -42,8 +45,8 @@ pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::path::Path;
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
||||
@@ -56,7 +59,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
fn test_mmap_directory() {
|
||||
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
test_directory(&mut mmap_directory);
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::make_io_err;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::WritePtr;
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use common::make_io_err;
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::WritePtr;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
|
||||
/// Writer associated with the `RAMDirectory`
|
||||
///
|
||||
@@ -170,8 +170,7 @@ impl Directory for RAMDirectory {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self
|
||||
.fs
|
||||
let exists = self.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
#[cfg(feature="mmap")]
|
||||
use fst::raw::MmapReadOnly;
|
||||
use std::ops::Deref;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::HasLen;
|
||||
#[cfg(feature = "mmap")]
|
||||
use fst::raw::MmapReadOnly;
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
use std::io::{self, Read};
|
||||
use std::ops::Deref;
|
||||
use std::slice;
|
||||
use std::io::{self, Read};
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
|
||||
const EMPTY_SLICE: [u8; 0] = [];
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
@@ -15,10 +17,12 @@ use std::slice;
|
||||
/// hold by this object should never be altered or destroyed.
|
||||
pub enum ReadOnlySource {
|
||||
/// Mmap source of data
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
Mmap(MmapReadOnly),
|
||||
/// Wrapping a `Vec<u8>`
|
||||
Anonymous(SharedVecSlice),
|
||||
/// Wrapping a static slice
|
||||
Static(&'static [u8])
|
||||
}
|
||||
|
||||
unsafe impl StableDeref for ReadOnlySource {}
|
||||
@@ -35,15 +39,16 @@ impl Deref for ReadOnlySource {
|
||||
impl ReadOnlySource {
|
||||
/// Creates an empty ReadOnlySource
|
||||
pub fn empty() -> ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::empty())
|
||||
ReadOnlySource::Static(&EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Returns the data underlying the ReadOnlySource object.
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
match *self {
|
||||
#[cfg(feature = "mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
|
||||
#[cfg(feature="mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||
ReadOnlySource::Static(data) => data,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,14 +71,9 @@ impl ReadOnlySource {
|
||||
/// 1KB slice is remaining, the whole `500MBs`
|
||||
/// are retained in memory.
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
|
||||
assert!(
|
||||
from_offset <= to_offset,
|
||||
"Requested negative slice [{}..{}]",
|
||||
from_offset,
|
||||
to_offset
|
||||
);
|
||||
assert!(from_offset <= to_offset, "Requested negative slice [{}..{}]", from_offset, to_offset);
|
||||
match *self {
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => {
|
||||
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
|
||||
ReadOnlySource::Mmap(sliced_mmap)
|
||||
@@ -81,6 +81,9 @@ impl ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
||||
}
|
||||
ReadOnlySource::Static(data) => {
|
||||
ReadOnlySource::Static(&data[from_offset..to_offset])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +124,12 @@ impl From<Vec<u8>> for ReadOnlySource {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&'static [u8]> for ReadOnlySource {
|
||||
fn from(data: &'static [u8]) -> ReadOnlySource {
|
||||
ReadOnlySource::Static(data)
|
||||
}
|
||||
}
|
||||
|
||||
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
|
||||
pub(crate) struct SourceRead {
|
||||
_data_owner: ReadOnlySource,
|
||||
@@ -135,11 +144,13 @@ impl SourceRead {
|
||||
|
||||
pub fn slice_from(&self, start: usize) -> &[u8] {
|
||||
&self.cursor[start..]
|
||||
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u8 {
|
||||
self.cursor[idx]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SourceRead {
|
||||
|
||||
123
src/directory/static_directory.rs
Normal file
123
src/directory/static_directory.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use std::collections::HashMap;
|
||||
use Directory;
|
||||
use std::path::PathBuf;
|
||||
use directory::ReadOnlySource;
|
||||
use std::io::BufWriter;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use std::path::Path;
|
||||
use std::fmt::{Formatter, Debug, self};
|
||||
use Result as TantivyResult;
|
||||
use directory::SeekableWrite;
|
||||
use std::io;
|
||||
use std::fs;
|
||||
use common::Endianness;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use byteorder::ByteOrder;
|
||||
use std::str;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Write};
|
||||
use std::ffi::OsString;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StaticDirectory {
|
||||
files: HashMap<PathBuf, &'static [u8]>,
|
||||
}
|
||||
|
||||
impl Debug for StaticDirectory {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "StaticDirectory[{} files]", self.files.len())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl StaticDirectory {
|
||||
pub fn open(mut data: &'static [u8]) -> TantivyResult<StaticDirectory> {
|
||||
assert!(data.len() > 8);
|
||||
let footer_len_offset = data.len() - 8;
|
||||
let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize;
|
||||
let mut body = &data[..body_len];
|
||||
let mut footer = &data[body_len..footer_len_offset];
|
||||
let num_files = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let mut files = HashMap::new();
|
||||
for _ in 0..num_files {
|
||||
let filename_len = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let filename = &footer[..filename_len];
|
||||
footer = &footer[filename_len..];
|
||||
let data_len = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let file_data = &body[..data_len];
|
||||
body = &body[data_len..];
|
||||
let filename_str = str::from_utf8(filename).expect("Invalid UTF8");
|
||||
let filename = PathBuf::from(filename_str);
|
||||
println!("{:?} {:?}", filename, data_len);
|
||||
files.insert(filename, file_data);
|
||||
}
|
||||
Ok(StaticDirectory {
|
||||
files
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for StaticDirectory {
|
||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||
if let Some(static_data) = self.files.get(path) {
|
||||
Ok(ReadOnlySource::from(*static_data))
|
||||
} else {
|
||||
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.files.contains_key(path)
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||
if let Some(static_data) = self.files.get(path) {
|
||||
Ok(static_data.to_vec())
|
||||
} else {
|
||||
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
box self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult<Vec<u8>> {
|
||||
assert!(directory_path.is_dir());
|
||||
let mut file_data: Vec<(OsString, usize)> = Vec::new();
|
||||
let mut write: Vec<u8> = Vec::new();
|
||||
for entry in fs::read_dir(directory_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_file() {
|
||||
info!("Appending {}", path.to_string_lossy());
|
||||
let mut open_file = File::open(&path)?;
|
||||
let file_len = open_file.read_to_end(&mut write)?;
|
||||
file_data.push((entry.file_name(), file_len));
|
||||
}
|
||||
}
|
||||
// write footer
|
||||
let body_len = write.len();
|
||||
VInt(file_data.len() as u64).serialize(&mut write)?;
|
||||
for (filename, filelen) in file_data {
|
||||
VInt(filename.len() as u64).serialize(&mut write)?;
|
||||
write.write_all(filename.to_string_lossy().as_bytes())?;
|
||||
VInt(filelen as u64).serialize(&mut write)?;
|
||||
}
|
||||
(body_len as u64).serialize(&mut write)?;
|
||||
Ok(write)
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
use common::BitSet;
|
||||
use DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
use common::BitSet;
|
||||
|
||||
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
|
||||
18
src/error.rs
18
src/error.rs
@@ -2,13 +2,13 @@
|
||||
|
||||
use std::io;
|
||||
|
||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use query;
|
||||
use schema;
|
||||
use serde_json;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use query;
|
||||
use schema;
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use serde_json;
|
||||
|
||||
error_chain!(
|
||||
errors {
|
||||
@@ -48,10 +48,10 @@ error_chain!(
|
||||
description("an error occurred in a thread")
|
||||
display("an error occurred in a thread: '{}'", err)
|
||||
}
|
||||
/// An Error appeared related to the schema.
|
||||
SchemaError(message: String) {
|
||||
description("the schema is not matching expectations.")
|
||||
display("Schema error: '{}'", message)
|
||||
/// An Error appeared related to the lack of a field.
|
||||
SchemaError(field: String) {
|
||||
description("a schema field is missing")
|
||||
display("a schema field is missing: '{}'", field)
|
||||
}
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
FastFieldError(err: FastFieldNotAvailableError) {
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use self::reader::BytesFastFieldReader;
|
||||
pub use self::writer::BytesFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_bytes() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_bytes_field("bytesfield");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
|
||||
index_writer.add_document(doc!(field=>vec![]));
|
||||
index_writer.add_document(doc!(field=>vec![255u8]));
|
||||
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
|
||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
||||
|
||||
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_val(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_val(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
|
||||
let long = vec![0u8; 1000];
|
||||
assert_eq!(bytes_reader.get_val(4), long.as_slice());
|
||||
}
|
||||
}
|
||||
@@ -1,37 +0,0 @@
|
||||
use owning_ref::OwningRef;
|
||||
|
||||
use directory::ReadOnlySource;
|
||||
use fastfield::FastFieldReader;
|
||||
use DocId;
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
///
|
||||
/// The reader is implemented as a `u64` fast field and a separate collection of bytes.
|
||||
///
|
||||
/// The `vals_reader` will access the concatenated list of all values for all documents.
|
||||
///
|
||||
/// The `idx_reader` associates, for each document, the index of its first value.
|
||||
///
|
||||
/// Reading the value for a document is done by reading the start index for it,
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values: OwningRef<ReadOnlySource, [u8]>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values_source: ReadOnlySource,
|
||||
) -> BytesFastFieldReader {
|
||||
let values = OwningRef::new(values_source).map(|source| &source[..]);
|
||||
BytesFastFieldReader { idx_reader, values }
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_val(&self, doc: DocId) -> &[u8] {
|
||||
let start = self.idx_reader.get(doc) as usize;
|
||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||
&self.values[start..stop]
|
||||
}
|
||||
}
|
||||
@@ -1,96 +0,0 @@
|
||||
use std::io;
|
||||
|
||||
use fastfield::serializer::FastFieldSerializer;
|
||||
use schema::{Document, Field, Value};
|
||||
use DocId;
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
/// This `BytesFastFieldWriter` is only useful for advanced user.
|
||||
/// The normal way to get your associated bytes in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::SingleValue`
|
||||
/// in your schema
|
||||
/// - add your document simply by calling `.add_document(...)` with associating bytes to the field.
|
||||
///
|
||||
/// The `BytesFastFieldWriter` can be acquired from the
|
||||
/// fast field writer by calling
|
||||
/// [`.get_bytes_writer(...)`](./struct.FastFieldsWriter.html#method.get_bytes_writer).
|
||||
///
|
||||
/// Once acquired, writing is done by calling `.add_document_val(&[u8])`
|
||||
/// once per document, even if there are no bytes associated to it.
|
||||
pub struct BytesFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u8>,
|
||||
doc_index: Vec<u64>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldWriter {
|
||||
/// Creates a new `BytesFastFieldWriter`
|
||||
pub fn new(field: Field) -> Self {
|
||||
BytesFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Access the field associated to the `BytesFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Finalize the current document.
|
||||
pub(crate) fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Shift to the next document and add all of the
|
||||
/// matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
if let &Value::Bytes(ref bytes) = field_value.value() {
|
||||
self.vals.extend_from_slice(bytes);
|
||||
} else {
|
||||
panic!(
|
||||
"Bytes field contained non-Bytes Value!. Field {:?} = {:?}",
|
||||
self.field, field_value
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Register the bytes associated to a document.
|
||||
///
|
||||
/// The method returns the `DocId` of the document that was
|
||||
/// just written.
|
||||
pub fn add_document_val(&mut self, val: &[u8]) -> DocId {
|
||||
let doc = self.doc_index.len() as DocId;
|
||||
self.next_doc();
|
||||
self.vals.extend_from_slice(val);
|
||||
doc
|
||||
}
|
||||
|
||||
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
for &offset in &self.doc_index {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
}
|
||||
doc_index_serializer.add_val(self.vals.len() as u64)?;
|
||||
doc_index_serializer.close_field()?;
|
||||
}
|
||||
{
|
||||
// writing the values themselves
|
||||
let mut value_serializer = serializer.new_bytes_fast_field_with_idx(self.field, 1)?;
|
||||
value_serializer.write_all(&self.vals)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
use bit_set::BitSet;
|
||||
use common::HasLen;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use common::HasLen;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
@@ -41,8 +41,7 @@ pub struct DeleteBitSet {
|
||||
impl DeleteBitSet {
|
||||
/// Opens a delete bitset given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||
let num_deleted: usize = data
|
||||
.as_slice()
|
||||
let num_deleted: usize = data.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
@@ -63,8 +62,10 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
@@ -73,10 +74,10 @@ impl HasLen for DeleteBitSet {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
use bit_set::BitSet;
|
||||
use directory::*;
|
||||
use std::path::PathBuf;
|
||||
use super::*;
|
||||
|
||||
fn test_delete_bitset_helper(bitset: &BitSet) {
|
||||
let test_path = PathBuf::from("test");
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use schema::FieldEntry;
|
||||
use std::result;
|
||||
use schema::FieldEntry;
|
||||
|
||||
/// `FastFieldNotAvailableError` is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use schema::Facet;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
use termdict::TermOrdinal;
|
||||
use schema::Facet;
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
/// facets associated to a given document in a specific
|
||||
@@ -19,7 +19,7 @@ use DocId;
|
||||
/// only makes sense for a given segment.
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionary,
|
||||
term_dict: TermDictionaryImpl,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
@@ -28,11 +28,11 @@ impl FacetReader {
|
||||
/// A facet reader just wraps :
|
||||
/// - a `MultiValueIntFastFieldReader` that makes it possible to
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionary` that helps associating a facet to
|
||||
/// - a `TermDictionaryImpl` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
pub fn new(
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionary,
|
||||
term_dict: TermDictionaryImpl,
|
||||
) -> FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
@@ -50,14 +50,13 @@ impl FacetReader {
|
||||
}
|
||||
|
||||
/// Accessor for the facet term dictionary.
|
||||
pub fn facet_dict(&self) -> &TermDictionary {
|
||||
pub fn facet_dict(&self) -> &TermDictionaryImpl {
|
||||
&self.term_dict
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
let found_term = self
|
||||
.term_dict
|
||||
let found_term = self.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
}
|
||||
|
||||
@@ -23,28 +23,26 @@ values stored.
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use common;
|
||||
use schema::Cardinality;
|
||||
use schema::FieldType;
|
||||
use schema::Value;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::MultiValueIntFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
|
||||
mod bytes;
|
||||
mod delete;
|
||||
mod reader;
|
||||
mod writer;
|
||||
mod serializer;
|
||||
mod error;
|
||||
mod delete;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for types that are allowed for fast fields: (u64 or i64).
|
||||
pub trait FastValue: Default + Clone + Copy {
|
||||
@@ -123,27 +121,31 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use schema::Document;
|
||||
use schema::Field;
|
||||
use schema::FAST;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use schema::Document;
|
||||
use schema::FAST;
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use super::*;
|
||||
use test;
|
||||
use test::Bencher;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref SCHEMA: Schema = {
|
||||
static ref SCHEMA: Schema = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
schema_builder.add_u64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
};
|
||||
pub static ref FIELD: Field = { SCHEMA.get_field("field").unwrap() };
|
||||
static ref FIELD: Field = {
|
||||
SCHEMA.get_field("field").unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -367,7 +369,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_permutation() -> Vec<u64> {
|
||||
fn generate_permutation() -> Vec<u64> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng = XorShiftRng::from_seed(*seed);
|
||||
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||
@@ -407,27 +409,13 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::tests::FIELD;
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use test::{self, Bencher};
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|v| v * 7) {
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
@@ -473,7 +461,7 @@ mod bench {
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n / 7).map(|val| val * 7) {
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
@@ -514,5 +502,4 @@ mod bench {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
mod reader;
|
||||
mod writer;
|
||||
mod reader;
|
||||
|
||||
pub use self::reader::MultiValueIntFastFieldReader;
|
||||
pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
pub use self::reader::MultiValueIntFastFieldReader;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Cardinality;
|
||||
use schema::IntOptions;
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use fastfield::{FastFieldReader, FastValue};
|
||||
use DocId;
|
||||
use fastfield::{FastFieldReader, FastValue};
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
///
|
||||
@@ -26,20 +26,13 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let (start, stop) = self.range(doc);
|
||||
let start = self.idx_reader.get(doc) as u32;
|
||||
let stop = self.idx_reader.get(doc + 1) as u32;
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||
self.vals_reader.get_range(start, &mut vals[..]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,35 +1,12 @@
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::serializer::FastSingleFieldSerializer;
|
||||
use fastfield::value_to_u64;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use itertools::Itertools;
|
||||
use std::collections::HashMap;
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Document, Field};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
use itertools::Itertools;
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
///
|
||||
/// This `Writer` is only useful for advanced user.
|
||||
/// The normal way to get your multivalued int in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::MultiValues`
|
||||
/// in your schema
|
||||
/// - add your document simply by calling `.add_document(...)`.
|
||||
///
|
||||
/// The `MultiValueIntFastFieldWriter` can be acquired from the
|
||||
/// fastfield writer, by calling [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer).
|
||||
///
|
||||
/// Once acquired, writing is done by calling calls to
|
||||
/// `.add_document_vals(&[u64])` once per document.
|
||||
///
|
||||
/// The serializer makes it possible to remap all of the values
|
||||
/// that were pushed to the writer using a mapping.
|
||||
/// This makes it possible to push unordered term ids,
|
||||
/// during indexing and remap them to their respective
|
||||
/// term ids when the segment is getting serialized.
|
||||
pub struct MultiValueIntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u64>,
|
||||
@@ -39,7 +16,7 @@ pub struct MultiValueIntFastFieldWriter {
|
||||
|
||||
impl MultiValueIntFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
|
||||
pub fn new(field: Field, is_facet: bool) -> Self {
|
||||
MultiValueIntFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
@@ -48,26 +25,24 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Access the field associated to the `MultiValueIntFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Finalize the current document.
|
||||
pub(crate) fn next_doc(&mut self) {
|
||||
pub fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Pushes a new value to the current document.
|
||||
pub(crate) fn add_val(&mut self, val: UnorderedTermId) {
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: UnorderedTermId) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
/// Shift to the next document and adds
|
||||
/// all of the matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if !self.is_facet {
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
@@ -77,17 +52,6 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Register all of the values associated to a document.
|
||||
///
|
||||
/// The method returns the `DocId` of the document that was
|
||||
/// just written.
|
||||
pub fn add_document_vals(&mut self, vals: &[UnorderedTermId]) -> DocId {
|
||||
let doc = self.doc_index.len() as DocId;
|
||||
self.next_doc();
|
||||
self.vals.extend_from_slice(vals);
|
||||
doc
|
||||
}
|
||||
|
||||
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
|
||||
///
|
||||
/// HashMap makes it possible to remap them before serializing.
|
||||
@@ -102,7 +66,7 @@ impl MultiValueIntFastFieldWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
@@ -126,13 +90,13 @@ impl MultiValueIntFastFieldWriter {
|
||||
1,
|
||||
)?;
|
||||
for val in &self.vals {
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
|
||||
value_serializer.add_val(remapped_val)?;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = self.vals.iter().cloned().minmax();
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for &val in &self.vals {
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
use super::FastValue;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use common::compute_num_bits;
|
||||
use common::BinarySerializable;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use common::CompositeFile;
|
||||
use directory::ReadOnlySource;
|
||||
use common::compute_num_bits;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use owning_ref::OwningRef;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::FAST;
|
||||
use schema::SchemaBuilder;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem;
|
||||
use std::path::Path;
|
||||
use DocId;
|
||||
use super::FastValue;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
@@ -67,20 +67,12 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
///
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64`
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||
self.bit_unpacker.get_range(start, output_u64);
|
||||
for out in output_u64.iter_mut() {
|
||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use common::bitpacker::BitPacker;
|
||||
use common::compute_num_bits;
|
||||
use common::BinarySerializable;
|
||||
use common::CompositeWrite;
|
||||
use common::CountingWriter;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use common::bitpacker::BitPacker;
|
||||
use common::compute_num_bits;
|
||||
use common::CountingWriter;
|
||||
use common::CompositeWrite;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
@@ -61,16 +61,6 @@ impl FastFieldSerializer {
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> io::Result<FastBytesFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer::open(field_write)
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
@@ -87,20 +77,11 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
@@ -126,21 +107,3 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write + 'a> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
fn open(write: &'a mut W) -> io::Result<FastBytesFieldSerializer<'a, W>> {
|
||||
Ok(FastBytesFieldSerializer { write })
|
||||
}
|
||||
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +1,18 @@
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use common;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Cardinality, Document, Field, FieldType, Schema};
|
||||
use std::collections::HashMap;
|
||||
use schema::{Cardinality, Document, Field, Schema};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use std::io;
|
||||
use termdict::TermOrdinal;
|
||||
use schema::FieldType;
|
||||
use common;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use postings::UnorderedTermId;
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use common::BinarySerializable;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
|
||||
bytes_value_writers: Vec<BytesFastFieldWriter>,
|
||||
}
|
||||
|
||||
impl FastFieldsWriter {
|
||||
@@ -21,7 +20,6 @@ impl FastFieldsWriter {
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
@@ -49,17 +47,12 @@ impl FastFieldsWriter {
|
||||
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
let fast_field_writer = BytesFastFieldWriter::new(field);
|
||||
bytes_value_writers.push(fast_field_writer);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
bytes_value_writers,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,36 +68,24 @@ impl FastFieldsWriter {
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a multivalued fastfield in the schema.
|
||||
pub fn get_multivalue_writer(
|
||||
pub(crate) fn get_multivalue_writer(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValueIntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
// TODO expose for users
|
||||
self.multi_values_writers
|
||||
.iter_mut()
|
||||
.find(|multivalue_writer| multivalue_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the bytes fast field writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a bytes fastfield in the schema.
|
||||
pub fn get_bytes_writer(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.bytes_value_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.multi_values_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.bytes_value_writers {
|
||||
field_writer.next_doc();
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
}
|
||||
@@ -114,7 +95,7 @@ impl FastFieldsWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
@@ -123,9 +104,6 @@ impl FastFieldsWriter {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field))?;
|
||||
}
|
||||
for field_writer in &self.bytes_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
|
||||
#[inline(always)]
|
||||
pub fn id_to_fieldnorm(id: u8) -> u32 {
|
||||
FIELD_NORMS_TABLE[id as usize]
|
||||
}
|
||||
|
||||
|
||||
#[inline(always)]
|
||||
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
FIELD_NORMS_TABLE
|
||||
@@ -10,34 +12,45 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
.unwrap_or_else(|idx| idx - 1) as u8
|
||||
}
|
||||
|
||||
|
||||
pub const FIELD_NORMS_TABLE: [u32; 256] = [
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
|
||||
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
|
||||
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1048,
|
||||
1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
|
||||
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384,
|
||||
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176,
|
||||
53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
|
||||
163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, 393240, 426008,
|
||||
458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, 1048600,
|
||||
1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, 2621464,
|
||||
2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, 6291480,
|
||||
6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, 14680088,
|
||||
15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, 31457304,
|
||||
33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, 67108888,
|
||||
75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968,
|
||||
167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912,
|
||||
335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800,
|
||||
671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576,
|
||||
1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54,
|
||||
56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144,
|
||||
152, 168, 184, 200, 216, 232, 248, 264, 280, 312, 344, 376, 408, 440, 472, 504,
|
||||
536, 600, 664, 728, 792, 856, 920, 984,
|
||||
1048, 1176, 1304, 1432, 1560, 1688, 1816, 1944,
|
||||
2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
|
||||
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240,
|
||||
10264, 11288, 12312, 13336, 14360, 15384,
|
||||
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744,
|
||||
32792, 36888, 40984, 45080, 49176, 53272, 57368, 61464,
|
||||
65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
|
||||
163864, 180248, 196632, 213016, 229400, 245784, 262168,
|
||||
294936, 327704, 360472, 393240, 426008, 458776,
|
||||
491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528,
|
||||
983064, 1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032,
|
||||
1966104, 2097176, 2359320, 2621464, 2883608, 3145752, 3407896, 3670040, 3932184,
|
||||
4194328, 4718616, 5242904, 5767192, 6291480, 6815768, 7340056, 7864344, 8388632, 9437208,
|
||||
10485784, 11534360, 12582936, 13631512, 14680088, 15728664, 16777240, 18874392, 20971544,
|
||||
23068696, 25165848, 27263000, 29360152, 31457304, 33554456, 37748760, 41943064,
|
||||
46137368, 50331672, 54525976, 58720280, 62914584, 67108888, 75497496, 83886104,
|
||||
92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968, 167772184,
|
||||
184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912, 335544344,
|
||||
369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800, 671088664,
|
||||
738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576, 1342177304,
|
||||
1476395032, 1610612760, 1744830488, 1879048216, 2013265944
|
||||
];
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm, FIELD_NORMS_TABLE};
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_decode_code() {
|
||||
assert_eq!(fieldnorm_to_id(0), 0);
|
||||
@@ -90,4 +103,4 @@ mod tests {
|
||||
assert_eq!(FIELD_NORMS_TABLE[i], decode_fieldnorm_byte(i as u8));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17,12 +17,13 @@
|
||||
//!
|
||||
//! This trick is used by the [BM25 similarity]().
|
||||
mod code;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
mod reader;
|
||||
|
||||
pub use self::reader::FieldNormReader;
|
||||
pub use self::serializer::FieldNormsSerializer;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
pub use self::serializer::FieldNormsSerializer;
|
||||
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use super::{id_to_fieldnorm, fieldnorm_to_id};
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
|
||||
|
||||
/// Reads the fieldnorm associated to a document.
|
||||
/// The fieldnorm represents the length associated to
|
||||
/// a given Field of a given document.
|
||||
@@ -20,13 +21,16 @@ use DocId;
|
||||
/// precompute computationally expensive functions of the fieldnorm
|
||||
/// in a very short array.
|
||||
pub struct FieldNormReader {
|
||||
data: ReadOnlySource,
|
||||
data: ReadOnlySource
|
||||
}
|
||||
|
||||
impl FieldNormReader {
|
||||
|
||||
/// Opens a field norm reader given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> Self {
|
||||
FieldNormReader { data }
|
||||
FieldNormReader {
|
||||
data
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `fieldnorm` associated to a doc id.
|
||||
@@ -67,13 +71,12 @@ impl FieldNormReader {
|
||||
#[cfg(test)]
|
||||
impl From<Vec<u32>> for FieldNormReader {
|
||||
fn from(field_norms: Vec<u32>) -> FieldNormReader {
|
||||
let field_norms_id = field_norms
|
||||
.into_iter()
|
||||
let field_norms_id = field_norms.into_iter()
|
||||
.map(FieldNormReader::fieldnorm_to_id)
|
||||
.collect::<Vec<u8>>();
|
||||
let field_norms_data = ReadOnlySource::from(field_norms_id);
|
||||
FieldNormReader {
|
||||
data: field_norms_data,
|
||||
data: field_norms_data
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,24 +1,26 @@
|
||||
use common::CompositeWrite;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use std::io;
|
||||
use common::CompositeWrite;
|
||||
use schema::Field;
|
||||
use std::io::Write;
|
||||
|
||||
/// The fieldnorms serializer is in charge of
|
||||
/// the serialization of field norms for all fields.
|
||||
|
||||
pub struct FieldNormsSerializer {
|
||||
composite_write: CompositeWrite,
|
||||
}
|
||||
|
||||
impl FieldNormsSerializer {
|
||||
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FieldNormsSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FieldNormsSerializer { composite_write })
|
||||
Ok(FieldNormsSerializer {
|
||||
composite_write
|
||||
})
|
||||
}
|
||||
|
||||
/// Serialize the given field
|
||||
|
||||
pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> {
|
||||
let write = self.composite_write.for_field(field);
|
||||
write.write_all(fieldnorms_data)?;
|
||||
@@ -26,9 +28,10 @@ impl FieldNormsSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up / flush / close
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1,36 +1,30 @@
|
||||
use DocId;
|
||||
|
||||
use super::fieldnorm_to_id;
|
||||
use super::FieldNormsSerializer;
|
||||
use schema::Field;
|
||||
use schema::Schema;
|
||||
use super::FieldNormsSerializer;
|
||||
use std::io;
|
||||
use schema::Schema;
|
||||
use super::fieldnorm_to_id;
|
||||
|
||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||
/// of each document for each field with field norms.
|
||||
///
|
||||
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
|
||||
/// byte per document per field.
|
||||
pub struct FieldNormsWriter {
|
||||
fields: Vec<Field>,
|
||||
fieldnorms_buffer: Vec<Vec<u8>>,
|
||||
fieldnorms_buffer: Vec<Vec<u8>>
|
||||
}
|
||||
|
||||
impl FieldNormsWriter {
|
||||
/// Returns the fields that should have field norms computed
|
||||
/// according to the given schema.
|
||||
pub(crate) fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
||||
|
||||
pub fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.filter(|&(_, field_entry)| {
|
||||
field_entry.is_indexed()
|
||||
})
|
||||
.map(|(field, _)| Field(field as u32))
|
||||
.collect::<Vec<Field>>()
|
||||
}
|
||||
|
||||
/// Initialize with state for tracking the field norm fields
|
||||
/// specified in the schema.
|
||||
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
|
||||
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
||||
let max_field = fields
|
||||
@@ -41,40 +35,26 @@ impl FieldNormsWriter {
|
||||
.unwrap_or(0);
|
||||
FieldNormsWriter {
|
||||
fields,
|
||||
fieldnorms_buffer: (0..max_field).map(|_| Vec::new()).collect::<Vec<_>>(),
|
||||
fieldnorms_buffer: (0..max_field)
|
||||
.map(|_| Vec::new())
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensure that all documents in 0..max_doc have a byte associated with them
|
||||
/// in each of the fieldnorm vectors.
|
||||
///
|
||||
/// Will extend with 0-bytes for documents that have not been seen.
|
||||
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
||||
for &field in self.fields.iter() {
|
||||
self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8);
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the fieldnorm byte for the given document for the given field.
|
||||
///
|
||||
/// Will internally convert the u32 `fieldnorm` value to the appropriate byte
|
||||
/// to approximate the field norm in less space.
|
||||
///
|
||||
/// * doc - the document id
|
||||
/// * field - the field being set
|
||||
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
||||
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
||||
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.0 as usize];
|
||||
assert!(
|
||||
fieldnorm_buffer.len() <= doc as usize,
|
||||
"Cannot register a given fieldnorm twice"
|
||||
);
|
||||
assert!(fieldnorm_buffer.len() <= doc as usize, "Cannot register a given fieldnorm twice");
|
||||
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
||||
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
|
||||
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
|
||||
}
|
||||
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
|
||||
for &field in self.fields.iter() {
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..];
|
||||
@@ -82,4 +62,4 @@ impl FieldNormsWriter {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
use rand::thread_rng;
|
||||
use std::collections::HashSet;
|
||||
use rand::thread_rng;
|
||||
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
use schema::*;
|
||||
use Index;
|
||||
use Searcher;
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
@@ -13,7 +13,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::operation::DeleteOperation;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
// The DeleteQueue is similar in conceptually to a multiple
|
||||
// consumer single producer broadcast channel.
|
||||
@@ -52,8 +52,7 @@ impl DeleteQueue {
|
||||
//
|
||||
// Past delete operations are not accessible.
|
||||
pub fn cursor(&self) -> DeleteCursor {
|
||||
let last_block = self
|
||||
.inner
|
||||
let last_block = self.inner
|
||||
.read()
|
||||
.expect("Read lock poisoned when opening delete queue cursor")
|
||||
.last_block
|
||||
@@ -93,8 +92,7 @@ impl DeleteQueue {
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self
|
||||
.inner
|
||||
let mut self_wlock = self.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
|
||||
@@ -134,8 +132,7 @@ impl From<DeleteQueue> for NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self
|
||||
.0
|
||||
let next_read_lock = self.0
|
||||
.read()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||
@@ -144,8 +141,7 @@ impl NextBlock {
|
||||
}
|
||||
let next_block;
|
||||
{
|
||||
let mut next_write_lock = self
|
||||
.0
|
||||
let mut next_write_lock = self.0
|
||||
.write()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
match *next_write_lock {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use core::LOCKFILE_FILEPATH;
|
||||
use directory::error::OpenWriteError;
|
||||
use Directory;
|
||||
use directory::error::OpenWriteError;
|
||||
use core::LOCKFILE_FILEPATH;
|
||||
|
||||
/// The directory lock is a mechanism used to
|
||||
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use bit_set::BitSet;
|
||||
use chan;
|
||||
use core::Index;
|
||||
@@ -9,35 +6,39 @@ use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use indexer::stamper::Stamper;
|
||||
use datastruct::stacker::Heap;
|
||||
use directory::FileProtection;
|
||||
use docset::DocSet;
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
use fastfield::write_delete_bitset;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use futures::Canceled;
|
||||
use datastruct::stacker::hashmap::split_memory;
|
||||
use futures::Future;
|
||||
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use indexer::operation::DeleteOperation;
|
||||
use indexer::stamper::Stamper;
|
||||
use indexer::DirectoryLock;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::operation::DeleteOperation;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentWriter;
|
||||
use postings::compute_table_size;
|
||||
use schema::Document;
|
||||
use docset::DocSet;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Document;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::mem::swap;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use indexer::DirectoryLock;
|
||||
use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use std::thread;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
||||
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
|
||||
|
||||
// We impose the memory per thread to be at least 3 MB.
|
||||
pub const HEAP_SIZE_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
|
||||
pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
|
||||
|
||||
// Add document will block if the number of docs waiting in the queue to be indexed
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
@@ -46,24 +47,6 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
type DocumentSender = chan::Sender<AddOperation>;
|
||||
type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
(1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
))
|
||||
.min(19) // we cap it at 512K
|
||||
}
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
///
|
||||
/// It manages a small number of indexing thread, as well as a shared
|
||||
@@ -98,6 +81,10 @@ pub struct IndexWriter {
|
||||
committed_opstamp: u64,
|
||||
}
|
||||
|
||||
// IndexWriter cannot be sent to another thread.
|
||||
impl !Send for IndexWriter {}
|
||||
impl !Sync for IndexWriter {}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
@@ -118,16 +105,11 @@ pub fn open_index_writer(
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
let err_msg = format!(
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
HEAP_SIZE_LIMIT
|
||||
));
|
||||
}
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
@@ -218,6 +200,7 @@ pub fn advance_deletes(
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
{
|
||||
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
|
||||
// We are already up-to-date here.
|
||||
@@ -258,33 +241,48 @@ pub fn advance_deletes(
|
||||
}
|
||||
}
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
|
||||
Ok(file_protect)
|
||||
}
|
||||
|
||||
fn index_documents(
|
||||
memory_budget: usize,
|
||||
heap: &mut Heap,
|
||||
table_size: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
heap.clear();
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
let table_size = initial_table_size(memory_budget);
|
||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?;
|
||||
for doc in document_iterator {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
|
||||
let mem_usage = segment_writer.mem_usage();
|
||||
|
||||
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
||||
// There is two possible conditions to close the segment.
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
if segment_writer.is_buffer_full() {
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
// The second is the term dictionary hash table
|
||||
// is reaching saturation.
|
||||
//
|
||||
// Tantivy does not resize its hashtable. When it reaches
|
||||
// capacity, we just stop indexing new document.
|
||||
if segment_writer.is_term_saturated() {
|
||||
info!(
|
||||
"Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !segment_updater.is_alive() {
|
||||
@@ -342,8 +340,7 @@ impl IndexWriter {
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result = self
|
||||
.segment_updater
|
||||
let result = self.segment_updater
|
||||
.wait_merging_thread()
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
|
||||
|
||||
@@ -377,12 +374,14 @@ impl IndexWriter {
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let document_receiver_clone = self.document_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
|
||||
info!("heap size {}, table_size {}", heap_size, table_size);
|
||||
let mut heap = Heap::with_capacity(heap_size);
|
||||
|
||||
let generation = self.generation;
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!(
|
||||
"indexing thread {} for gen {}",
|
||||
@@ -410,7 +409,8 @@ impl IndexWriter {
|
||||
}
|
||||
let segment = segment_updater.new_segment();
|
||||
index_documents(
|
||||
mem_budget,
|
||||
&mut heap,
|
||||
table_size,
|
||||
&segment,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
@@ -448,7 +448,10 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
|
||||
pub fn merge(
|
||||
&mut self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
self.segment_updater.start_merge(segment_ids)
|
||||
}
|
||||
|
||||
@@ -488,8 +491,7 @@ impl IndexWriter {
|
||||
let document_receiver = self.document_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self
|
||||
._directory_lock
|
||||
let directory_lock = self._directory_lock
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
@@ -645,13 +647,12 @@ impl IndexWriter {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::initial_table_size;
|
||||
use env_logger;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
use schema::{self, Document};
|
||||
use Index;
|
||||
use Term;
|
||||
use error::*;
|
||||
use env_logger;
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_stops_duplicates() {
|
||||
@@ -674,7 +675,7 @@ mod tests {
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = Box::new(NoMergePolicy::default());
|
||||
let merge_policy = box NoMergePolicy::default();
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
@@ -708,7 +709,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.rollback().unwrap();
|
||||
|
||||
@@ -741,7 +742,7 @@ mod tests {
|
||||
};
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
let mut doc = Document::default();
|
||||
@@ -775,7 +776,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
@@ -810,7 +811,7 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
@@ -840,12 +841,4 @@ mod tests {
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000), 12);
|
||||
assert_eq!(initial_table_size(1_000_000), 15);
|
||||
assert_eq!(initial_table_size(10_000_000), 18);
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -82,7 +82,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(self.clone())
|
||||
box self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,8 +99,8 @@ impl Default for LogMergePolicy {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use core::{SegmentId, SegmentMeta};
|
||||
use indexer::merge_policy::MergePolicy;
|
||||
use core::{SegmentId, SegmentMeta};
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use std::fmt::Debug;
|
||||
use std::marker;
|
||||
use std::fmt::Debug;
|
||||
|
||||
/// Set of segment suggested for a merge.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -37,7 +37,7 @@ impl MergePolicy for NoMergePolicy {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(NoMergePolicy)
|
||||
box NoMergePolicy
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ pub mod tests {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(MergeWheneverPossible)
|
||||
box MergeWheneverPossible
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,29 +1,29 @@
|
||||
pub mod delete_queue;
|
||||
mod directory_lock;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod index_writer;
|
||||
mod log_merge_policy;
|
||||
pub mod merge_policy;
|
||||
pub mod merger;
|
||||
pub mod operation;
|
||||
mod prepared_commit;
|
||||
mod segment_entry;
|
||||
mod segment_manager;
|
||||
mod segment_register;
|
||||
pub mod segment_serializer;
|
||||
pub mod segment_updater;
|
||||
pub mod merger;
|
||||
pub mod merge_policy;
|
||||
mod log_merge_policy;
|
||||
mod segment_register;
|
||||
mod segment_writer;
|
||||
mod segment_manager;
|
||||
pub mod delete_queue;
|
||||
pub mod segment_updater;
|
||||
mod directory_lock;
|
||||
mod segment_entry;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod operation;
|
||||
mod stamper;
|
||||
mod prepared_commit;
|
||||
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::IndexWriter;
|
||||
use Result;
|
||||
use super::IndexWriter;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
@@ -13,7 +13,7 @@ impl<'a> PreparedCommit<'a> {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
opstamp,
|
||||
opstamp
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use bit_set::BitSet;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use bit_set::BitSet;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use core::SegmentId;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use core::SegmentId;
|
||||
use std::sync::RwLock;
|
||||
use core::SegmentMeta;
|
||||
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use core::SegmentId;
|
||||
use indexer::SegmentEntry;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
|
||||
#[derive(Default)]
|
||||
struct SegmentRegisters {
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::segment_entry::SegmentEntry;
|
||||
use std::collections::HashMap;
|
||||
use core::SegmentMeta;
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use indexer::segment_entry::SegmentEntry;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
|
||||
/// The segment register keeps track
|
||||
/// of the list of segment, their size as well
|
||||
@@ -59,8 +59,7 @@ impl SegmentRegister {
|
||||
}
|
||||
|
||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let mut segment_ids: Vec<SegmentMeta> = self
|
||||
.segment_states
|
||||
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
@@ -114,11 +113,11 @@ impl SegmentRegister {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use indexer::SegmentState;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::*;
|
||||
use indexer::SegmentState;
|
||||
use super::*;
|
||||
|
||||
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
||||
segment_register
|
||||
|
||||
@@ -3,9 +3,9 @@ use Result;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use store::StoreWriter;
|
||||
use fieldnorm::FieldNormsSerializer;
|
||||
use postings::InvertedIndexSerializer;
|
||||
use store::StoreWriter;
|
||||
|
||||
/// Segment serializer is in charge of laying out on disk
|
||||
/// the data accumulated and sorted by the `SegmentWriter`.
|
||||
@@ -47,7 +47,7 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Accessor to the field norm serializer.
|
||||
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
|
||||
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
|
||||
&mut self.fieldnorms_serializer
|
||||
}
|
||||
|
||||
|
||||
@@ -1,40 +1,40 @@
|
||||
use super::segment_manager::{get_mergeable_segments, SegmentManager};
|
||||
use core::Index;
|
||||
use core::IndexMeta;
|
||||
use core::META_FILEPATH;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SerializableSegment;
|
||||
use core::META_FILEPATH;
|
||||
use directory::Directory;
|
||||
use directory::FileProtection;
|
||||
use error::{Error, ErrorKind, Result};
|
||||
use futures::oneshot;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use futures::Future;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use futures_cpupool::CpuPool;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::index_writer::advance_deletes;
|
||||
use indexer::merger::IndexMerger;
|
||||
use indexer::stamper::Stamper;
|
||||
use error::{Error, ErrorKind, Result};
|
||||
use futures_cpupool::CpuPool;
|
||||
use futures::Future;
|
||||
use futures::Canceled;
|
||||
use futures::oneshot;
|
||||
use directory::FileProtection;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use indexer::index_writer::advance_deletes;
|
||||
use indexer::MergeCandidate;
|
||||
use indexer::merger::IndexMerger;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentSerializer;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use schema::Schema;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use serde_json;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use schema::Schema;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::RwLock;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use super::segment_manager::{get_mergeable_segments, SegmentManager};
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
@@ -171,7 +171,7 @@ impl SegmentUpdater {
|
||||
pool: CpuPool::new(1),
|
||||
index,
|
||||
segment_manager,
|
||||
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
@@ -283,7 +283,10 @@ impl SegmentUpdater {
|
||||
}).wait()
|
||||
}
|
||||
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
|
||||
pub fn start_merge(
|
||||
&self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
self.0.segment_manager.start_merge(segment_ids);
|
||||
let segment_updater_clone = self.clone();
|
||||
|
||||
@@ -358,9 +361,7 @@ impl SegmentUpdater {
|
||||
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
|
||||
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
||||
for MergeCandidate(segment_metas) in merge_candidates {
|
||||
if let Err(e) = self.start_merge(&segment_metas).fuse().poll() {
|
||||
error!("The merge task failed quickly after starting: {:?}", e);
|
||||
}
|
||||
self.start_merge(&segment_metas);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -479,9 +480,9 @@ impl SegmentUpdater {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
use schema::*;
|
||||
use Index;
|
||||
use schema::*;
|
||||
use indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
|
||||
#[test]
|
||||
fn test_delete_during_merge() {
|
||||
@@ -493,7 +494,7 @@ mod tests {
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
||||
index_writer.set_merge_policy(box MergeWheneverPossible);
|
||||
|
||||
{
|
||||
for _ in 0..100 {
|
||||
|
||||
@@ -1,30 +1,33 @@
|
||||
use super::operation::AddOperation;
|
||||
use Result;
|
||||
use DocId;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use schema::Value;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use datastruct::stacker::Heap;
|
||||
use indexer::index_writer::MARGIN_IN_BYTES;
|
||||
use super::operation::AddOperation;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::FacetTokenizer;
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use DocId;
|
||||
use Result;
|
||||
use schema::Value;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// documents.
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
pub struct SegmentWriter {
|
||||
pub struct SegmentWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
max_doc: DocId,
|
||||
multifield_postings: MultiFieldPostingsWriter,
|
||||
multifield_postings: MultiFieldPostingsWriter<'a>,
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
@@ -32,7 +35,8 @@ pub struct SegmentWriter {
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
impl SegmentWriter {
|
||||
|
||||
impl<'a> SegmentWriter<'a> {
|
||||
/// Creates a new `SegmentWriter`
|
||||
///
|
||||
/// The arguments are defined as follows
|
||||
@@ -43,12 +47,13 @@ impl SegmentWriter {
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter> {
|
||||
) -> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.iter()
|
||||
@@ -64,6 +69,7 @@ impl SegmentWriter {
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
heap,
|
||||
max_doc: 0,
|
||||
multifield_postings,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
|
||||
@@ -89,8 +95,22 @@ impl SegmentWriter {
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.multifield_postings.mem_usage()
|
||||
/// Returns true iff the segment writer's buffer has reached capacity.
|
||||
///
|
||||
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
|
||||
/// The `Segment` is `finalize`d when the buffer gets full.
|
||||
///
|
||||
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
|
||||
/// exceeds the heap size.
|
||||
pub fn is_buffer_full(&self) -> bool {
|
||||
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
|
||||
}
|
||||
|
||||
/// Return true if the term dictionary hashmap is reaching capacity.
|
||||
/// It is one of the condition that triggers a `SegmentWriter` to
|
||||
/// be finalized.
|
||||
pub(crate) fn is_term_saturated(&self) -> bool {
|
||||
self.multifield_postings.is_term_saturated()
|
||||
}
|
||||
|
||||
/// Indexes a new document
|
||||
@@ -119,7 +139,8 @@ impl SegmentWriter {
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut term = Term::for_field(field); // we set the Term
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
for facet_bytes in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
||||
@@ -158,7 +179,8 @@ impl SegmentWriter {
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, num_tokens);
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
@@ -182,9 +204,6 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// Do nothing. Bytes only supports fast fields.
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
|
||||
@@ -229,7 +248,7 @@ fn write(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl SerializableSegment for SegmentWriter {
|
||||
impl<'a> SerializableSegment for SegmentWriter<'a> {
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(
|
||||
|
||||
@@ -1,66 +1,15 @@
|
||||
// AtomicU64 have not landed in stable.
|
||||
// For the moment let's just use AtomicUsize on
|
||||
// x86/64 bit platform, and a mutex on other platform.
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[cfg(target = "x86_64")]
|
||||
mod archicture_impl {
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicUsize>);
|
||||
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicUsize::new(first_opstamp as usize)))
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target = "x86_64"))]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<Mutex<u64>>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(Mutex::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
let mut guard = self.0.lock().expect("Failed to lock the stamper");
|
||||
let previous_val = *guard;
|
||||
*guard = previous_val + 1;
|
||||
previous_val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub use self::archicture_impl::Stamper;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::Stamper;
|
||||
|
||||
#[test]
|
||||
fn test_stamper() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
assert_eq!(stamper.stamp(), 7u64);
|
||||
assert_eq!(stamper.stamp(), 8u64);
|
||||
|
||||
let stamper_clone = stamper.clone();
|
||||
assert_eq!(stamper.stamp(), 9u64);
|
||||
|
||||
assert_eq!(stamper.stamp(), 10u64);
|
||||
assert_eq!(stamper_clone.stamp(), 11u64);
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1, Ordering::SeqCst) as u64
|
||||
}
|
||||
}
|
||||
|
||||
94
src/lib.rs
94
src/lib.rs
@@ -1,7 +1,14 @@
|
||||
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(module_inception))]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(inline_always))]
|
||||
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||
#![feature(box_syntax)]
|
||||
#![feature(optin_builtin_traits)]
|
||||
#![feature(conservative_impl_trait)]
|
||||
#![feature(collections_range)]
|
||||
#![feature(integer_atomics)]
|
||||
#![feature(drain_filter)]
|
||||
#![cfg_attr(test, feature(test))]
|
||||
#![cfg_attr(test, feature(iterator_step_by))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![allow(unknown_lints)]
|
||||
#![allow(new_without_default)]
|
||||
@@ -55,7 +62,7 @@
|
||||
//!
|
||||
//! // Indexing documents
|
||||
//!
|
||||
//! let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
//! let index = Index::create(index_path, schema.clone())?;
|
||||
//!
|
||||
//! // Here we use a buffer of 100MB that will be split
|
||||
//! // between indexing threads.
|
||||
@@ -116,40 +123,36 @@ extern crate lazy_static;
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
|
||||
#[cfg_attr(test, macro_use)]
|
||||
extern crate serde_json;
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
#[macro_use]
|
||||
extern crate error_chain;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
extern crate atomicwrites;
|
||||
extern crate base64;
|
||||
extern crate bit_set;
|
||||
extern crate bitpacking;
|
||||
extern crate byteorder;
|
||||
extern crate chan;
|
||||
extern crate combine;
|
||||
extern crate crossbeam;
|
||||
extern crate fnv;
|
||||
extern crate fst;
|
||||
extern crate fst_regex;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate itertools;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate snap;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
extern crate rust_stemmers;
|
||||
extern crate serde;
|
||||
extern crate serde_json;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate tempdir;
|
||||
#[cfg(test)]
|
||||
extern crate tempfile;
|
||||
extern crate uuid;
|
||||
extern crate bitpacking;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
@@ -163,8 +166,7 @@ extern crate winapi;
|
||||
|
||||
#[cfg(test)]
|
||||
extern crate rand;
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
#[cfg(test)]
|
||||
extern crate test;
|
||||
|
||||
extern crate tinysegmenter;
|
||||
@@ -183,35 +185,36 @@ pub use error::{Error, ErrorKind, ResultExt};
|
||||
/// Tantivy result.
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
mod common;
|
||||
mod compression;
|
||||
mod core;
|
||||
mod compression;
|
||||
mod indexer;
|
||||
mod common;
|
||||
|
||||
#[allow(unused_doc_comments)]
|
||||
#[allow(unused_doc_comment)]
|
||||
mod error;
|
||||
pub mod tokenizer;
|
||||
mod datastruct;
|
||||
|
||||
pub mod collector;
|
||||
pub mod termdict;
|
||||
pub mod store;
|
||||
pub mod query;
|
||||
pub mod directory;
|
||||
pub mod collector;
|
||||
pub mod postings;
|
||||
pub mod schema;
|
||||
pub mod fastfield;
|
||||
pub mod fieldnorm;
|
||||
pub mod postings;
|
||||
pub mod query;
|
||||
pub mod schema;
|
||||
pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use core::SegmentComponent;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use directory::Directory;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use indexer::IndexWriter;
|
||||
pub use postings::Postings;
|
||||
pub use schema::{Document, Term};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use postings::Postings;
|
||||
pub use core::SegmentComponent;
|
||||
|
||||
pub use common::{i64_to_u64, u64_to_i64};
|
||||
|
||||
@@ -227,10 +230,10 @@ pub fn version() -> &'static str {
|
||||
|
||||
/// Defines tantivy's merging strategy
|
||||
pub mod merge_policy {
|
||||
pub use indexer::DefaultMergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
pub use indexer::MergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
pub use indexer::NoMergePolicy;
|
||||
pub use indexer::DefaultMergePolicy;
|
||||
}
|
||||
|
||||
/// A `u32` identifying a document within a segment.
|
||||
@@ -279,29 +282,33 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
mod tests {
|
||||
|
||||
use collector::tests::TestCollector;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use query::BooleanQuery;
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use schema::*;
|
||||
use Index;
|
||||
use core::SegmentReader;
|
||||
use query::BooleanQuery;
|
||||
use schema::*;
|
||||
use docset::DocSet;
|
||||
use IndexWriter;
|
||||
use Postings;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
||||
assert!(
|
||||
nearly_equals(val, expected),
|
||||
"Got {}, expected {}.",
|
||||
val,
|
||||
expected
|
||||
);
|
||||
assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
|
||||
}
|
||||
|
||||
pub fn nearly_equals(a: f32, b: f32) -> bool {
|
||||
(a - b).abs() < 0.0005 * (a + b).abs()
|
||||
}
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32() < ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
@@ -311,6 +318,10 @@ mod tests {
|
||||
.collect::<Vec<u32>>()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
@@ -322,7 +333,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(feature="mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
@@ -448,6 +459,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn advance_undeleted(docset: &mut DocSet, reader: &SegmentReader) -> bool {
|
||||
while docset.advance() {
|
||||
if !reader.is_deleted(docset.doc()) {
|
||||
|
||||
@@ -6,24 +6,23 @@ Postings module (also called inverted index)
|
||||
///
|
||||
/// Postings, also called inverted lists, is the key datastructure
|
||||
/// to full-text search.
|
||||
|
||||
mod postings;
|
||||
mod postings_writer;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
mod serializer;
|
||||
mod stacker;
|
||||
mod postings_writer;
|
||||
mod term_info;
|
||||
mod segment_postings;
|
||||
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
|
||||
pub use self::postings::Postings;
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
|
||||
pub use common::HasLen;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
@@ -39,21 +38,25 @@ pub(crate) enum FreqReadingOption {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use core::Index;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentReader;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
use indexer::SegmentWriter;
|
||||
use query::Scorer;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use schema::Field;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||
use std::iter;
|
||||
use DocId;
|
||||
use Score;
|
||||
use query::Intersection;
|
||||
use query::Scorer;
|
||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||
use core::SegmentComponent;
|
||||
use indexer::SegmentWriter;
|
||||
use core::SegmentReader;
|
||||
use core::Index;
|
||||
use schema::IndexRecordOption;
|
||||
use std::iter;
|
||||
use datastruct::stacker::Heap;
|
||||
use schema::Field;
|
||||
use test::{self, Bencher};
|
||||
use indexer::operation::AddOperation;
|
||||
use tests;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use fieldnorm::FieldNormReader;
|
||||
|
||||
#[test]
|
||||
pub fn test_position_write() {
|
||||
@@ -124,6 +127,7 @@ pub mod tests {
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
@@ -162,9 +166,10 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let segment = index.new_segment();
|
||||
|
||||
let heap = Heap::with_capacity(10_000_000);
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
|
||||
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
@@ -201,14 +206,13 @@ pub mod tests {
|
||||
{
|
||||
let segment_reader = SegmentReader::open(&segment).unwrap();
|
||||
{
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field);
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field) ;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
|
||||
for i in 2..1000 {
|
||||
assert_eq!(
|
||||
fieldnorm_reader.fieldnorm_id(i),
|
||||
FieldNormReader::fieldnorm_to_id(i + 1)
|
||||
);
|
||||
FieldNormReader::fieldnorm_to_id(i + 1) );
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -445,7 +449,7 @@ pub mod tests {
|
||||
// delete everything else
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.delete_term(term_1);
|
||||
index_writer.delete_term(term_1);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
@@ -475,23 +479,23 @@ pub mod tests {
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref TERM_A: Term = {
|
||||
static ref TERM_A: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "a")
|
||||
};
|
||||
pub static ref TERM_B: Term = {
|
||||
static ref TERM_B: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "b")
|
||||
};
|
||||
pub static ref TERM_C: Term = {
|
||||
static ref TERM_C: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "c")
|
||||
};
|
||||
pub static ref TERM_D: Term = {
|
||||
static ref TERM_D: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "d")
|
||||
};
|
||||
pub static ref INDEX: Index = {
|
||||
static ref INDEX: Index = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
@@ -503,7 +507,7 @@ pub mod tests {
|
||||
let posting_list_size = 1_000_000;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
for _ in 0..posting_list_size {
|
||||
for _ in 0 .. posting_list_size {
|
||||
let mut doc = Document::default();
|
||||
if rng.gen_weighted_bool(15) {
|
||||
doc.add_text(text_field, "a");
|
||||
@@ -526,85 +530,6 @@ pub mod tests {
|
||||
};
|
||||
}
|
||||
|
||||
/// Wraps a given docset, and forward alls call but the
|
||||
/// `.skip_next(...)`. This is useful to test that a specialized
|
||||
/// implementation of `.skip_next(...)` is consistent
|
||||
/// with the default implementation.
|
||||
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
|
||||
|
||||
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
|
||||
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
|
||||
UnoptimizedDocSet(docset)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.0.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.0.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.0.score()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
|
||||
postings_factory: F,
|
||||
targets: Vec<u32>,
|
||||
) {
|
||||
for target in targets {
|
||||
let mut postings_opt = postings_factory();
|
||||
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
|
||||
let skip_result_opt = postings_opt.skip_next(target);
|
||||
let skip_result_unopt = postings_unopt.skip_next(target);
|
||||
assert_eq!(
|
||||
skip_result_unopt, skip_result_opt,
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
match skip_result_opt {
|
||||
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
|
||||
SkipResult::OverStep => assert!(postings_opt.doc() > target),
|
||||
SkipResult::End => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
while postings_opt.advance() {
|
||||
assert!(postings_unopt.advance());
|
||||
assert_eq!(
|
||||
postings_opt.doc(),
|
||||
postings_unopt.doc(),
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
}
|
||||
assert!(!postings_unopt.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::tests::*;
|
||||
use docset::SkipResult;
|
||||
use query::Intersection;
|
||||
use schema::IndexRecordOption;
|
||||
use test::{self, Bencher};
|
||||
use tests;
|
||||
use DocSet;
|
||||
|
||||
#[bench]
|
||||
fn bench_segment_postings(b: &mut Bencher) {
|
||||
let searcher = INDEX.searcher();
|
||||
@@ -721,4 +646,71 @@ mod bench {
|
||||
s
|
||||
});
|
||||
}
|
||||
|
||||
/// Wraps a given docset, and forward alls call but the
|
||||
/// `.skip_next(...)`. This is useful to test that a specialized
|
||||
/// implementation of `.skip_next(...)` is consistent
|
||||
/// with the default implementation.
|
||||
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
|
||||
|
||||
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
|
||||
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
|
||||
UnoptimizedDocSet(docset)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.0.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.0.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.0.score()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
|
||||
postings_factory: F,
|
||||
targets: Vec<u32>,
|
||||
) {
|
||||
for target in targets {
|
||||
let mut postings_opt = postings_factory();
|
||||
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
|
||||
let skip_result_opt = postings_opt.skip_next(target);
|
||||
let skip_result_unopt = postings_unopt.skip_next(target);
|
||||
assert_eq!(
|
||||
skip_result_unopt, skip_result_opt,
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
match skip_result_opt {
|
||||
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
|
||||
SkipResult::OverStep => assert!(postings_opt.doc() > target),
|
||||
SkipResult::End => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
while postings_opt.advance() {
|
||||
assert!(postings_unopt.advance());
|
||||
assert_eq!(
|
||||
postings_opt.doc(),
|
||||
postings_unopt.doc(),
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
}
|
||||
assert!(!postings_unopt.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,104 +1,90 @@
|
||||
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||
|
||||
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use postings::UnorderedTermId;
|
||||
use DocId;
|
||||
use schema::Term;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::collections::HashMap;
|
||||
use postings::Recorder;
|
||||
use Result;
|
||||
use schema::{Field, Schema};
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use termdict::TermOrdinal;
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use DocId;
|
||||
use Result;
|
||||
use schema::IndexRecordOption;
|
||||
use postings::UnorderedTermId;
|
||||
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
heap: &'a Heap,
|
||||
) -> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// FieldType::Bytes cannot actually be indexed.
|
||||
// TODO fix during the indexer refactoring described in #276
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MultiFieldPostingsWriter {
|
||||
heap: MemoryArena,
|
||||
schema: Schema,
|
||||
term_index: TermHashMap,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
term_index: TermHashMap<'a>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
}
|
||||
|
||||
impl MultiFieldPostingsWriter {
|
||||
impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
/// Create a new `MultiFieldPostingsWriter` given
|
||||
/// a schema and a heap.
|
||||
pub fn new(schema: &Schema, table_bits: usize) -> MultiFieldPostingsWriter {
|
||||
let term_index = TermHashMap::new(table_bits);
|
||||
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
|
||||
let term_index = TermHashMap::new(table_bits, heap);
|
||||
let per_field_postings_writers: Vec<_> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| posting_from_field_entry(field_entry))
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.collect();
|
||||
|
||||
MultiFieldPostingsWriter {
|
||||
heap: MemoryArena::new(),
|
||||
schema: schema.clone(),
|
||||
heap,
|
||||
term_index,
|
||||
per_field_postings_writers,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.term_index.mem_usage() + self.heap.mem_usage()
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
|
||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
||||
postings_writer.index_text(
|
||||
&mut self.term_index,
|
||||
doc,
|
||||
field,
|
||||
token_stream,
|
||||
&mut self.heap,
|
||||
)
|
||||
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
|
||||
}
|
||||
|
||||
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
|
||||
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
|
||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, self.heap)
|
||||
}
|
||||
|
||||
/// Serialize the inverted index.
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
#[allow(needless_range_loop)]
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
|
||||
.term_index
|
||||
.iter()
|
||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||
.collect();
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
|
||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||
@@ -108,10 +94,8 @@ impl MultiFieldPostingsWriter {
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
let mut unordered_term_mappings: HashMap<
|
||||
Field,
|
||||
HashMap<UnorderedTermId, TermOrdinal>,
|
||||
> = HashMap::new();
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
|
||||
HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
@@ -126,46 +110,40 @@ impl MultiFieldPostingsWriter {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i + 1];
|
||||
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Str(_) | &FieldType::HierarchicalFacet => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let mut unordered_term_ids = term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| {
|
||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||
})
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
&FieldType::U64(_) | &FieldType::I64(_) => {}
|
||||
&FieldType::Bytes => {}
|
||||
// populating the unordered term ord -> ordered term ord mapping
|
||||
// for the field.
|
||||
let mut mapping = HashMap::new();
|
||||
for (term_ord, term_unord_id) in term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket)
|
||||
.enumerate()
|
||||
{
|
||||
mapping.insert(term_unord_id, term_ord);
|
||||
}
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
let mut field_serializer =
|
||||
serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||
let mut field_serializer = serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&mut field_serializer,
|
||||
&self.term_index.heap,
|
||||
&self.heap,
|
||||
self.heap,
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(unordered_term_mappings)
|
||||
}
|
||||
|
||||
/// Return true iff the term dictionary is saturated.
|
||||
pub fn is_term_saturated(&self) -> bool {
|
||||
self.term_index.is_saturated()
|
||||
}
|
||||
}
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
/// `PostingsWriter` writes in a `MemoryArena`.
|
||||
/// `PostingsWriter` writes in a `Heap`.
|
||||
pub trait PostingsWriter {
|
||||
/// Record that a document contains a term at a given position.
|
||||
///
|
||||
@@ -180,17 +158,16 @@ pub trait PostingsWriter {
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &mut MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> UnorderedTermId;
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
term_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and subscribe all of its token.
|
||||
@@ -200,9 +177,10 @@ pub trait PostingsWriter {
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &mut MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> u32 {
|
||||
let mut term = Term::for_field(field);
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
let num_tokens = {
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
@@ -218,67 +196,61 @@ pub trait PostingsWriter {
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
|
||||
heap: &'a Heap,
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
/// constructor
|
||||
pub fn new() -> SpecializedPostingsWriter<Rec> {
|
||||
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
|
||||
SpecializedPostingsWriter {
|
||||
heap,
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
pub fn new_boxed() -> Box<PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
pub fn new_boxed(heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new(heap))
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &mut MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
self.total_num_tokens += 1;
|
||||
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||
if opt_recorder.is_some() {
|
||||
let mut recorder = opt_recorder.unwrap();
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(heap);
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::new(heap);
|
||||
recorder.new_doc(doc, heap);
|
||||
recorder.record_position(position, heap);
|
||||
recorder
|
||||
let (term_ord, recorder): (UnorderedTermId, &mut Rec) = term_index.get_or_create(term);
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
if current_doc != u32::max_value() {
|
||||
recorder.close_doc(heap);
|
||||
}
|
||||
}) as UnorderedTermId
|
||||
recorder.new_doc(doc, heap);
|
||||
}
|
||||
self.total_num_tokens += 1;
|
||||
recorder.record_position(position, heap);
|
||||
term_ord
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
termdict_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for &(term_bytes, addr, _) in term_addrs {
|
||||
let recorder: Rec = unsafe { termdict_heap.read(addr) };
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
serializer.new_term(&term_bytes[4..])?;
|
||||
recorder.serialize(serializer, heap)?;
|
||||
recorder.serialize(addr, serializer, heap)?;
|
||||
serializer.close_term()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use postings::FieldSerializer;
|
||||
use std::{self, io};
|
||||
use DocId;
|
||||
use std::{self, io};
|
||||
use postings::FieldSerializer;
|
||||
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||
const POSITION_END: u32 = std::u32::MAX;
|
||||
@@ -15,53 +15,62 @@ const POSITION_END: u32 = std::u32::MAX;
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub trait Recorder: Copy {
|
||||
///
|
||||
fn new(heap: &mut MemoryArena) -> Self;
|
||||
pub trait Recorder: HeapAllocable {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
/// This method shall only be called if the term is within the document.
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena);
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap);
|
||||
/// Record the position of a term. For each document,
|
||||
/// this method will be called `term_freq` times.
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena);
|
||||
fn record_position(&mut self, position: u32, heap: &Heap);
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena);
|
||||
fn close_doc(&mut self, heap: &Heap);
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
impl HeapAllocable for NothingRecorder {
|
||||
fn with_addr(addr: u32) -> NothingRecorder {
|
||||
NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {}
|
||||
|
||||
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
||||
fn close_doc(&mut self, _heap: &Heap) {}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
for doc in self.stack.iter(heap) {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for doc in self.stack.iter(self_addr, heap) {
|
||||
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -69,47 +78,52 @@ impl Recorder for NothingRecorder {
|
||||
}
|
||||
|
||||
/// Recorder encoding document ids, and term frequencies
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
current_tf: u32,
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
impl HeapAllocable for TermFrequencyRecorder {
|
||||
fn with_addr(addr: u32) -> TermFrequencyRecorder {
|
||||
TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
current_doc: u32::max_value(),
|
||||
current_tf: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
||||
fn record_position(&mut self, _position: u32, _heap: &Heap) {
|
||||
self.current_tf += 1;
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
debug_assert!(self.current_tf > 0);
|
||||
self.stack.push(self.current_tf, heap);
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self
|
||||
.stack
|
||||
.iter(heap)
|
||||
let mut doc_iter = self.stack
|
||||
.iter(self_addr, heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
|
||||
while let Some(doc) = doc_iter.next() {
|
||||
@@ -123,40 +137,46 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
|
||||
/// Recorder encoding term frequencies as well as positions.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList,
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn new(heap: &mut MemoryArena) -> Self {
|
||||
impl HeapAllocable for TFAndPositionRecorder {
|
||||
fn with_addr(addr: u32) -> TFAndPositionRecorder {
|
||||
TFAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(heap),
|
||||
stack: ExpUnrolledLinkedList::with_addr(addr),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TFAndPositionRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||
fn new_doc(&mut self, doc: DocId, heap: &Heap) {
|
||||
self.current_doc = doc;
|
||||
self.stack.push(doc, heap);
|
||||
}
|
||||
|
||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
||||
fn record_position(&mut self, position: u32, heap: &Heap) {
|
||||
self.stack.push(position, heap);
|
||||
}
|
||||
|
||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||
fn close_doc(&mut self, heap: &Heap) {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
}
|
||||
|
||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(heap);
|
||||
let mut positions_iter = self.stack.iter(self_addr, heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
let mut prev_position = 0;
|
||||
doc_positions.clear();
|
||||
|
||||
@@ -2,15 +2,15 @@ use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BL
|
||||
use DocId;
|
||||
|
||||
use common::BitSet;
|
||||
use common::CountingWriter;
|
||||
use common::HasLen;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::Postings;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::FreqReadingOption;
|
||||
use postings::Postings;
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use common::CountingWriter;
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
@@ -84,13 +84,9 @@ impl SegmentPostings {
|
||||
for &doc in docs {
|
||||
postings_serializer.write_doc(doc, 1u32).unwrap();
|
||||
}
|
||||
postings_serializer
|
||||
.close_term()
|
||||
.expect("In memory Serialization should never fail.");
|
||||
postings_serializer.close_term().expect("In memory Serialization should never fail.");
|
||||
}
|
||||
let (buffer, _) = counting_writer
|
||||
.finish()
|
||||
.expect("Serializing in a buffer should never fail.");
|
||||
let (buffer , _) = counting_writer.finish().expect("Serializing in a buffer should never fail.");
|
||||
let data = ReadOnlySource::from(buffer);
|
||||
let block_segment_postings = BlockSegmentPostings::from_data(
|
||||
docs.len(),
|
||||
@@ -102,6 +98,7 @@ impl SegmentPostings {
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
@@ -128,7 +125,7 @@ fn exponential_search(target: u32, mut start: usize, arr: &[u32]) -> (usize, usi
|
||||
loop {
|
||||
let new = start + jump;
|
||||
if new >= end {
|
||||
return (start, end);
|
||||
return (start, end)
|
||||
}
|
||||
if arr[new] > target {
|
||||
return (start, new);
|
||||
@@ -166,8 +163,7 @@ impl DocSet for SegmentPostings {
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
|
||||
let sum_freq: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
self.position_computer.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freq as usize);
|
||||
}
|
||||
@@ -202,8 +198,7 @@ impl DocSet for SegmentPostings {
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
|
||||
let sum_freqs: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
self.position_computer.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freqs as usize);
|
||||
}
|
||||
@@ -216,6 +211,7 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
@@ -266,6 +262,7 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl HasLen for SegmentPostings {
|
||||
fn len(&self) -> usize {
|
||||
self.block_cursor.doc_freq()
|
||||
@@ -279,11 +276,16 @@ impl Postings for SegmentPostings {
|
||||
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
if self.position_computer.is_some() {
|
||||
output.resize(self.term_freq() as usize, 0u32);
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.positions_with_offset(offset, &mut output[..])
|
||||
let prev_capacity = output.capacity();
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if term_freq > prev_capacity {
|
||||
let additional_len = term_freq - output.len();
|
||||
output.reserve(additional_len);
|
||||
}
|
||||
unsafe {
|
||||
output.set_len(term_freq);
|
||||
self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..])
|
||||
}
|
||||
} else {
|
||||
output.clear();
|
||||
}
|
||||
@@ -399,8 +401,7 @@ impl BlockSegmentPostings {
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.num_bitpacked_blocks > 0 {
|
||||
let num_consumed_bytes = self
|
||||
.doc_decoder
|
||||
let num_consumed_bytes = self.doc_decoder
|
||||
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
match self.freq_reading_option {
|
||||
@@ -410,8 +411,7 @@ impl BlockSegmentPostings {
|
||||
self.remaining_data.advance(num_bytes_to_skip);
|
||||
}
|
||||
FreqReadingOption::ReadFreq => {
|
||||
let num_consumed_bytes = self
|
||||
.freq_decoder
|
||||
let num_consumed_bytes = self.freq_decoder
|
||||
.uncompress_block_unsorted(self.remaining_data.as_ref());
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
@@ -473,16 +473,16 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BlockSegmentPostings;
|
||||
use super::SegmentPostings;
|
||||
use common::HasLen;
|
||||
use core::Index;
|
||||
use docset::DocSet;
|
||||
use super::SegmentPostings;
|
||||
use schema::SchemaBuilder;
|
||||
use core::Index;
|
||||
use schema::INT_INDEXED;
|
||||
use schema::Term;
|
||||
use fst::Streamer;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Term;
|
||||
use schema::INT_INDEXED;
|
||||
use common::HasLen;
|
||||
use super::BlockSegmentPostings;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -570,3 +570,4 @@ mod tests {
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,16 +1,20 @@
|
||||
use super::TermInfo;
|
||||
use common::BinarySerializable;
|
||||
use common::{CompositeWrite, CountingWriter};
|
||||
use compression::VIntEncoder;
|
||||
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use core::Segment;
|
||||
use directory::WritePtr;
|
||||
use schema::Schema;
|
||||
use schema::{Field, FieldEntry, FieldType};
|
||||
use std::io::{self, Write};
|
||||
use termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use DocId;
|
||||
use Result;
|
||||
use termdict::TermDictionaryBuilderImpl;
|
||||
use super::TermInfo;
|
||||
use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use directory::WritePtr;
|
||||
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use DocId;
|
||||
use core::Segment;
|
||||
use std::io::{self, Write};
|
||||
use compression::VIntEncoder;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::CompositeWrite;
|
||||
use termdict::TermDictionaryBuilder;
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
@@ -81,11 +85,7 @@ impl InvertedIndexSerializer {
|
||||
/// a given field.
|
||||
///
|
||||
/// Loads the indexing options for the given field.
|
||||
pub fn new_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
) -> io::Result<FieldSerializer> {
|
||||
pub fn new_field(&mut self, field: Field, total_num_tokens: u64) -> io::Result<FieldSerializer> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -111,12 +111,11 @@ impl InvertedIndexSerializer {
|
||||
/// The field serializer is in charge of
|
||||
/// the serialization of a specific field.
|
||||
pub struct FieldSerializer<'a> {
|
||||
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
|
||||
term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter<WritePtr>>,
|
||||
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
|
||||
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
|
||||
current_term_info: TermInfo,
|
||||
term_open: bool,
|
||||
num_terms: TermOrdinal,
|
||||
}
|
||||
|
||||
impl<'a> FieldSerializer<'a> {
|
||||
@@ -126,6 +125,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
|
||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
if let Some(text_indexing_options) = text_options.get_indexing_options() {
|
||||
@@ -141,7 +141,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
_ => (false, false),
|
||||
};
|
||||
let term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(term_dictionary_write, field_type)?;
|
||||
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
|
||||
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
|
||||
let positions_serializer_opt = if position_enabled {
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
@@ -155,13 +155,11 @@ impl<'a> FieldSerializer<'a> {
|
||||
positions_serializer_opt,
|
||||
current_term_info: TermInfo::default(),
|
||||
term_open: false,
|
||||
num_terms: TermOrdinal::default(),
|
||||
})
|
||||
}
|
||||
|
||||
fn current_term_info(&self) -> TermInfo {
|
||||
let (filepos, offset) = self
|
||||
.positions_serializer_opt
|
||||
let (filepos, offset) = self.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.addr())
|
||||
.unwrap_or((0u64, 0u8));
|
||||
@@ -177,7 +175,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
||||
assert!(
|
||||
!self.term_open,
|
||||
"Called new_term, while the previous term was not closed."
|
||||
@@ -185,10 +183,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.term_open = true;
|
||||
self.postings_serializer.clear();
|
||||
self.current_term_info = self.current_term_info();
|
||||
self.term_dictionary_builder.insert_key(term)?;
|
||||
let term_ordinal = self.num_terms;
|
||||
self.num_terms += 1;
|
||||
Ok(term_ordinal)
|
||||
self.term_dictionary_builder.insert_key(term)
|
||||
}
|
||||
|
||||
/// Serialize the information that a document contains the current term,
|
||||
@@ -273,8 +268,7 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] = self
|
||||
.block_encoder
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
@@ -300,16 +294,14 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.termfreq_enabled {
|
||||
let block_encoded = self
|
||||
.block_encoder
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
|
||||
@@ -1,218 +0,0 @@
|
||||
use super::{Addr, MemoryArena};
|
||||
|
||||
use common::is_power_of_2;
|
||||
use std::mem;
|
||||
|
||||
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
||||
|
||||
const FIRST_BLOCK: u32 = 4u32;
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(len: u32) -> Option<usize> {
|
||||
match len {
|
||||
0...3 => None,
|
||||
4...MAX_BLOCK_LEN => {
|
||||
if is_power_of_2(len as usize) {
|
||||
Some(len as usize)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
n => {
|
||||
if n % MAX_BLOCK_LEN == 0 {
|
||||
Some(MAX_BLOCK_LEN as usize)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An exponential unrolled link.
|
||||
///
|
||||
/// The use case is as follows. Tantivy's indexer conceptually acts like a
|
||||
/// `HashMap<Term, Vec<u32>>`. As we come accross a given term in document
|
||||
/// `D`, we lookup the term in the map and append the document id to its vector.
|
||||
///
|
||||
/// The vector is then only read when it is serialized.
|
||||
///
|
||||
/// The `ExpUnrolledLinkedList` offers a more efficient solution to this
|
||||
/// problem.
|
||||
///
|
||||
/// It combines the idea of the unrolled linked list and tries to address the
|
||||
/// problem of selecting an adequate block size using a strategy similar to
|
||||
/// that of the `Vec` amortized resize strategy.
|
||||
///
|
||||
/// Data is stored in a linked list of blocks. The first block has a size of `4`
|
||||
/// and each block has a length of twice that of the previous block up to
|
||||
/// `MAX_BLOCK_LEN = 32768`.
|
||||
///
|
||||
/// This strategy is a good trade off to handle numerous very rare terms
|
||||
/// and avoid wasting half of the memory for very frequent terms.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
head: Addr,
|
||||
tail: Addr,
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
|
||||
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
head: addr,
|
||||
tail: addr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap,
|
||||
addr: self.head,
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a new element to the current stack.
|
||||
///
|
||||
/// If the current block end is reached, a new block is allocated.
|
||||
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
|
||||
self.len += 1;
|
||||
if let Some(new_block_len) = jump_needed(self.len) {
|
||||
// We need to allocate another block.
|
||||
// We also allocate an extra `u32` to store the pointer
|
||||
// to the future next block.
|
||||
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: Addr = heap.allocate_space(new_block_size);
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, new_block_addr)
|
||||
};
|
||||
self.tail = new_block_addr;
|
||||
}
|
||||
unsafe {
|
||||
// logic
|
||||
heap.write(self.tail, val);
|
||||
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
heap: &'a MemoryArena,
|
||||
addr: Addr,
|
||||
len: u32,
|
||||
consumed: u32,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
} else {
|
||||
self.consumed += 1;
|
||||
let addr: Addr = if jump_needed(self.consumed).is_some() {
|
||||
unsafe {
|
||||
// logic
|
||||
self.heap.read(self.addr)
|
||||
}
|
||||
} else {
|
||||
self.addr
|
||||
};
|
||||
self.addr = addr.offset(mem::size_of::<u32>() as u32);
|
||||
Some(unsafe {
|
||||
// logic
|
||||
self.heap.read(addr)
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::MemoryArena;
|
||||
use super::jump_needed;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let mut heap = MemoryArena::new();
|
||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||
stack.push(1u32, &mut heap);
|
||||
stack.push(2u32, &mut heap);
|
||||
stack.push(4u32, &mut heap);
|
||||
stack.push(8u32, &mut heap);
|
||||
{
|
||||
let mut it = stack.iter(&heap);
|
||||
assert_eq!(it.next().unwrap(), 1u32);
|
||||
assert_eq!(it.next().unwrap(), 2u32);
|
||||
assert_eq!(it.next().unwrap(), 4u32);
|
||||
assert_eq!(it.next().unwrap(), 8u32);
|
||||
assert!(it.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed() {
|
||||
let mut block_len = 4u32;
|
||||
let mut i = 0;
|
||||
while i < 10_000_000 {
|
||||
assert!(jump_needed(i + block_len - 1).is_none());
|
||||
assert!(jump_needed(i + block_len + 1).is_none());
|
||||
assert!(jump_needed(i + block_len).is_some());
|
||||
let new_block_len = jump_needed(i + block_len).unwrap();
|
||||
i += block_len;
|
||||
block_len = new_block_len as u32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::ExpUnrolledLinkedList;
|
||||
use tantivy_memory_arena::MemoryArena;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
let mut vecs = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
vecs.push(Vec::new());
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
vecs[t].push(i);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = MemoryArena::new();
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||
stacks.push(stack);
|
||||
}
|
||||
for s in 0..NUM_STACK {
|
||||
for i in 0u32..STACK_SIZE {
|
||||
let t = s * 392017 % NUM_STACK;
|
||||
stacks[t].push(i, &heap);
|
||||
}
|
||||
}
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user