Compare commits

...

149 Commits
0.2.0 ... 0.3.1

Author SHA1 Message Date
Paul Masurel
80f1e26c3b Tantivy 0.3.1 2017-04-23 15:52:07 +08:00
Paul Masurel
3e68b61d8f issue/122 Adds a garbage collect method 2017-04-23 15:51:06 +08:00
Paul Masurel
44c684af5c NOBUG Fixes winapi version 2017-04-08 19:01:31 +09:00
Paul Masurel
60279a03b6 RELEASE Tantivy 0.3. See Changelog 2017-04-08 18:53:40 +09:00
Paul Masurel
dc43135fe0 NOBUG Remove .info 2017-04-08 18:49:37 +09:00
Paul Masurel
ce022e5f06 issue/54 Clone segment reader rather than reload.
Closes #54.
2017-04-08 17:52:33 +09:00
Paul Masurel
0be977d9eb Merge pull request #114 from tantivy-search/issue/96
Closes Issue/96
2017-04-08 17:49:48 +09:00
Paul Masurel
a4ba20eea3 issue/96 code clean up, adding comments.wq 2017-04-08 17:30:25 +09:00
Paul Masurel
4bef6c99ee issue/96 Cleaning up some lock management 2017-04-05 10:12:39 +09:00
Paul Masurel
a84871468b issue/96 Rename FileError -> OpenReadError 2017-04-05 10:01:49 +09:00
Paul Masurel
e0a39fb273 issue/96 Added unit test, documentation and various tiny improvements. 2017-04-04 22:43:35 +09:00
Paul Masurel
35203378ef Considering merge options after calling end_merge 2017-04-03 17:26:21 +09:00
Paul Masurel
b5bf9bb13c issue/96 Looping over wait_merging_thread. 2017-04-03 08:39:18 +09:00
Paul Masurel
ea3349644c issue/96 Fixed unit test condition to something reasonable 2017-04-02 21:58:38 +09:00
Paul Masurel
d4f2e475ff issue/96 removed faulty assert 2017-04-02 19:21:20 +09:00
Paul Masurel
17631ed866 issue/96 Added functionality to protect files from deletion
Hopefully fixed the race condition happening when merging files.
2017-04-02 18:48:20 +09:00
Paul Masurel
9eb2d3e8c5 issue/96 avoid removing the bitset from segment_entry. 2017-04-02 16:26:28 +09:00
Paul Masurel
afd08a7bbc issue/96 Changed datastruct for the delete queue. 2017-04-01 21:01:10 +09:00
Paul Masurel
4fc7bc5f09 Added helper to create Vec with a given sizewq 2017-03-31 18:54:23 +09:00
Paul Masurel
602b9d235f Merge pull request #113 from kaedroho/patch-1
Mark "cpp" folder as linguist-vendored in .gitattributes
2017-03-31 09:05:57 +09:00
Karl Hobley
b22c6b86c7 Mark "cpp" folder as linguist-vendored in .gitattributes
This repo is currently being detected as a C project because of some vendored libraries in the "cpp" folder.

According to https://github.com/github/linguist#using-gitattributes you can use ``.gitattributes`` tell GitHub to not count this folder when detecting the language.
2017-03-30 13:43:03 +01:00
Paul Masurel
f0dc0de4b7 Added helper to create Vec with a given size 2017-03-29 11:26:24 +09:00
Paul Masurel
456dd3a60d issue/96 merge 2017-03-28 16:49:48 +09:00
Paul Masurel
d768a10077 master merged in feature branch 2017-03-27 09:27:23 +09:00
Paul Masurel
ddb2b8d807 test passing.
SegmentWriter create SegmentEntry which contain a delete_bitset
2017-03-26 18:32:53 +09:00
Paul Masurel
45806951b1 added quotation mark 2017-03-25 22:48:07 +09:00
Paul Masurel
84a060552d issue/109 trying to get proper logging in appveyor 2017-03-25 22:34:40 +09:00
Paul Masurel
68a956c6e7 issue/109 Showing debug! if test fails 2017-03-25 21:54:17 +09:00
Paul Masurel
f50f557cfc issue/109 Remove futures from most of segment_updater API. 2017-03-25 19:36:03 +09:00
Paul Masurel
daa19b770a (hopefully) bugfix race condition on wait merging threadwq. 2017-03-24 18:20:58 +09:00
Paul Masurel
e75402be80 Merge pull request #108 from KodrAus/ci/appveyor
Add appveyor config
2017-03-24 15:49:50 +09:00
Ashley Mannix
51cab39186 drop to vs2015 image 2017-03-24 16:37:30 +10:00
Ashley Mannix
c8e12b6847 try set mingw path 2017-03-24 16:22:32 +10:00
Ashley Mannix
b44a9cb89d add appveyor config 2017-03-24 16:11:51 +10:00
Paul Masurel
e650fab927 Merge pull request #106 from tantivy-search/wip/delay-test-deletes
Fix delete tests on Windows
2017-03-22 09:26:36 +09:00
Paul Masurel
b12a97abe4 Add unit test for when deleting fails
Test that when delete fails, we still keep
the file as managed.

Remove the error log for windows, as failing
to delete is expected.
2017-03-22 08:57:09 +09:00
Laurentiu Nicola
2b5a4bbde2 Don't delete twice on not(windows) 2017-03-21 07:48:58 +02:00
Laurentiu Nicola
2d169c4454 Delay deleting the files in the test suite to make it work on Windows 2017-03-21 07:37:28 +02:00
Paul Masurel
66d6e4e246 Merge pull request #103 from tantivy-search/lnicola-fix-sync-directory
Make directory syncing work on Windows (resubmit)
2017-03-21 10:55:03 +09:00
Paul Masurel
a061ba091d Merge pull request #105 from tantivy-search/wip/simdcomp-build
Avoid using make for building simdcomp
2017-03-21 10:00:49 +09:00
Laurentiu Nicola
92ce9b906b Avoid using make for building simdcomp 2017-03-21 00:25:04 +02:00
Laurentiu Nicola
1e0ac31e11 Clarify comment and use qualified import for the flag 2017-03-20 23:12:48 +02:00
Paul Masurel
ebcea0128c Getting the FLAG from the winapi module. 2017-03-19 11:09:15 +09:00
Paul Masurel
30075176cb blop 2017-03-19 10:52:54 +09:00
Laurentiu Nicola
7c114b602d Make directory syncing work on Windows 2017-03-19 02:17:13 +02:00
Paul Masurel
50659147d1 NOBUG updated simple_search.html 2017-03-14 12:04:21 +09:00
Paul Masurel
da10fe3b4d Various fixes. 2017-03-13 22:01:55 +09:00
Paul Masurel
4db56c6bd8 Merge pull request #101 from tantivy-search/issue/99
Improvements to simple_search.rs: fixes #100 and improves #99
2017-03-13 13:26:39 +09:00
Claus Matzinger
292dd6dcb6 fixup 2017-03-13 00:24:54 -04:00
Claus Matzinger
37e71f7c63 fixes #100 and improves #99 2017-03-12 22:59:38 -04:00
Paul Masurel
5932278e00 test passing 2017-03-13 10:00:19 +09:00
Paul Masurel
202dda98ba baby step 3 2017-03-12 19:00:57 +09:00
Paul Masurel
7c971b5d3b baby step 2 2017-03-11 16:14:20 +09:00
Paul Masurel
77c61ddab2 Baby step1 2017-03-11 14:20:46 +09:00
Paul Masurel
b7f026bab9 Merger returns a SegmentMeta 2017-03-10 09:05:51 +09:00
Paul Masurel
cc2f78184b Added unit test for #96 2017-03-10 09:05:51 +09:00
Paul Masurel
673423f762 Merge pull request #98 from KodrAus/feat/no-cpp
Convert simd wrapper to C
2017-03-09 13:11:08 +09:00
Paul Masurel
7532c4a440 Removed double ; 2017-03-09 10:57:30 +09:00
Ashley Mannix
324b56a60c fix warnings 2017-03-09 06:54:48 +10:00
Paul Masurel
ac3890f93c NOBUG Marked the functional test as ignore 2017-03-08 19:08:29 +09:00
Ashley Mannix
69b3de43f6 convert simd wrapper to c 2017-03-08 14:02:48 +10:00
Paul Masurel
3d1196d53e NOBUG added doc link. 2017-03-07 10:14:00 +09:00
Paul Masurel
a397537ed8 NOBUG added rustdoc 2017-03-07 10:10:43 +09:00
Paul Masurel
ebca904767 NOBUG added rustdoc 2017-03-07 09:58:51 +09:00
Paul Masurel
3a472914ce Fix .write -> .write_all 2017-03-06 16:28:30 +09:00
Paul Masurel
c59507444f issue/77 ManagedDirectory working
Closes #77
2017-03-06 12:18:36 +09:00
Paul Masurel
4b7afa2ae7 issue/77 Added managed directory 2017-03-03 22:41:37 +09:00
Paul Masurel
590a8582c9 The reference doc should not point to the schema page. 2017-02-28 21:17:19 +09:00
Paul Masurel
ab3440f925 NOBUG Bypass github cache for coveralls badge 2017-02-27 12:39:59 +09:00
Paul Masurel
ec5fb2eaa9 NOBUG cleanup 2017-02-27 09:52:28 +09:00
Paul Masurel
15b60d72cc NOBUG add_document does not return result 2017-02-27 09:36:41 +09:00
Paul Masurel
7a07144c68 Bugfix related with deletes, rollback and the index opstamp. 2017-02-27 01:42:25 +09:00
Paul Masurel
8bcfdb8e80 NOBUG misc ... 2017-02-26 21:35:18 +09:00
Paul Masurel
a7f10f055d Nobug hidding doc, filling doc 2017-02-26 00:11:32 +09:00
Paul Masurel
597dac9cb6 NOBUG Adding doc. 2017-02-25 23:39:02 +09:00
Paul Masurel
6a002bcc76 NOBUGwq 2017-02-25 21:20:55 +09:00
Paul Masurel
3a86fc00a2 Closes #64 - Improve Index creationg API / documentation 2017-02-25 20:40:39 +09:00
Paul Masurel
ca1617d3cd Fixes #91 2017-02-25 20:32:26 +09:00
Paul Masurel
e4a102d859 Merge branch 'issue/43'
Conflicts:
	src/directory/mmap_directory.rs
2017-02-25 19:36:21 +09:00
Paul Masurel
1d9924ee90 Closes #43. 2017-02-25 19:32:36 +09:00
Paul Masurel
f326a2dafe TODO hunt 2017-02-25 15:28:56 +09:00
Paul Masurel
78228ece73 Closes #92. ByteOrder of u32 terms. 2017-02-24 23:41:46 +09:00
Paul Masurel
503d0295cb issue/43 TODO hunt 2017-02-23 09:54:54 +09:00
Paul Masurel
eb39db44fc issue/43 Avoid keeping segments with 0 documents. 2017-02-23 09:20:30 +09:00
Paul Masurel
7f78d1f4ca Fixes #82 Renamed and commented the function to create Term from &[u8] 2017-02-23 08:33:59 +09:00
Paul Masurel
df9090cb0b NOBUG TODO hunt, and cleanups 2017-02-22 22:18:33 +09:00
Paul Masurel
4a8eb3cb05 issue/43 Added unit test for deletes including merging. 2017-02-22 21:38:37 +09:00
Paul Masurel
a74b41d7ed NOBUG run benchmark over exactly 100 K elements 2017-02-21 11:43:55 +09:00
Paul Masurel
06017bd422 NOBUG made the cleanup limit adaptive in MmapCache 2017-02-21 00:37:45 +09:00
Paul Masurel
17beaab8bf Merge branch 'issue/72' 2017-02-21 00:25:24 +09:00
Paul Masurel
062e38a2ab Fixes #72 - Cache directory uses weak ref. Introduced CacheInfo object. 2017-02-21 00:24:33 +09:00
Paul Masurel
8c2b20c496 NOBUG Trying to fix coverall conf. 2017-02-20 17:47:16 +09:00
Paul Masurel
c677eb9f13 issue/43 Removed notify 2017-02-19 22:41:45 +09:00
Paul Masurel
0f332d1fd3 issue/43 Removed doc freq from recorders. 2017-02-19 22:39:31 +09:00
Paul Masurel
1b45539f32 issue/43 Added support for delete in merged index 2017-02-19 22:39:31 +09:00
Paul Masurel
7315000fd4 issue/43 Merging ok for postings / fastfields. 2017-02-19 22:39:31 +09:00
Paul Masurel
e3d2fca844 issue/43 Isolated segment_entry / doc_opstamp_mapping 2017-02-19 22:39:31 +09:00
Paul Masurel
1c03d98a11 issue/43 added delete_queue right in the segment updater 2017-02-19 22:39:31 +09:00
Paul Masurel
8b68f22be1 issue/43 made the delete queue shareable 2017-02-19 22:39:31 +09:00
Paul Masurel
d007cf3435 issue/43 simplification. removed the notion of delete cursor. 2017-02-19 22:39:04 +09:00
Paul Masurel
72afbb28c7 issue/43 test passing 2017-02-19 22:39:04 +09:00
Paul Masurel
2fc3a505bc issue/43 refactoring segment meta 2017-02-19 22:39:04 +09:00
Paul Masurel
e337c35721 issue/43 SegmentMeta refactoring 2017-02-19 22:39:04 +09:00
Paul Masurel
0c318339b0 issue/43 Path logic in segment. 2017-02-19 22:39:04 +09:00
Paul Masurel
64fee11bc0 issue/43 Clean up 2017-02-19 22:39:04 +09:00
Paul Masurel
e12fc4bb09 issue/43 deletes
merge not working
only updating uncommitted
2017-02-19 22:39:04 +09:00
Paul Masurel
0820992141 issue/43 docstamp -> opstamp 2017-02-19 22:38:39 +09:00
Paul Masurel
09782858da issue/43 Segment have a commit opstamp 2017-02-19 22:38:39 +09:00
Paul Masurel
ca977fb17b issue/43 Refactoring of SegmentUpdater 2017-02-19 22:38:39 +09:00
Paul Masurel
e8ecb68f00 issue/43 switching for futures 2017-02-19 22:38:39 +09:00
Paul Masurel
0ec492dcf2 issue/43 refactoring in order to remove the segment updater non sense for simpler futures 2017-02-19 22:38:39 +09:00
Paul Masurel
20eb586660 issue/43 Rename SegmentUpdater 2017-02-19 22:38:39 +09:00
Paul Masurel
6530d43d6a issue/43 Small fixes. 2017-02-19 22:38:39 +09:00
Paul Masurel
926e71a573 issue/43 unit test running. segment updater uses futures. 2017-02-19 22:38:38 +09:00
Paul Masurel
bacaabf857 issue/43 fixed on unit test. need big refactoring of segment updater 2017-02-19 22:38:38 +09:00
Paul Masurel
d6e7157173 issue/43 Test broken... moved segment manager to the segment updater / segment writer 2017-02-19 22:38:15 +09:00
Paul Masurel
093dcbd253 issue/43 Isolated SegmentMeta 2017-02-19 22:38:15 +09:00
Paul Masurel
fba44b78b6 issue/43 Added delete doc file 2017-02-19 22:38:15 +09:00
Paul Masurel
01cf303dec issue/43 segment writer 2017-02-19 22:38:14 +09:00
Paul Masurel
d5c161e196 issue/43 Computing deleted doc bitset 2017-02-19 22:38:14 +09:00
Paul Masurel
183d5221b5 issue/43 DeleteQueue. 2017-02-19 22:38:14 +09:00
Paul Masurel
5a06f45403 issue/43 small progress 2017-02-19 22:36:57 +09:00
Paul Masurel
395cbf3913 issue/43 Change the delete queue datastruct for something cleaner/functional 2017-02-19 22:36:57 +09:00
Paul Masurel
fe2ddb8844 issue43 Added DeleteQueue. 2017-02-19 22:36:57 +09:00
Paul Masurel
3129701e92 issue/71 Added list of supported OSes 2017-02-19 14:14:15 +09:00
Paul Masurel
56ba698def Merge pull request #76 from Ameobea/master
Updated dependency versions and implementations
2017-02-17 18:20:44 +09:00
Casey Primozic
e0ba699c16 Updated dependency versions and implementations
- Updated `byteorder` error usage (now returns straight `Error`s)
 - Updated `Uuid` implementation (`to_simple_string` now `.simple().to_string()`)
2017-02-17 01:26:13 -06:00
Paul Masurel
b6423f9a76 Merge pull request #73 from manuel-woelker/pr-subtree
Use git subtree mechanism for simdcomp to simplify build (cf. #24)
2017-01-27 14:52:36 +09:00
Manuel Woelker
a667394a49 update README and build after simdcomp subtree refactor 2017-01-26 21:14:05 +01:00
Manuel Woelker
9f02b090dd Merge commit 'f07ccd6e4fbc5bbfeb94d40e0f14bc527a7d5439' as 'cpp/simdcomp' 2017-01-26 20:28:23 +01:00
Manuel Woelker
f07ccd6e4f Squashed 'cpp/simdcomp/' content from commit 0dca286
git-subtree-dir: cpp/simdcomp
git-subtree-split: 0dca28668f1fb6d343dc3c62fa7750a00f1d7201
2017-01-26 20:28:23 +01:00
Manuel Woelker
f19f8757de remove git submodule to replace via git subtree 2017-01-26 20:17:44 +01:00
Paul Masurel
f729edb529 NOBUG added badges / categories for crates.io 2017-01-21 09:35:44 +09:00
Paul Masurel
73ef201c44 Merge branch 'master' of github.com:tantivy-search/tantivy 2017-01-11 21:09:05 +09:00
Paul Masurel
3b69e790e9 NOBUG expose a version public api. Handy to check if the compilation was made with simd or not. 2017-01-11 21:06:41 +09:00
Paul Masurel
1b0b3051c2 NOBUG Pinned some version, removed import warning. 2017-01-09 15:30:50 +09:00
Paul Masurel
43c1da1a92 Merge branch 'issue/67' 2016-12-20 16:52:33 +01:00
Paul Masurel
e1cb5e299d NOBUG split field_type into 2 2016-12-20 16:51:34 +01:00
Paul Masurel
14ebed392b Merge pull request #68 from tantivy-search/issue/67
Issue/67
2016-12-20 11:27:05 +01:00
Paul Masurel
d3d34be167 issue/67 Added a advance interface to the term iterator 2016-12-20 11:25:52 +01:00
Paul Masurel
98cdc83428 Issue #67 Removing afterwards. 2016-12-18 11:57:28 +01:00
Paul Masurel
4d7d201f21 Issue #67 - Removed segment ord array from term iteration.
This was probably an early optimization.
2016-12-17 09:44:51 +01:00
Paul Masurel
ca5f3e1d46 issue/67 First stab. Iterator working. 2016-12-17 00:58:12 +01:00
Paul Masurel
1559733b03 Merge pull request #63 from vandenoever/readme
fix for build instructions
2016-12-12 10:17:27 +09:00
Paul Masurel
44b5f1868c Merge branch 'master' into readme 2016-12-12 10:17:19 +09:00
Paul Masurel
4cedfd903d NOBUG Added ga beacon to README 2016-12-12 10:07:30 +09:00
Paul Masurel
c0049e8487 NOBUG fixed doc urls. 2016-12-11 21:43:14 +09:00
Paul Masurel
e88adbff5c Bumped tantivy's version in Cargo.toml 2016-12-11 17:51:45 +09:00
Jos van den Oever
e497e04f70 fix for build instructions
And clarification that nighty is required.
2016-12-10 18:08:15 +01:00
123 changed files with 100720 additions and 2155 deletions

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
cpp/* linguist-vendored

1
.gitignore vendored
View File

@@ -5,3 +5,4 @@ target/release
Cargo.lock
benchmark
.DS_Store
cpp/simdcomp/bitpackingbenchmark

3
.gitmodules vendored
View File

@@ -1,3 +0,0 @@
[submodule "cpp/simdcomp"]
path = cpp/simdcomp
url = git@github.com:lemire/simdcomp.git

View File

@@ -1,11 +1,6 @@
language: rust
rust:
- nightly
git:
submodules: false
before_install:
- sed -i 's/git@github.com:/https:\/\/github.com\//' .gitmodules
- git submodule update --init --recursive
env:
global:
- CC=gcc-4.8
@@ -37,4 +32,4 @@ after_success:
- bash ./script/build-doc.sh
- travis-cargo doc-upload
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --include-path=`pwd`/src --exclude-path=`pwd`/cpp --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi

47
CHANGELOG.md Normal file
View File

@@ -0,0 +1,47 @@
Tantivy 0.3
==========================
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
for their contribution to this release.
Thanks also to everyone in tantivy gitter chat
for their advise and company :)
https://gitter.im/tantivy-search/tantivy
Warning:
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
code and index format.
You should not expect backward compatibility before
tantivy 1.0.
New Features
------------
- Delete. You can now delete documents from an index.
- Support for windows (Thanks to @lnicola)
Various Bugfixes & small improvements
----------------------------------------
- Added CI for Windows (https://ci.appveyor.com/project/fulmicoton/tantivy)
Thanks to @KodrAus ! (#108)
- Various dependy version update (Thanks to @Ameobea) #76
- Fixed several race conditions in `Index.wait_merge_threads`
- Fixed #72. Mmap were never released.
- Fixed #80. Fast field used to take an amplitude of 32 bits after a merge. (Ouch!)
- Fixed #92. u32 are now encoded using big endian in the fst
in order to make there enumeration consistent with
the natural ordering.
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
- Misc invisible bug fixes, and code cleanup.
- Use

View File

@@ -1,43 +1,49 @@
[package]
name = "tantivy"
version = "0.1.1"
version = "0.3.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
build = "build.rs"
license = "MIT"
categories = ["database-implementations", "data-structures"]
description = """Tantivy is a search engine library."""
documentation = "http://fulmicoton.com/tantivy/tantivy/index.html"
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/tantivy-search/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
[dependencies]
byteorder = "0.4"
memmap = "0.2"
lazy_static = "0.1"
regex = "0.1"
fst = "0.1"
atomicwrites = "0.0.14"
tempfile = "2.0"
byteorder = "1.0"
memmap = "0.4"
lazy_static = "0.2.1"
regex = "0.2"
fst = "0.1.37"
atomicwrites = "0.1.3"
tempfile = "2.1"
rustc-serialize = "0.3"
log = "0.3"
combine = "2.0.*"
log = "0.3.6"
combine = "2.2"
tempdir = "0.3"
bincode = "0.4"
libc = {version = "0.2.6", optional=true}
num_cpus = "0.2"
itertools = "0.4"
lz4 = "1.13"
bincode = "0.5"
libc = {version = "0.2.20", optional=true}
num_cpus = "1.2"
itertools = "0.5.9"
lz4 = "1.20"
bit-set = "0.4.0"
time = "0.1"
uuid = "0.1"
uuid = { version = "0.4", features = ["v4", "rustc-serialize"] }
chan = "0.1"
version = "2"
crossbeam = "0.2"
futures = "0.1.9"
futures-cpupool = "0.1.2"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
[dev-dependencies]
rand = "0.3"
env_logger = "0.4"
[build-dependencies]
gcc = {version = "0.3", optional=true}
@@ -52,3 +58,7 @@ debug-assertions = false
[features]
default = ["simdcompression"]
simdcompression = ["libc", "gcc"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -1,45 +1,52 @@
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)
[![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=master)](https://travis-ci.org/tantivy-search/tantivy)
[![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master)](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master&refresh1)](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy)
![beacon for google analytics](https://ga-beacon.appspot.com/UA-88834340-1/tantivy/README)
**Tantivy** is a **full text search engine library** written in rust.
It is strongly inspired by Lucene's design.
# Features
- configurable indexing (optional term frequency and position indexing)
- tf-idf scoring
- Basic query language
- Phrase queries
- Incremental indexing
- Multithreaded indexing (indexing English Wikipedia takes 4 minutes on my desktop)
- mmap based
- SIMD integer compression
- optional SIMD integer compression
- u32 fast fields (equivalent of doc values in Lucene)
- LZ4 compressed document store
- Cheesy logo with a horse
Tantivy supports Linux, MacOS and Windows.
# Getting started
- [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html)
- [tantivy-cli and its tutorial](https://github.com/fulmicoton/tantivy-cli).
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
It will walk you through getting a wikipedia search engine up and running in a few minutes.
- [reference doc](http://fulmicoton.com/tantivy/tantivy/index.html).
- [reference doc]
- [For the last released version](https://docs.rs/tantivy/)
- [For the last master branch](https://tantivy-search.github.io/tantivy/tantivy/index.html)
# Compiling
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/book/box-syntax-and-patterns.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), and [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md).
By default, `tantivy` uses a git submodule called `simdcomp`.
After cloning the repository, you will need to initialize and update
the submodules. The project can then be built using `cargo`.
git clone git@github.com:fulmicoton/tantivy.git
git submodule init
git submodule update
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
cargo build

24
appveyor.yml Normal file
View File

@@ -0,0 +1,24 @@
# Appveyor configuration template for Rust using rustup for Rust installation
# https://github.com/starkat99/appveyor-rust
os: Visual Studio 2015
environment:
matrix:
- channel: nightly
target: x86_64-pc-windows-msvc
- channel: nightly
target: x86_64-pc-windows-gnu
msys_bits: 64
install:
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
- rustup-init -yv --default-toolchain %channel% --default-host %target%
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
- if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
- rustc -vV
- cargo -vV
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --verbose

View File

@@ -1,40 +1,50 @@
#[cfg(feature= "simdcompression")]
#[cfg(feature = "simdcompression")]
mod build {
extern crate gcc;
use std::process::Command;
pub fn build() {
Command::new("make")
.current_dir("cpp/simdcomp")
.output()
.unwrap_or_else(|e| { panic!("Failed to make simdcomp: {}", e) });
gcc::Config::new()
.cpp(true)
.flag("-std=c++11")
.flag("-O3")
.flag("-mssse3")
.include("./cpp/simdcomp/include")
.object("cpp/simdcomp/avxbitpacking.o")
.object("cpp/simdcomp/simdintegratedbitpacking.o")
.object("cpp/simdcomp/simdbitpacking.o")
.object("cpp/simdcomp/simdpackedsearch.o")
.object("cpp/simdcomp/simdcomputil.o")
.object("cpp/simdcomp/simdpackedselect.o")
.object("cpp/simdcomp/simdfor.o")
.file("cpp/simdcomp_wrapper.cpp")
.compile("libsimdcomp.a");
println!("cargo:rustc-flags=-l dylib=stdc++");
let mut config = gcc::Config::new();
config.include("./cpp/simdcomp/include")
.file("cpp/simdcomp/src/avxbitpacking.c")
.file("cpp/simdcomp/src/simdintegratedbitpacking.c")
.file("cpp/simdcomp/src/simdbitpacking.c")
.file("cpp/simdcomp/src/simdpackedsearch.c")
.file("cpp/simdcomp/src/simdcomputil.c")
.file("cpp/simdcomp/src/simdpackedselect.c")
.file("cpp/simdcomp/src/simdfor.c")
.file("cpp/simdcomp_wrapper.c");
if !cfg!(debug_assertions) {
config.opt_level(3);
if cfg!(target_env = "msvc") {
config.define("NDEBUG", None)
.flag("/Gm-")
.flag("/GS-")
.flag("/Gy")
.flag("/Oi")
.flag("/GL");
} else {
config.flag("-msse4.1")
.flag("-march=native");
}
}
config.compile("libsimdcomp.a");
// Workaround for linking static libraries built with /GL
// https://github.com/rust-lang/rust/issues/26003
if !cfg!(debug_assertions) && cfg!(target_env = "msvc") {
println!("cargo:rustc-link-lib=dylib=simdcomp");
}
}
}
#[cfg(not(feature= "simdcompression"))]
#[cfg(not(feature = "simdcompression"))]
mod build {
pub fn build() {
}
pub fn build() {}
}
fn main() {
build::build();
}

1
cpp/simdcomp vendored

Submodule cpp/simdcomp deleted from 0dca28668f

9
cpp/simdcomp/.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
Makefile.in
lib*
unit*
*.o
src/*.lo
src/*.o
src/.deps
src/.dirstamp
src/.libs

11
cpp/simdcomp/.travis.yml Normal file
View File

@@ -0,0 +1,11 @@
language: c
sudo: false
compiler:
- gcc
- clang
branches:
only:
- master
script: make && ./unit

9
cpp/simdcomp/CHANGELOG Normal file
View File

@@ -0,0 +1,9 @@
Upcoming
- added missing include
- improved portability (MSVC)
- implemented C89 compatibility
Version 0.0.3 (19 May 2014)
- improved documentation
Version 0.0.2 (6 February 2014)
- added go demo
Version 0.0.1 (5 February 2014)

27
cpp/simdcomp/LICENSE Normal file
View File

@@ -0,0 +1,27 @@
Copyright (c) 2014--, The authors
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
* Neither the name of the {organization} nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

137
cpp/simdcomp/README.md Normal file
View File

@@ -0,0 +1,137 @@
The SIMDComp library
====================
[![Build Status](https://travis-ci.org/lemire/simdcomp.png)](https://travis-ci.org/lemire/simdcomp)
A simple C library for compressing lists of integers using binary packing and SIMD instructions.
The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers.
This library can decode at least 4 billions of compressed integers per second on most
desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s.
This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4.
On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer,
which can easily translate into more than 8 decoded billions integers per second.
Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others
What is it for?
-------------
This is a low-level library for fast integer compression. By design it does not define a compressed
format. It is up to the (sophisticated) user to create a compressed format.
Requirements
-------------
- Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.)
- It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better)
- C99 compliant compiler (GCC is assumed)
- A Linux-like distribution is assumed by the makefile
For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker
Usage
-------
Compression works over blocks of 128 integers.
For a complete working example, see example.c (you can build it and
run it with "make example; ./example").
1) Lists of integers in random order.
```C
const uint32_t b = maxbits(datain);// computes bit width
simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes
simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer
```
While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b.
2) Sorted lists of integers.
We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset).
```C
uint32_t offset = 0;
uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width
simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes
simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed
```
General example for arrays of arbitrary length:
```C
int compress_decompress_demo() {
size_t k, N = 9999;
__m128i * endofbuf;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer;
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
uint32_t b;
for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */
datain[k] = k;
}
b = maxbits_length(datain, N);
buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory
endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
/* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */
/* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */
simdunpack_length((const __m128i *)buffer, N, backbuffer, b);
for (k = 0; k < N; ++k){
if(datain[k] != backbuffer[k]) {
printf("bug\n");
return -1;
}
}
return 0;
}
```
3) Frame-of-Reference
We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing
routines, but do not use differential coding so they allow faster search in some cases, at the expense
of compression.
Setup
---------
make
make test
and if you are daring:
make install
Go
--------
If you are a go user, there is a "go" folder where you will find a simple demo.
Other libraries
----------------
* Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
* Fast integer compression in C using StreamVByte https://github.com/lemire/streamvbyte
* FastPFOR is a C++ research library well suited to compress unsorted arrays: https://github.com/lemire/FastPFor
* SIMDCompressionAndIntersection is a C++ research library well suited for sorted arrays (differential coding)
and computing intersections: https://github.com/lemire/SIMDCompressionAndIntersection
* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
References
------------
* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399
* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015. http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract
* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387
* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916
* T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5

View File

@@ -0,0 +1,235 @@
/**
* This code is released under a BSD License.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "simdcomp.h"
#ifdef _MSC_VER
# include <windows.h>
__int64 freq;
typedef __int64 time_snap_t;
static time_snap_t time_snap(void)
{
__int64 now;
QueryPerformanceCounter((LARGE_INTEGER *)&now);
return (__int64)((now*1000000)/freq);
}
# define TIME_SNAP_FMT "%I64d"
#else
# define time_snap clock
# define TIME_SNAP_FMT "%lu"
typedef clock_t time_snap_t;
#endif
void benchmarkSelect() {
uint32_t buffer[128];
uint32_t backbuffer[128];
uint32_t initial = 33;
uint32_t b;
time_snap_t S1, S2, S3;
int i;
printf("benchmarking select \n");
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 0; b <= 32; b++) {
uint32_t prev = initial;
uint32_t out[128];
/* initialize the buffer */
for (i = 0; i < 128; i++) {
buffer[i] = ((uint32_t)(1655765 * i )) ;
if(b < 32) buffer[i] %= (1<<b);
}
for (i = 0; i < 128; i++) {
buffer[i] = buffer[i] + prev;
prev = buffer[i];
}
for (i = 1; i < 128; i++) {
if(buffer[i] < buffer[i-1] )
buffer[i] = buffer[i-1];
}
assert(simdmaxbitsd1(initial, buffer)<=b);
for (i = 0; i < 128; i++) {
out[i] = 0; /* memset would do too */
}
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
S1 = time_snap();
for (i = 0; i < 128 * 10; i++) {
uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i % 128);
assert(valretrieved == buffer[i%128]);
}
S2 = time_snap();
for (i = 0; i < 128 * 10; i++) {
simdunpackd1(initial, (__m128i *)out, backbuffer, b);
assert(backbuffer[i % 128] == buffer[i % 128]);
}
S3 = time_snap();
printf("bit width = %d, fast select function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2));
}
}
int uint32_cmp(const void *a, const void *b)
{
const uint32_t *ia = (const uint32_t *)a;
const uint32_t *ib = (const uint32_t *)b;
if(*ia < *ib)
return -1;
else if (*ia > *ib)
return 1;
return 0;
}
/* adapted from wikipedia */
int binary_search(uint32_t * A, uint32_t key, int imin, int imax)
{
int imid;
imax --;
while(imin + 1 < imax) {
imid = imin + ((imax - imin) / 2);
if (A[imid] > key) {
imax = imid;
} else if (A[imid] < key) {
imin = imid;
} else {
return imid;
}
}
return imax;
}
/* adapted from wikipedia */
int lower_bound(uint32_t * A, uint32_t key, int imin, int imax)
{
int imid;
imax --;
while(imin + 1 < imax) {
imid = imin + ((imax - imin) / 2);
if (A[imid] >= key) {
imax = imid;
} else if (A[imid] < key) {
imin = imid;
}
}
if(A[imin] >= key) return imin;
return imax;
}
void benchmarkSearch() {
uint32_t buffer[128];
uint32_t backbuffer[128];
uint32_t out[128];
uint32_t result, initial = 0;
uint32_t b, i;
time_snap_t S1, S2, S3, S4;
printf("benchmarking search \n");
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 0; b <= 32; b++) {
uint32_t prev = initial;
/* initialize the buffer */
for (i = 0; i < 128; i++) {
buffer[i] = ((uint32_t)rand()) ;
if(b < 32) buffer[i] %= (1<<b);
}
qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
for (i = 0; i < 128; i++) {
buffer[i] = buffer[i] + prev;
prev = buffer[i];
}
for (i = 1; i < 128; i++) {
if(buffer[i] < buffer[i-1] )
buffer[i] = buffer[i-1];
}
assert(simdmaxbitsd1(initial, buffer)<=b);
for (i = 0; i < 128; i++) {
out[i] = 0; /* memset would do too */
}
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
simdunpackd1(initial, (__m128i *)out, backbuffer, b);
for (i = 0; i < 128; i++) {
assert(buffer[i] == backbuffer[i]);
}
S1 = time_snap();
for (i = 0; i < 128 * 10; i++) {
int pos;
uint32_t pseudorandomkey = buffer[i%128];
__m128i vecinitial = _mm_set1_epi32(initial);
pos = simdsearchd1(&vecinitial, (__m128i *)out, b,
pseudorandomkey, &result);
if((result < pseudorandomkey) || (buffer[pos] != result)) {
printf("bug A.\n");
} else if (pos > 0) {
if(buffer[pos-1] >= pseudorandomkey)
printf("bug B.\n");
}
}
S2 = time_snap();
for (i = 0; i < 128 * 10; i++) {
int pos;
uint32_t pseudorandomkey = buffer[i%128];
simdunpackd1(initial, (__m128i *)out, backbuffer, b);
pos = lower_bound(backbuffer, pseudorandomkey, 0, 128);
result = backbuffer[pos];
if((result < pseudorandomkey) || (buffer[pos] != result)) {
printf("bug C.\n");
} else if (pos > 0) {
if(buffer[pos-1] >= pseudorandomkey)
printf("bug D.\n");
}
}
S3 = time_snap();
for (i = 0; i < 128 * 10; i++) {
int pos;
uint32_t pseudorandomkey = buffer[i%128];
pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128,
pseudorandomkey, &result);
if((result < pseudorandomkey) || (buffer[pos] != result)) {
printf("bug A.\n");
} else if (pos > 0) {
if(buffer[pos-1] >= pseudorandomkey)
printf("bug B.\n");
}
}
S4 = time_snap();
printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2), (S4-S3) );
}
}
int main() {
#ifdef _MSC_VER
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
#endif
benchmarkSearch();
benchmarkSelect();
return 0;
}

View File

@@ -0,0 +1,205 @@
#include <stdio.h>
#include "simdcomp.h"
#define RDTSC_START(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile( \
"cpuid\n\t" \
"rdtsc\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_FINAL(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile( \
"rdtscp\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
"cpuid\n\t" \
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) {
uint32_t * answer = malloc(sizeof(uint32_t) * length);
uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
uint32_t i;
for(i = 0; i < length; ++i) {
answer[i] = rand() & mask;
}
return answer;
}
uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) {
uint32_t * answer = malloc(sizeof(uint32_t) * length);
uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
uint32_t i;
answer[0] = rand() & mask;
for(i = 1; i < length; ++i) {
answer[i] = answer[i-1] + (rand() & mask);
}
return answer;
}
void demo128() {
const uint32_t length = 128;
uint32_t bit;
printf("# --- %s\n", __func__);
printf("# compressing %d integers\n",length);
printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
for(bit = 1; bit <= 32; ++bit) {
uint32_t i;
uint32_t * data = get_random_array_from_bit_width(length, bit);
__m128i * buffer = malloc(length * sizeof(uint32_t));
uint32_t * backdata = malloc(length * sizeof(uint32_t));
uint32_t repeat = 500;
uint64_t min_diff;
printf("%d\t",bit);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
simdpackwithoutmask(data,buffer, bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
simdunpack(buffer, backdata,bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
free(data);
free(buffer);
free(backdata);
printf("\n");
}
printf("\n\n"); /* two blank lines are required by gnuplot */
}
void demo128_d1() {
const uint32_t length = 128;
uint32_t bit;
printf("# --- %s\n", __func__);
printf("# compressing %d integers\n",length);
printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
for(bit = 1; bit <= 32; ++bit) {
uint32_t i;
uint32_t * data = get_random_array_from_bit_width_d1(length, bit);
__m128i * buffer = malloc(length * sizeof(uint32_t));
uint32_t * backdata = malloc(length * sizeof(uint32_t));
uint32_t repeat = 500;
uint64_t min_diff;
printf("%d\t",bit);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
simdpackwithoutmaskd1(0,data,buffer, bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
simdunpackd1(0,buffer, backdata,bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
free(data);
free(buffer);
free(backdata);
printf("\n");
}
printf("\n\n"); /* two blank lines are required by gnuplot */
}
#ifdef __AVX2__
void demo256() {
const uint32_t length = 256;
uint32_t bit;
printf("# --- %s\n", __func__);
printf("# compressing %d integers\n",length);
printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
for(bit = 1; bit <= 32; ++bit) {
uint32_t i;
uint32_t * data = get_random_array_from_bit_width(length, bit);
__m256i * buffer = malloc(length * sizeof(uint32_t));
uint32_t * backdata = malloc(length * sizeof(uint32_t));
uint32_t repeat = 500;
uint64_t min_diff;
printf("%d\t",bit);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
avxpackwithoutmask(data,buffer, bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
avxunpack(buffer, backdata,bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
free(data);
free(buffer);
free(backdata);
printf("\n");
}
printf("\n\n"); /* two blank lines are required by gnuplot */
}
#endif /* avx 2 */
int main() {
demo128();
demo128_d1();
#ifdef __AVX2__
demo256();
#endif
return 0;
}

195
cpp/simdcomp/example.c Normal file
View File

@@ -0,0 +1,195 @@
/* Type "make example" to build this example program. */
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "simdcomp.h"
/**
We provide several different code examples.
**/
/* very simple test to illustrate a simple application */
int compress_decompress_demo() {
size_t k, N = 9999;
__m128i * endofbuf;
int howmanybytes;
float compratio;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer;
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
uint32_t b;
printf("== simple test\n");
for (k = 0; k < N; ++k) { /* start with k=0, not k=1! */
datain[k] = k;
}
b = maxbits_length(datain, N);
buffer = malloc(simdpack_compressedbytes(N,b));
endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */
compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes;
/* endofbuf points to the end of the compressed data */
buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */
printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio);
/* in actual applications b must be stored and retrieved: caller is responsible for that. */
simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */
for (k = 0; k < N; ++k) {
if(datain[k] != backbuffer[k]) {
printf("bug at %lu \n",(unsigned long)k);
return -1;
}
}
printf("Code works!\n");
free(datain);
free(buffer);
free(backbuffer);
return 0;
}
/* compresses data from datain to buffer, returns how many bytes written
used below in simple_demo */
size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
uint32_t offset;
uint8_t * initout;
size_t k;
if(length/SIMDBlockSize*SIMDBlockSize != length) {
printf("Data length should be a multiple of %i \n",SIMDBlockSize);
}
offset = 0;
initout = buffer;
for(k = 0; k < length / SIMDBlockSize; ++k) {
uint32_t b = simdmaxbitsd1(offset,
datain + k * SIMDBlockSize);
*buffer++ = b;
simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer,
b);
offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
buffer += b * sizeof(__m128i);
}
return buffer - initout;
}
/* Another illustration ... */
void simple_demo() {
size_t REPEAT = 10, gap;
size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */
uint32_t * datain = malloc(N * sizeof(uint32_t));
size_t compsize;
clock_t start, end;
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
printf("== simple demo\n");
for (gap = 1; gap <= 243; gap *= 3) {
size_t k, repeat;
uint32_t offset = 0;
uint32_t bogus = 0;
double numberofseconds;
printf("\n");
printf(" gap = %lu \n", (unsigned long) gap);
datain[0] = 0;
for (k = 1; k < N; ++k)
datain[k] = datain[k-1] + ( rand() % (gap + 1) );
compsize = compress(datain,N,buffer);
printf("compression ratio = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 ));
start = clock();
for(repeat = 0; repeat < REPEAT; ++repeat) {
uint8_t * decbuffer = buffer;
for (k = 0; k * SIMDBlockSize < N; ++k) {
uint8_t b = *decbuffer++;
simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
/* do something here with backbuffer */
bogus += backbuffer[3];
decbuffer += b * sizeof(__m128i);
offset = backbuffer[SIMDBlockSize - 1];
}
}
end = clock();
numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
start = clock();
for(repeat = 0; repeat < REPEAT; ++repeat) {
uint8_t * decbuffer = buffer;
for (k = 0; k * SIMDBlockSize < N; ++k) {
memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t));
bogus += backbuffer[3] - backbuffer[100];
}
}
end = clock();
numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
printf("ignore me %i \n",bogus);
printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n");
}
free(buffer);
free(datain);
free(backbuffer);
}
/* Used below in more_sophisticated_demo ... */
size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) {
uint8_t * initout;
size_t k;
if(length/SIMDBlockSize*SIMDBlockSize != length) {
printf("Data length should be a multiple of %i \n",SIMDBlockSize);
}
initout = buffer;
for(k = 0; k < length / SIMDBlockSize; ++k) {
uint32_t b = maxbits(datain);
*buffer++ = b;
simdpackwithoutmask(datain, (__m128i *)buffer, b);
datain += SIMDBlockSize;
buffer += b * sizeof(__m128i);
}
return buffer - initout;
}
/* Here we compress the data in blocks of 128 integers with varying bit width */
int varying_bit_width_demo() {
size_t nn = 128 * 2;
uint32_t * datainn = malloc(nn * sizeof(uint32_t));
uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
uint8_t * initbuffern = buffern;
uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
size_t k, compsize;
printf("== varying bit-width demo\n");
for(k=0; k<nn; ++k) {
datainn[k] = rand() % (k + 1);
}
compsize = varying_bit_width_compress(datainn,nn,buffern);
printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
(unsigned)(nn * sizeof(uint32_t)));
for (k = 0; k * SIMDBlockSize < nn; ++k) {
uint32_t b = *buffern;
buffern++;
simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
buffern += b * sizeof(__m128i);
}
for (k = 0; k < nn; ++k) {
if(backbuffern[k] != datainn[k]) {
printf("bug\n");
return -1;
}
}
printf("Code works!\n");
free(datainn);
free(initbuffern);
free(backbuffern);
return 0;
}
int main() {
if(compress_decompress_demo() != 0) return -1;
if(varying_bit_width_demo() != 0) return -1;
simple_demo();
return 0;
}

13
cpp/simdcomp/go/README.md Normal file
View File

@@ -0,0 +1,13 @@
Simple Go demo
==============
Setup
======
Start by installing the simdcomp library (make && make install).
Then type:
go run test.go

71
cpp/simdcomp/go/test.go Normal file
View File

@@ -0,0 +1,71 @@
/////////
// This particular file is in the public domain.
// Author: Daniel Lemire
////////
package main
/*
#cgo LDFLAGS: -lsimdcomp
#include <simdcomp.h>
*/
import "C"
import "fmt"
//////////
// For this demo, we pack and unpack blocks of 128 integers
/////////
func main() {
// I am going to use C types. Alternative might be to use unsafe.Pointer calls, see http://bit.ly/1ndw3W3
// this is our original data
var data [128]C.uint32_t
for i := C.uint32_t(0); i < C.uint32_t(128); i++ {
data[i] = i
}
////////////
// We first pack without differential coding
///////////
// computing how many bits per int. is needed
b := C.maxbits(&data[0])
ratio := 32.0/float64(b)
fmt.Println("Bit width ", b)
fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio))
// we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
out := make([] C.__m128i,b)
C.simdpackwithoutmask( &data[0],&out[0],b);
var recovereddata [128]C.uint32_t
C.simdunpack(&out[0],&recovereddata[0],b)
for i := 0; i < 128; i++ {
if data[i] != recovereddata[i] {
fmt.Println("Bug ")
return
}
}
///////////
// Next, we use differential coding
//////////
offset := C.uint32_t(0) // if you pack data from K to K + 128, offset should be the value at K-1. When K = 0, choose a default
b1 := C.simdmaxbitsd1(offset,&data[0])
ratio1 := 32.0/float64(b1)
fmt.Println("Bit width ", b1)
fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio1))
// we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
out = make([] C.__m128i,b1)
C.simdpackwithoutmaskd1(offset, &data[0],&out[0],b1);
C.simdunpackd1(offset,&out[0],&recovereddata[0],b1)
for i := 0; i < 128; i++ {
if data[i] != recovereddata[i] {
fmt.Println("Bug ")
return
}
}
fmt.Println("test succesful.")
}

View File

@@ -0,0 +1,40 @@
/**
* This code is released under a BSD License.
*/
#ifndef INCLUDE_AVXBITPACKING_H_
#define INCLUDE_AVXBITPACKING_H_
#ifdef __AVX2__
#include "portability.h"
/* AVX2 is required */
#include <immintrin.h>
/* for memset */
#include <string.h>
#include "simdcomputil.h"
enum{ AVXBlockSize = 256};
/* max integer logarithm over a range of AVXBlockSize integers (256 integer) */
uint32_t avxmaxbits(const uint32_t * begin);
/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */
void avxpack(const uint32_t * in,__m256i * out, const uint32_t bit);
/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */
void avxpackwithoutmask(const uint32_t * in,__m256i * out, const uint32_t bit);
/* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */
void avxunpack(const __m256i * in,uint32_t * out, const uint32_t bit);
#endif /* __AVX2__ */
#endif /* INCLUDE_AVXBITPACKING_H_ */

View File

@@ -0,0 +1,81 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMDBITCOMPAT_H_
#define SIMDBITCOMPAT_H_
#include <iso646.h> /* mostly for Microsoft compilers */
#include <string.h>
#if SIMDCOMP_DEBUG
# define SIMDCOMP_ALWAYS_INLINE inline
# define SIMDCOMP_NEVER_INLINE
# define SIMDCOMP_PURE
#else
# if defined(__GNUC__)
# if __GNUC__ >= 3
# define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
# define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
# define SIMDCOMP_PURE __attribute__((pure))
# else
# define SIMDCOMP_ALWAYS_INLINE inline
# define SIMDCOMP_NEVER_INLINE
# define SIMDCOMP_PURE
# endif
# elif defined(_MSC_VER)
# define SIMDCOMP_ALWAYS_INLINE __forceinline
# define SIMDCOMP_NEVER_INLINE
# define SIMDCOMP_PURE
# else
# if __has_attribute(always_inline)
# define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
# else
# define SIMDCOMP_ALWAYS_INLINE inline
# endif
# if __has_attribute(noinline)
# define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
# else
# define SIMDCOMP_NEVER_INLINE
# endif
# if __has_attribute(pure)
# define SIMDCOMP_PURE __attribute__((pure))
# else
# define SIMDCOMP_PURE
# endif
# endif
#endif
#if defined(_MSC_VER) && _MSC_VER < 1600
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef signed char int8_t;
#else
#include <stdint.h> /* part of Visual Studio 2010 and better, others likely anyway */
#endif
#if defined(_MSC_VER)
#define SIMDCOMP_ALIGNED(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define SIMDCOMP_ALIGNED(x) __attribute__ ((aligned(x)))
#endif
#endif
#if defined(_MSC_VER)
# include <intrin.h>
/* 64-bit needs extending */
# define SIMDCOMP_CTZ(result, mask) do { \
unsigned long index; \
if (!_BitScanForward(&(index), (mask))) { \
(result) = 32U; \
} else { \
(result) = (uint32_t)(index); \
} \
} while (0)
#else
# define SIMDCOMP_CTZ(result, mask) \
result = __builtin_ctz(mask)
#endif
#endif /* SIMDBITCOMPAT_H_ */

View File

@@ -0,0 +1,72 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMDBITPACKING_H_
#define SIMDBITPACKING_H_
#include "portability.h"
/* SSE2 is required */
#include <emmintrin.h>
/* for memset */
#include <string.h>
#include "simdcomputil.h"
/***
* Please see example.c for various examples on how to make good use
* of these functions.
*/
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out".
* The input values are masked so that only the least significant "bit" bits are used. */
void simdpack(const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out".
* The input values are assumed to be less than 1<<bit. */
void simdpackwithoutmask(const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */
void simdunpack(const __m128i * in,uint32_t * out, const uint32_t bit);
/* how many compressed bytes are needed to compressed length integers using a bit width of bit with
the simdpackFOR_length function. */
int simdpack_compressedbytes(int length, const uint32_t bit);
/* like simdpack, but supports an undetermined number of inputs.
* This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
* Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location between
the provided (out) pointer and the returned pointer. */
__m128i * simdpack_length(const uint32_t * in, size_t length, __m128i * out, const uint32_t bit);
/* like simdunpack, but supports an undetermined number of inputs.
* This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
* Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided
(in) pointer and the returned pointer. */
const __m128i * simdunpack_length(const __m128i * in, size_t length, uint32_t * out, const uint32_t bit);
/* like simdpack, but supports an undetermined small number of inputs. This is useful if you need to pack less
than 128 integers.
* Note that this function is much slower.
* Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location
between the provided (out) pointer and the returned pointer. */
__m128i * simdpack_shortlength(const uint32_t * in, int length, __m128i * out, const uint32_t bit);
/* like simdunpack, but supports an undetermined small number of inputs. This is useful if you need to unpack less
than 128 integers.
* Note that this function is much slower.
* Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided (in)
pointer and the returned pointer. */
const __m128i * simdunpack_shortlength(const __m128i * in, int length, uint32_t * out, const uint32_t bit);
/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
void simdfastset(__m128i * in128, uint32_t b, uint32_t value, size_t index);
#endif /* SIMDBITPACKING_H_ */

View File

@@ -0,0 +1,22 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMDCOMP_H_
#define SIMDCOMP_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "simdbitpacking.h"
#include "simdcomputil.h"
#include "simdfor.h"
#include "simdintegratedbitpacking.h"
#include "avxbitpacking.h"
#ifdef __cplusplus
} // extern "C"
#endif
#endif

View File

@@ -0,0 +1,54 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMDCOMPUTIL_H_
#define SIMDCOMPUTIL_H_
#include "portability.h"
/* SSE2 is required */
#include <emmintrin.h>
/* returns the integer logarithm of v (bit width) */
uint32_t bits(const uint32_t v);
/* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */
uint32_t maxbits(const uint32_t * begin);
/* same as maxbits, but we specify the number of integers */
uint32_t maxbits_length(const uint32_t * in,uint32_t length);
enum{ SIMDBlockSize = 128};
/* computes (quickly) the minimal value of 128 values */
uint32_t simdmin(const uint32_t * in);
/* computes (quickly) the minimal value of the specified number of values */
uint32_t simdmin_length(const uint32_t * in, uint32_t length);
#ifdef __SSE4_1__
/* computes (quickly) the minimal and maximal value of the specified number of values */
void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax);
/* computes (quickly) the minimal and maximal value of the 128 values */
void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax);
#endif
/* like maxbit over 128 integers (SIMDBlockSize) with provided initial value
and using differential coding */
uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in);
/* like simdmaxbitsd1, but calculates maxbits over |length| integers
with provided initial value. |length| can be any arbitrary value. */
uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
uint32_t length);
#endif /* SIMDCOMPUTIL_H_ */

View File

@@ -0,0 +1,72 @@
/**
* This code is released under a BSD License.
*/
#ifndef INCLUDE_SIMDFOR_H_
#define INCLUDE_SIMDFOR_H_
#include "portability.h"
/* SSE2 is required */
#include <emmintrin.h>
#include "simdcomputil.h"
#include "simdbitpacking.h"
#ifdef __cplusplus
extern "C" {
#endif
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" */
void simdpackFOR(uint32_t initvalue, const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */
void simdunpackFOR(uint32_t initvalue, const __m128i * in,uint32_t * out, const uint32_t bit);
/* how many compressed bytes are needed to compressed length integers using a bit width of bit with
the simdpackFOR_length function. */
int simdpackFOR_compressedbytes(int length, const uint32_t bit);
/* like simdpackFOR, but supports an undetermined number of inputs.
This is useful if you need to pack less than 128 integers. Note that this function is much slower.
Compressed data is stored in the memory location between
the provided (out) pointer and the returned pointer. */
__m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t * in, int length, __m128i * out, const uint32_t bit);
/* like simdunpackFOR, but supports an undetermined number of inputs.
This is useful if you need to unpack less than 128 integers. Note that this function is much slower.
The read compressed data is between the provided
(in) pointer and the returned pointer. */
const __m128i * simdunpackFOR_length(uint32_t initvalue, const __m128i * in, int length, uint32_t * out, const uint32_t bit);
/* returns the value stored at the specified "slot".
* */
uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
int slot);
/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
void simdfastsetFOR(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
* which is >= |key|, and returns its position. It is assumed that the values
* stored are in sorted order.
* The encoded key is stored in "*presult".
* The first length decoded integers, ignoring others. If no value is larger or equal to the key,
* length is returned. Length should be no larger than 128.
*
* If no value is larger or equal to the key,
* length is returned */
int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
int length, uint32_t key, uint32_t *presult);
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* INCLUDE_SIMDFOR_H_ */

View File

@@ -0,0 +1,98 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMD_INTEGRATED_BITPACKING_H
#define SIMD_INTEGRATED_BITPACKING_H
#include "portability.h"
/* SSE2 is required */
#include <emmintrin.h>
#include "simdcomputil.h"
#include "simdbitpacking.h"
#ifdef __cplusplus
extern "C" {
#endif
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out"
integer values should be in sorted order (for best results).
The differences are masked so that only the least significant "bit" bits are used. */
void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out"
integer values should be in sorted order (for best results).
The difference values are assumed to be less than 1<<bit. */
void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */
void simdunpackd1(uint32_t initvalue, const __m128i * in,uint32_t * out, const uint32_t bit);
/* searches "bit" 128-bit vectors from "in" (= 128 encoded integers) for the first encoded uint32 value
* which is >= |key|, and returns its position. It is assumed that the values
* stored are in sorted order.
* The encoded key is stored in "*presult". If no value is larger or equal to the key,
* 128 is returned. The pointer initOffset is a pointer to the last four value decoded
* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init)),
* and the vector gets updated.
**/
int
simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit,
uint32_t key, uint32_t *presult);
/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
* which is >= |key|, and returns its position. It is assumed that the values
* stored are in sorted order.
* The encoded key is stored in "*presult".
* The first length decoded integers, ignoring others. If no value is larger or equal to the key,
* length is returned. Length should be no larger than 128.
*
* If no value is larger or equal to the key,
* length is returned */
int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
int length, uint32_t key, uint32_t *presult);
/* returns the value stored at the specified "slot".
* */
uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
int slot);
/* given a block of 128 packed values, this function sets the value at index "index" to "value",
* you must somehow know the previous value.
* Because of differential coding, all following values are incremented by the offset between this new
* value and the old value...
* This functions is useful if you want to modify the last value.
*/
void simdfastsetd1fromprevious( __m128i * in, uint32_t bit, uint32_t previousvalue, uint32_t value, size_t index);
/* given a block of 128 packed values, this function sets the value at index "index" to "value",
* This function computes the previous value if needed.
* Because of differential coding, all following values are incremented by the offset between this new
* value and the old value...
* This functions is useful if you want to modify the last value.
*/
void simdfastsetd1(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
/*Simply scan the data
* The pointer initOffset is a pointer to the last four value decoded
* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init);),
* and the vector gets updated.
* */
void
simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit);
#ifdef __cplusplus
} // extern "C"
#endif
#endif

79
cpp/simdcomp/makefile Normal file
View File

@@ -0,0 +1,79 @@
# minimalist makefile
.SUFFIXES:
#
.SUFFIXES: .cpp .o .c .h
ifeq ($(DEBUG),1)
CFLAGS = -fPIC -std=c89 -ggdb -msse4.1 -march=native -Wall -Wextra -Wshadow -fsanitize=undefined -fno-omit-frame-pointer -fsanitize=address
else
CFLAGS = -fPIC -std=c89 -O3 -msse4.1 -march=native -Wall -Wextra -Wshadow
endif # debug
LDFLAGS = -shared
LIBNAME=libsimdcomp.so.0.0.3
all: unit unit_chars bitpackingbenchmark $(LIBNAME)
test:
./unit
./unit_chars
install: $(OBJECTS)
cp $(LIBNAME) /usr/local/lib
ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so
ldconfig
cp $(HEADERS) /usr/local/include
HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h
uninstall:
for h in $(HEADERS) ; do rm /usr/local/$$h; done
rm /usr/local/lib/$(LIBNAME)
rm /usr/local/lib/libsimdcomp.so
ldconfig
OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \
simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o
$(LIBNAME): $(OBJECTS)
$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS)
avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude
simdfor.o: ./src/simdfor.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude
simdcomputil.o: ./src/simdcomputil.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude
simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude
simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude
simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude
simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude
example: ./example.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS)
unit: ./tests/unit.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude $(OBJECTS)
bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude $(OBJECTS)
benchmark: ./benchmarks/benchmark.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude $(OBJECTS)
dynunit: ./tests/unit.c $(HEADERS) $(LIBNAME)
$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -lsimdcomp
unit_chars: ./tests/unit_chars.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude $(OBJECTS)
clean:
rm -f unit *.o $(LIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars

104
cpp/simdcomp/makefile.vc Normal file
View File

@@ -0,0 +1,104 @@
!IFNDEF MACHINE
!IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64"
MACHINE=x64
!ELSE
MACHINE=x86
!ENDIF
!ENDIF
!IFNDEF DEBUG
DEBUG=no
!ENDIF
!IFNDEF CC
CC=cl.exe
!ENDIF
!IFNDEF AR
AR=lib.exe
!ENDIF
!IFNDEF LINK
LINK=link.exe
!ENDIF
!IFNDEF PGO
PGO=no
!ENDIF
!IFNDEF PGI
PGI=no
!ENDIF
INC = /Iinclude
!IF "$(DEBUG)"=="yes"
CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm
ARFLAGS = /nologo
LDFLAGS = /nologo /debug /nodefaultlib:msvcrt
!ELSE
CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP
ARFLAGS = /nologo /LTCG
LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf
!ENDIF
!IF "$(PGI)"=="yes"
LDFLAGS = $(LDFLAGS) /ltcg:pgi
!ENDIF
!IF "$(PGO)"=="yes"
LDFLAGS = $(LDFLAGS) /ltcg:pgo
!ENDIF
LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \
simdpackedsearch.obj simdpackedselect.obj simdfor.obj
all: lib dll dynunit unit_chars example benchmark
# need some good use case scenario to train the instrumented build
@if "$(PGI)"=="yes" echo Running PGO training
@if "$(PGI)"=="yes" benchmark.exe >nul 2>&1
@if "$(PGI)"=="yes" example.exe >nul 2>&1
$(LIB_OBJS):
$(CC) $(INC) $(CFLAGS) /c src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \
src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c
lib: $(LIB_OBJS)
$(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS)
dll: $(LIB_OBJS)
$(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS)
unit: lib
$(CC) $(INC) $(CFLAGS) /c src/unit.c
$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib
dynunit: dll
$(CC) $(INC) $(CFLAGS) /c src/unit.c
$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib
unit_chars: lib
$(CC) $(INC) $(CFLAGS) /c src/unit_chars.c
$(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib
example: lib
$(CC) $(INC) $(CFLAGS) /c example.c
$(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib
benchmark: lib
$(CC) $(INC) $(CFLAGS) /c src/benchmark.c
$(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib
clean:
del /Q *.obj
del /Q *.lib
del /Q *.exe
del /Q *.dll
del /Q *.pgc
del /Q *.pgd
del /Q *.pdb

16
cpp/simdcomp/package.json Normal file
View File

@@ -0,0 +1,16 @@
{
"name": "simdcomp",
"version": "0.0.3",
"repo": "lemire/simdcomp",
"description": "A simple C library for compressing lists of integers",
"license": "BSD-3-Clause",
"src": [
"src/simdbitpacking.c",
"src/simdcomputil.c",
"src/simdintegratedbitpacking.c",
"include/simdbitpacking.h",
"include/simdcomp.h",
"include/simdcomputil.h",
"include/simdintegratedbitpacking.h"
]
}

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python
import sys
def howmany(bit):
""" how many values are we going to pack? """
return 256
def howmanywords(bit):
return (howmany(bit) * bit + 255)/256
def howmanybytes(bit):
return howmanywords(bit) * 16
print("""
/** code generated by avxpacking.py starts here **/
""")
print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""")
print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""")
def plurial(number):
if(number <> 1):
return "s"
else :
return ""
print("")
print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {");
print(" (void)compressed;");
print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
print(" const __m256i * in = (const __m256i *) pin;");
print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */")
oldword = 0
for j in range(howmany(bit)/8):
firstword = j * bit / 32
if(firstword > oldword):
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
oldword = firstword
secondword = (j * bit + bit - 1)/32
firstshift = (j*bit) % 32
if( firstword == secondword):
if(firstshift == 0):
print(" w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j))
else:
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
else:
print(" tmp = _mm256_lddqu_si256 (in + {0});".format(j))
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
secondshift = 32-firstshift
print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
print("}");
print("")
print("")
print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {");
print(" (void)compressed;");
print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
print(" const __m256i * in = (const __m256i *) pin;");
if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
def maskfnc(x):
if(bit == 32): return x
return " _mm256_and_si256 ( mask, {0}) ".format(x)
if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */")
oldword = 0
for j in range(howmany(bit)/8):
firstword = j * bit / 32
if(firstword > oldword):
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
oldword = firstword
secondword = (j * bit + bit - 1)/32
firstshift = (j*bit) % 32
loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j))
if( firstword == secondword):
if(firstshift == 0):
print(" w{0} = {1};".format(firstword%2,loadstr))
else:
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
else:
print(" tmp = {0};".format(loadstr))
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
secondshift = 32-firstshift
print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
print("}");
print("")
print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {");
print(" (void) compressed;");
print(" memset(pout,0,{0});".format(howmany(0)));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit));
print(" /* we are going to access {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
print(" __m256i * out = (__m256i *) pout;");
if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
maskstr = " _mm256_and_si256 ( mask, {0}) "
if (bit == 32) : maskstr = " {0} " # no need
oldword = 0
print(" w0 = _mm256_lddqu_si256 (compressed);")
for j in range(howmany(bit)/8):
firstword = j * bit / 32
secondword = (j * bit + bit - 1)/32
if(secondword > oldword):
print(" w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
oldword = secondword
firstshift = (j*bit) % 32
firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
if(firstshift == 0):
firstshiftstr =" w{0} " # no need
wfirst = firstshiftstr.format(firstword%2)
if( firstword == secondword):
if(firstshift + bit <> 32):
wfirst = maskstr.format(wfirst)
print(" _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
else:
secondshift = (32-firstshift)
wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
wfirstorsecond = maskstr.format(wfirstorsecond)
print(" _mm256_storeu_si256(out + {0},\n {1});".format(j,wfirstorsecond))
print("}");
print("")
print("static avxpackblockfnc avxfuncPackArr[] = {")
for bit in range(0,32):
print("&avxpackblock{0},".format(bit))
print("&avxpackblock32")
print("};")
print("static avxpackblockfnc avxfuncPackMaskArr[] = {")
for bit in range(0,32):
print("&avxpackblockmask{0},".format(bit))
print("&avxpackblockmask32")
print("};")
print("static avxunpackblockfnc avxfuncUnpackArr[] = {")
for bit in range(0,32):
print("&avxunpackblock{0},".format(bit))
print("&avxunpackblock32")
print("};")
print("/** code generated by avxpacking.py ends here **/")

152
cpp/simdcomp/scripts/simdfor.py Executable file
View File

@@ -0,0 +1,152 @@
#!/usr/bin/env python3
from math import ceil
print("""
/**
* Blablabla
*
*/
""");
def mask(bit):
return str((1 << bit) - 1)
for length in [32]:
print("""
static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) {
__m128i *out = (__m128i*)(_out);
int i;
(void) _in;
for (i = 0; i < 8; ++i) {
_mm_store_si128(out++, initOffset);
_mm_store_si128(out++, initOffset);
_mm_store_si128(out++, initOffset);
_mm_store_si128(out++, initOffset);
}
return initOffset;
}
""")
print("""
static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) {
(void) initOffset;
(void) _in;
(void) out;
}
""")
for bit in range(1,33):
offsetVar = " initOffset";
print("""
static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t * _in, __m128i * out) {
const __m128i *in = (const __m128i*)(_in);
__m128i OutReg;
""");
if (bit != 32):
print(" __m128i CurrIn = _mm_load_si128(in);");
print(" __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);");
else:
print(" __m128i InReg = _mm_load_si128(in);");
print(" (void) initOffset;");
inwordpointer = 0
valuecounter = 0
for k in range(ceil((length * bit) / 32)):
if(valuecounter == length): break
for x in range(inwordpointer,32,bit):
if(x!=0) :
print(" OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));");
else:
print(" OutReg = InReg; ");
if((x+bit>=32) ):
while(inwordpointer<32):
inwordpointer += bit
print(" _mm_store_si128(out, OutReg);");
print("");
if(valuecounter + 1 < length):
print(" ++out;")
inwordpointer -= 32;
if(inwordpointer>0):
print(" OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");");
if(valuecounter + 1 < length):
print(" ++in;")
if (bit != 32):
print(" CurrIn = _mm_load_si128(in);");
print(" InReg = _mm_sub_epi32(CurrIn, initOffset);");
else:
print(" InReg = _mm_load_si128(in);");
print("");
valuecounter = valuecounter + 1
if(valuecounter == length): break
assert(valuecounter == length)
print("\n}\n\n""")
for bit in range(1,32):
offsetVar = " initOffset";
print("""\n
static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const __m128i* in, uint32_t * _out) {
""");
print(""" __m128i* out = (__m128i*)(_out);
__m128i InReg = _mm_load_si128(in);
__m128i OutReg;
__m128i tmp;
const __m128i mask = _mm_set1_epi32((1U<<"""+str(bit)+""")-1);
""");
MainText = "";
MainText += "\n";
inwordpointer = 0
valuecounter = 0
for k in range(ceil((length * bit) / 32)):
for x in range(inwordpointer,32,bit):
if(valuecounter == length): break
if (x > 0):
MainText += " tmp = _mm_srli_epi32(InReg," + str(x) +");\n";
else:
MainText += " tmp = InReg;\n";
if(x+bit<32):
MainText += " OutReg = _mm_and_si128(tmp, mask);\n";
else:
MainText += " OutReg = tmp;\n";
if((x+bit>=32) ):
while(inwordpointer<32):
inwordpointer += bit
if(valuecounter + 1 < length):
MainText += " ++in;"
MainText += " InReg = _mm_load_si128(in);\n";
inwordpointer -= 32;
if(inwordpointer>0):
MainText += " OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n";
if (bit != 32):
MainText += " OutReg = _mm_add_epi32(OutReg, initOffset);\n";
MainText += " _mm_store_si128(out++, OutReg);\n\n";
MainText += "";
valuecounter = valuecounter + 1
if(valuecounter == length): break
assert(valuecounter == length)
print(MainText)
print(" return initOffset;");
print("\n}\n\n")
print("""
static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) {
__m128i * mout = (__m128i *)_out;
__m128i invec;
size_t k;
for(k = 0; k < 128/4; ++k) {
invec = _mm_load_si128(in++);
_mm_store_si128(mout++, invec);
}
return invec;
}
""")

40
cpp/simdcomp/simdcomp.def Normal file
View File

@@ -0,0 +1,40 @@
EXPORTS
simdpack
simdpackwithoutmask
simdunpack
bits
maxbits
maxbits_length
simdmin
simdmin_length
simdmaxmin
simdmaxmin_length
simdmaxbitsd1
simdmaxbitsd1_length
simdpackd1
simdpackwithoutmaskd1
simdunpackd1
simdsearchd1
simdsearchwithlengthd1
simdselectd1
simdpackFOR
simdselectFOR
simdsearchwithlengthFOR
simdunpackFOR
simdmin_length
simdmaxmin
simdmaxmin_length
simdpack_length
simdpackFOR_length
simdunpackFOR_length
simdpack_shortlength
simdfastsetFOR
simdfastset
simdfastsetd1
simdunpack_length
simdunpack_shortlength
simdsearchwithlengthFOR
simdscand1
simdfastsetd1fromprevious
simdfastsetd1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,234 @@
/**
* This code is released under a BSD License.
*/
#include "simdcomputil.h"
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif
#include <assert.h>
#define Delta(curr, prev) \
_mm_sub_epi32(curr, \
_mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)))
/* returns the integer logarithm of v (bit width) */
uint32_t bits(const uint32_t v) {
#ifdef _MSC_VER
unsigned long answer;
if (v == 0) {
return 0;
}
_BitScanReverse(&answer, v);
return answer + 1;
#else
return v == 0 ? 0 : 32 - __builtin_clz(v); /* assume GCC-like compiler if not microsoft */
#endif
}
static uint32_t maxbitas32int(const __m128i accumulator) {
const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
uint32_t ans = _mm_cvtsi128_si32(_tmp2);
return bits(ans);
}
SIMDCOMP_PURE uint32_t maxbits(const uint32_t * begin) {
const __m128i* pin = (const __m128i*)(begin);
__m128i accumulator = _mm_loadu_si128(pin);
uint32_t k = 1;
for(; 4*k < SIMDBlockSize; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_or_si128(accumulator,newvec);
}
return maxbitas32int(accumulator);
}
static uint32_t orasint(const __m128i accumulator) {
const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
return _mm_cvtsi128_si32(_tmp2);
}
#ifdef __SSE4_1__
static uint32_t minasint(const __m128i accumulator) {
const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
return _mm_cvtsi128_si32(_tmp2);
}
static uint32_t maxasint(const __m128i accumulator) {
const __m128i _tmp1 = _mm_max_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m128i _tmp2 = _mm_max_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
return _mm_cvtsi128_si32(_tmp2);
}
uint32_t simdmin(const uint32_t * in) {
const __m128i* pin = (const __m128i*)(in);
__m128i accumulator = _mm_loadu_si128(pin);
uint32_t k = 1;
for(; 4*k < SIMDBlockSize; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_min_epu32(accumulator,newvec);
}
return minasint(accumulator);
}
void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax) {
const __m128i* pin = (const __m128i*)(in);
__m128i minaccumulator = _mm_loadu_si128(pin);
__m128i maxaccumulator = minaccumulator;
uint32_t k = 1;
for(; 4*k < SIMDBlockSize; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
minaccumulator = _mm_min_epu32(minaccumulator,newvec);
maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
}
*getmin = minasint(minaccumulator);
*getmax = maxasint(maxaccumulator);
}
uint32_t simdmin_length(const uint32_t * in, uint32_t length) {
uint32_t currentmin = 0xFFFFFFFF;
uint32_t lengthdividedby4 = length / 4;
uint32_t offset = lengthdividedby4 * 4;
uint32_t k;
if (lengthdividedby4 > 0) {
const __m128i* pin = (const __m128i*)(in);
__m128i accumulator = _mm_loadu_si128(pin);
k = 1;
for(; 4*k < lengthdividedby4 * 4; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_min_epu32(accumulator,newvec);
}
currentmin = minasint(accumulator);
}
for (k = offset; k < length; ++k)
if (in[k] < currentmin)
currentmin = in[k];
return currentmin;
}
void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) {
uint32_t lengthdividedby4 = length / 4;
uint32_t offset = lengthdividedby4 * 4;
uint32_t k;
*getmin = 0xFFFFFFFF;
*getmax = 0;
if (lengthdividedby4 > 0) {
const __m128i* pin = (const __m128i*)(in);
__m128i minaccumulator = _mm_loadu_si128(pin);
__m128i maxaccumulator = minaccumulator;
k = 1;
for(; 4*k < lengthdividedby4 * 4; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
minaccumulator = _mm_min_epu32(minaccumulator,newvec);
maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
}
*getmin = minasint(minaccumulator);
*getmax = maxasint(maxaccumulator);
}
for (k = offset; k < length; ++k) {
if (in[k] < *getmin)
*getmin = in[k];
if (in[k] > *getmax)
*getmax = in[k];
}
}
#endif
SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t * in,uint32_t length) {
uint32_t k;
uint32_t lengthdividedby4 = length / 4;
uint32_t offset = lengthdividedby4 * 4;
uint32_t bigxor = 0;
if(lengthdividedby4 > 0) {
const __m128i* pin = (const __m128i*)(in);
__m128i accumulator = _mm_loadu_si128(pin);
k = 1;
for(; 4*k < 4*lengthdividedby4; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_or_si128(accumulator,newvec);
}
bigxor = orasint(accumulator);
}
for(k = offset; k < length; ++k)
bigxor |= in[k];
return bits(bigxor);
}
/* maxbit over 128 integers (SIMDBlockSize) with provided initial value */
uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) {
__m128i initoffset = _mm_set1_epi32 (initvalue);
const __m128i* pin = (const __m128i*)(in);
__m128i newvec = _mm_loadu_si128(pin);
__m128i accumulator = Delta(newvec , initoffset);
__m128i oldvec = newvec;
uint32_t k = 1;
for(; 4*k < SIMDBlockSize; ++k) {
newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec));
oldvec = newvec;
}
initoffset = oldvec;
return maxbitas32int(accumulator);
}
/* maxbit over |length| integers with provided initial value */
uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
uint32_t length) {
__m128i newvec;
__m128i oldvec;
__m128i initoffset;
__m128i accumulator;
const __m128i *pin;
uint32_t tmparray[4];
uint32_t k = 1;
uint32_t acc;
assert(length > 0);
pin = (const __m128i *)(in);
initoffset = _mm_set1_epi32(initvalue);
switch (length) {
case 1:
newvec = _mm_set1_epi32(in[0]);
break;
case 2:
newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]);
break;
case 3:
newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]);
break;
default:
newvec = _mm_loadu_si128(pin);
break;
}
accumulator = Delta(newvec, initoffset);
oldvec = newvec;
/* process 4 integers and build an accumulator */
while (k * 4 + 4 <= length) {
newvec = _mm_loadu_si128(pin + k);
accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
oldvec = newvec;
k++;
}
/* extract the accumulator as an integer */
_mm_storeu_si128((__m128i *)(tmparray), accumulator);
acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3];
/* now process the remaining integers */
for (k *= 4; k < length; k++)
acc |= in[k] - (k == 0 ? initvalue : in[k - 1]);
/* return the number of bits */
return bits(acc);
}

14501
cpp/simdcomp/src/simdfor.c Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

900
cpp/simdcomp/tests/unit.c Normal file
View File

@@ -0,0 +1,900 @@
/**
* This code is released under a BSD License.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include "simdcomp.h"
int testshortpack() {
int bit;
size_t i;
size_t length;
__m128i * bb;
srand(0);
printf("testshortpack\n");
for (bit = 0; bit < 32; ++bit) {
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
for (i = 0; i < N; ++i) {
data[i] = rand() & ((1 << bit) - 1);
}
for (length = 0; length <= N; ++length) {
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
bb = simdpack_shortlength(data, length, (__m128i *) buffer,
bit);
if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
printf("bug\n");
return -1;
}
simdunpack_shortlength((__m128i *) buffer, length,
backdata, bit);
for (i = 0; i < length; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
return -1;
}
}
}
free(data);
free(backdata);
free(buffer);
}
return 0;
}
int testlongpack() {
int bit;
size_t i;
size_t length;
__m128i * bb;
srand(0);
printf("testlongpack\n");
for (bit = 0; bit < 32; ++bit) {
const size_t N = 2048;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
for (i = 0; i < N; ++i) {
data[i] = rand() & ((1 << bit) - 1);
}
for (length = 0; length <= N; ++length) {
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
bb = simdpack_length(data, length, (__m128i *) buffer,
bit);
if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
printf("bug\n");
return -1;
}
simdunpack_length((__m128i *) buffer, length,
backdata, bit);
for (i = 0; i < length; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
return -1;
}
}
}
free(data);
free(backdata);
free(buffer);
}
return 0;
}
int testset() {
int bit;
size_t i;
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
srand(0);
for (bit = 0; bit < 32; ++bit) {
printf("simple set %d \n",bit);
for (i = 0; i < N; ++i) {
data[i] = rand() & ((1 << bit) - 1);
}
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
simdpack(data, (__m128i *) buffer, bit);
simdunpack((__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
return -1;
}
}
for(i = N ; i > 0; i--) {
simdfastset((__m128i *) buffer, bit, data[N - i], i - 1);
}
simdunpack((__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[N - i - 1]) {
printf("bug\n");
return -1;
}
}
simdpack(data, (__m128i *) buffer, bit);
for(i = 1 ; i <= N; i++) {
simdfastset((__m128i *) buffer, bit, data[i - 1], i - 1);
}
simdunpack((__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
return -1;
}
}
}
free(data);
free(backdata);
free(buffer);
return 0;
}
#ifdef __SSE4_1__
int testsetd1() {
int bit;
size_t i;
uint32_t newvalue;
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
srand(0);
for (bit = 0; bit < 32; ++bit) {
printf("simple set d1 %d \n",bit);
data[0] = rand() & ((1 << bit) - 1);
datazeroes[0] = 0;
for (i = 1; i < N; ++i) {
data[i] = data[i - 1] + (rand() & ((1 << bit) - 1));
datazeroes[i] = 0;
}
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
simdpackd1(0,datazeroes, (__m128i *) buffer, bit);
for(i = 1 ; i <= N; i++) {
simdfastsetd1(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
newvalue = simdselectd1(0, (const __m128i *) buffer, bit,i - 1);
if( newvalue != data[i-1] ) {
printf("bad set-select\n");
return -1;
}
}
simdunpackd1(0,(__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[i])
return -1;
}
}
free(data);
free(backdata);
free(buffer);
free(datazeroes);
return 0;
}
#endif
int testsetFOR() {
int bit;
size_t i;
uint32_t newvalue;
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
srand(0);
for (bit = 0; bit < 32; ++bit) {
printf("simple set FOR %d \n",bit);
for (i = 0; i < N; ++i) {
data[i] = (rand() & ((1 << bit) - 1));
datazeroes[i] = 0;
}
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
simdpackFOR(0,datazeroes, (__m128i *) buffer, bit);
for(i = 1 ; i <= N; i++) {
simdfastsetFOR(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
newvalue = simdselectFOR(0, (const __m128i *) buffer, bit,i - 1);
if( newvalue != data[i-1] ) {
printf("bad set-select\n");
return -1;
}
}
simdunpackFOR(0,(__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[i])
return -1;
}
}
free(data);
free(backdata);
free(buffer);
free(datazeroes);
return 0;
}
int testshortFORpack() {
int bit;
size_t i;
__m128i * rb;
size_t length;
uint32_t offset = 7;
srand(0);
for (bit = 0; bit < 32; ++bit) {
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
for (i = 0; i < N; ++i) {
data[i] = (rand() & ((1 << bit) - 1)) + offset;
}
for (length = 0; length <= N; ++length) {
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
rb = simdpackFOR_length(offset,data, length, (__m128i *) buffer,
bit);
if(((rb - (__m128i *) buffer)*sizeof(__m128i)) != (unsigned) simdpackFOR_compressedbytes(length,bit)) {
return -1;
}
simdunpackFOR_length(offset,(__m128i *) buffer, length,
backdata, bit);
for (i = 0; i < length; ++i) {
if (data[i] != backdata[i])
return -1;
}
}
free(data);
free(backdata);
free(buffer);
}
return 0;
}
#ifdef __AVX2__
int testbabyavx() {
int bit;
int trial;
unsigned int i,j;
const size_t N = AVXBlockSize;
srand(0);
printf("testbabyavx\n");
printf("bit = ");
for (bit = 0; bit < 32; ++bit) {
printf(" %d ",bit);
fflush(stdout);
for(trial = 0; trial < 100; ++trial) {
uint32_t * data = malloc(N * sizeof(uint32_t)+ 64 * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t) );
__m256i * buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32);
for (i = 0; i < N; ++i) {
data[i] = rand() & ((uint32_t)(1 << bit) - 1);
}
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
if(avxmaxbits(data) != maxbits_length(data,N)) {
printf("avxmaxbits is buggy\n");
return -1;
}
avxpackwithoutmask(data, buffer, bit);
avxunpack(buffer, backdata, bit);
for (i = 0; i < AVXBlockSize; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
for (j = 0; j < N; ++j) {
if (data[j] != backdata[j]) {
printf("data[%d]=%d v.s. backdata[%d]=%d\n",j,data[j],j,backdata[j]);
} else {
printf("data[%d]=%d\n",j,data[j]);
}
}
return -1;
}
}
free(data);
free(backdata);
free(buffer);
}
}
printf("\n");
return 0;
}
int testavx2() {
int N = 5000 * AVXBlockSize, gap;
__m256i * buffer = malloc(AVXBlockSize * sizeof(uint32_t));
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint32_t * backbuffer = malloc(AVXBlockSize * sizeof(uint32_t));
for (gap = 1; gap <= 387420489; gap *= 3) {
int k;
printf(" gap = %u \n", gap);
for (k = 0; k < N; ++k)
datain[k] = k * gap;
for (k = 0; k * AVXBlockSize < N; ++k) {
/*
First part works for general arrays (sorted or unsorted)
*/
int j;
/* we compute the bit width */
const uint32_t b = avxmaxbits(datain + k * AVXBlockSize);
if(avxmaxbits(datain + k * AVXBlockSize) != maxbits_length(datain + k * AVXBlockSize,AVXBlockSize)) {
printf("avxmaxbits is buggy %d %d \n",
avxmaxbits(datain + k * AVXBlockSize),
maxbits_length(datain + k * AVXBlockSize,AVXBlockSize));
return -1;
}
printf("bit width = %d\n",b);
/* we read 256 integers at "datain + k * AVXBlockSize" and
write b 256-bit vectors at "buffer" */
avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
avxunpack(buffer, backbuffer, b);/* uncompressed */
for (j = 0; j < AVXBlockSize; ++j) {
if (backbuffer[j] != datain[k * AVXBlockSize + j]) {
int i;
printf("bug in avxpack\n");
for(i = 0; i < AVXBlockSize; ++i) {
printf("data[%d]=%d got back %d %s\n",i,
datain[k * AVXBlockSize + i],backbuffer[i],
datain[k * AVXBlockSize + i]!=backbuffer[i]?"bug":"");
}
return -2;
}
}
}
}
free(buffer);
free(datain);
free(backbuffer);
printf("Code looks good.\n");
return 0;
}
#endif /* avx2 */
int test() {
int N = 5000 * SIMDBlockSize, gap;
__m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
for (gap = 1; gap <= 387420489; gap *= 3) {
int k;
printf(" gap = %u \n", gap);
for (k = 0; k < N; ++k)
datain[k] = k * gap;
for (k = 0; k * SIMDBlockSize < N; ++k) {
/*
First part works for general arrays (sorted or unsorted)
*/
int j;
/* we compute the bit width */
const uint32_t b = maxbits(datain + k * SIMDBlockSize);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b 128-bit vectors at "buffer" */
simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpack(buffer, backbuffer, b);/* uncompressed */
for (j = 0; j < SIMDBlockSize; ++j) {
if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
printf("bug in simdpack\n");
return -2;
}
}
{
/*
next part assumes that the data is sorted (uses differential coding)
*/
uint32_t offset = 0;
/* we compute the bit width */
const uint32_t b1 = simdmaxbitsd1(offset,
datain + k * SIMDBlockSize);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b1 128-bit vectors at "buffer" */
simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
b1);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpackd1(offset, buffer, backbuffer, b1);
for (j = 0; j < SIMDBlockSize; ++j) {
if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
printf("bug in simdpack d1\n");
return -3;
}
}
offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
}
}
}
free(buffer);
free(datain);
free(backbuffer);
printf("Code looks good.\n");
return 0;
}
#ifdef __SSE4_1__
int testFOR() {
int N = 5000 * SIMDBlockSize, gap;
__m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
uint32_t tmax, tmin, tb;
for (gap = 1; gap <= 387420489; gap *= 2) {
int k;
printf(" gap = %u \n", gap);
for (k = 0; k < N; ++k)
datain[k] = k * gap;
for (k = 0; k * SIMDBlockSize < N; ++k) {
int j;
simdmaxmin_length(datain + k * SIMDBlockSize,SIMDBlockSize,&tmin,&tmax);
/* we compute the bit width */
tb = bits(tmax - tmin);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b 128-bit vectors at "buffer" */
simdpackFOR(tmin,datain + k * SIMDBlockSize, buffer, tb);
for (j = 0; j < SIMDBlockSize; ++j) {
uint32_t selectedvalue = simdselectFOR(tmin,buffer,tb,j);
if (selectedvalue != datain[k * SIMDBlockSize + j]) {
printf("bug in simdselectFOR\n");
return -3;
}
}
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpackFOR(tmin,buffer, backbuffer, tb);/* uncompressed */
for (j = 0; j < SIMDBlockSize; ++j) {
if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
printf("bug in simdpackFOR\n");
return -2;
}
}
}
}
free(buffer);
free(datain);
free(backbuffer);
printf("Code looks good.\n");
return 0;
}
#endif
#define MAX 300
int test_simdmaxbitsd1_length() {
uint32_t result, buffer[MAX + 1];
int i, j;
memset(&buffer[0], 0xff, sizeof(buffer));
/* this test creates buffers of different length; each buffer is
* initialized to result in the following deltas:
* length 1: 2
* length 2: 1 2
* length 3: 1 1 2
* length 4: 1 1 1 2
* length 5: 1 1 1 1 2
* etc. Each sequence's "maxbits" is 2. */
for (i = 0; i < MAX; i++) {
for (j = 0; j < i; j++)
buffer[j] = j + 1;
buffer[i] = i + 2;
result = simdmaxbitsd1_length(0, &buffer[0], i + 1);
if (result != 2) {
printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n",
result, i);
return -1;
}
}
printf("simdmaxbitsd1_length: ok\n");
return 0;
}
int uint32_cmp(const void *a, const void *b)
{
const uint32_t *ia = (const uint32_t *)a;
const uint32_t *ib = (const uint32_t *)b;
if(*ia < *ib)
return -1;
else if (*ia > *ib)
return 1;
return 0;
}
#ifdef __SSE4_1__
int test_simdpackedsearch() {
uint32_t buffer[128];
uint32_t result = 0;
int b, i;
uint32_t init = 0;
__m128i initial = _mm_set1_epi32(init);
/* initialize the buffer */
for (i = 0; i < 128; i++)
buffer[i] = (uint32_t)(i + 1);
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 1; b <= 32; b++) {
uint32_t out[128];
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
initial = _mm_setzero_si128();
printf("simdsearchd1: %d bits\n", b);
/* now perform the searches */
initial = _mm_set1_epi32(init);
assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0);
assert(result > 0);
for (i = 1; i <= 128; i++) {
initial = _mm_set1_epi32(init);
assert(simdsearchd1(&initial, (__m128i *)out, b,
(uint32_t)i, &result) == i - 1);
assert(result == (unsigned)i);
}
initial = _mm_set1_epi32(init);
assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result)
== 128);
assert(result > 200);
}
printf("simdsearchd1: ok\n");
return 0;
}
int test_simdpackedsearchFOR() {
uint32_t buffer[128];
uint32_t result = 0;
int b;
uint32_t i;
uint32_t maxv, tmin, tmax, tb;
uint32_t out[128];
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 1; b <= 32; b++) {
/* initialize the buffer */
maxv = (b == 32)
? 0xFFFFFFFF
: ((1U<<b) - 1);
for (i = 0; i < 128; i++)
buffer[i] = maxv * (i + 1) / 128;
simdmaxmin_length(buffer,SIMDBlockSize,&tmin,&tmax);
/* we compute the bit width */
tb = bits(tmax - tmin);
/* delta-encode to 'i' bits */
simdpackFOR(tmin, buffer, (__m128i *)out, tb);
printf("simdsearchd1: %d bits\n", b);
/* now perform the searches */
for (i = 0; i < 128; i++) {
assert(buffer[i] == simdselectFOR(tmin, (__m128i *)out, tb,i));
}
for (i = 0; i < 128; i++) {
int x = simdsearchwithlengthFOR(tmin, (__m128i *)out, tb,
128,buffer[i], &result) ;
assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == buffer[x]);
assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == result);
assert(buffer[x] == result);
assert(result == buffer[i]);
assert(buffer[x] == buffer[i]);
}
}
printf("simdsearchFOR: ok\n");
return 0;
}
int test_simdpackedsearch_advanced() {
uint32_t buffer[128];
uint32_t backbuffer[128];
uint32_t out[128];
uint32_t result = 0;
uint32_t b, i;
uint32_t init = 0;
__m128i initial = _mm_set1_epi32(init);
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 0; b <= 32; b++) {
uint32_t prev = init;
/* initialize the buffer */
for (i = 0; i < 128; i++) {
buffer[i] = ((uint32_t)(1431655765 * i + 0xFFFFFFFF)) ;
if(b < 32) buffer[i] %= (1<<b);
}
qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
for (i = 0; i < 128; i++) {
buffer[i] = buffer[i] + prev;
prev = buffer[i];
}
for (i = 1; i < 128; i++) {
if(buffer[i] < buffer[i-1] )
buffer[i] = buffer[i-1];
}
assert(simdmaxbitsd1(init, buffer)<=b);
for (i = 0; i < 128; i++) {
out[i] = 0; /* memset would do too */
}
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
simdunpackd1(init, (__m128i *)out, backbuffer, b);
for (i = 0; i < 128; i++) {
assert(buffer[i] == backbuffer[i]);
}
printf("advanced simdsearchd1: %d bits\n", b);
for (i = 0; i < 128; i++) {
int pos;
initial = _mm_set1_epi32(init);
pos = simdsearchd1(&initial, (__m128i *)out, b,
buffer[i], &result);
assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
buffer[i], &result));
assert(buffer[pos] == buffer[i]);
if(pos > 0)
assert(buffer[pos - 1] < buffer[i]);
assert(result == buffer[i]);
}
for (i = 0; i < 128; i++) {
int pos;
if(buffer[i] == 0) continue;
initial = _mm_set1_epi32(init);
pos = simdsearchd1(&initial, (__m128i *)out, b,
buffer[i] - 1, &result);
assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
buffer[i] - 1, &result));
assert(buffer[pos] >= buffer[i] - 1);
if(pos > 0)
assert(buffer[pos - 1] < buffer[i] - 1);
assert(result == buffer[pos]);
}
for (i = 0; i < 128; i++) {
int pos;
if (buffer[i] + 1 == 0)
continue;
initial = _mm_set1_epi32(init);
pos = simdsearchd1(&initial, (__m128i *) out, b,
buffer[i] + 1, &result);
assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
buffer[i] + 1, &result));
if(pos == 128) {
assert(buffer[i] == buffer[127]);
} else {
assert(buffer[pos] >= buffer[i] + 1);
if (pos > 0)
assert(buffer[pos - 1] < buffer[i] + 1);
assert(result == buffer[pos]);
}
}
}
printf("advanced simdsearchd1: ok\n");
return 0;
}
int test_simdpackedselect() {
uint32_t buffer[128];
uint32_t initial = 33;
int b, i;
/* initialize the buffer */
for (i = 0; i < 128; i++)
buffer[i] = (uint32_t)(initial + i);
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 1; b <= 32; b++) {
uint32_t out[128];
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
printf("simdselectd1: %d bits\n", b);
/* now perform the searches */
for (i = 0; i < 128; i++) {
assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i)
== initial + i);
}
}
printf("simdselectd1: ok\n");
return 0;
}
int test_simdpackedselect_advanced() {
uint32_t buffer[128];
uint32_t initial = 33;
uint32_t b;
int i;
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 0; b <= 32; b++) {
uint32_t prev = initial;
uint32_t out[128];
/* initialize the buffer */
for (i = 0; i < 128; i++) {
buffer[i] = ((uint32_t)(165576 * i)) ;
if(b < 32) buffer[i] %= (1<<b);
}
for (i = 0; i < 128; i++) {
buffer[i] = buffer[i] + prev;
prev = buffer[i];
}
for (i = 1; i < 128; i++) {
if(buffer[i] < buffer[i-1] )
buffer[i] = buffer[i-1];
}
assert(simdmaxbitsd1(initial, buffer)<=b);
for (i = 0; i < 128; i++) {
out[i] = 0; /* memset would do too */
}
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
printf("simdselectd1: %d bits\n", b);
/* now perform the searches */
for (i = 0; i < 128; i++) {
uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i);
assert(valretrieved == buffer[i]);
}
}
printf("advanced simdselectd1: ok\n");
return 0;
}
#endif
int main() {
int r;
r = testsetFOR();
if (r) {
printf("test failure 1\n");
return r;
}
#ifdef __SSE4_1__
r = testsetd1();
if (r) {
printf("test failure 2\n");
return r;
}
#endif
r = testset();
if (r) {
printf("test failure 3\n");
return r;
}
r = testshortFORpack();
if (r) {
printf("test failure 4\n");
return r;
}
r = testshortpack();
if (r) {
printf("test failure 5\n");
return r;
}
r = testlongpack();
if (r) {
printf("test failure 6\n");
return r;
}
#ifdef __SSE4_1__
r = test_simdpackedsearchFOR();
if (r) {
printf("test failure 7\n");
return r;
}
r = testFOR();
if (r) {
printf("test failure 8\n");
return r;
}
#endif
#ifdef __AVX2__
r= testbabyavx();
if (r) {
printf("test failure baby avx\n");
return r;
}
r = testavx2();
if (r) {
printf("test failure 9 avx\n");
return r;
}
#endif
r = test();
if (r) {
printf("test failure 9\n");
return r;
}
r = test_simdmaxbitsd1_length();
if (r) {
printf("test failure 10\n");
return r;
}
#ifdef __SSE4_1__
r = test_simdpackedsearch();
if (r) {
printf("test failure 11\n");
return r;
}
r = test_simdpackedsearch_advanced();
if (r) {
printf("test failure 12\n");
return r;
}
r = test_simdpackedselect();
if (r) {
printf("test failure 13\n");
return r;
}
r = test_simdpackedselect_advanced();
if (r) {
printf("test failure 14\n");
return r;
}
#endif
printf("All tests OK!\n");
return 0;
}

View File

@@ -0,0 +1,102 @@
/**
* This code is released under a BSD License.
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "simdcomp.h"
#define get_random_char() (uint8_t)(rand() % 256);
int main() {
int N = 5000 * SIMDBlockSize, gap;
__m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
srand(time(NULL));
for (gap = 1; gap <= 387420489; gap *= 3) {
int k;
printf(" gap = %u \n", gap);
/* simulate some random character string, don't care about endiannes */
for (k = 0; k < N; ++k) {
uint8_t _tmp[4];
_tmp[0] = get_random_char();
_tmp[1] = get_random_char();
_tmp[2] = get_random_char();
_tmp[3] = get_random_char();
memmove(&datain[k], _tmp, 4);
}
for (k = 0; k * SIMDBlockSize < N; ++k) {
/*
First part works for general arrays (sorted or unsorted)
*/
int j;
/* we compute the bit width */
const uint32_t b = maxbits(datain + k * SIMDBlockSize);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b 128-bit vectors at "buffer" */
simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpack(buffer, backbuffer, b);/* uncompressed */
for (j = 0; j < SIMDBlockSize; ++j) {
uint8_t chars_back[4];
uint8_t chars_in[4];
memmove(chars_back, &backbuffer[j], 4);
memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
if (chars_in[0] != chars_back[0]
|| chars_in[1] != chars_back[1]
|| chars_in[2] != chars_back[2]
|| chars_in[3] != chars_back[3]) {
printf("bug in simdpack\n");
return -2;
}
}
{
/*
next part assumes that the data is sorted (uses differential coding)
*/
uint32_t offset = 0;
/* we compute the bit width */
const uint32_t b1 = simdmaxbitsd1(offset,
datain + k * SIMDBlockSize);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b1 128-bit vectors at "buffer" */
simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
b1);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpackd1(offset, buffer, backbuffer, b1);
for (j = 0; j < SIMDBlockSize; ++j) {
uint8_t chars_back[4];
uint8_t chars_in[4];
memmove(chars_back, &backbuffer[j], 4);
memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
if (chars_in[0] != chars_back[0]
|| chars_in[1] != chars_back[1]
|| chars_in[2] != chars_back[2]
|| chars_in[3] != chars_back[3]) {
printf("bug in simdpack\n");
return -3;
}
}
offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
}
}
}
free(buffer);
free(datain);
free(backbuffer);
printf("Code looks good.\n");
return 0;
}

42
cpp/simdcomp_wrapper.c vendored Normal file
View File

@@ -0,0 +1,42 @@
#include "simdcomp.h"
#include "simdcomputil.h"
// assumes datain has a size of 128 uint32
// and that buffer is large enough to host the data.
size_t compress_sorted(
const uint32_t* datain,
uint8_t* output,
const uint32_t offset) {
const uint32_t b = simdmaxbitsd1(offset, datain);
*output++ = b;
simdpackwithoutmaskd1(offset, datain, (__m128i *) output, b);
return 1 + b * sizeof(__m128i);
}
// assumes datain has a size of 128 uint32
// and that buffer is large enough to host the data.
size_t uncompress_sorted(
const uint8_t* compressed_data,
uint32_t* output,
uint32_t offset) {
const uint32_t b = *compressed_data++;
simdunpackd1(offset, (__m128i *)compressed_data, output, b);
return 1 + b * sizeof(__m128i);
}
size_t compress_unsorted(
const uint32_t* datain,
uint8_t* output) {
const uint32_t b = maxbits(datain);
*output++ = b;
simdpackwithoutmask(datain, (__m128i *) output, b);
return 1 + b * sizeof(__m128i);
}
size_t uncompress_unsorted(
const uint8_t* compressed_data,
uint32_t* output) {
const uint32_t b = *compressed_data++;
simdunpack((__m128i *)compressed_data, output, b);
return 1 + b * sizeof(__m128i);
}

View File

@@ -1,48 +0,0 @@
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "simdcomp.h"
#include "simdcomputil.h"
extern "C" {
// assumes datain has a size of 128 uint32
// and that buffer is large enough to host the data.
size_t compress_sorted_cpp(
const uint32_t* datain,
uint8_t* output,
const uint32_t offset) {
const uint32_t b = simdmaxbitsd1(offset, datain);
*output++ = b;
simdpackwithoutmaskd1(offset, datain, (__m128i *) output, b);
return 1 + b * sizeof(__m128i);;
}
// assumes datain has a size of 128 uint32
// and that buffer is large enough to host the data.
size_t uncompress_sorted_cpp(
const uint8_t* compressed_data,
uint32_t* output,
uint32_t offset) {
const uint32_t b = *compressed_data++;
simdunpackd1(offset, (__m128i *)compressed_data, output, b);
return 1 + b * sizeof(__m128i);
}
size_t compress_unsorted_cpp(
const uint32_t* datain,
uint8_t* output) {
const uint32_t b = maxbits(datain);
*output++ = b;
simdpackwithoutmask(datain, (__m128i *) output, b);
return 1 + b * sizeof(__m128i);;
}
size_t uncompress_unsorted_cpp(
const uint8_t* compressed_data,
uint32_t* output) {
const uint32_t b = *compressed_data++;
simdunpack((__m128i *)compressed_data, output, b);
return 1 + b * sizeof(__m128i);
}
}

View File

@@ -52,7 +52,7 @@
<div class="pilwrap ">
<a class="pilcrow" href="#section-2">&#182;</a>
</div>
<p>Lets create a temporary directory for the
<p>Lets create a temporary directory for the
sake of this example</p>
</div>
@@ -60,7 +60,7 @@ sake of this example</p>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">if</span> <span class="hljs-keyword">let</span> <span class="hljs-literal">Ok</span>(dir) = TempDir::new(<span class="hljs-string">"tantivy_example_dir"</span>) {
run_example(dir.path()).unwrap();
dir.close().unwrap();
}
}
}
@@ -78,7 +78,7 @@ sake of this example</p>
<h1 id="defining-the-schema">Defining the schema</h1>
<p>The Tantivy index requires a very strict schema.
The schema declares which fields are in the index,
and for each field, its type and “the way it should
and for each field, its type and “the way it should
be indexed”.</p>
</div>
@@ -111,12 +111,12 @@ be indexed”.</p>
We want full-text search for it, and we want to be able
to retrieve the document after the search.</p>
<p>TEXT | STORED is some syntactic sugar to describe
that. </p>
that.</p>
<p><code>TEXT</code> means the field should be tokenized and indexed,
along with its term frequency and term positions.</p>
<p><code>STORED</code> means that the field will also be saved
in a compressed, row-oriented key-value store.
This store is useful to reconstruct the
This store is useful to reconstruct the
documents that were selected during the search phase.</p>
</div>
@@ -139,7 +139,7 @@ to retrieve the body after the search.</p>
</div>
<div class="content"><div class='highlight'><pre> schema_builder.add_text_field(<span class="hljs-string">"body"</span>, TEXT);
<span class="hljs-keyword">let</span> schema = schema_builder.build();</pre></div></div>
</li>
@@ -173,14 +173,12 @@ with our schema in the directory.</p>
There must be only one writer at a time.
This single <code>IndexWriter</code> is already
multithreaded.</p>
<p>Here we use a buffer of 1 GB. Using a bigger
heap for the indexer can increase its throughput.
This buffer will be split between the indexing
threads.</p>
<p>Here we use a buffer of 50MB per thread. Using a bigger
heap for the indexer can increase its throughput.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = <span class="hljs-built_in">try!</span>(index.writer(<span class="hljs-number">1_000_000_000</span>));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = <span class="hljs-built_in">try!</span>(index.writer(<span class="hljs-number">50_000_000</span>));</pre></div></div>
</li>
@@ -213,10 +211,12 @@ one by one in a Document object.</p>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> title = schema.get_field(<span class="hljs-string">"title"</span>).unwrap();
<span class="hljs-keyword">let</span> body = schema.get_field(<span class="hljs-string">"body"</span>).unwrap();
<span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> old_man_doc = Document::<span class="hljs-keyword">default</span>();
old_man_doc.add_text(title, <span class="hljs-string">"The Old Man and the Sea"</span>);
old_man_doc.add_text(body, <span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish."</span>);</pre></div></div>
old_man_doc.add_text(body,
<span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."</span>);</pre></div></div>
</li>
@@ -231,7 +231,7 @@ one by one in a Document object.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(index_writer.add_document(old_man_doc));</pre></div></div>
<div class="content"><div class='highlight'><pre> index_writer.add_document(old_man_doc);</pre></div></div>
</li>
@@ -248,13 +248,13 @@ a document object directly from json.</p>
</div>
<div class="content"><div class='highlight'><pre>
<div class="content"><div class='highlight'><pre>
<span class="hljs-keyword">let</span> mice_and_men_doc = <span class="hljs-built_in">try!</span>(schema.parse_document(r#<span class="hljs-string">"{
"</span>title<span class="hljs-string">": "</span>Of Mice and Men<span class="hljs-string">",
"</span>body<span class="hljs-string">": "</span>few miles south of Soledad, the Salinas River drops <span class="hljs-keyword">in</span> close to the hillside bank and runs deep and green. The water is warm too, <span class="hljs-keyword">for</span> it has slipped twinkling over the yellow sands <span class="hljs-keyword">in</span> the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying <span class="hljs-keyword">in</span> their lower leaf junctures the debris of the winters flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool<span class="hljs-string">"
}"</span>#));
<span class="hljs-built_in">try!</span>(index_writer.add_document(mice_and_men_doc));</pre></div></div>
index_writer.add_document(mice_and_men_doc);</pre></div></div>
</li>
@@ -275,7 +275,7 @@ The following document has two titles.</p>
"</span>title<span class="hljs-string">": ["</span>Frankenstein<span class="hljs-string">", "</span>The Modern Promotheus<span class="hljs-string">"],
"</span>body<span class="hljs-string">": "</span>You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence <span class="hljs-keyword">in</span> the success of my undertaking.<span class="hljs-string">"
}"</span>#));
<span class="hljs-built_in">try!</span>(index_writer.add_document(frankenstein_doc));</pre></div></div>
index_writer.add_document(frankenstein_doc);</pre></div></div>
</li>
@@ -288,7 +288,7 @@ The following document has two titles.</p>
</div>
<p>This is an example, so we will only index 3 documents
here. You can check out tantivys tutorial to index
the English wikipedia. Tantivys indexing is rather fast.
the English wikipedia. Tantivys indexing is rather fast.
Indexing 5 million articles of the English wikipedia takes
around 4 minutes on my computer!</p>
@@ -343,15 +343,13 @@ commit.</p>
<a class="pilcrow" href="#section-17">&#182;</a>
</div>
<h1 id="searching">Searching</h1>
<p>Lets search our index. We start
by creating a searcher. There can be more
than one searcher at a time.</p>
<p>You should create a searcher
every time you start a “search query”.</p>
<p>Lets search our index. Start by reloading
searchers in the index. This should be done
after every commit().</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> searcher = index.searcher();</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(index.load_searchers());</pre></div></div>
</li>
@@ -362,14 +360,13 @@ every time you start a “search query”.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-18">&#182;</a>
</div>
<p>The query parser can interpret human queries.
Here, if the user does not specify which
field they want to search, tantivy will search
in both title and body.</p>
<p>Afterwards create one (or more) searchers.</p>
<p>You should create a searcher
every time you start a “search query”.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query_parser = QueryParser::new(index.schema(), <span class="hljs-built_in">vec!</span>(title, body));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> searcher = index.searcher();</pre></div></div>
</li>
@@ -380,6 +377,24 @@ in both title and body.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-19">&#182;</a>
</div>
<p>The query parser can interpret human queries.
Here, if the user does not specify which
field they want to search, tantivy will search
in both title and body.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query_parser = QueryParser::new(index.schema(), <span class="hljs-built_in">vec!</span>[title, body]);</pre></div></div>
</li>
<li id="section-20">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-20">&#182;</a>
</div>
<p>QueryParser may fail if the query is not in the right
format. For user facing applications, this can be a problem.
A ticket has been opened regarding this problem.</p>
@@ -391,11 +406,11 @@ A ticket has been opened regarding this problem.</p>
</li>
<li id="section-20">
<li id="section-21">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-20">&#182;</a>
<a class="pilcrow" href="#section-21">&#182;</a>
</div>
<p>A query defines a set of documents, as
well as the way they should be scored.</p>
@@ -408,36 +423,20 @@ any document matching at least one of our terms.</p>
</li>
<li id="section-21">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-21">&#182;</a>
</div>
<h3 id="collectors">Collectors</h3>
<p>We are not interested in all of the documents but
only in the top 10. Keeping track of our top 10 best documents
is the role of the TopCollector.</p>
</div>
<div class="content"><div class='highlight'><pre>
<span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> top_collector = TopCollector::with_limit(<span class="hljs-number">10</span>);</pre></div></div>
</li>
<li id="section-22">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-22">&#182;</a>
</div>
<p>We can now perform our query.</p>
<h3 id="collectors">Collectors</h3>
<p>We are not interested in all of the documents but
only in the top 10. Keeping track of our top 10 best documents
is the role of the TopCollector.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(searcher.search(&amp;query, &amp;<span class="hljs-keyword">mut</span> top_collector)));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> top_collector = TopCollector::with_limit(<span class="hljs-number">10</span>);</pre></div></div>
</li>
@@ -448,12 +447,11 @@ is the role of the TopCollector.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-23">&#182;</a>
</div>
<p>Our top collector now contains the 10
most relevant doc ids…</p>
<p>We can now perform our query.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> doc_addresses = top_collector.docs();</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(searcher.search(&amp;*query, &amp;<span class="hljs-keyword">mut</span> top_collector));</pre></div></div>
</li>
@@ -464,7 +462,23 @@ most relevant doc ids…</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-24">&#182;</a>
</div>
<p>The actual documents still need to be
<p>Our top collector now contains the 10
most relevant doc ids…</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> doc_addresses = top_collector.docs();</pre></div></div>
</li>
<li id="section-25">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-25">&#182;</a>
</div>
<p>The actual documents still need to be
retrieved from Tantivys store.</p>
<p>Since the body field was not configured as stored,
the document returned will only contain
@@ -472,10 +486,10 @@ a title.</p>
</div>
<div class="content"><div class='highlight'><pre>
<div class="content"><div class='highlight'><pre>
<span class="hljs-keyword">for</span> doc_address <span class="hljs-keyword">in</span> doc_addresses {
<span class="hljs-keyword">let</span> retrieved_doc = <span class="hljs-built_in">try!</span>(searcher.doc(&amp;doc_address));
<span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, schema.to_json(&amp;retrieved_doc));
<span class="hljs-keyword">let</span> retrieved_doc = <span class="hljs-built_in">try!</span>(searcher.doc(&amp;doc_address));
<span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, schema.to_json(&amp;retrieved_doc));
}
<span class="hljs-literal">Ok</span>(())

View File

@@ -10,105 +10,105 @@ use tantivy::collector::TopCollector;
use tantivy::query::QueryParser;
fn main() {
// Let's create a temporary directory for the
// Let's create a temporary directory for the
// sake of this example
if let Ok(dir) = TempDir::new("tantivy_example_dir") {
run_example(dir.path()).unwrap();
dir.close().unwrap();
}
}
}
fn run_example(index_path: &Path) -> tantivy::Result<()> {
// # Defining the schema
//
// The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index,
// and for each field, its type and "the way it should
// and for each field, its type and "the way it should
// be indexed".
// first we need to define a schema ...
let mut schema_builder = SchemaBuilder::default();
// Our first field is title.
// We want full-text search for it, and we want to be able
// to retrieve the document after the search.
//
// TEXT | STORED is some syntactic sugar to describe
// that.
//
// that.
//
// `TEXT` means the field should be tokenized and indexed,
// along with its term frequency and term positions.
//
// `STORED` means that the field will also be saved
// in a compressed, row-oriented key-value store.
// This store is useful to reconstruct the
// This store is useful to reconstruct the
// documents that were selected during the search phase.
schema_builder.add_text_field("title", TEXT | STORED);
// Our first field is body.
// We want full-text search for it, and we want to be able
// to retrieve the body after the search.
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let schema = schema_builder.build();
// # Indexing documents
//
// Let's create a brand new index.
//
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = try!(Index::create(index_path, schema.clone()));
// To insert document we need an index writer.
// There must be only one writer at a time.
// This single `IndexWriter` is already
// multithreaded.
//
// Here we use a buffer of 1 GB. Using a bigger
// Here we use a buffer of 50MB per thread. Using a bigger
// heap for the indexer can increase its throughput.
// This buffer will be split between the indexing
// threads.
let mut index_writer = try!(index.writer(1_000_000_000));
let mut index_writer = try!(index.writer(50_000_000));
// Let's index our documents!
// We first need a handle on the title and the body field.
// ### Create a document "manually".
//
// We can create a document manually, by setting the fields
// one by one in a Document object.
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(body, "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.");
old_man_doc.add_text(body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.");
// ... and add it to the `IndexWriter`.
try!(index_writer.add_document(old_man_doc));
index_writer.add_document(old_man_doc);
// ### Create a document directly from json.
//
// Alternatively, we can use our schema to parse
// a document object directly from json.
let mice_and_men_doc = try!(schema.parse_document(r#"{
"title": "Of Mice and Men",
"body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winters flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool"
}"#));
try!(index_writer.add_document(mice_and_men_doc));
index_writer.add_document(mice_and_men_doc);
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
// The following document has two titles.
@@ -116,20 +116,20 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Promotheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"#));
try!(index_writer.add_document(frankenstein_doc));
index_writer.add_document(frankenstein_doc);
// This is an example, so we will only index 3 documents
// here. You can check out tantivy's tutorial to index
// the English wikipedia. Tantivy's indexing is rather fast.
// the English wikipedia. Tantivy's indexing is rather fast.
// Indexing 5 million articles of the English wikipedia takes
// around 4 minutes on my computer!
// ### Committing
//
//
// At this point our documents are not searchable.
//
//
//
// We need to call .commit() explicitly to force the
// index_writer to finish processing the documents in the queue,
// flush the current index to the disk, and advertise
@@ -137,22 +137,25 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
//
// This call is blocking.
try!(index_writer.commit());
// If `.commit()` returns correctly, then all of the
// documents that have been added are guaranteed to be
// persistently indexed.
//
//
// In the scenario of a crash or a power failure,
// tantivy behaves as if has rolled back to its last
// commit.
// # Searching
//
// Let's search our index. We start
// by creating a searcher. There can be more
// than one searcher at a time.
//
// Let's search our index. Start by reloading
// searchers in the index. This should be done
// after every commit().
try!(index.load_searchers());
// Afterwards create one (or more) searchers.
//
// You should create a searcher
// every time you start a "search query".
let searcher = index.searcher();
@@ -161,46 +164,45 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// Here, if the user does not specify which
// field they want to search, tantivy will search
// in both title and body.
let query_parser = QueryParser::new(index.schema(), vec!(title, body));
let query_parser = QueryParser::new(index.schema(), vec![title, body]);
// QueryParser may fail if the query is not in the right
// format. For user facing applications, this can be a problem.
// A ticket has been opened regarding this problem.
let query = try!(query_parser.parse_query("sea whale"));
// A query defines a set of documents, as
// well as the way they should be scored.
//
//
// A query created by the query parser is scored according
// to a metric called Tf-Idf, and will consider
// any document matching at least one of our terms.
// ### Collectors
// ### Collectors
//
// We are not interested in all of the documents but
// We are not interested in all of the documents but
// only in the top 10. Keeping track of our top 10 best documents
// is the role of the TopCollector.
let mut top_collector = TopCollector::with_limit(10);
// We can now perform our query.
try!(searcher.search(&*query, &mut top_collector));
// Our top collector now contains the 10
// Our top collector now contains the 10
// most relevant doc ids...
let doc_addresses = top_collector.docs();
// The actual documents still need to be
// The actual documents still need to be
// retrieved from Tantivy's store.
//
//
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
for doc_address in doc_addresses {
let retrieved_doc = try!(searcher.doc(&doc_address));
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc = try!(searcher.doc(&doc_address));
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

View File

@@ -1,7 +1,7 @@
use Result;
use collector::Collector;
use SegmentLocalId;
use SegmentReader;
use std::io;
use DocId;
use Score;
@@ -12,7 +12,7 @@ use Score;
pub struct DoNothingCollector;
impl Collector for DoNothingCollector {
#[inline]
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
#[inline]
@@ -38,7 +38,7 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
Ok(())

View File

@@ -1,7 +1,7 @@
use std::io;
use super::Collector;
use DocId;
use Score;
use Result;
use SegmentReader;
use SegmentLocalId;
@@ -28,7 +28,7 @@ impl Default for CountCollector {
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}

View File

@@ -2,7 +2,7 @@ use SegmentReader;
use SegmentLocalId;
use DocId;
use Score;
use std::io;
use Result;
mod count_collector;
pub use self::count_collector::CountCollector;
@@ -48,14 +48,14 @@ pub use self::chained_collector::chain;
pub trait Collector {
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
}
impl<'a, C: Collector> Collector for &'a mut C {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
@@ -73,7 +73,6 @@ pub mod tests {
use DocId;
use Score;
use core::SegmentReader;
use std::io;
use SegmentLocalId;
use fastfield::U32FastFieldReader;
use schema::Field;
@@ -107,7 +106,7 @@ pub mod tests {
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
@@ -140,14 +139,14 @@ pub mod tests {
}
}
pub fn vals(&self,) -> &Vec<u32> {
&self.vals
pub fn vals(self,) -> Vec<u32> {
self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.ff_reader = Some(try!(reader.get_fast_field_reader(self.field)));
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = reader.get_fast_field_reader(self.field);
Ok(())
}

View File

@@ -1,7 +1,7 @@
use std::io;
use super::Collector;
use DocId;
use Score;
use Result;
use SegmentReader;
use SegmentLocalId;
@@ -25,7 +25,7 @@ impl<'a> MultiCollector<'a> {
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
for collector in &mut self.collectors {
try!(collector.set_segment(segment_local_id, segment));
}

View File

@@ -1,8 +1,8 @@
use std::io;
use super::Collector;
use SegmentReader;
use SegmentLocalId;
use DocAddress;
use Result;
use std::collections::BinaryHeap;
use std::cmp::Ordering;
use DocId;
@@ -105,7 +105,7 @@ impl TopCollector {
impl Collector for TopCollector {
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
self.segment_id = segment_id;
Ok(())
}

View File

@@ -3,16 +3,15 @@ mod timer;
mod vint;
pub mod bitpacker;
pub use self::serialize::BinarySerializable;
pub use self::timer::Timing;
pub use self::timer::TimerTree;
pub use self::timer::OpenTimer;
pub use self::vint::VInt;
use std::io;
/// Create a default io error given a string.
pub fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
@@ -30,3 +29,14 @@ pub trait HasLen {
}
/// Creates an uninitialized Vec of a given usize
///
/// `allocate_vec` does an unsafe call to `set_len`
/// as other solution are extremely slow in debug mode.
pub fn allocate_vec<T>(capacity: usize) -> Vec<T> {
let mut v = Vec::with_capacity(capacity);
unsafe {
v.set_len(capacity);
}
v
}

View File

@@ -1,4 +1,3 @@
use byteorder::{ReadBytesExt, WriteBytesExt};
use byteorder::LittleEndian as Endianness;
use std::fmt;
@@ -6,20 +5,12 @@ use std::io::Write;
use std::io::Read;
use std::io;
use common::VInt;
use byteorder;
pub trait BinarySerializable : fmt::Debug + Sized {
fn serialize(&self, writer: &mut Write) -> io::Result<usize>;
fn deserialize(reader: &mut Read) -> io::Result<Self>;
}
fn convert_byte_order_error(byteorder_error: byteorder::Error) -> io::Error {
match byteorder_error {
byteorder::Error::UnexpectedEOF => io::Error::new(io::ErrorKind::InvalidData, "Reached EOF unexpectedly"),
byteorder::Error::Io(e) => e,
}
}
impl BinarySerializable for () {
fn serialize(&self, _: &mut Write) -> io::Result<usize> {
Ok(0)
@@ -62,12 +53,10 @@ impl BinarySerializable for u32 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
writer.write_u32::<Endianness>(*self)
.map(|_| 4)
.map_err(convert_byte_order_error)
}
fn deserialize(reader: &mut Read) -> io::Result<u32> {
reader.read_u32::<Endianness>()
.map_err(convert_byte_order_error)
}
}
@@ -76,24 +65,20 @@ impl BinarySerializable for u64 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
writer.write_u64::<Endianness>(*self)
.map(|_| 8)
.map_err(convert_byte_order_error)
}
fn deserialize(reader: &mut Read) -> io::Result<u64> {
reader.read_u64::<Endianness>()
.map_err(convert_byte_order_error)
}
}
impl BinarySerializable for u8 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
// TODO error
try!(writer.write_u8(*self).map_err(convert_byte_order_error));
try!(writer.write_u8(*self));
Ok(1)
}
fn deserialize(reader: &mut Read) -> io::Result<u8> {
reader.read_u8()
.map_err(convert_byte_order_error)
}
}
@@ -123,7 +108,7 @@ mod test {
fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
let mut buffer: Vec<u8> = Vec::new();
if num_bytes != 0 {
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
assert_eq!(buffer.len(), num_bytes);

View File

@@ -1,44 +1,45 @@
use super::NUM_DOCS_PER_BLOCK;
use libc::size_t;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
extern {
fn compress_sorted_cpp(
data: *const u32,
output: *mut u8,
offset: u32) -> size_t;
mod simdcomp {
use libc::size_t;
fn uncompress_sorted_cpp(
compressed_data: *const u8,
output: *mut u32,
offset: u32) -> size_t;
fn compress_unsorted_cpp(
data: *const u32,
output: *mut u8) -> size_t;
extern {
pub fn compress_sorted(
data: *const u32,
output: *mut u8,
offset: u32) -> size_t;
fn uncompress_unsorted_cpp(
compressed_data: *const u8,
output: *mut u32) -> size_t;
pub fn uncompress_sorted(
compressed_data: *const u8,
output: *mut u32,
offset: u32) -> size_t;
pub fn compress_unsorted(
data: *const u32,
output: *mut u8) -> size_t;
pub fn uncompress_unsorted(
compressed_data: *const u8,
output: *mut u32) -> size_t;
}
}
fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
unsafe { compress_sorted_cpp(vals.as_ptr(), output.as_mut_ptr(), offset) }
unsafe { simdcomp::compress_sorted(vals.as_ptr(), output.as_mut_ptr(), offset) }
}
fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
unsafe { uncompress_sorted_cpp(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
}
fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {
unsafe { compress_unsorted_cpp(vals.as_ptr(), output.as_mut_ptr()) }
unsafe { simdcomp::compress_unsorted(vals.as_ptr(), output.as_mut_ptr()) }
}
fn uncompress_unsorted(compressed_data: &[u8], output: &mut [u32]) -> usize {
unsafe { uncompress_unsorted_cpp(compressed_data.as_ptr(), output.as_mut_ptr()) }
unsafe { simdcomp::uncompress_unsorted(compressed_data.as_ptr(), output.as_mut_ptr()) }
}

View File

@@ -2,59 +2,53 @@ use Result;
use Error;
use schema::Schema;
use std::sync::Arc;
use std::borrow::BorrowMut;
use std::fmt;
use rustc_serialize::json;
use core::SegmentId;
use directory::{Directory, MmapDirectory, RAMDirectory};
use indexer::IndexWriter;
use indexer::index_writer::open_index_writer;
use core::searcher::Searcher;
use std::convert::From;
use num_cpus;
use super::segment::Segment;
use core::SegmentReader;
use super::pool::Pool;
use core::SegmentMeta;
use super::pool::LeasedItem;
use std::path::Path;
use indexer::SegmentManager;
use core::IndexMeta;
use IndexWriter;
use directory::ManagedDirectory;
use core::META_FILEPATH;
use super::segment::create_segment;
use indexer::segment_updater::save_new_metas;
const NUM_SEARCHERS: usize = 12;
/// Accessor to the index segment manager
///
/// This method is not part of tantivy's public API
pub fn get_segment_manager(index: &Index) -> Arc<SegmentManager> {
index.segment_manager.clone()
}
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_file = try!(directory.open_read(&META_FILEPATH));
let meta_content = String::from_utf8_lossy(meta_file.as_slice());
json::decode(&meta_content)
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
json::decode(&meta_string)
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
}
/// Tantivy's Search Index
pub struct Index {
segment_manager: Arc<SegmentManager>,
directory: Box<Directory>,
directory: ManagedDirectory,
schema: Schema,
searcher_pool: Arc<Pool<Searcher>>,
docstamp: u64,
}
impl Index {
/// Creates a new index using the `RAMDirectory`.
///
/// The index will be allocated in anonymous memory.
/// This should only be used for unit tests.
pub fn create_in_ram(schema: Schema) -> Index {
let directory = Box::new(RAMDirectory::create());
let ram_directory = RAMDirectory::create();
let directory = ManagedDirectory::new(ram_directory).expect("Creating a managed directory from a brand new RAM directory should never fail.");
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here
}
@@ -63,9 +57,9 @@ impl Index {
///
/// If a previous index was in this directory, then its meta file will be destroyed.
pub fn create(directory_path: &Path, schema: Schema) -> Result<Index> {
let mut directory = MmapDirectory::open(directory_path)?;
save_new_metas(schema.clone(), 0, &mut directory)?;
Index::from_directory(box directory, schema)
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Creates a new index in a temp directory.
@@ -77,49 +71,55 @@ impl Index {
/// The temp directory is only used for testing the `MmapDirectory`.
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
let directory = Box::new(try!(MmapDirectory::create_from_tempdir()));
let mmap_directory = MmapDirectory::create_from_tempdir()?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Creates a new index given a directory and an `IndexMeta`.
fn create_from_metas(directory: Box<Directory>, metas: IndexMeta) -> Result<Index> {
fn create_from_metas(directory: ManagedDirectory, metas: IndexMeta) -> Result<Index> {
let schema = metas.schema.clone();
let docstamp = metas.docstamp;
let committed_segments = metas.committed_segments;
// TODO log somethings is uncommitted is not empty.
let index = Index {
segment_manager: Arc::new(SegmentManager::from_segments(committed_segments)),
directory: directory,
schema: schema,
searcher_pool: Arc::new(Pool::new()),
docstamp: docstamp,
};
try!(index.load_searchers());
Ok(index)
}
/// Opens a new directory from a directory.
pub fn from_directory(directory: Box<Directory>, schema: Schema) -> Result<Index> {
/// Create a new index from a directory.
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
Index::create_from_metas(directory, IndexMeta::with_schema(schema))
}
/// Opens a new directory from an index path.
pub fn open(directory_path: &Path) -> Result<Index> {
let directory = try!(MmapDirectory::open(directory_path));
let metas = try!(load_metas(&directory)); //< TODO does the directory already exists?
Index::create_from_metas(directory.box_clone(), metas)
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
let metas = try!(load_metas(&directory));
Index::create_from_metas(directory, metas)
}
/// Returns the index docstamp.
/// Returns the index opstamp.
///
/// The docstamp is the number of documents that have been added
/// The opstamp is the number of documents that have been added
/// from the beginning of time, and until the moment of the last commit.
pub fn docstamp(&self) -> u64 {
self.docstamp
pub fn opstamp(&self) -> u64 {
load_metas(self.directory()).unwrap().opstamp
}
/// Creates a multithreaded writer.
/// Each writer produces an independent segment.
/// Open a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
/// that due to a panic or other error, a stale lockfile will be
/// left in the index directory. If you are sure that no other
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// num_threads specifies the number of indexing workers that
/// should work at the same time.
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
@@ -129,12 +129,13 @@ impl Index {
num_threads: usize,
heap_size_in_bytes: usize)
-> Result<IndexWriter> {
IndexWriter::open(self, num_threads, heap_size_in_bytes)
open_index_writer(self, num_threads, heap_size_in_bytes)
}
/// Creates a multithreaded writer
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
@@ -151,47 +152,47 @@ impl Index {
}
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> Vec<Segment> {
self.searchable_segment_ids()
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self
.searchable_segment_metas()?
.into_iter()
.map(|segment_id| self.segment(segment_id))
.collect()
.map(|segment_meta| self.segment(segment_meta))
.collect())
}
/// Remove all of the file associated with the segment.
///
/// This method cannot fail. If a problem occurs,
/// some files may end up never being removed.
/// The error will only be logged.
pub fn delete_segment(&self, segment_id: SegmentId) {
self.segment(segment_id).delete();
}
/// Return a segment object given a `segment_id`
///
/// The segment may or may not exist.
pub fn segment(&self, segment_id: SegmentId) -> Segment {
create_segment(self.clone(), segment_id)
}
/// Return a reference to the index directory.
pub fn directory(&self) -> &Directory {
&*self.directory
}
/// Return a mutable reference to the index directory.
pub fn directory_mut(&mut self) -> &mut Directory {
&mut *self.directory
}
/// Returns the list of segment ids that are searchable.
fn searchable_segment_ids(&self) -> Vec<SegmentId> {
self.segment_manager.committed_segments()
#[doc(hidden)]
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
create_segment(self.clone(), segment_meta)
}
/// Creates a new segment.
pub fn new_segment(&self) -> Segment {
self.segment(SegmentId::generate_random())
let segment_meta = SegmentMeta::new(SegmentId::generate_random());
create_segment(self.clone(), segment_meta)
}
/// Return a reference to the index directory.
pub fn directory(&self) -> &ManagedDirectory {
&self.directory
}
/// Return a mutable reference to the index directory.
pub fn directory_mut(&mut self) -> &mut ManagedDirectory {
&mut self.directory
}
/// Reads the meta.json and returns the list of
/// `SegmentMeta` from the last commit.
pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
Ok(load_metas(self.directory())?.segments)
}
/// Returns the list of segment ids that are searchable.
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect())
}
/// Creates a new generation of searchers after
@@ -200,16 +201,14 @@ impl Index {
/// This needs to be called when a new segment has been
/// published or after a merge.
pub fn load_searchers(&self) -> Result<()> {
let searchable_segments = self.searchable_segments();
let mut searchers = Vec::new();
for _ in 0..NUM_SEARCHERS {
let searchable_segments_clone = searchable_segments.clone();
let segment_readers: Vec<SegmentReader> = try!(searchable_segments_clone.into_iter()
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
.into_iter()
.map(SegmentReader::open)
.collect());
let searcher = Searcher::from(segment_readers);
searchers.push(searcher);
}
let searchers = (0..NUM_SEARCHERS)
.map(|_| Searcher::from(segment_readers.clone()))
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}
@@ -239,12 +238,9 @@ impl fmt::Debug for Index {
impl Clone for Index {
fn clone(&self) -> Index {
Index {
segment_manager: self.segment_manager.clone(),
directory: self.directory.box_clone(),
directory: self.directory.clone(),
schema: self.schema.clone(),
searcher_pool: self.searcher_pool.clone(),
docstamp: self.docstamp,
}
}
}

View File

@@ -1,7 +1,5 @@
use schema::Schema;
use core::SegmentId;
use core::SegmentMeta;
/// Meta information about the `Index`.
///
@@ -13,35 +11,17 @@ use core::SegmentId;
///
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
pub struct IndexMeta {
pub committed_segments: Vec<SegmentMeta>,
pub uncommitted_segments: Vec<SegmentMeta>,
pub segments: Vec<SegmentMeta>,
pub schema: Schema,
pub docstamp: u64,
pub opstamp: u64,
}
impl IndexMeta {
pub fn with_schema(schema: Schema) -> IndexMeta {
IndexMeta {
committed_segments: Vec::new(),
uncommitted_segments: Vec::new(),
segments: vec!(),
schema: schema,
docstamp: 0u64,
opstamp: 0u64,
}
}
}
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
pub struct SegmentMeta {
pub segment_id: SegmentId,
pub num_docs: u32,
}
#[cfg(test)]
impl SegmentMeta {
pub fn new(segment_id: SegmentId, num_docs: u32) -> SegmentMeta {
SegmentMeta {
segment_id: segment_id,
num_docs: num_docs,
}
}
}

View File

@@ -1,5 +1,4 @@
pub mod searcher;
pub mod index;
mod segment_reader;
mod segment_id;
@@ -7,20 +6,38 @@ mod segment_component;
mod segment;
mod index_meta;
mod pool;
mod segment_meta;
mod term_iterator;
use std::path::PathBuf;
pub use self::searcher::Searcher;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;
pub use self::segment::Segment;
pub use self::segment::SegmentInfo;
pub use self::segment::SerializableSegment;
pub use self::index::Index;
pub use self::index_meta::{IndexMeta, SegmentMeta};
pub use self::segment_meta::SegmentMeta;
pub use self::index_meta::IndexMeta;
pub use self::term_iterator::TermIterator;
use std::path::PathBuf;
lazy_static! {
/// The meta file contains all the information about the list of segments and the schema
/// of the index.
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
/// The managed file contains a list of files that were created by the tantivy
/// and will therefore be garbage collected when they are deemed useless by tantivy.
///
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
/// are currently in the directory
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
/// Only one process should be able to write tantivy's index at a time.
/// This file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
}

View File

@@ -7,6 +7,8 @@ use query::Query;
use DocId;
use DocAddress;
use schema::Term;
use core::TermIterator;
use std::fmt;
/// Holds a list of `SegmentReader`s ready for search.
@@ -14,13 +16,13 @@ use schema::Term;
/// It guarantees that the `Segment` will not be removed before
/// the destruction of the `Searcher`.
///
#[derive(Debug)]
pub struct Searcher {
segment_readers: Vec<SegmentReader>,
}
impl Searcher {
/// Fetches a document from tantivy's store given a `DocAddress`.
///
/// The searcher uses the segment ordinal to route the
@@ -47,9 +49,21 @@ impl Searcher {
.map(|segment_reader| segment_reader.doc_freq(term))
.fold(0u32, |acc, val| acc + val)
}
/// Returns a Stream over all of the sorted unique terms of
/// the searcher.
///
/// This includes all of the fields from all of the segment_readers.
/// See [TermIterator](struct.TermIterator.html).
///
/// # Warning
/// This API is very likely to change in the future.
pub fn terms<'a>(&'a self) -> TermIterator<'a> {
TermIterator::from(self.segment_readers())
}
/// Return the list of segment readers
pub fn segment_readers(&self,) -> &Vec<SegmentReader> {
pub fn segment_readers(&self,) -> &[SegmentReader] {
&self.segment_readers
}
@@ -70,4 +84,14 @@ impl From<Vec<SegmentReader>> for Searcher {
segment_readers: segment_readers,
}
}
}
impl fmt::Debug for Searcher {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let segment_ids = self.segment_readers
.iter()
.map(|segment_reader| segment_reader.segment_id())
.collect::<Vec<_>>();
write!(f, "Searcher({:?})", segment_ids)
}
}

View File

@@ -1,97 +1,89 @@
use Result;
use std::path::PathBuf;
use schema::Schema;
use DocId;
use std::fmt;
use core::SegmentId;
use directory::{ReadOnlySource, WritePtr};
use directory::{ReadOnlySource, WritePtr, FileProtection};
use indexer::segment_serializer::SegmentSerializer;
use super::SegmentComponent;
use core::Index;
use std::result;
use directory::error::{FileError, OpenWriteError};
use directory::Directory;
use core::SegmentMeta;
use directory::error::{OpenReadError, OpenWriteError};
/// A segment is a piece of the index.
#[derive(Clone)]
pub struct Segment {
index: Index,
segment_id: SegmentId,
meta: SegmentMeta,
}
impl fmt::Debug for Segment {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Segment({:?})", self.segment_id.uuid_string())
write!(f, "Segment({:?})", self.id().uuid_string())
}
}
/// Creates a new segment given an `Index` and a `SegmentId`
///
/// The function is here to make it private outside `tantivy`.
pub fn create_segment(index: Index, segment_id: SegmentId) -> Segment {
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
Segment {
index: index,
segment_id: segment_id,
meta: meta,
}
}
impl Segment {
/// Returns our index's schema.
pub fn schema(&self,) -> Schema {
self.index.schema()
}
/// Returns the segment meta-information
pub fn meta(&self) -> &SegmentMeta {
&self.meta
}
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.meta.set_delete_meta(num_deleted_docs, opstamp);
}
/// Returns the segment's id.
pub fn id(&self,) -> SegmentId {
self.segment_id
self.meta.id()
}
/// Returns the relative path of a component of our segment.
///
/// It just joins the segment id with the extension
/// associated to a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
self.segment_id.relative_path(component)
self.meta.relative_path(component)
}
/// Deletes all of the document of the segment.
/// This is called when there is a merge or a rollback.
/// Protects a specific component file from being deleted.
///
/// # Disclaimer
/// If deletion of a file fails (e.g. a file
/// was read-only.), the method does not
/// fail and just logs an error
pub fn delete(&self,) {
for component in SegmentComponent::values() {
let rel_path = self.relative_path(component);
if let Err(err) = self.index.directory().delete(&rel_path) {
match err {
FileError::FileDoesNotExist(_) => {
// this is normal behavior.
// the position file for instance may not exists.
}
FileError::IOError(err) => {
error!("Failed to remove {:?} : {:?}", self.segment_id, err);
}
}
}
}
/// Returns a FileProtection object. The file is guaranteed
/// to not be garbage collected as long as this `FileProtection` object
/// lives.
pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection {
let path = self.relative_path(component);
self.index.directory().protect_file_from_delete(&path)
}
/// Open one of the component file for read.
pub fn open_read(&self, component: SegmentComponent) -> result::Result<ReadOnlySource, FileError> {
/// Open one of the component file for a *regular* read.
pub fn open_read(&self, component: SegmentComponent) -> result::Result<ReadOnlySource, OpenReadError> {
let path = self.relative_path(component);
let source = try!(self.index.directory().open_read(&path));
Ok(source)
}
/// Open one of the component file for write.
/// Open one of the component file for *regular* write.
pub fn open_write(&mut self, component: SegmentComponent) -> result::Result<WritePtr, OpenWriteError> {
let path = self.relative_path(component);
let write = try!(self.index.directory_mut().open_write(&path));
@@ -108,7 +100,34 @@ pub trait SerializableSegment {
fn write(&self, serializer: SegmentSerializer) -> Result<u32>;
}
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
pub struct SegmentInfo {
pub max_doc: DocId,
#[cfg(test)]
mod tests {
use core::SegmentComponent;
use directory::Directory;
use std::collections::HashSet;
use schema::SchemaBuilder;
use Index;
#[test]
fn test_segment_protect_component() {
let mut index = Index::create_in_ram(SchemaBuilder::new().build());
let segment = index.new_segment();
let path = segment.relative_path(SegmentComponent::POSTINGS);
let directory = index.directory_mut();
directory.atomic_write(&*path, &vec!(0u8)).unwrap();
let living_files = HashSet::new();
{
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
assert!(directory.exists(&*path));
directory.garbage_collect(living_files.clone());
assert!(directory.exists(&*path));
}
directory.garbage_collect(living_files);
assert!(!directory.exists(&*path));
}
}

View File

@@ -1,41 +1,27 @@
use std::vec::IntoIter;
#[derive(Copy, Clone)]
pub enum SegmentComponent {
INFO,
POSTINGS,
POSITIONS,
FASTFIELDS,
FIELDNORMS,
TERMS,
STORE,
DELETE
}
impl SegmentComponent {
pub fn values() -> IntoIter<SegmentComponent> {
vec!(
SegmentComponent::INFO,
pub fn iterator() -> impl Iterator<Item=&'static SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
).into_iter()
SegmentComponent::DELETE
];
SEGMENT_COMPONENTS.into_iter()
}
pub fn path_suffix(&self)-> &'static str {
match *self {
SegmentComponent::POSITIONS => ".pos",
SegmentComponent::INFO => ".info",
SegmentComponent::POSTINGS => ".idx",
SegmentComponent::TERMS => ".term",
SegmentComponent::STORE => ".store",
SegmentComponent::FASTFIELDS => ".fast",
SegmentComponent::FIELDNORMS => ".fieldnorm",
}
}
}
}

View File

@@ -1,14 +1,19 @@
use uuid::Uuid;
use std::fmt;
use rustc_serialize::{Encoder, Decoder, Encodable, Decodable};
use core::SegmentComponent;
use std::path::PathBuf;
use std::cmp::{Ordering, Ord};
#[cfg(test)]
use std::sync::atomic;
/// Tantivy SegmentId.
///
/// Tantivy's segment are identified
/// by a UUID which is used to prefix the filenames
/// of all of the file associated with the segment.
///
/// In unit test, for reproducability, the SegmentId are
/// simply generated in an autoincrement fashion.
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
pub struct SegmentId(Uuid);
@@ -23,12 +28,12 @@ lazy_static! {
// During tests, we generate the segment id in a autoincrement manner
// for consistency of segment id between run.
//
// The order of the test execution is not guaranteed, but the order
// The order of the test execution is not guaranteed, but the order
// of segments within a single test is guaranteed.
#[cfg(test)]
fn create_uuid() -> Uuid {
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR)
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR).unwrap()
}
#[cfg(not(test))]
@@ -37,21 +42,26 @@ fn create_uuid() -> Uuid {
}
impl SegmentId {
#[doc(hidden)]
pub fn generate_random() -> SegmentId {
SegmentId(create_uuid())
}
/// Returns a shorter identifier of the segment.
///
/// We are using UUID4, so only 6 bits are fixed,
/// and the rest is random.
///
/// Picking the first 8 chars is ok to identify
/// segments in a display message.
pub fn short_uuid_string(&self,) -> String {
(&self.0.to_simple_string()[..8]).to_string()
(&self.0.simple().to_string()[..8]).to_string()
}
/// Returns a segment uuid string.
pub fn uuid_string(&self,) -> String {
self.0.to_simple_string()
}
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let filename = self.uuid_string() + component.path_suffix();
PathBuf::from(filename)
self.0.simple().to_string()
}
}
@@ -69,7 +79,7 @@ impl Decodable for SegmentId {
impl fmt::Debug for SegmentId {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "SegmentId({:?})", self.uuid_string())
write!(f, "Seg({:?})", self.short_uuid_string())
}
}

121
src/core/segment_meta.rs Normal file
View File

@@ -0,0 +1,121 @@
use core::SegmentId;
use super::SegmentComponent;
use std::path::PathBuf;
use std::collections::HashSet;
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
struct DeleteMeta {
num_deleted_docs: u32,
opstamp: u64,
}
/// SegmentMeta contains simple meta information about a segment.
///
/// For instance the number of docs it contains,
/// how many are deleted, etc.
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
pub struct SegmentMeta {
segment_id: SegmentId,
max_doc: u32,
deletes: Option<DeleteMeta>,
}
impl SegmentMeta {
/// Creates a new segment meta for
/// a segment with no deletes and no documents.
pub fn new(segment_id: SegmentId) -> SegmentMeta {
SegmentMeta {
segment_id: segment_id,
max_doc: 0,
deletes: None,
}
}
/// Returns the segment id.
pub fn id(&self) -> SegmentId {
self.segment_id
}
/// Returns the number of deleted documents.
pub fn num_deleted_docs(&self) -> u32 {
self.deletes
.as_ref()
.map(|delete_meta| delete_meta.num_deleted_docs)
.unwrap_or(0u32)
}
/// Returns the list of files that
/// are required for the segment meta.
///
/// This is useful as the way tantivy removes files
/// is by removing all files that have been created by tantivy
/// and are not used by any segment anymore.
pub fn list_files(&self) -> HashSet<PathBuf> {
SegmentComponent::iterator()
.map(|component| {
self.relative_path(*component)
})
.collect::<HashSet<PathBuf>>()
}
/// Returns the relative path of a component of our segment.
///
/// It just joins the segment id with the extension
/// associated to a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string();
path.push_str(&*match component {
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))},
});
PathBuf::from(path)
}
/// Return the highest doc id + 1
///
/// If there are no deletes, then num_docs = max_docs
/// and all the doc ids contains in this segment
/// are exactly (0..max_doc).
pub fn max_doc(&self) -> u32 {
self.max_doc
}
/// Return the number of documents in the segment.
pub fn num_docs(&self) -> u32 {
self.max_doc() - self.num_deleted_docs()
}
/// Returns the opstamp of the last delete operation
/// taken in account in this segment.
pub fn delete_opstamp(&self) -> Option<u64> {
self.deletes
.as_ref()
.map(|delete_meta| delete_meta.opstamp)
}
/// Returns true iff the segment meta contains
/// delete information.
pub fn has_deletes(&self) -> bool {
self.deletes.is_some()
}
#[doc(hidden)]
pub fn set_max_doc(&mut self, max_doc: u32) {
self.max_doc = max_doc;
}
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.deletes = Some(DeleteMeta {
num_deleted_docs: num_deleted_docs,
opstamp: opstamp,
});
}
}

View File

@@ -3,17 +3,18 @@ use core::Segment;
use core::SegmentId;
use core::SegmentComponent;
use schema::Term;
use common::HasLen;
use core::SegmentMeta;
use fastfield::delete::DeleteBitSet;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use DocId;
use std::io;
use std::str;
use postings::TermInfo;
use datastruct::FstMap;
use std::sync::Arc;
use std::fmt;
use rustc_serialize::json;
use core::SegmentInfo;
use schema::Field;
use postings::SegmentPostingsOption;
use postings::SegmentPostings;
@@ -22,8 +23,6 @@ use schema::Schema;
use schema::FieldType;
use postings::FreqHandler;
use schema::TextIndexingOptions;
use error::Error;
/// Entry point to access all of the datastructures of the `Segment`
///
@@ -36,14 +35,16 @@ use error::Error;
/// The segment reader has a very low memory footprint,
/// as close to all of the memory data is mmapped.
///
#[derive(Clone)]
pub struct SegmentReader {
segment_info: SegmentInfo,
segment_id: SegmentId,
term_infos: FstMap<TermInfo>,
segment_meta: SegmentMeta,
term_infos: Arc<FstMap<TermInfo>>,
postings_data: ReadOnlySource,
store_reader: StoreReader,
fast_fields_reader: U32FastFieldsReader,
fieldnorms_reader: U32FastFieldsReader,
fast_fields_reader: Arc<U32FastFieldsReader>,
fieldnorms_reader: Arc<U32FastFieldsReader>,
delete_bitset: DeleteBitSet,
positions_data: ReadOnlySource,
schema: Schema,
}
@@ -54,7 +55,7 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes, so it happens
/// to also be the number of documents in the index.
pub fn max_doc(&self) -> DocId {
self.segment_info.max_doc
self.segment_meta.max_doc()
}
/// Returns the number of documents.
@@ -63,20 +64,39 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
self.segment_info.max_doc
self.segment_meta.num_docs()
}
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset.len() as DocId
}
/// Accessor to a segment's fast field reader given a field.
pub fn get_fast_field_reader(&self, field: Field) -> io::Result<U32FastFieldReader> {
pub fn get_fast_field_reader(&self, field: Field) -> Option<U32FastFieldReader> {
/// Returns the u32 fast value reader if the field
/// is a u32 field indexed as "fast".
///
/// Return None if the field is not a u32 field
/// indexed with the fast option.
///
/// # Panics
/// May panic if the index is corrupted.
let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Str(_) => {
Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
match field_entry.field_type() {
&FieldType::Str(_) => {
warn!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name());
None
},
FieldType::U32(_) => {
// TODO check that the schema allows that
//Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
self.fast_fields_reader.get_field(field)
&FieldType::U32(ref u32_options) => {
if u32_options.is_fast() {
self.fast_fields_reader.get_field(field)
}
else {
warn!("Field <{}> is not defined as a fast field.", field_entry.name());
None
}
},
}
}
@@ -88,7 +108,7 @@ impl SegmentReader {
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> io::Result<U32FastFieldReader> {
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U32FastFieldReader> {
self.fieldnorms_reader.get_field(field)
}
@@ -107,21 +127,7 @@ impl SegmentReader {
/// Open a new segment for reading.
pub fn open(segment: Segment) -> Result<SegmentReader> {
let segment_info_reader = try!(segment.open_read(SegmentComponent::INFO));
let segment_info_data = try!(
str::from_utf8(&*segment_info_reader)
.map_err(|err| {
let segment_info_filepath = segment.relative_path(SegmentComponent::INFO);
Error::CorruptedFile(segment_info_filepath, Box::new(err))
})
);
let segment_info: SegmentInfo = try!(
json::decode(&segment_info_data)
.map_err(|err| {
let file_path = segment.relative_path(SegmentComponent::INFO);
Error::CorruptedFile(file_path, Box::new(err))
})
);
let source = try!(segment.open_read(SegmentComponent::TERMS));
let term_infos = try!(FstMap::from_source(source));
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
@@ -137,15 +143,25 @@ impl SegmentReader {
.open_read(SegmentComponent::POSITIONS)
.unwrap_or_else(|_| ReadOnlySource::empty());
let delete_bitset =
if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
DeleteBitSet::open(delete_data)
}
else {
DeleteBitSet::empty()
};
let schema = segment.schema();
Ok(SegmentReader {
segment_info: segment_info,
segment_meta: segment.meta().clone(),
postings_data: postings_shared_mmap,
term_infos: term_infos,
term_infos: Arc::new(term_infos),
segment_id: segment.id(),
store_reader: store_reader,
fast_fields_reader: fast_fields_reader,
fieldnorms_reader: fieldnorms_reader,
fast_fields_reader: Arc::new(fast_fields_reader),
fieldnorms_reader: Arc::new(fieldnorms_reader),
delete_bitset: delete_bitset,
positions_data: positions_data,
schema: schema,
})
@@ -214,10 +230,15 @@ impl SegmentReader {
FreqHandler::new_without_freq()
}
};
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler))
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, &self.delete_bitset, freq_handler))
}
/// Returns the posting list associated with a term.
///
/// If the term is not found, return None.
/// Even when non-null, because of deletes, the posting object
/// returned by this method may contain no documents.
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
let field_entry = self.schema.get_field_entry(term.field());
let segment_posting_option = match *field_entry.field_type() {
@@ -237,6 +258,24 @@ impl SegmentReader {
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.term_infos.get(term.as_slice())
}
/// Returns the segment id
pub fn segment_id(&self) -> SegmentId {
self.segment_id
}
/// Returns the bitset representing
/// the documents that have been deleted.
pub fn delete_bitset(&self) -> &DeleteBitSet {
&self.delete_bitset
}
/// Returns true iff the `doc` is marked
/// as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
self.delete_bitset.is_deleted(doc)
}
}

185
src/core/term_iterator.rs Normal file
View File

@@ -0,0 +1,185 @@
use fst::Streamer;
use std::mem;
use std::collections::BinaryHeap;
use fst::map::Keys;
use schema::Field;
use schema::Term;
use core::SegmentReader;
use std::cmp::Ordering;
#[derive(PartialEq, Eq, Debug)]
struct HeapItem {
term: Term,
segment_ord: usize,
}
impl PartialOrd for HeapItem {
fn partial_cmp(&self, other: &HeapItem) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for HeapItem {
fn cmp(&self, other: &HeapItem) -> Ordering {
(&other.term, &other.segment_ord).cmp(&(&self.term, &self.segment_ord))
}
}
/// Given a list of sorted term streams,
/// returns an iterator over sorted unique terms.
///
/// The item yield is actually a pair with
/// - the term
/// - a slice with the ordinal of the segments containing
/// the terms.
pub struct TermIterator<'a> {
key_streams: Vec<Keys<'a>>,
heap: BinaryHeap<HeapItem>,
// Buffer hosting the list of segment ordinals containing
// the current term.
current_term: Term,
current_segment_ords: Vec<usize>,
}
impl<'a> TermIterator<'a> {
fn new(key_streams: Vec<Keys<'a>>) -> TermIterator<'a> {
let key_streams_len = key_streams.len();
TermIterator {
key_streams: key_streams,
heap: BinaryHeap::new(),
current_term: Term::from_field_text(Field(0), ""),
current_segment_ords: (0..key_streams_len).collect(),
}
}
/// Advance the term iterator to the next term.
/// Returns true if there is indeed another term
/// False if there is none.
pub fn advance(&mut self) -> bool {
self.advance_segments();
if let Some(mut head) = self.heap.pop() {
mem::swap(&mut self.current_term, &mut head.term);
self.current_segment_ords.push(head.segment_ord);
loop {
match self.heap.peek() {
Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {}
_ => { break; }
}
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
self.current_segment_ords.push(next_heap_it.segment_ord);
}
true
}
else {
false
}
}
/// Returns the current term.
///
/// This method may be called
/// iff advance() has been called before
/// and "true" was returned.
pub fn term(&self) -> &Term {
&self.current_term
}
/// Returns the sorted list of segment ordinals
/// that include the current term.
///
/// This method may be called
/// iff advance() has been called before
/// and "true" was returned.
pub fn segment_ords(&self) -> &[usize]{
&self.current_segment_ords[..]
}
fn advance_segments(&mut self) {
for segment_ord in self.current_segment_ords.drain(..) {
if let Some(term) = self.key_streams[segment_ord].next() {
self.heap.push(HeapItem {
term: Term::from_bytes(term),
segment_ord: segment_ord,
});
}
}
}
}
impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
type Item = &'a Term;
fn next(&'a mut self) -> Option<Self::Item> {
if self.advance() {
Some(&self.current_term)
}
else {
None
}
}
}
impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> {
TermIterator::new(
segment_readers
.iter()
.map(|reader| reader.term_infos().keys())
.collect()
)
}
}
#[cfg(test)]
mod tests {
use super::*;
use schema::{SchemaBuilder, Document, TEXT};
use core::Index;
#[test]
fn test_term_iterator() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
{
let mut doc = Document::default();
doc.add_text(text_field, "a b d f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c d f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
{
let mut doc = Document::default();
doc.add_text(text_field, "e f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut term_it = searcher.terms();
let mut terms = String::new();
while let Some(term) = term_it.next() {
unsafe {
terms.push_str(term.text());
}
}
assert_eq!(terms, "abcdef");
}
}

View File

@@ -4,7 +4,6 @@ use std::io;
use std::io::Write;
use fst;
use fst::raw::Fst;
use fst::Streamer;
use directory::ReadOnlySource;
use common::BinarySerializable;
@@ -21,7 +20,7 @@ pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
Ok(FstMapBuilder {
@@ -31,7 +30,28 @@ impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
})
}
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()>{
/// Horribly unsafe, nobody should ever do that... except me :)
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
/// TODO see if I can bend Rust typesystem to enforce that
/// in a nice way.
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
Ok(())
}
/// Horribly unsafe, nobody should ever do that... except me :)
pub fn insert_value(&mut self, value: &V) -> io::Result<()> {
try!(value.serialize(&mut self.data));
Ok(())
}
#[cfg(test)]
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
@@ -66,27 +86,10 @@ fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
}))
}
pub struct FstKeyIter<'a, V: 'static + BinarySerializable> {
streamer: fst::map::Stream<'a>,
__phantom__: PhantomData<V>
}
impl<'a, V: 'static + BinarySerializable> FstKeyIter<'a, V> {
pub fn next(&mut self) -> Option<(&[u8])> {
self.streamer
.next()
.map(|(k, _)| k)
}
}
impl<V: BinarySerializable> FstMap<V> {
pub fn keys(&self,) -> FstKeyIter<V> {
FstKeyIter {
streamer: self.fst_index.stream(),
__phantom__: PhantomData,
}
pub fn keys(&self,) -> fst::map::Keys {
self.fst_index.keys()
}
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
@@ -123,6 +126,7 @@ mod tests {
use super::*;
use directory::{RAMDirectory, Directory};
use std::path::PathBuf;
use fst::Streamer;
#[test]
fn test_fstmap() {
@@ -143,7 +147,6 @@ mod tests {
assert_eq!(keys.next().unwrap(), "abc".as_bytes());
assert_eq!(keys.next().unwrap(), "abcd".as_bytes());
assert_eq!(keys.next(), None);
}
}

View File

@@ -4,5 +4,4 @@ pub mod stacker;
pub use self::fstmap::FstMapBuilder;
pub use self::fstmap::FstMap;
pub use self::fstmap::FstKeyIter;
pub use self::skip::{SkipListBuilder, SkipList};

View File

@@ -36,7 +36,7 @@ impl<T: BinarySerializable> LayerBuilder<T> {
fn insert(&mut self, doc_id: DocId, value: &T) -> io::Result<Option<(DocId, u32)>> {
self.remaining -= 1;
self.len += 1;
let offset = self.written_size() as u32; // TODO not sure if we want after or here
let offset = self.written_size() as u32;
try!(doc_id.serialize(&mut self.buffer));
try!(value.serialize(&mut self.buffer));
Ok(if self.remaining == 0 {

View File

@@ -1,5 +1,6 @@
use std::cell::UnsafeCell;
use std::mem;
use common::allocate_vec;
use std::ptr;
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
@@ -41,7 +42,6 @@ impl Heap {
self.inner().clear();
}
/// Return the heap capacity.
pub fn capacity(&self,) -> u32 {
self.inner().capacity()
@@ -91,6 +91,10 @@ impl Heap {
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
pub fn get_ref<Item>(&self, addr: u32) -> &Item {
self.inner().get_mut_ref(addr)
}
}
@@ -101,22 +105,11 @@ struct InnerHeap {
next_heap: Option<Box<InnerHeap>>,
}
/// initializing a long Vec<u8> is crazy slow in
/// debug mode.
/// We use this unsafe trick to make unit test
/// way faster.
fn allocate_fast(num_bytes: usize) -> Vec<u8> {
let mut buffer = Vec::with_capacity(num_bytes);
unsafe {
buffer.set_len(num_bytes);
}
buffer
}
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
let buffer: Vec<u8> = allocate_fast(num_bytes);
let buffer: Vec<u8> = allocate_vec(num_bytes);
InnerHeap {
buffer: buffer,
buffer_len: num_bytes as u32,

View File

@@ -1,13 +1,14 @@
use std::marker::Send;
use std::fmt;
use std::path::Path;
use directory::error::{FileError, OpenWriteError};
use directory::error::{OpenReadError, DeleteError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
use std::io;
use std::marker::Sync;
/// Write-once read many (WORM) abstraction for where tantivy's index should be stored.
/// Write-once read many (WORM) abstraction for where
/// tantivy's data should be stored.
///
/// There are currently two implementations of `Directory`
///
@@ -25,16 +26,16 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
///
/// Specifically, subsequent writes or flushes should
/// have no effect on the returned `ReadOnlySource` object.
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, FileError>;
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError>;
/// Removes a file
///
/// Removing a file will not affect an eventual
/// existing ReadOnlySource pointing to it.
///
/// Removing a nonexistent file, yields a
/// `FileError::DoesNotExist`.
fn delete(&self, path: &Path) -> result::Result<(), FileError>;
/// `DeleteError::DoesNotExist`.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError>;
/// Returns true iff the file exists
fn exists(&self, path: &Path) -> bool;
@@ -60,6 +61,12 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
/// The file may not previously exist.
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError>;
/// Reads the full content file that has been written using
/// atomic_write.
///
/// This should only be used for small files.
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError>;
/// Atomically replace the content of a file with data.
///
/// This calls ensure that reads can never *observe*
@@ -70,6 +77,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
/// Clones the directory and boxes the clone
fn box_clone(&self) -> Box<Directory>;
}

View File

@@ -27,9 +27,9 @@ impl From<io::Error> for OpenWriteError {
}
}
/// Error that may occur when accessing a file (read, or delete)
/// Error that may occur when accessing a file read
#[derive(Debug)]
pub enum FileError {
pub enum OpenReadError {
/// The file does not exists.
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
@@ -37,8 +37,16 @@ pub enum FileError {
IOError(io::Error),
}
impl From<io::Error> for FileError {
fn from(err: io::Error) -> FileError {
FileError::IOError(err)
}
/// Error that may occur when trying to delete a file
#[derive(Debug)]
pub enum DeleteError {
/// The file does not exists.
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(io::Error),
/// The file may not be deleted because it is
/// protected.
FileProtected(PathBuf),
}

View File

@@ -0,0 +1,405 @@
use std::path::{Path, PathBuf};
use directory::error::{OpenReadError, DeleteError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
use std::io;
use Directory;
use std::sync::{Arc, RwLock};
use std::collections::HashSet;
use std::io::Write;
use rustc_serialize::json;
use core::MANAGED_FILEPATH;
use std::collections::HashMap;
use std::fmt;
use Result;
use Error;
/// Wrapper of directories that keeps track of files created by Tantivy.
///
/// A managed directory is just a wrapper of a directory
/// that keeps a (persisted) list of the files that
/// have been created (and not deleted) by tantivy so far.
///
/// Thanks to this list, it implements a `garbage_collect` method
/// that removes the files that were created by tantivy and are not
/// useful anymore.
#[derive(Debug)]
pub struct ManagedDirectory {
directory: Box<Directory>,
meta_informations: Arc<RwLock<MetaInformation>>,
}
#[derive(Debug, Default)]
struct MetaInformation {
managed_paths: HashSet<PathBuf>,
protected_files: HashMap<PathBuf, usize>,
}
/// A `FileProtection` prevents the garbage collection of a file.
///
/// See `ManagedDirectory.protect_file_from_delete`.
pub struct FileProtection {
directory: ManagedDirectory,
path: PathBuf,
}
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
let mut meta_informations_wlock = directory.meta_informations
.write()
.expect("Managed file lock poisoned");
if let Some(counter_ref_mut) = meta_informations_wlock
.protected_files
.get_mut(path) {
(*counter_ref_mut) -= 1;
}
}
impl fmt::Debug for FileProtection {
fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
write!(formatter, "FileProtectionFor({:?})", self.path)
}
}
impl Drop for FileProtection {
fn drop(&mut self) {
unprotect_file_from_delete(&self.directory, &*self.path);
}
}
impl ManagedDirectory {
/// Wraps a directory as managed directory.
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = json::decode(&managed_files_json)
.map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?;
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::new(RwLock::new(
MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default()
})),
})
}
Err(OpenReadError::FileDoesNotExist(_)) => {
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::default(),
})
}
Err(OpenReadError::IOError(e)) => {
Err(From::from(e))
}
}
}
/// Garbage collect unused files.
///
/// Removes the files that were created by `tantivy` and are not
/// used by any segment anymore.
///
/// * `living_files` - List of files that are still used by the index.
///
/// This method does not panick nor returns errors.
/// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed
/// files.
pub fn garbage_collect(&mut self, living_files: HashSet<PathBuf>) {
let mut files_to_delete = vec!();
{ // releasing the lock as .delete() will use it too.
let meta_informations_rlock = self.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) {
files_to_delete.push(managed_path.clone());
}
}
}
let mut deleted_files = vec!();
{
for file_to_delete in files_to_delete {
match self.delete(&file_to_delete) {
Ok(_) => {
info!("Deleted {:?}", file_to_delete);
deleted_files.push(file_to_delete);
}
Err(file_error) => {
error!("Failed to delete {:?}", file_to_delete);
match file_error {
DeleteError::FileDoesNotExist(_) => {
deleted_files.push(file_to_delete);
}
DeleteError::IOError(_) => {
if !cfg!(target_os = "windows") {
error!("Failed to delete {:?}", file_to_delete);
}
}
DeleteError::FileProtected(_) => {
// this is expected.
}
}
}
}
}
}
if !deleted_files.is_empty() {
// update the list of managed files by removing
// the file that were removed.
{
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
for delete_file in &deleted_files {
managed_paths_write.remove(delete_file);
}
}
if let Err(_) = self.save_managed_paths() {
error!("Failed to save the list of managed files.");
}
}
}
/// Protects a file from being garbage collected.
///
/// The method returns a `FileProtection` object.
/// The file will not be garbage collected as long as the
/// `FileProtection` object is kept alive.
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
let pathbuf = path.to_owned();
{
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned on protect");
*meta_informations_wlock
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
}
FileProtection {
directory: self.clone(),
path: pathbuf.clone(),
}
}
/// Saves the file containing the list of existing files
/// that were created by tantivy.
fn save_managed_paths(&mut self,) -> io::Result<()> {
let managed_paths;
{
let meta_informations_rlock = self.meta_informations
.read()
.expect("Managed file lock poisoned");
managed_paths = meta_informations_rlock.managed_paths.clone();
}
let mut w = vec!();
try!(write!(&mut w, "{}\n", json::as_pretty_json(&managed_paths)));
self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
Ok(())
}
/// Registers a file as managed
///
/// This method must be called before the file is
/// actually created to ensure that a failure between
/// registering the filepath and creating the file
/// will not lead to garbage files that will
/// never get removed.
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
let has_changed = {
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");
meta_wlock.managed_paths.insert(filepath.to_owned())
};
if has_changed {
self.save_managed_paths()?;
}
Ok(())
}
}
impl Directory for ManagedDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.directory.open_read(path)
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
self.register_file_as_managed(path)?;
self.directory.open_write(path)
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
self.register_file_as_managed(path)?;
self.directory.atomic_write(path, data)
}
fn atomic_read(&self, path: &Path) -> result::Result<Vec<u8>, OpenReadError> {
self.directory.atomic_read(path)
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
{
let metas_rlock = self.meta_informations
.read()
.expect("poisoned lock in managed directory meta");
if let Some(counter) = metas_rlock.protected_files.get(path) {
if *counter > 0 {
return Err(DeleteError::FileProtected(path.to_owned()))
}
}
}
self.directory.delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.directory.exists(path)
}
fn box_clone(&self) -> Box<Directory> {
box self.clone()
}
}
impl Clone for ManagedDirectory {
fn clone(&self) -> ManagedDirectory {
ManagedDirectory {
directory: self.directory.box_clone(),
meta_informations: self.meta_informations.clone(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use directory::MmapDirectory;
use std::path::Path;
use std::io::Write;
use tempdir::TempDir;
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
}
#[test]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
}
{
managed_directory.atomic_write(*TEST_PATH2, &vec!(0u8,1u8)).unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = [TEST_PATH1.to_owned()]
.into_iter()
.cloned()
.collect();
managed_directory.garbage_collect(living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
#[test]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
}
else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
}
#[test]
fn test_managed_directory_protect() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
{
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
managed_directory.garbage_collect(living_files.clone());
assert!(managed_directory.exists(*TEST_PATH1));
}
managed_directory.garbage_collect(living_files.clone());
assert!(!managed_directory.exists(*TEST_PATH1));
}
}

View File

@@ -1,27 +1,158 @@
use std::path::{Path, PathBuf};
use tempdir::TempDir;
use std::collections::HashMap;
use std::collections::hash_map::Entry as HashMapEntry;
use fst::raw::MmapReadOnly;
use std::fs::File;
use atomicwrites;
use std::sync::RwLock;
use std::fmt;
use std::io::Write;
use std::io;
use std::io::{Seek, SeekFrom};
use directory::Directory;
use directory::ReadOnlySource;
use directory::WritePtr;
use std::io::BufWriter;
use std::fs::OpenOptions;
use directory::error::{OpenWriteError, FileError, OpenDirectoryError};
use std::result;
use common::make_io_err;
use std::sync::Arc;
use std::fs;
use directory::Directory;
use directory::error::{OpenWriteError, OpenReadError, DeleteError, OpenDirectoryError};
use directory::ReadOnlySource;
use directory::shared_vec_slice::SharedVecSlice;
use directory::WritePtr;
use fst::raw::MmapReadOnly;
use memmap::{Mmap, Protection};
use std::collections::hash_map::Entry as HashMapEntry;
use std::collections::HashMap;
use std::convert::From;
use std::fmt;
use std::fs::{self, File};
use std::fs::OpenOptions;
use std::io::{self, Seek, SeekFrom};
use std::io::{BufWriter, Read, Write};
use std::mem;
use std::path::{Path, PathBuf};
use std::result;
use std::sync::Arc;
use std::sync::RwLock;
use std::sync::Weak;
use tempdir::TempDir;
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
let convert_file_error = |err: io::Error| {
if err.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.clone())
}
else {
OpenReadError::IOError(err)
}
};
let file = File::open(&full_path).map_err(convert_file_error)?;
let meta_data = file
.metadata()
.map_err(|e| OpenReadError::IOError(e))?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return an anonymous mmap_cache
// instead.
return Ok(None)
}
match Mmap::open(&file, Protection::Read) {
Ok(mmap) => {
Ok(Some(Arc::new(mmap)))
}
Err(e) => {
Err(OpenReadError::IOError(e))
}
}
}
#[derive(Default,Clone,Debug,RustcDecodable,RustcEncodable)]
pub struct CacheCounters {
// Number of time the cache prevents to call `mmap`
pub hit: usize,
// Number of time tantivy had to call `mmap`
// as no entry was in the cache.
pub miss_empty: usize,
// Number of time tantivy had to call `mmap`
// as the entry in the cache was evinced.
pub miss_weak: usize,
}
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
pub struct CacheInfo {
pub counters: CacheCounters,
pub mmapped: Vec<PathBuf>,
}
struct MmapCache {
counters: CacheCounters,
cache: HashMap<PathBuf, Weak<Mmap>>,
purge_weak_limit: usize,
}
const STARTING_PURGE_WEAK_LIMIT: usize = 1_000;
impl Default for MmapCache {
fn default() -> MmapCache {
MmapCache {
counters: CacheCounters::default(),
cache: HashMap::new(),
purge_weak_limit: STARTING_PURGE_WEAK_LIMIT,
}
}
}
impl MmapCache {
fn cleanup(&mut self) {
let previous_cache_size = self.cache.len();
let mut new_cache = HashMap::new();
mem::swap(&mut new_cache, &mut self.cache);
self.cache = new_cache
.into_iter()
.filter(|&(_, ref weak_ref)| weak_ref.upgrade().is_some())
.collect();
if self.cache.len() == previous_cache_size {
self.purge_weak_limit *= 2;
}
}
fn get_info(&mut self) -> CacheInfo {
self.cleanup();
let paths: Vec<PathBuf> = self.cache.keys()
.cloned()
.collect();
CacheInfo {
counters: self.counters.clone(),
mmapped: paths,
}
}
fn get_mmap(&mut self, full_path: PathBuf) -> Result<Option<Arc<Mmap>>, OpenReadError> {
// if we exceed this limit, then we go through the weak
// and remove those that are obsolete.
if self.cache.len() > self.purge_weak_limit {
self.cleanup();
}
Ok(match self.cache.entry(full_path.clone()) {
HashMapEntry::Occupied(mut occupied_entry) => {
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
self.counters.hit += 1;
Some(mmap_arc.clone())
}
else {
// The entry exists but the weak ref has been destroyed.
self.counters.miss_weak += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
occupied_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
}
else {
None
}
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss_empty += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
vacant_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
}
else {
None
}
}
})
}
}
/// Directory storing data in files, read via mmap.
///
@@ -30,8 +161,9 @@ use directory::shared_vec_slice::SharedVecSlice;
#[derive(Clone)]
pub struct MmapDirectory {
root_path: PathBuf,
mmap_cache: Arc<RwLock<HashMap<PathBuf, MmapReadOnly>>>,
mmap_cache: Arc<RwLock<MmapCache>>,
_temp_directory: Arc<Option<TempDir>>,
}
impl fmt::Debug for MmapDirectory {
@@ -40,8 +172,6 @@ impl fmt::Debug for MmapDirectory {
}
}
impl MmapDirectory {
/// Creates a new MmapDirectory in a temporary directory.
@@ -53,13 +183,12 @@ impl MmapDirectory {
let tempdir_path = PathBuf::from(tempdir.path());
let directory = MmapDirectory {
root_path: PathBuf::from(tempdir_path),
mmap_cache: Arc::new(RwLock::new(HashMap::new())),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(Some(tempdir))
};
Ok(directory)
}
/// Opens a MmapDirectory in a directory.
///
/// Returns an error if the `directory_path` does not
@@ -74,7 +203,7 @@ impl MmapDirectory {
else {
Ok(MmapDirectory {
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(HashMap::new())),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None)
})
}
@@ -89,11 +218,40 @@ impl MmapDirectory {
/// Sync the root directory.
/// In certain FS, this is required to persistently create
/// a file.
fn sync_directory(&self,) -> Result<(), io::Error> {
let fd = try!(File::open(&self.root_path));
fn sync_directory(&self) -> Result<(), io::Error> {
let mut open_opts = OpenOptions::new();
// Linux needs read to be set, otherwise returns EINVAL
// write must not be set, or it fails with EISDIR
open_opts.read(true);
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
// and calling sync_all() only works if write access is requested.
#[cfg(windows)]
{
use std::os::windows::fs::OpenOptionsExt;
use winapi::winbase;
open_opts.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = try!(open_opts.open(&self.root_path));
try!(fd.sync_all());
Ok(())
}
/// Returns some statistical information
/// about the Mmap cache.
///
/// The `MmapDirectory` embeds a `MmapDirectory`
/// to avoid multiplying the `mmap` system calls.
pub fn get_cache_info(&mut self) -> CacheInfo {
self.mmap_cache
.write()
.expect("Mmap cache lock is poisoned.")
.get_info()
}
}
@@ -128,47 +286,21 @@ impl Seek for SafeFileWriter {
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, FileError> {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = try!(
self.mmap_cache
.write()
.map_err(|_| {
make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path))
})
);
let mmap = match mmap_cache.entry(full_path.clone()) {
HashMapEntry::Occupied(e) => {
e.get().clone()
}
HashMapEntry::Vacant(vacant_entry) => {
let file = try!(
File::open(&full_path).map_err(|err| {
if err.kind() == io::ErrorKind::NotFound {
FileError::FileDoesNotExist(full_path.clone())
}
else {
FileError::IOError(err)
}
})
);
if try!(file.metadata()).len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return an anonymous mmap_cache
// instead.
return Ok(ReadOnlySource::Anonymous(SharedVecSlice::empty()))
}
let new_mmap = try!(MmapReadOnly::open(&file));
vacant_entry.insert(new_mmap.clone());
new_mmap
}
};
Ok(ReadOnlySource::Mmap(mmap))
let mut mmap_cache = self.mmap_cache
.write()
.map_err(|_| OpenReadError::IOError(
make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path))
))?;
Ok(mmap_cache.get_mmap(full_path)?
.map(MmapReadOnly::from)
.map(ReadOnlySource::Mmap)
.unwrap_or(ReadOnlySource::Anonymous(SharedVecSlice::empty()))
)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
@@ -202,22 +334,32 @@ impl Directory for MmapDirectory {
Ok(BufWriter::new(Box::new(writer)))
}
fn delete(&self, path: &Path) -> result::Result<(), FileError> {
debug!("Delete {:?}", path);
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = try!(self.mmap_cache
.write()
.map_err(|_| {
make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path))
})
.map_err(|_|
DeleteError::IOError(make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path))))
);
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.remove(&full_path);
try!(fs::remove_file(&full_path));
try!(self.sync_directory());
Ok(())
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => {
self.sync_directory()
.map_err(|e| DeleteError::IOError(e))
}
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
}
else {
Err(DeleteError::IOError(e))
}
}
}
}
fn exists(&self, path: &Path) -> bool {
@@ -225,6 +367,27 @@ impl Directory for MmapDirectory {
full_path.exists()
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let full_path = self.resolve_path(path);
let mut buffer = Vec::new();
match File::open(&full_path) {
Ok(mut file) => {
file.read_to_end(&mut buffer)
.map_err(|e| OpenReadError::IOError(e))?;
Ok(buffer)
}
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
}
else {
Err(OpenReadError::IOError(e))
}
}
}
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path);
let full_path = self.resolve_path(path);
@@ -238,5 +401,100 @@ impl Directory for MmapDirectory {
fn box_clone(&self,) -> Box<Directory> {
Box::new(self.clone())
}
}
#[cfg(test)]
mod tests {
// There are more tests in directory/mod.rs
// The following tests are specific to the MmapDirectory
use super::*;
#[test]
fn test_open_empty() {
// empty file is actually an edge case because those
// cannot be mmapped.
//
// In that case the directory returns a SharedVecSlice.
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let path = PathBuf::from("test");
{
let mut w = mmap_directory.open_write(&path).unwrap();
w.flush().unwrap();
}
let readonlymap = mmap_directory.open_read(&path).unwrap();
assert_eq!(readonlymap.len(), 0);
}
#[test]
fn test_cache() {
let content = "abc".as_bytes();
// here we test if the cache releases
// mmaps correctly.
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let paths: Vec<PathBuf> = (0..10)
.map(|i| PathBuf::from(&*format!("file_{}", i)))
.collect();
{
for path in &paths {
let mut w = mmap_directory.open_write(path).unwrap();
w.write(content).unwrap();
w.flush().unwrap();
}
}
{
for path in &paths {
{
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
}
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
}
assert_eq!(mmap_directory.get_cache_info().counters.miss_empty, 10);
{
// test weak miss
// the first pass create the weak refs.
for path in &paths {
let _r = mmap_directory.open_read(path).unwrap();
}
// ... the second hits the weak refs.
for path in &paths {
let _r = mmap_directory.open_read(path).unwrap();
}
let cache_info = mmap_directory.get_cache_info();
assert_eq!(cache_info.counters.miss_empty, 20);
assert_eq!(cache_info.counters.miss_weak, 10);
}
{
let mut saved_readmmaps = vec!();
// Keeps reference alive
for (i, path) in paths.iter().enumerate() {
let r = mmap_directory.open_read(path).unwrap();
saved_readmmaps.push(r);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
let cache_info = mmap_directory.get_cache_info();
println!("{:?}", cache_info);
assert_eq!(cache_info.counters.miss_empty, 30);
assert_eq!(cache_info.counters.miss_weak, 10);
assert_eq!(cache_info.mmapped.len(), 10);
for saved_readmmap in saved_readmmaps {
assert_eq!(saved_readmmap.as_slice(), content);
}
}
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
}

View File

@@ -3,6 +3,7 @@ mod ram_directory;
mod directory;
mod read_only_source;
mod shared_vec_slice;
mod managed_directory;
/// Errors specific to the directory module.
pub mod error;
@@ -14,6 +15,7 @@ pub use self::read_only_source::ReadOnlySource;
pub use self::directory::Directory;
pub use self::ram_directory::RAMDirectory;
pub use self::mmap_directory::MmapDirectory;
pub use self::managed_directory::{ManagedDirectory, FileProtection};
/// Synonym of Seek + Write
pub trait SeekableWrite: Seek + Write {}
@@ -58,31 +60,37 @@ mod tests {
fn test_simple(directory: &mut Directory) {
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7,3,5]).unwrap();
write_file.flush().unwrap();
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7,3,5]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(!directory.exists(*TEST_PATH));
}
fn test_seek(directory: &mut Directory) {
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7,3,5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3,1]).unwrap();
write_file.flush().unwrap();
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7,3,5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3,1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
assert!(directory.delete(*TEST_PATH).is_ok());
}
@@ -111,19 +119,32 @@ mod tests {
}
}
fn test_delete(directory: &mut Directory) {
fn test_directory_delete(directory: &mut Directory) {
assert!(directory.open_read(*TEST_PATH).is_err());
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
}
if cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
}
assert!(directory.open_read(*TEST_PATH).is_err());
assert!(directory.delete(*TEST_PATH).is_err());
}
fn test_directory(directory: &mut Directory) {
@@ -131,7 +152,7 @@ mod tests {
test_seek(directory);
test_rewrite_forbidden(directory);
test_write_create_the_file(directory);
test_delete(directory);
test_directory_delete(directory);
}
}

View File

@@ -6,7 +6,7 @@ use std::result;
use std::sync::{Arc, RwLock};
use common::make_io_err;
use directory::{Directory, ReadOnlySource};
use directory::error::{OpenWriteError, FileError};
use directory::error::{OpenWriteError, OpenReadError, DeleteError};
use directory::WritePtr;
use super::shared_vec_slice::SharedVecSlice;
@@ -55,7 +55,7 @@ impl Seek for VecWriter {
impl Write for VecWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.is_flushed = false;
try!(self.data.write(buf));
try!(self.data.write_all(buf));
Ok(buf.len())
}
@@ -87,29 +87,29 @@ impl InnerDirectory {
Ok(prev_value.is_some())
}
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, FileError> {
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
self.0
.read()
.map_err(|_| {
let io_err = make_io_err(format!("Failed to acquire read lock for the directory, when trying to read {:?}", path));
FileError::IOError(io_err)
OpenReadError::IOError(io_err)
})
.and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| FileError::FileDoesNotExist(PathBuf::from(path)))
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(|data| {
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
})
})
}
fn delete(&self, path: &Path) -> result::Result<(), FileError> {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.0
.write()
.map_err(|_| {
let io_err = make_io_err(format!("Failed to acquire write lock for the directory, when trying to delete {:?}", path));
FileError::IOError(io_err)
DeleteError::IOError(io_err)
})
.and_then(|mut writable_map| {
match writable_map.remove(path) {
@@ -117,7 +117,7 @@ impl InnerDirectory {
Ok(())
},
None => {
Err(FileError::FileDoesNotExist(PathBuf::from(path)))
Err(DeleteError::FileDoesNotExist(PathBuf::from(path)))
}
}
})
@@ -160,7 +160,7 @@ impl RAMDirectory {
}
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, FileError> {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.open_read(path)
}
@@ -176,7 +176,7 @@ impl Directory for RAMDirectory {
}
}
fn delete(&self, path: &Path) -> result::Result<(), FileError> {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.fs.delete(path)
}
@@ -185,6 +185,12 @@ impl Directory for RAMDirectory {
self.fs.exists(path)
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let read = self.open_read(path)?;
Ok(read.as_slice()
.to_owned())
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let path_buf = PathBuf::from(path);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());

View File

@@ -3,17 +3,16 @@
/// Definition of Tantivy's error and result.
use std::io;
use std::result;
use std::path::PathBuf;
use std::error;
use std::sync::PoisonError;
use directory::error::{FileError, OpenWriteError, OpenDirectoryError};
use directory::error::{OpenReadError, OpenWriteError, OpenDirectoryError};
use query;
use schema;
/// Tantivy result.
pub type Result<T> = result::Result<T, Error>;
/// Generic tantivy error.
@@ -32,11 +31,14 @@ pub enum Error {
/// The data within is corrupted.
///
/// For instance, it contains invalid JSON.
CorruptedFile(PathBuf, Box<error::Error + Send>),
CorruptedFile(PathBuf, Box<error::Error + Send + Sync>),
/// Invalid argument was passed by the user.
InvalidArgument(String),
/// An Error happened in one of the thread
ErrorInThread(String), // TODO investigate better solution
ErrorInThread(String),
/// An Error appeared related to the lack of a field.
SchemaError(String),
}
impl From<io::Error> for Error {
@@ -57,11 +59,11 @@ impl<Guard> From<PoisonError<Guard>> for Error {
}
}
impl From<FileError> for Error {
fn from(error: FileError) -> Error {
impl From<OpenReadError> for Error {
fn from(error: OpenReadError) -> Error {
match error {
FileError::FileDoesNotExist(filepath) => Error::PathDoesNotExist(filepath),
FileError::IOError(io_error) => Error::IOError(io_error),
OpenReadError::FileDoesNotExist(filepath) => Error::PathDoesNotExist(filepath),
OpenReadError::IOError(io_error) => Error::IOError(io_error),
}
}
}

127
src/fastfield/delete.rs Normal file
View File

@@ -0,0 +1,127 @@
use bit_set::BitSet;
use directory::WritePtr;
use std::io::Write;
use std::io;
use directory::ReadOnlySource;
use DocId;
use common::HasLen;
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
let max_doc = delete_bitset.capacity();
let mut byte = 0u8;
let mut shift = 0u8;
for doc in 0..max_doc {
if delete_bitset.contains(doc) {
byte |= 1 << shift;
}
if shift == 7 {
writer.write_all(&[byte])?;
shift = 0;
byte = 0;
}
else {
shift += 1;
}
}
if max_doc % 8 > 0 {
writer.write_all(&[byte])?;
}
writer.flush()
}
#[derive(Clone)]
pub struct DeleteBitSet {
data: ReadOnlySource,
len: usize,
}
impl DeleteBitSet {
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
let num_deleted: usize = data
.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();
DeleteBitSet {
data: data,
len: num_deleted,
}
}
pub fn empty() -> DeleteBitSet {
DeleteBitSet {
data: ReadOnlySource::empty(),
len: 0,
}
}
pub fn has_deletes(&self) -> bool {
self.len() > 0
}
pub fn is_deleted(&self, doc: DocId) -> bool {
if self.len == 0 {
false
}
else {
let byte_offset = doc / 8u32;
let b: u8 = (*self.data)[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
}
}
}
impl HasLen for DeleteBitSet {
fn len(&self) -> usize {
self.len
}
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use bit_set::BitSet;
use directory::*;
use super::*;
fn test_delete_bitset_helper(bitset: &BitSet) {
let test_path = PathBuf::from("test");
let mut directory = RAMDirectory::create();
{
let mut writer = directory.open_write(&*test_path).unwrap();
write_delete_bitset(bitset, &mut writer).unwrap();
}
{
let source = directory.open_read(&test_path).unwrap();
let delete_bitset = DeleteBitSet::open(source);
let n = bitset.capacity();
for doc in 0..n {
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
}
assert_eq!(delete_bitset.len(), bitset.len());
}
}
#[test]
fn test_delete_bitset() {
{
let mut bitset = BitSet::with_capacity(10);
bitset.insert(1);
bitset.insert(9);
test_delete_bitset_helper(&bitset);
}
{
let mut bitset = BitSet::with_capacity(8);
bitset.insert(1);
bitset.insert(2);
bitset.insert(3);
bitset.insert(5);
bitset.insert(7);
test_delete_bitset_helper(&bitset);
}
}
}

View File

@@ -13,6 +13,7 @@
mod reader;
mod writer;
mod serializer;
pub mod delete;
pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter};
pub use self::reader::{U32FastFieldsReader, U32FastFieldReader};
@@ -53,9 +54,6 @@ mod tests {
#[test]
pub fn test_fastfield() {
let test_fastfield = U32FastFieldReader::from(vec!(100,200,300));
println!("{}", test_fastfield.get(0));
println!("{}", test_fastfield.get(1));
println!("{}", test_fastfield.get(2));
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);

View File

@@ -32,7 +32,7 @@ pub struct U32FastFieldReader {
impl U32FastFieldReader {
pub fn empty() -> U32FastFieldReader {
U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone()).expect("should always work.")
U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone())
}
pub fn min_val(&self,) -> u32 {
@@ -43,14 +43,18 @@ impl U32FastFieldReader {
self.max_val
}
pub fn open(data: ReadOnlySource) -> io::Result<U32FastFieldReader> {
/// Opens a new fast field reader given a read only source.
///
/// # Panics
/// Panics if the data is corrupted.
pub fn open(data: ReadOnlySource) -> U32FastFieldReader {
let min_val;
let amplitude;
let max_val;
{
let mut cursor = data.as_slice();
min_val = try!(u32::deserialize(&mut cursor));
amplitude = try!(u32::deserialize(&mut cursor));
min_val = u32::deserialize(&mut cursor).unwrap();
amplitude = u32::deserialize(&mut cursor).unwrap();
max_val = min_val + amplitude;
}
let num_bits = compute_num_bits(amplitude);
@@ -58,12 +62,12 @@ impl U32FastFieldReader {
let data_arr = &(data.deref()[8..]);
BitUnpacker::new(data_arr, num_bits as usize)
};
Ok(U32FastFieldReader {
U32FastFieldReader {
_data: data,
bit_unpacker: bit_unpacker,
min_val: min_val,
max_val: max_val,
})
}
}
pub fn get(&self, doc: DocId) -> u32 {
@@ -132,17 +136,20 @@ impl U32FastFieldsReader {
})
}
pub fn get_field(&self, field: Field) -> io::Result<U32FastFieldReader> {
match self.field_offsets.get(&field) {
Some(&(start, stop)) => {
/// Returns the u32 fast value reader if the field
/// is a u32 field indexed as "fast".
///
/// Return None if the field is not a u32 field
/// indexed with the fast option.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn get_field(&self, field: Field) -> Option<U32FastFieldReader> {
self.field_offsets
.get(&field)
.map(|&(start, stop)| {
let field_source = self.source.slice(start as usize, stop as usize);
U32FastFieldReader::open(field_source)
}
None => {
Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field"))
}
}
})
}
}

View File

@@ -98,7 +98,6 @@ impl U32FastFieldWriter {
}
},
None => {
// TODO make default value configurable
0u32
}
}

62
src/functional_test.rs Normal file
View File

@@ -0,0 +1,62 @@
use std::collections::HashSet;
use rand::thread_rng;
use schema::*;
use Index;
use Searcher;
use rand::distributions::{IndependentSample, Range};
fn check_index_content(searcher: &Searcher, vals: &HashSet<u32>) {
assert!(searcher.segment_readers().len() < 20);
assert_eq!(searcher.num_docs() as usize, vals.len());
}
#[test]
#[ignore]
fn test_indexing() {
let mut schema_builder = SchemaBuilder::default();
let id_field = schema_builder.add_u32_field("id", U32_INDEXED);
let multiples_field = schema_builder.add_u32_field("multiples", U32_INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let universe = Range::new(0u32, 20u32);
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
let mut committed_docs: HashSet<u32> = HashSet::new();
let mut uncommitted_docs: HashSet<u32> = HashSet::new();
for _ in 0..200 {
let random_val = universe.ind_sample(&mut rng);
if random_val == 0 {
index_writer.commit().expect("Commit failed");
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
index.load_searchers().unwrap();
let searcher = index.searcher();
// check that everything is correct.
check_index_content(&searcher, &committed_docs);
}
else {
if committed_docs.remove(&random_val) ||
uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u32(id_field, random_val);
index_writer.delete_term(doc_id_term);
}
else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u32(id_field, random_val);
for i in 1u32..10u32 {
doc.add_u32(multiples_field, random_val * i);
}
index_writer.add_document(doc);
}
}
}
}

325
src/indexer/delete_queue.rs Normal file
View File

@@ -0,0 +1,325 @@
use super::operation::DeleteOperation;
use std::sync::{Arc, RwLock};
use std::mem;
use std::ops::DerefMut;
// The DeleteQueue is similar in conceptually to a multiple
// consumer single producer broadcast channel.
//
// All consumer will receive all messages.
//
// Consumer of the delete queue are holding a `DeleteCursor`,
// which points to a specific place of the `DeleteQueue`.
//
// New consumer can be created in two ways
// - calling `delete_queue.cursor()` returns a cursor, that
// will include all future delete operation (and no past operations).
// - cloning an existing cursor returns a new cursor, that
// is at the exact same position, and can now advance independantly
// from the original cursor.
#[derive(Default)]
struct InnerDeleteQueue {
writer: Vec<DeleteOperation>,
last_block: Option<Arc<Block>>,
}
#[derive(Clone, Default)]
pub struct DeleteQueue {
inner: Arc<RwLock<InnerDeleteQueue>>,
}
impl DeleteQueue {
// Creates a new delete queue.
pub fn new() -> DeleteQueue {
let delete_queue = DeleteQueue {
inner: Arc::default(),
};
let next_block = NextBlock::from(delete_queue.clone());
{
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
delete_queue_wlock.last_block = Some(
Arc::new(Block {
operations: Arc::default(),
next: next_block,
})
);
}
delete_queue
}
// Creates a new cursor that makes it possible to
// consume future delete operations.
//
// Past delete operations are not accessible.
pub fn cursor(&self) -> DeleteCursor {
let last_block = self.inner
.read()
.expect("Read lock poisoned when opening delete queue cursor")
.last_block
.clone()
.expect("Failed to unwrap last_block. This should never happen
as the Option<> is only here to make
initialization possible");
let operations_len = last_block.operations.len();
DeleteCursor {
block: last_block,
pos: operations_len,
}
}
// Appends a new delete operations.
pub fn push(&self, delete_operation: DeleteOperation) {
self.inner
.write()
.expect("Failed to acquire write lock on delete queue writer")
.writer
.push(delete_operation);
}
// DeleteQueue is a linked list of blocks of
// delete operations.
//
// Writing happens by simply appending to a vec.
// `.flush()` takes this pending delete operations vec
// creates a new read-only block from it,
// and appends it to the linked list.
//
// `.flush()` happens when, for instance,
// a consumer reaches the last read-only operations.
// It then ask the delete queue if there happen to
// be some unflushed operations.
//
fn flush(&self) -> Option<Arc<Block>> {
let mut self_wlock = self
.inner
.write()
.expect("Failed to acquire write lock on delete queue writer");
let delete_operations;
{
let writer: &mut Vec<DeleteOperation> = &mut self_wlock.writer;
if writer.is_empty() {
return None;
}
delete_operations = mem::replace(writer, vec!());
}
let next_block = NextBlock::from(self.clone());
{
self_wlock.last_block = Some(
Arc::new(Block {
operations: Arc::new(delete_operations),
next: next_block,
})
);
}
self_wlock.last_block.clone()
}
}
enum InnerNextBlock {
Writer(DeleteQueue),
Closed(Arc<Block>),
}
struct NextBlock(RwLock<InnerNextBlock>);
impl From<DeleteQueue> for NextBlock {
fn from(delete_queue: DeleteQueue) -> NextBlock {
NextBlock(RwLock::new(InnerNextBlock::Writer(delete_queue)))
}
}
impl NextBlock {
fn next_block(&self) -> Option<Arc<Block>> {
{
let next_read_lock = self.0
.read()
.expect("Failed to acquire write lock in delete queue");
match *next_read_lock {
InnerNextBlock::Closed(ref block) => {
return Some(block.clone());
}
_ => {}
}
}
let next_block;
{
let mut next_write_lock = self.0
.write()
.expect("Failed to acquire write lock in delete queue");
match *next_write_lock {
InnerNextBlock::Closed(ref block) => {
return Some(block.clone());
}
InnerNextBlock::Writer(ref writer) => {
match writer.flush() {
Some(flushed_next_block) => {
next_block = flushed_next_block;
}
None => {
return None;
}
}
}
}
*next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); // TODO fix
return Some(next_block)
}
}
}
struct Block {
operations: Arc<Vec<DeleteOperation>>,
next: NextBlock,
}
#[derive(Clone)]
pub struct DeleteCursor {
block: Arc<Block>,
pos: usize,
}
impl DeleteCursor {
/// Skips operations and position it so that
/// - either all of the delete operation currently in the
/// queue are consume and the next get will return None.
/// - the next get will return the first operation with an
/// `opstamp >= target_opstamp`.
pub fn skip_to(&mut self, target_opstamp: u64) {
// TODO Can be optimize as we work with block.
loop {
if let Some(operation) = self.get() {
if operation.opstamp >= target_opstamp {
break;
}
}
else {
break;
}
self.advance();
}
}
/// If the current block has been entirely
/// consumed, try to load the next one.
///
/// Return `true`, if after this attempt,
/// the cursor is on a block that has not
/// been entirely consumed.
/// Return `false`, if we have reached the end of the queue.
fn load_block_if_required(&mut self) -> bool {
if self.pos >= self.block.operations.len() {
// we have consumed our operations entirely.
// let's ask our writer if he has more for us.
// self.go_next_block();
match self.block.next.next_block() {
Some(block) => {
self.block = block;
self.pos = 0;
true
}
None => {
false
}
}
}
else {
true
}
}
/// Advance to the next delete operation.
/// Returns true iff there is such an operation.
pub fn advance(&mut self) -> bool {
if self.load_block_if_required() {
self.pos += 1;
true
}
else {
false
}
}
/// Get the current delete operation.
/// Calling `.get` does not advance the cursor.
pub fn get(&mut self) -> Option<&DeleteOperation> {
if self.load_block_if_required() {
Some(&self.block.operations[self.pos])
}
else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::{DeleteQueue, DeleteOperation};
use schema::{Term, Field};
#[test]
fn test_deletequeue() {
let delete_queue = DeleteQueue::new();
let make_op = |i: usize| {
let field = Field(1u8);
DeleteOperation {
opstamp: i as u64,
term: Term::from_field_u32(field, i as u32)
}
};
delete_queue.push(make_op(1));
delete_queue.push(make_op(2));
let snapshot = delete_queue.cursor();
{
let mut operations_it = snapshot.clone();
assert_eq!(operations_it.get().unwrap().opstamp, 1);
operations_it.advance();
assert_eq!(operations_it.get().unwrap().opstamp, 2);
operations_it.advance();
assert!(operations_it.get().is_none());
operations_it.advance();
let mut snapshot2 = delete_queue.cursor();
assert!(snapshot2.get().is_none());
delete_queue.push(make_op(3));
assert_eq!(snapshot2.get().unwrap().opstamp, 3);
assert_eq!(operations_it.get().unwrap().opstamp, 3);
assert_eq!(operations_it.get().unwrap().opstamp, 3);
operations_it.advance();
assert!(operations_it.get().is_none());
operations_it.advance();
}
{
let mut operations_it = snapshot.clone();
assert_eq!(operations_it.get().unwrap().opstamp, 1);
operations_it.advance();
assert_eq!(operations_it.get().unwrap().opstamp, 2);
operations_it.advance();
assert_eq!(operations_it.get().unwrap().opstamp, 3);
operations_it.advance();
assert!(operations_it.get().is_none());
}
}
}

View File

@@ -1,8 +1,7 @@
use Directory;
use std::path::Path;
use directory::error::OpenWriteError;
use core::LOCKFILE_FILEPATH;
pub const LOCKFILE_NAME: &'static str = ".tantivy-indexer.lock";
/// The directory lock is a mechanism used to
@@ -16,16 +15,14 @@ pub struct DirectoryLock {
impl DirectoryLock {
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
let lockfile_path = Path::new(LOCKFILE_NAME);
try!(directory.open_write(lockfile_path));
try!(directory.open_write(&*LOCKFILE_FILEPATH));
Ok(DirectoryLock { directory: directory })
}
}
impl Drop for DirectoryLock {
fn drop(&mut self) {
let lockfile_path = Path::new(LOCKFILE_NAME);
if let Err(e) = self.directory.delete(lockfile_path) {
if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) {
error!("Failed to remove the lock file. {:?}", e);
}
}

View File

@@ -0,0 +1,93 @@
use std::sync::Arc;
use DocId;
// Doc to opstamp is used to identify which
// document should be deleted.
//
// Since the docset matching the query of a delete operation
// is not computed right when the delete operation is received,
// we need to find a way to evaluate, for each document,
// whether the document was added before or after
// the delete operation. This anteriority is used by comparing
// the docstamp of the document.
//
// The doc to opstamp mapping stores precisely an array
// indexed by doc id and storing the opstamp of the document.
//
// This mapping is (for the moment) stricly increasing
// because of the way document id are allocated.
#[derive(Clone)]
pub enum DocToOpstampMapping {
WithMap(Arc<Vec<u64>>),
None
}
impl From<Vec<u64>> for DocToOpstampMapping {
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping {
DocToOpstampMapping::WithMap(Arc::new(opstamps))
}
}
impl DocToOpstampMapping {
/// Given an opstamp return the limit doc id L
/// such that all doc id D such that
// D >= L iff opstamp(D) >= than `target_opstamp`.
//
// The edge case opstamp = some doc opstamp is in practise
// never called.
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId {
match *self {
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
match doc_opstamps.binary_search(&target_opstamp) {
Ok(doc_id) => doc_id as DocId,
Err(doc_id) => doc_id as DocId,
}
}
DocToOpstampMapping::None => DocId::max_value(),
}
}
}
#[cfg(test)]
mod tests {
use super::DocToOpstampMapping;
#[test]
fn test_doc_to_opstamp_mapping_none() {
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), u32::max_value());
}
#[test]
fn test_doc_to_opstamp_mapping_complex() {
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!());
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
}
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64));
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
}
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64, 12u64, 17u64, 23u64));
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
for i in 2u64..13u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
}
for i in 13u64..18u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 2);
}
for i in 18u64..24u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 3);
}
for i in 24u64..30u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 4);
}
}
}
}

View File

@@ -1,30 +1,38 @@
use schema::Schema;
use schema::Document;
use indexer::SegmentSerializer;
use core::SerializableSegment;
use bit_set::BitSet;
use chan;
use core::Index;
use core::Segment;
use std::thread::JoinHandle;
use indexer::{MergePolicy, DefaultMergePolicy};
use indexer::SegmentWriter;
use super::directory_lock::DirectoryLock;
use std::clone::Clone;
use std::io;
use std::thread;
use std::mem;
use indexer::merger::IndexMerger;
use core::SegmentComponent;
use core::SegmentId;
use datastruct::stacker::Heap;
use std::mem::swap;
use std::sync::{Arc, Mutex};
use chan;
use core::SegmentMeta;
use super::segment_updater::{SegmentUpdater, SegmentUpdate, SegmentUpdateSender};
use std::time::Duration;
use super::super::core::index::get_segment_manager;
use super::segment_manager::CommitState;
use Result;
use core::SegmentReader;
use indexer::stamper::Stamper;
use datastruct::stacker::Heap;
use directory::FileProtection;
use Error;
use Directory;
use fastfield::delete::write_delete_bitset;
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
use futures::Canceled;
use futures::Future;
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
use indexer::MergePolicy;
use indexer::operation::DeleteOperation;
use indexer::SegmentEntry;
use indexer::SegmentWriter;
use postings::DocSet;
use postings::SegmentPostingsOption;
use Result;
use schema::Document;
use schema::Schema;
use schema::Term;
use std::mem;
use std::mem::swap;
use std::thread::JoinHandle;
use super::directory_lock::DirectoryLock;
use super::operation::AddOperation;
use super::segment_updater::SegmentUpdater;
use std::thread;
// Size of the margin for the heap. A segment is closed when the remaining memory
// in the heap goes below MARGIN_IN_BYTES.
@@ -36,12 +44,8 @@ pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
// Add document will block if the number of docs waiting in the queue to be indexed reaches PIPELINE_MAX_SIZE_IN_DOCS
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
type DocumentSender = chan::Sender<Document>;
type DocumentReceiver = chan::Receiver<Document>;
type DocumentSender = chan::Sender<AddOperation>;
type DocumentReceiver = chan::Receiver<AddOperation>;
/// `IndexWriter` is the user entry-point to add document to an index.
///
@@ -55,9 +59,8 @@ pub struct IndexWriter {
// lifetime of the lock with that of the IndexWriter.
_directory_lock: DirectoryLock,
_merge_policy: Arc<Mutex<Box<MergePolicy>>>,
index: Index,
heap_size_in_bytes_per_thread: usize,
workers_join_handle: Vec<JoinHandle<Result<()>>>,
@@ -65,15 +68,18 @@ pub struct IndexWriter {
document_receiver: DocumentReceiver,
document_sender: DocumentSender,
segment_update_sender: SegmentUpdateSender,
segment_update_thread: JoinHandle<()>,
segment_updater: SegmentUpdater,
worker_id: usize,
num_threads: usize,
uncommitted_docstamp: u64,
committed_docstamp: u64,
generation: usize,
delete_queue: DeleteQueue,
stamper: Stamper,
committed_opstamp: u64,
}
// IndexWriter cannot be sent to another thread.
@@ -81,15 +87,185 @@ impl !Send for IndexWriter {}
impl !Sync for IndexWriter {}
/// Open a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
/// that due to a panic or other error, a stale lockfile will be
/// left in the index directory. If you are sure that no other
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// num_threads specifies the number of indexing workers that
/// should work at the same time.
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn open_index_writer(
index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize) -> Result<IndexWriter> {
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
panic!(format!("The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT));
}
let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone()));
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
let stamper = Stamper::new(index.opstamp());
let segment_updater = SegmentUpdater::new(index.clone(),
stamper.clone(),
delete_queue.cursor())?;
let mut index_writer = IndexWriter {
_directory_lock: directory_lock,
heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread,
index: index.clone(),
document_receiver: document_receiver,
document_sender: document_sender,
segment_updater: segment_updater,
workers_join_handle: vec!(),
num_threads: num_threads,
delete_queue: delete_queue,
committed_opstamp: index.opstamp(),
stamper: stamper,
generation: 0,
worker_id: 0,
};
try!(index_writer.start_workers());
Ok(index_writer)
}
pub fn compute_deleted_bitset(
delete_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: DocToOpstampMapping,
target_opstamp: u64) -> Result<bool> {
let mut might_have_changed = false;
loop {
if let Some(delete_op) = delete_cursor.get() {
if delete_op.opstamp > target_opstamp {
break;
}
else {
// A delete operation should only affect
// document that were inserted after it.
//
// Limit doc helps identify the first document
// that may be affected by the delete operation.
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
while docset.advance() {
let deleted_doc = docset.doc();
if deleted_doc < limit_doc {
delete_bitset.insert(deleted_doc as usize);
might_have_changed = true;
}
}
}
}
}
else {
break;
}
delete_cursor.advance();
}
Ok(might_have_changed)
}
// TODO skip delete operation before teh
// last delete opstamp
/// Advance delete for the given segment up
/// to the target opstamp.
pub fn advance_deletes(
mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
{
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
// We are already up-to-date here.
if target_opstamp == previous_opstamp {
return Ok(file_protect);
}
}
let segment_reader = SegmentReader::open(segment.clone())?;
let max_doc = segment_reader.max_doc();
let mut delete_bitset: BitSet =
match segment_entry.delete_bitset() {
Some(ref previous_delete_bitset) =>
(*previous_delete_bitset).clone(),
None =>
BitSet::with_capacity(max_doc as usize)
};
let delete_cursor = segment_entry.delete_cursor();
compute_deleted_bitset(
&mut delete_bitset,
&segment_reader,
delete_cursor,
DocToOpstampMapping::None,
target_opstamp)?;
for doc in 0u32..max_doc {
if segment_reader.is_deleted(doc) {
delete_bitset.insert(doc as usize);
}
}
let num_deleted_docs = delete_bitset.len();
if num_deleted_docs > 0 {
segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
file_protect = Some(segment.protect_from_delete(SegmentComponent::DELETE));
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
write_delete_bitset(&delete_bitset, &mut delete_file)?;
}
}
segment_entry.set_meta(segment.meta().clone());
Ok(file_protect)
}
fn index_documents(heap: &mut Heap,
segment: Segment,
schema: &Schema,
document_iterator: &mut Iterator<Item = Document>,
segment_update_sender: &mut SegmentUpdateSender)
-> Result<()> {
generation: usize,
document_iterator: &mut Iterator<Item=AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor)
-> Result<bool> {
heap.clear();
let segment_id = segment.id();
let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema));
let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), &schema)?;
for doc in document_iterator {
try!(segment_writer.add_document(&doc, &schema));
if segment_writer.is_buffer_full() {
@@ -99,31 +275,55 @@ fn index_documents(heap: &mut Heap,
}
}
let num_docs = segment_writer.max_doc();
let segment_meta = SegmentMeta {
segment_id: segment_id,
num_docs: num_docs,
};
// this is ensured by the call to peek before starting
// the worker thread.
assert!(num_docs > 0);
try!(segment_writer.finalize());
segment_update_sender.send(SegmentUpdate::AddSegment(segment_meta));
Ok(())
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
}
let mut segment_meta = SegmentMeta::new(segment_id);
segment_meta.set_max_doc(num_docs);
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
doc_to_opstamps,
last_docstamp,
)?;
let segment_entry = SegmentEntry::new(
segment_meta,
delete_cursor,
{ if may_have_deletes { Some(deleted_bitset) }
else { None } }
);
Ok(
segment_updater
.add_segment(generation, segment_entry)
)
}
impl IndexWriter {
/// The index writer
pub fn wait_merging_threads(mut self) -> Result<()> {
self.segment_update_sender.send(SegmentUpdate::Terminate);
// this will stop the indexing thread,
// dropping the last reference to the segment_update_sender.
// dropping the last reference to the segment_updater.
drop(self.document_sender);
let mut v = Vec::new();
mem::swap(&mut v, &mut self.workers_join_handle);
for join_handle in v {
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec!());
for join_handle in former_workers_handles {
try!(join_handle.join()
.expect("Indexing Worker thread panicked")
.map_err(|e| {
@@ -131,122 +331,84 @@ impl IndexWriter {
}));
}
drop(self.workers_join_handle);
self.segment_update_thread
.join()
.map_err(|err| {
error!("Error in the merging thread {:?}", err);
Error::ErrorInThread(format!("{:?}", err))
})
let result = self.segment_updater
.wait_merging_thread()
.map_err(|_|
Error::ErrorInThread("Failed to join merging thread.".to_string())
);
if let &Err(ref e) = &result {
error!("Some merging thread failed {:?}", e);
}
result
}
/// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline.
///
fn add_indexing_worker(&mut self) -> Result<()> {
let index = self.index.clone();
let schema = self.index.schema();
let document_receiver_clone = self.document_receiver.clone();
let mut segment_update_sender = self.segment_update_sender.clone();
let mut segment_updater = self.segment_updater.clone();
let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread);
let join_handle: JoinHandle<Result<()>> = try!(thread::Builder::new()
.name(format!("indexing_thread_{}", self.worker_id))
let generation = self.generation;
let mut delete_cursor = self.delete_queue.cursor();
let join_handle: JoinHandle<Result<()>> =
thread::Builder::new()
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
.spawn(move || {
loop {
let segment = index.new_segment();
let mut document_iterator = document_receiver_clone.clone()
.into_iter()
.peekable();
// the peeking here is to avoid
// creating a new segment's files
// if no document are available.
if document_iterator.peek().is_some() {
try!(index_documents(&mut heap,
segment,
&schema,
&mut document_iterator,
&mut segment_update_sender));
} else {
//
// this is a valid guarantee as the
// peeked document now belongs to
// our local iterator.
if let Some(operation) = document_iterator.peek() {
delete_cursor.skip_to(operation.opstamp);
}
else {
// No more documents.
// Happens when there is a commit, or if the `IndexWriter`
// was dropped.
return Ok(());
return Ok(())
}
let segment = segment_updater.new_segment();
index_documents(&mut heap,
segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone())?;
}
}));
})?;
self.worker_id += 1;
self.workers_join_handle.push(join_handle);
Ok(())
}
/// Open a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
/// that due to a panic or other error, a stale lockfile will be
/// left in the index directory. If you are sure that no other
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// num_threads specifies the number of indexing workers that
/// should work at the same time.
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn open(index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize)
-> Result<IndexWriter> {
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
panic!(format!("The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT));
}
let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone()));
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
let merge_policy: Arc<Mutex<Box<MergePolicy>>> = Arc::new(Mutex::new(box DefaultMergePolicy::default()));
let (segment_update_sender, segment_update_thread) = SegmentUpdater::start_updater(index.clone(), merge_policy.clone());
let mut index_writer = IndexWriter {
_directory_lock: directory_lock,
_merge_policy: merge_policy,
heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread,
index: index.clone(),
document_receiver: document_receiver,
document_sender: document_sender,
segment_update_sender: segment_update_sender,
segment_update_thread: segment_update_thread,
workers_join_handle: Vec::new(),
num_threads: num_threads,
committed_docstamp: index.docstamp(),
uncommitted_docstamp: index.docstamp(),
worker_id: 0,
};
try!(index_writer.start_workers());
Ok(index_writer)
}
/// Returns a clone of the index_writer merge policy.
/// Accessor to the merge policy.
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
self._merge_policy.lock().unwrap().box_clone()
self.segment_updater.get_merge_policy()
}
/// Set the merge policy.
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
*self._merge_policy.lock().unwrap() = merge_policy;
self.segment_updater.set_merge_policy(merge_policy);
}
fn start_workers(&mut self) -> Result<()> {
@@ -256,56 +418,15 @@ impl IndexWriter {
Ok(())
}
/// Detects and removes the files that
/// are not used by the index anymore.
pub fn garbage_collect_files(&mut self) -> Result<()> {
self.segment_updater.garbage_collect_files()
}
/// Merges a given list of segments
pub fn merge(&mut self, segments: &[Segment]) -> Result<()> {
if segments.len() < 2 {
// no segments or one segment? nothing to do.
return Ok(());
}
let segment_manager = get_segment_manager(&self.index);
{
// let's check that all these segments are in the same
// committed/uncommited state.
let first_commit_state = segment_manager.is_committed(segments[0].id());
for segment in segments {
let commit_state = segment_manager.is_committed(segment.id());
if commit_state == CommitState::Missing {
return Err(Error::InvalidArgument(format!("Segment {:?} is not in the index",
segments[0].id())));
}
if commit_state != first_commit_state {
return Err(Error::InvalidArgument(String::from("You may not merge segments \
that are heterogenously in \
committed and uncommited.")));
}
}
}
let schema = self.index.schema();
// An IndexMerger is like a "view" of our merged segments.
let merger = try!(IndexMerger::open(schema, segments));
let mut merged_segment = self.index.new_segment();
// ... we just serialize this index merger in our new segment
// to merge the two segments.
let segment_serializer = try!(SegmentSerializer::for_segment(&mut merged_segment));
let num_docs = try!(merger.write(segment_serializer));
let merged_segment_ids: Vec<SegmentId> =
segments.iter().map(|segment| segment.id()).collect();
let segment_meta = SegmentMeta {
segment_id: merged_segment.id(),
num_docs: num_docs,
};
segment_manager.end_merge(&merged_segment_ids, &segment_meta);
try!(self.index.load_searchers());
Ok(())
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> impl Future<Item=SegmentMeta, Error=Canceled> {
self.segment_updater.start_merge(segment_ids)
}
/// Closes the current document channel send.
@@ -331,55 +452,28 @@ impl IndexWriter {
/// After calling rollback, the index is in the same
/// state as it was after the last commit.
///
/// The docstamp at the last commit is returned.
pub fn rollback(&mut self) -> Result<u64> {
/// The opstamp at the last commit is returned.
pub fn rollback(mut self) -> Result<IndexWriter> {
info!("Rolling back to opstamp {}", self.committed_opstamp);
self.segment_update_sender.send(SegmentUpdate::CancelGeneration);
// we cannot drop segment ready receiver yet
// as it would block the workers.
let document_receiver = self.recreate_document_channel();
self.segment_updater.kill();
// Drains the document receiver pipeline :
// Workers don't need to index the pending documents.
for _ in document_receiver {}
let mut former_workers_join_handle = Vec::new();
swap(&mut former_workers_join_handle,
&mut self.workers_join_handle);
// wait for all the worker to finish their work
// (it should be fast since we consumed all pending documents)
for worker_handle in former_workers_join_handle {
// we stop one worker at a time ...
try!(try!(worker_handle.join()
.map_err(|e| Error::ErrorInThread(format!("{:?}", e)))));
// ... and recreate a new one right away
// to work on the next generation.
try!(self.add_indexing_worker());
}
// All of our indexing workers for the rollbacked generation have
// been terminated.
// Our document receiver pipe was drained.
// No new document have been added in the meanwhile because `IndexWriter`
// is not shared by different threads.
//
// We can now open a new generation and reaccept segments
// from now on.
self.segment_update_sender.send(SegmentUpdate::NewGeneration);
let rollbacked_segments = get_segment_manager(&self.index).rollback();
for segment_id in rollbacked_segments {
// TODO all delete must happen after saving
// meta.json
self.index.delete_segment(segment_id);
}
let receiver_clone = self.document_receiver.clone();
let index = self.index.clone();
let num_threads = self.num_threads;
let heap_size_in_bytes_per_thread = self.heap_size_in_bytes_per_thread;
drop(self);
for _ in receiver_clone {}
// reset the docstamp
self.uncommitted_docstamp = self.committed_docstamp;
Ok(self.committed_docstamp)
let index_writer = open_index_writer(
&index,
num_threads,
heap_size_in_bytes_per_thread)?;
Ok(index_writer)
}
@@ -394,29 +488,11 @@ impl IndexWriter {
/// long as the hard disk is spared), it will be possible
/// to resume indexing from this point.
///
/// Commit returns the `docstamp` of the last document
/// Commit returns the `opstamp` of the last document
/// that made it in the commit.
///
pub fn commit(&mut self) -> Result<u64> {
// this will drop the current document channel
// and recreate a new one channels.
self.recreate_document_channel();
// Docstamp of the last document in this commit.
self.committed_docstamp = self.uncommitted_docstamp;
let mut former_workers_join_handle = Vec::new();
swap(&mut former_workers_join_handle,
&mut self.workers_join_handle);
for worker_handle in former_workers_join_handle {
let indexing_worker_result = try!(worker_handle.join()
.map_err(|e| Error::ErrorInThread(format!("{:?}", e))));
try!(indexing_worker_result);
// add a new worker for the next generation.
try!(self.add_indexing_worker());
}
// here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.
@@ -427,44 +503,96 @@ impl IndexWriter {
// This will move uncommitted segments to the state of
// committed segments.
self.segment_update_sender.send(SegmentUpdate::Commit(self.committed_docstamp));
self.committed_opstamp = self.stamper.stamp();
info!("committing {}", self.committed_opstamp);
// wait for the segment update thread to have processed the info
let segment_manager = get_segment_manager(&self.index);
while segment_manager.docstamp() != self.committed_docstamp {
thread::sleep(Duration::from_millis(100));
// this will drop the current document channel
// and recreate a new one channels.
self.recreate_document_channel();
let mut former_workers_join_handle = Vec::new();
swap(&mut former_workers_join_handle,
&mut self.workers_join_handle);
for worker_handle in former_workers_join_handle {
let indexing_worker_result = try!(worker_handle.join()
.map_err(|e| Error::ErrorInThread(format!("{:?}", e))));
try!(indexing_worker_result);
// add a new worker for the next generation.
try!(self.add_indexing_worker());
}
Ok(self.committed_docstamp)
// wait for the segment update thread to have processed the info
self.segment_updater
.commit(self.committed_opstamp)?;
Ok(self.committed_opstamp)
}
/// Delete all documents containing a given term.
///
/// Delete operation only affects documents that
/// were added in previous commits, and documents
/// that were added previously in the same commit.
///
/// Like adds, the deletion itself will be visible
/// only after calling `commit()`.
pub fn delete_term(&mut self, term: Term) -> u64 {
let opstamp = self.stamper.stamp();
let delete_operation = DeleteOperation {
opstamp: opstamp,
term: term,
};
self.delete_queue.push(delete_operation);
opstamp
}
/// Returns the opstamp of the last successful commit.
///
/// This is, for instance, the opstamp the index will
/// rollback to if there is a failure like a power surge.
///
/// This is also the opstamp of the commit that is currently
/// available for searchers.
pub fn commit_opstamp(&self) -> u64 {
self.committed_opstamp
}
/// Adds a document.
///
/// If the indexing pipeline is full, this call may block.
///
/// The docstamp is an increasing `u64` that can
/// The opstamp is an increasing `u64` that can
/// be used by the client to align commits with its own
/// document queue.
///
/// Currently it represents the number of documents that
/// have been added since the creation of the index.
pub fn add_document(&mut self, doc: Document) -> io::Result<u64> {
self.document_sender.send(doc);
self.uncommitted_docstamp += 1;
Ok(self.uncommitted_docstamp)
pub fn add_document(&mut self, document: Document) -> u64 {
let opstamp = self.stamper.stamp();
let add_operation = AddOperation {
opstamp: opstamp,
document: document,
};
self.document_sender.send(add_operation);
opstamp
}
}
#[cfg(test)]
mod tests {
use indexer::NoMergePolicy;
use schema::{self, Document};
use Index;
use Term;
use Error;
use indexer::NoMergePolicy;
use env_logger;
#[test]
fn test_lockfile_stops_duplicates() {
@@ -506,7 +634,6 @@ mod tests {
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
@@ -519,32 +646,38 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "a");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
assert_eq!(index_writer.rollback().unwrap(), 0u64);
index_writer = index_writer.rollback().unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing("a"), 0);
{
let mut doc = Document::default();
doc.add_text(text_field, "b");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "c");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
assert_eq!(index_writer.commit().unwrap(), 2u64);
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1);
assert_eq!(num_docs_containing("c"), 1);
}
index.load_searchers().unwrap();
index.searcher();
}
#[test]
fn test_with_merges() {
let _ = env_logger::init();
let mut schema_builder = schema::SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
@@ -556,23 +689,26 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
// create 10 segments with 100 tiny docs
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
let mut doc = Document::default();
doc.add_text(text_field, "a");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
index_writer.commit().expect("commit failed");
for _doc in 0..100 {
let mut doc = Document::default();
doc.add_text(text_field, "a");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer.wait_merging_threads().expect("waiting merging thread failed");
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 200);
assert_eq!(index.searchable_segments().len(), 1);
assert!(index.searchable_segments().unwrap().len() < 8);
}
}

View File

@@ -48,13 +48,12 @@ impl LogMergePolicy {
impl MergePolicy for LogMergePolicy {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
if segments.is_empty() {
return Vec::new();
}
let mut size_sorted_tuples = segments.iter()
.map(|x| x.num_docs)
.map(|x| x.num_docs())
.enumerate()
.collect::<Vec<(usize, u32)>>();
@@ -75,16 +74,15 @@ impl MergePolicy for LogMergePolicy {
levels.last_mut().unwrap().push(ind);
}
let result = levels.iter()
levels
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| {
MergeCandidate(ind_vec.iter()
.map(|&ind| segments[ind].segment_id)
.map(|&ind| segments[ind].id())
.collect())
})
.collect();
result
.collect()
}
fn box_clone(&self) -> Box<MergePolicy> {
@@ -122,11 +120,17 @@ mod tests {
assert!(result_list.is_empty());
}
fn seg_meta(num_docs: u32) -> SegmentMeta {
let mut segment_metas = SegmentMeta::new(SegmentId::generate_random());
segment_metas.set_max_doc(num_docs);
segment_metas
}
#[test]
fn test_log_merge_policy_pair() {
let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 10)];
let test_input = vec![seg_meta(10),
seg_meta(10),
seg_meta(10)];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}
@@ -134,12 +138,12 @@ mod tests {
#[test]
fn test_log_merge_policy_levels() {
// multiple levels all get merged correctly
let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 1000),
SegmentMeta::new(SegmentId::generate_random(), 1000),
SegmentMeta::new(SegmentId::generate_random(), 1000)];
let test_input = vec![seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000)];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
@@ -147,24 +151,24 @@ mod tests {
#[test]
fn test_log_merge_policy_within_levels() {
// multiple levels all get merged correctly
let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 11),
SegmentMeta::new(SegmentId::generate_random(), 12),
SegmentMeta::new(SegmentId::generate_random(), 1000),
SegmentMeta::new(SegmentId::generate_random(), 1000),
SegmentMeta::new(SegmentId::generate_random(), 1000)];
let test_input = vec![seg_meta(10),
seg_meta(11),
seg_meta(12),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000)];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
#[test]
fn test_log_merge_policy_small_segments() {
// multiple levels all get merged correctly
let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 1),
SegmentMeta::new(SegmentId::generate_random(), 1),
SegmentMeta::new(SegmentId::generate_random(), 1),
SegmentMeta::new(SegmentId::generate_random(), 2),
SegmentMeta::new(SegmentId::generate_random(), 2),
SegmentMeta::new(SegmentId::generate_random(), 2)];
let test_input = vec![seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2)];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}

View File

@@ -13,7 +13,7 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
///
/// Every time a the list of segments changes, the segment updater
/// asks the merge policy if some segments should be merged.
pub trait MergePolicy: marker::Send + Debug {
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
/// Given the list of segment metas, returns the list of merge candidates.
///
/// This call happens on the segment updater thread, and will block
@@ -43,3 +43,38 @@ impl MergePolicy for NoMergePolicy {
}
}
#[cfg(test)]
pub mod tests {
use super::*;
use core::SegmentId;
use core::SegmentMeta;
/// Merge policy useful for test purposes.
///
/// Everytime there is more than one segment,
/// it will suggest to merge them.
#[derive(Debug)]
pub struct MergeWheneverPossible;
impl MergePolicy for MergeWheneverPossible {
fn compute_merge_candidates(&self, segment_metas: &[SegmentMeta]) -> Vec<MergeCandidate> {
let segment_ids = segment_metas
.iter()
.map(|segment_meta| segment_meta.id())
.collect::<Vec<SegmentId>>();
if segment_ids.len() > 1 {
vec!(MergeCandidate(segment_ids))
}
else {
vec!()
}
}
fn box_clone(&self) -> Box<MergePolicy> {
box MergeWheneverPossible
}
}
}

View File

@@ -1,151 +1,42 @@
use Result;
use {Error, Result};
use core::SegmentReader;
use core::Segment;
use DocId;
use core::SerializableSegment;
use schema::FieldValue;
use indexer::SegmentSerializer;
use postings::PostingsSerializer;
use postings::TermInfo;
use fastfield::U32FastFieldReader;
use itertools::Itertools;
use postings::Postings;
use postings::DocSet;
use std::collections::BinaryHeap;
use datastruct::FstKeyIter;
use schema::{Term, Schema, Field};
use core::TermIterator;
use fastfield::delete::DeleteBitSet;
use schema::{Schema, Field};
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use postings::ChainedPostings;
use postings::HasLen;
use postings::OffsetPostings;
use core::SegmentInfo;
use std::cmp::{min, max, Ordering};
use std::iter;
struct PostingsMerger<'a> {
doc_offsets: Vec<DocId>,
heap: BinaryHeap<HeapItem>,
term_streams: Vec<FstKeyIter<'a, TermInfo>>,
readers: &'a [SegmentReader],
}
#[derive(PartialEq, Eq, Debug)]
struct HeapItem {
term: Term,
segment_ord: usize,
}
impl PartialOrd for HeapItem {
fn partial_cmp(&self, other: &HeapItem) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for HeapItem {
fn cmp(&self, other: &HeapItem) -> Ordering {
(&other.term, &other.segment_ord).cmp(&(&self.term, &self.segment_ord))
}
}
impl<'a> PostingsMerger<'a> {
fn new(readers: &'a [SegmentReader]) -> PostingsMerger<'a> {
let mut doc_offsets: Vec<DocId> = Vec::new();
let mut max_doc = 0;
for reader in readers {
doc_offsets.push(max_doc);
max_doc += reader.max_doc();
};
let term_streams = readers
.iter()
.map(|reader| reader.term_infos().keys())
.collect();
let mut postings_merger = PostingsMerger {
heap: BinaryHeap::new(),
term_streams: term_streams,
doc_offsets: doc_offsets,
readers: readers,
};
for segment_ord in 0..readers.len() {
postings_merger.push_next_segment_el(segment_ord);
}
postings_merger
}
// pushes the term_reader associated with the given segment ordinal
// into the heap.
fn push_next_segment_el(&mut self, segment_ord: usize) {
if let Some(term) = self.term_streams[segment_ord].next() {
let it = HeapItem {
term: Term::from(term),
segment_ord: segment_ord,
};
self.heap.push(it);
}
}
fn append_segment(&mut self,
heap_item: &HeapItem,
segment_postings_list: &mut Vec<OffsetPostings<'a>>) {
{
let offset = self.doc_offsets[heap_item.segment_ord];
let reader = &self.readers[heap_item.segment_ord];
if let Some(segment_postings) = reader.read_postings_all_info(&heap_item.term) {
let offset_postings = OffsetPostings::new(segment_postings, offset);
segment_postings_list.push(offset_postings);
}
}
self.push_next_segment_el(heap_item.segment_ord);
}
}
impl<'a> Iterator for PostingsMerger<'a> {
type Item = (Term, ChainedPostings<'a>);
fn next(&mut self,) -> Option<(Term, ChainedPostings<'a>)> {
// TODO remove the Vec<u8> allocations
match self.heap.pop() {
Some(heap_it) => {
let mut segment_postings_list = Vec::new();
self.append_segment(&heap_it, &mut segment_postings_list);
loop {
match self.heap.peek() {
Some(&ref next_heap_it) if next_heap_it.term == heap_it.term => {},
_ => { break; }
}
let next_heap_it = self.heap.pop().expect("This is only reached if an element was peeked beforehand.");
self.append_segment(&next_heap_it, &mut segment_postings_list);
}
let chained_posting = ChainedPostings::from(segment_postings_list);
Some((heap_it.term, chained_posting))
},
None => None
}
}
}
use std::cmp::{min, max};
use common::allocate_vec;
pub struct IndexMerger {
schema: Schema,
readers: Vec<SegmentReader>,
segment_info: SegmentInfo,
max_doc: u32,
}
struct DeltaPositionComputer {
buffer: Vec<u32>
buffer: Vec<u32>,
}
impl DeltaPositionComputer {
fn new() -> DeltaPositionComputer {
DeltaPositionComputer {
buffer: iter::repeat(0u32).take(512).collect::<Vec<u32>>(),
DeltaPositionComputer {
buffer: allocate_vec(512)
}
}
fn compute_delta_positions(&mut self, positions: &[u32],) -> &[u32] {
fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] {
if positions.len() > self.buffer.len() {
self.buffer.resize(positions.len(), 0u32);
}
@@ -159,91 +50,211 @@ impl DeltaPositionComputer {
}
fn compute_min_max_val(u32_reader: &U32FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u32, u32)> {
if max_doc == 0 {
None
}
else if !delete_bitset.has_deletes() {
// no deleted documents,
// we can use the previous min_val, max_val.
Some((u32_reader.min_val(), u32_reader.max_val()))
}
else {
// some deleted documents,
// we need to recompute the max / min
(0..max_doc)
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
.map(|doc_id| u32_reader.get(doc_id))
.minmax()
.into_option()
}
}
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U32FastFieldReader> {
segment_reader.get_fieldnorms_reader(field)
}
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U32FastFieldReader> {
segment_reader.get_fast_field_reader(field)
}
impl IndexMerger {
pub fn open(schema: Schema, segments: &[Segment]) -> Result<IndexMerger> {
let mut readers = Vec::new();
let mut max_doc = 0;
let mut readers = vec!();
let mut max_doc: u32 = 0u32;
for segment in segments {
let reader = try!(SegmentReader::open(segment.clone()));
max_doc += reader.max_doc();
readers.push(reader);
if segment.meta().num_docs() > 0 {
let reader = SegmentReader::open(segment.clone())?;
max_doc += reader.num_docs();
readers.push(reader);
}
}
Ok(IndexMerger {
schema: schema,
readers: readers,
segment_info: SegmentInfo {
max_doc: max_doc
},
max_doc: max_doc,
})
}
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
// TODO make sure that works even if the field is never here.
for field in self.schema.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u8)) {
let mut u32_readers = Vec::new();
let mut min_val = u32::min_value();
let mut max_val = 0;
for reader in &self.readers {
let u32_reader = try!(reader.get_fieldnorms_reader(field));
min_val = min(min_val, u32_reader.min_val());
max_val = max(max_val, u32_reader.max_val());
u32_readers.push((reader.max_doc(), u32_reader));
}
try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val));
for (max_doc, u32_reader) in u32_readers {
for doc_id in 0..max_doc {
let val = u32_reader.get(doc_id);
try!(fast_field_serializer.add_val(val));
}
}
try!(fast_field_serializer.close_field());
}
Ok(())
fn write_fieldnorms(&self,
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fieldnorm_fastfields: Vec<Field> = self.schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u8))
.collect();
self.generic_write_fast_field(fieldnorm_fastfields, &extract_fieldnorm_reader, fast_field_serializer)
}
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
for field in self.schema.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
.map(|(field_id, _)| Field(field_id as u8)) {
let mut u32_readers = Vec::new();
let mut min_val = u32::min_value();
let mut max_val = 0;
let fast_fields: Vec<Field> = self.schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
.map(|(field_id, _)| Field(field_id as u8))
.collect();
self.generic_write_fast_field(fast_fields, &extract_fast_field_reader, fast_field_serializer)
}
// used both to merge field norms and regular u32 fast fields.
fn generic_write_fast_field(&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U32FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
for field in fields {
let mut u32_readers = vec!();
let mut min_val = u32::max_value();
let mut max_val = u32::min_value();
for reader in &self.readers {
let u32_reader = try!(reader.get_fast_field_reader(field));
min_val = min(min_val, u32_reader.min_val());
max_val = max(max_val, u32_reader.max_val());
u32_readers.push((reader.max_doc(), u32_reader));
match field_reader_extractor(reader, field) {
Some(u32_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u32_reader, reader.max_doc(), reader.delete_bitset()) {
// the segment has some non-deleted documents
min_val = min(min_val, seg_min_val);
max_val = max(max_val, seg_max_val);
u32_readers.push((reader.max_doc(), u32_reader, reader.delete_bitset()));
}
}
None => {
let error_msg = format!("Failed to find a u32_reader for field {:?}", field);
error!("{}", error_msg);
return Err(Error::SchemaError(error_msg))
}
}
}
if u32_readers.is_empty() {
// we have actually zero documents.
min_val = 0;
max_val = 0;
}
assert!(min_val <= max_val);
try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val));
for (max_doc, u32_reader) in u32_readers {
for (max_doc, u32_reader, delete_bitset) in u32_readers {
for doc_id in 0..max_doc {
let val = u32_reader.get(doc_id);
try!(fast_field_serializer.add_val(val));
if !delete_bitset.is_deleted(doc_id) {
let val = u32_reader.get(doc_id);
try!(fast_field_serializer.add_val(val));
}
}
}
try!(fast_field_serializer.close_field());
}
Ok(())
}
fn write_postings(&self, postings_serializer: &mut PostingsSerializer) -> Result<()> {
let postings_merger = PostingsMerger::new(&self.readers);
fn write_postings(
&self,
postings_serializer: &mut PostingsSerializer) -> Result<()> {
let mut merged_terms = TermIterator::from(&self.readers[..]);
let mut delta_position_computer = DeltaPositionComputer::new();
for (term, mut merged_doc_ids) in postings_merger {
try!(postings_serializer.new_term(&term, merged_doc_ids.len() as DocId));
while merged_doc_ids.advance() {
let delta_positions: &[u32] = delta_position_computer.compute_delta_positions(merged_doc_ids.positions());
try!(postings_serializer.write_doc(merged_doc_ids.doc(), merged_doc_ids.term_freq(), delta_positions));
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
}
else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
}
try!(postings_serializer.close_term());
merged_doc_id_map.push(segment_local_map);
}
while merged_terms.advance() {
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
//
// In the new segments, the doc id from the different
// segment are stacked so that :
// - Segment 0's doc ids become doc id [0, seg.max_doc]
// - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc]
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ...
let term = merged_terms.term();
let mut term_written = false;
let segment_postings = merged_terms
.segment_ords()
.iter()
.cloned()
.flat_map(|segment_ord| {
self.readers[segment_ord]
.read_postings_all_info(&term)
.map(|segment_postings| (segment_ord, segment_postings))
})
.collect::<Vec<_>>();
// We can remove the term if all documents which
// contained it have been deleted.
if segment_postings.len() > 0 {
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
while segment_postings.advance() {
if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] {
if !term_written {
// we make sure to only write the term iff
// there is at least one document.
postings_serializer.new_term(&term)?;
term_written = true;
}
let delta_positions: &[u32] =
delta_position_computer.compute_delta_positions(segment_postings.positions());
try!(postings_serializer.write_doc(
remapped_doc_id,
segment_postings.term_freq(),
delta_positions));
}
}
}
if term_written {
try!(postings_serializer.close_term());
}
}
}
Ok(())
}
@@ -251,7 +262,15 @@ impl IndexMerger {
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
for reader in &self.readers {
let store_reader = reader.get_store_reader();
try!(store_writer.stack_reader(store_reader));
for doc_id in 0..reader.max_doc() {
if !reader.is_deleted(doc_id) {
let doc = try!(store_reader.get(doc_id));
let field_values: Vec<&FieldValue> = doc.field_values()
.iter()
.collect();
try!(store_writer.store(&field_values));
}
}
}
Ok(())
}
@@ -263,9 +282,8 @@ impl SerializableSegment for IndexMerger {
try!(self.write_fieldnorms(serializer.get_fieldnorms_serializer()));
try!(self.write_fast_fields(serializer.get_fast_field_serializer()));
try!(self.write_storable_fields(serializer.get_store_writer()));
try!(serializer.write_segment_info(&self.segment_info));
try!(serializer.close());
Ok(self.segment_info.max_doc)
Ok(self.max_doc)
}
}
@@ -274,17 +292,24 @@ mod tests {
use schema;
use schema::Document;
use schema::Term;
use query::TermQuery;
use schema::{Field, FieldValue};
use core::Index;
use Searcher;
use DocAddress;
use collector::tests::FastFieldTestCollector;
use collector::tests::TestCollector;
use query::BooleanQuery;
use postings::SegmentPostingsOption;
use schema::TextIndexingOptions;
use futures::Future;
#[test]
fn test_index_merger() {
fn test_index_merger_no_deletes() {
let mut schema_builder = schema::SchemaBuilder::default();
let text_fieldtype = schema::TextOptions::default().set_indexing_options(TextIndexingOptions::TokenizedWithFreq).set_stored();
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::U32Options::default().set_fast();
let score_field = schema_builder.add_u32_field("score", score_fieldtype);
@@ -298,21 +323,21 @@ mod tests {
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u32(score_field, 3);
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c");
doc.add_u32(score_field, 5);
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c d");
doc.add_u32(score_field, 7);
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index_writer.commit().expect("committed");
}
{
@@ -321,23 +346,27 @@ mod tests {
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u32(score_field, 11);
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c g");
doc.add_u32(score_field, 13);
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index_writer.commit().expect("Commit failed");
}
}
{
let segments = index.searchable_segments();
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.merge(&segments).unwrap();
index_writer.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
}
{
index.load_searchers().unwrap();
let searcher = index.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let mut collector = TestCollector::default();
@@ -346,22 +375,14 @@ mod tests {
collector.docs()
};
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "a"))),
vec!(1, 2, 4,)
);
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "af"))),
vec!(0, 3,)
);
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "g"))),
vec!(4,)
);
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "b"))),
vec!(0, 1, 2, 3, 4,)
);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec!(1, 2, 4,));
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec!(0, 3,));
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec!(4,));
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec!(0, 1, 2, 3, 4,));
}
{
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
@@ -388,13 +409,192 @@ mod tests {
let query = BooleanQuery::new_multiterms_query(terms);
let mut collector = FastFieldTestCollector::for_field(score_field);
assert!(searcher.search(&query, &mut collector).is_ok());
collector.vals().clone()
collector.vals()
};
assert_eq!(
get_fast_vals(vec!(Term::from_field_text(text_field, "a"))),
vec!(5, 7, 13,)
);
assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec!(5, 7, 13,));
}
}
}
fn search_term(searcher: &Searcher, term: Term) -> Vec<u32> {
let mut collector = FastFieldTestCollector::for_field(Field(1));
let term_query = TermQuery::new(term, SegmentPostingsOption::NoFreq);
searcher.search(&term_query, &mut collector).unwrap();
collector.vals()
}
#[test]
fn test_index_merger_with_deletes() {
let mut schema_builder = schema::SchemaBuilder::default();
let text_fieldtype = schema::TextOptions
::default()
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::U32Options::default().set_fast();
let score_field = schema_builder.add_u32_field("score", score_fieldtype);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{ // a first commit
index_writer.add_document(
doc!(
text_field => "a b d",
score_field => 1
));
index_writer.add_document(
doc!(
text_field => "b c",
score_field => 2
));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(
doc!(
text_field => "c d",
score_field => 3
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!(1));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!(1));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(1, 3));
}
{ // a second commit
index_writer.add_document(
doc!(
text_field => "a d e",
score_field => 4_000
));
index_writer.add_document(
doc!(
text_field => "e f",
score_field => 5_000
));
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(
doc!(
text_field => "f g",
score_field => 6_000
));
index_writer.add_document(
doc!(
text_field => "g h",
score_field => 7_000
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 1);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 1);
assert_eq!(score_field_reader.max_val(), 3);
let score_field_reader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 4000);
assert_eq!(score_field_reader.max_val(), 7000);
}
{ // merging the segments
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
index_writer.merge(&segment_ids)
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 3);
assert_eq!(score_field_reader.max_val(), 7000);
}
{
// test a commit with only deletes
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 3);
assert_eq!(score_field_reader.max_val(), 7000);
}
{ // Test merging a single segment in order to remove deletes.
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
index_writer.merge(&segment_ids)
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 6000);
assert_eq!(score_field_reader.max_val(), 7000);
}
{ // Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g"));
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
index_writer.merge(&segment_ids)
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 0);
}
}
}

View File

@@ -1,5 +1,4 @@
mod index_writer;
pub mod index_writer;
pub mod segment_serializer;
pub mod merger;
mod merge_policy;
@@ -7,9 +6,15 @@ mod log_merge_policy;
mod segment_register;
mod segment_writer;
mod segment_manager;
pub mod delete_queue;
pub mod segment_updater;
mod directory_lock;
mod segment_entry;
mod doc_opstamp_mapping;
pub mod operation;
mod stamper;
pub use self::segment_entry::{SegmentEntry, SegmentState};
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_writer::SegmentWriter;
pub use self::index_writer::IndexWriter;
@@ -17,6 +22,5 @@ pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_policy::{NoMergePolicy, MergeCandidate, MergePolicy};
pub use self::segment_manager::SegmentManager;
/// Alias for the default merge policy, which is the LogMergePolicy.
pub type DefaultMergePolicy = LogMergePolicy;

17
src/indexer/operation.rs Normal file
View File

@@ -0,0 +1,17 @@
use schema::Document;
use schema::Term;
/// Timestamped Delete operation.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct DeleteOperation {
pub opstamp: u64,
pub term: Term,
}
/// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)]
pub struct AddOperation {
pub opstamp: u64,
pub document: Document,
}

View File

@@ -0,0 +1,129 @@
use core::SegmentMeta;
use bit_set::BitSet;
use indexer::delete_queue::DeleteCursor;
use core::SegmentId;
use std::fmt;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum SegmentState {
Ready,
InMerge,
}
impl SegmentState {
pub fn letter_code(&self,) -> char {
match *self {
SegmentState::InMerge => 'M',
SegmentState::Ready => 'R',
}
}
}
/// A segment entry describes the state of
/// a given segment, at a given instant.
///
/// In addition to segment meta,
/// it contains a few transient states
/// - state expresses whether the segment is already in the
/// middle of a merge
/// - delete_bitset is a bitset describing
/// documents that were deleted during the commit
/// itself.
/// - Delete cursor, is the position in the delete queue.
/// Deletes happening before the cursor are reflected either
/// in the .del file or in the delete_bitset.
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
state: SegmentState,
delete_bitset: Option<BitSet>,
delete_cursor: DeleteCursor,
}
impl SegmentEntry {
/// Create a new `SegmentEntry`
pub fn new(segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,
delete_bitset: delete_bitset,
delete_cursor: delete_cursor,
}
}
/// Return a reference to the segment entry deleted bitset.
///
/// `DocId` in this bitset are flagged as deleted.
pub fn delete_bitset(&self,) -> Option<&BitSet> {
self.delete_bitset.as_ref()
}
/// Set the `SegmentMeta` for this segment.
pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
self.meta = segment_meta;
}
/// Return a reference to the segment_entry's delete cursor
pub fn delete_cursor(&mut self) -> &mut DeleteCursor {
&mut self.delete_cursor
}
/// Return the `SegmentEntry`.
///
/// The state describes whether the segment is available for
/// a merge or not.
pub fn state(&self) -> SegmentState {
self.state
}
/// Returns the segment id.
pub fn segment_id(&self) -> SegmentId {
self.meta.id()
}
/// Accessor to the `SegmentMeta`
pub fn meta(&self) -> &SegmentMeta {
&self.meta
}
/// Mark the `SegmentEntry` as in merge.
///
/// Only segments that are not already
/// in a merge are elligible for future merge.
pub fn start_merge(&mut self,) {
self.state = SegmentState::InMerge;
}
/// Cancel a merge
///
/// If a merge fails, it is important to switch
/// the segment back to a idle state, so that it
/// may be elligible for future merges.
pub fn cancel_merge(&mut self,) {
self.state = SegmentState::Ready;
}
/// Returns true iff a segment should
/// be considered for a merge.
pub fn is_ready(&self,) -> bool {
self.state == SegmentState::Ready
}
}
impl fmt::Debug for SegmentEntry {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state)
}
}

View File

@@ -1,33 +1,22 @@
use super::segment_register::SegmentRegister;
use std::sync::RwLock;
use core::SegmentMeta;
use core::{META_FILEPATH, LOCKFILE_FILEPATH};
use core::SegmentId;
use indexer::SegmentEntry;
use std::path::PathBuf;
use std::collections::hash_set::HashSet;
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
use std::fmt::{self, Debug, Formatter};
use std::sync::atomic::{AtomicUsize, Ordering};
use indexer::delete_queue::DeleteCursor;
#[derive(Default)]
struct SegmentRegisters {
docstamp: u64,
uncommitted: SegmentRegister,
committed: SegmentRegister,
writing: HashSet<SegmentId>,
}
#[derive(Eq, PartialEq)]
pub enum CommitState {
Committed,
Uncommitted,
Missing,
}
impl Default for SegmentRegisters {
fn default() -> SegmentRegisters {
SegmentRegisters {
docstamp: 0u64,
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::default()
}
}
}
/// The segment manager stores the list of segments
@@ -35,14 +24,9 @@ impl Default for SegmentRegisters {
///
/// It guarantees the atomicity of the
/// changes (merges especially)
#[derive(Default)]
pub struct SegmentManager {
registers: RwLock<SegmentRegisters>,
// generation is an ever increasing counter that
// is incremented whenever we modify
// the segment manager. It can be useful for debugging
// purposes, and it also acts as a "dirty" marker,
// to detect when the `meta.json` should be written.
generation: AtomicUsize,
}
impl Debug for SegmentManager {
@@ -52,47 +36,73 @@ impl Debug for SegmentManager {
}
}
/// Returns the `SegmentMeta`s for (committed segment, uncommitted segments).
/// The result is consistent with other transactions.
///
/// For instance, a segment will not appear in both committed and uncommitted
/// segments
pub fn get_segment_ready_for_commit(segment_manager: &SegmentManager,) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
pub fn get_mergeable_segments(segment_manager: &SegmentManager,) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(registers_lock.committed.get_segment_ready_for_commit(),
registers_lock.uncommitted.get_segment_ready_for_commit())
(registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments())
}
impl SegmentManager {
/// Returns whether a segment is committed, uncommitted or missing.
pub fn is_committed(&self, segment_id: SegmentId) -> CommitState {
let lock = self.read();
if lock.uncommitted.contains(segment_id) {
CommitState::Uncommitted
pub fn from_segments(segment_metas: Vec<SegmentMeta>, delete_cursor: DeleteCursor) -> SegmentManager {
SegmentManager {
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor),
writing: HashSet::new(),
}),
}
else if lock.committed.contains(segment_id) {
CommitState::Committed
}
else {
CommitState::Missing
}
}
pub fn docstamp(&self,) -> u64 {
self.read().docstamp
}
pub fn from_segments(segment_metas: Vec<SegmentMeta>) -> SegmentManager {
SegmentManager {
registers: RwLock::new( SegmentRegisters {
docstamp: 0u64, // TODO put the actual value
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::from(segment_metas),
}),
generation: AtomicUsize::default(),
/// Returns all of the segment entries (committed or uncommitted)
pub fn segment_entries(&self,) -> Vec<SegmentEntry> {
let mut segment_entries = self.read()
.uncommitted
.segment_entries();
segment_entries.extend(
self.read()
.committed
.segment_entries()
);
segment_entries
}
/// Returns the overall number of segments in the `SegmentManager`
pub fn num_segments(&self,) -> usize {
let registers_lock = self.read();
registers_lock.committed.len() + registers_lock.uncommitted.len()
}
pub fn list_files(&self) -> HashSet<PathBuf> {
let registers_lock = self.read();
let mut files = HashSet::new();
files.insert(META_FILEPATH.clone());
files.insert(LOCKFILE_FILEPATH.clone());
let segment_metas: Vec<SegmentMeta> =
registers_lock.committed
.get_all_segments()
.into_iter()
.chain(registers_lock.uncommitted
.get_all_segments()
.into_iter())
.chain(registers_lock.writing
.iter()
.cloned()
.map(SegmentMeta::new))
.collect();
for segment_meta in segment_metas {
files.extend(segment_meta.list_files());
}
files
}
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
let registers = self.read();
registers
.committed
.segment_entry(segment_id)
.or_else(|| registers.uncommitted.segment_entry(segment_id))
}
// Lock poisoning should never happen :
@@ -103,36 +113,16 @@ impl SegmentManager {
}
fn write(&self,) -> RwLockWriteGuard<SegmentRegisters> {
self.generation.fetch_add(1, Ordering::Release);
self.registers.write().expect("Failed to acquire write lock on SegmentManager.")
}
pub fn generation(&self,) -> usize {
self.generation.load(Ordering::Acquire)
}
/// Removes all of the uncommitted segments
/// and returns them.
pub fn rollback(&self,) -> Vec<SegmentId> {
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
let mut registers_lock = self.write();
let segment_ids = registers_lock.uncommitted.segment_ids();
registers_lock.committed.clear();
registers_lock.uncommitted.clear();
segment_ids
}
pub fn commit(&self, docstamp: u64) {
let mut registers_lock = self.write();
let segment_entries = registers_lock.uncommitted.segment_entries();
for segment_entry in segment_entries {
registers_lock.committed.add_segment_entry(segment_entry);
}
registers_lock.docstamp = docstamp;
registers_lock.uncommitted.clear();
}
pub fn add_segment(&self, segment_meta: SegmentMeta) {
let mut registers_lock = self.write();
registers_lock.uncommitted.add_segment(segment_meta);
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) {
@@ -147,34 +137,86 @@ impl SegmentManager {
registers_lock.committed.start_merge(segment_id);
}
}
else {
error!("Merge operation sent for segments that are not all uncommited or commited.");
}
}
pub fn end_merge(&self, merged_segment_ids: &[SegmentId], merged_segment_meta: &SegmentMeta) {
pub fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId) {
let mut registers_lock = self.write();
if registers_lock.uncommitted.contains_all(merged_segment_ids) {
for segment_id in merged_segment_ids {
registers_lock.uncommitted.remove_segment(segment_id);
// we mark all segments are ready for merge.
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) {
&mut registers_lock.uncommitted
}
else if registers_lock.committed.contains_all(&before_merge_segment_ids) {
&mut registers_lock.committed
}
else {
warn!("couldn't find segment in SegmentManager");
return;
}
};
for segment_id in before_merge_segment_ids {
target_segment_register.cancel_merge(segment_id);
}
registers_lock.uncommitted.add_segment(merged_segment_meta.clone());
}
else if registers_lock.committed.contains_all(merged_segment_ids) {
for segment_id in merged_segment_ids {
registers_lock.committed.remove_segment(segment_id);
}
registers_lock.committed.add_segment(merged_segment_meta.clone());
} else {
warn!("couldn't find segment in SegmentManager");
}
// ... and we make sure the target segment entry
// can be garbage collected.
registers_lock.writing.remove(&after_merge_segment_id);
}
pub fn write_segment(&self, segment_id: SegmentId) {
let mut registers_lock = self.write();
registers_lock.writing.insert(segment_id);
}
pub fn add_segment(&self, segment_entry: SegmentEntry) {
let mut registers_lock = self.write();
registers_lock.writing.remove(&segment_entry.segment_id());
registers_lock.uncommitted.add_segment_entry(segment_entry);
}
pub fn committed_segments(&self,) -> Vec<SegmentId> {
let registers_lock = self.read();
registers_lock.committed.segment_ids()
pub fn end_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry) {
let mut registers_lock = self.write();
registers_lock.writing.remove(&after_merge_segment_entry.segment_id());
let mut target_register: &mut SegmentRegister = {
if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) {
&mut registers_lock.uncommitted
}
else if registers_lock.committed.contains_all(&before_merge_segment_ids) {
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
return;
}
};
for segment_id in before_merge_segment_ids {
target_register.remove_segment(segment_id);
}
target_register.add_segment_entry(after_merge_segment_entry);
}
pub fn segment_metas(&self,) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
pub fn committed_segment_metas(&self,) -> Vec<SegmentMeta> {
let registers_lock = self.read();
(registers_lock.committed.segment_metas(), registers_lock.uncommitted.segment_metas())
registers_lock.committed.segment_metas()
}
}

View File

@@ -3,39 +3,8 @@ use std::collections::HashMap;
use core::SegmentMeta;
use std::fmt;
use std::fmt::{Debug, Formatter};
#[derive(Clone, PartialEq, Eq, Debug)]
pub enum SegmentState {
Ready,
InMerge,
}
impl SegmentState {
fn letter_code(&self,) -> char {
match *self {
SegmentState::InMerge => 'M',
SegmentState::Ready => 'R',
}
}
}
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
state: SegmentState,
}
impl SegmentEntry {
fn start_merge(&mut self,) {
self.state = SegmentState::InMerge;
}
fn is_ready(&self,) -> bool {
self.state == SegmentState::Ready
}
}
use indexer::segment_entry::SegmentEntry;
use indexer::delete_queue::DeleteCursor;
/// The segment register keeps track
/// of the list of segment, their size as well
@@ -45,36 +14,50 @@ impl SegmentEntry {
/// segments that are currently searchable,
/// and by the index merger to identify
/// merge candidates.
#[derive(Default)]
pub struct SegmentRegister {
segment_states: HashMap<SegmentId, SegmentEntry>,
}
impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
try!(write!(f, "SegmentRegister("));
for (k, v) in &self.segment_states {
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state.letter_code()));
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code()));
}
try!(write!(f, ")"));
Ok(())
}
}
impl SegmentRegister {
pub fn clear(&mut self,) {
self.segment_states.clear();
}
pub fn get_segment_ready_for_commit(&self,) -> Vec<SegmentMeta> {
pub fn len(&self) -> usize {
self.segment_states.len()
}
pub fn get_all_segments(&self,) -> Vec<SegmentMeta> {
self.segment_states
.values()
.filter(|segment_entry| segment_entry.is_ready())
.map(|segment_entry| segment_entry.meta.clone())
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
pub fn segment_entries(&self,) -> Vec<SegmentEntry>{
pub fn get_mergeable_segments(&self,) -> Vec<SegmentMeta> {
self.segment_states
.values()
.filter(|segment_entry| segment_entry.is_ready())
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
pub fn segment_entries(&self,) -> Vec<SegmentEntry> {
self.segment_states
.values()
.cloned()
@@ -84,31 +67,18 @@ impl SegmentRegister {
pub fn segment_metas(&self,) -> Vec<SegmentMeta> {
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
.values()
.map(|segment_entry| segment_entry.meta.clone())
.map(|segment_entry| segment_entry.meta().clone())
.collect();
segment_ids.sort_by_key(|meta| meta.segment_id);
segment_ids.sort_by_key(|meta| meta.id());
segment_ids
}
pub fn segment_ids(&self,) -> Vec<SegmentId> {
self.segment_metas()
.into_iter()
.map(|segment_meta| segment_meta.segment_id)
.collect()
}
#[cfg(test)]
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states
.get(&segment_id)
.map(|segment_entry| segment_entry.clone())
}
pub fn contains(&self, segment_id: SegmentId) -> bool {
self.segment_states.contains_key(&segment_id)
}
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
@@ -116,21 +86,21 @@ impl SegmentRegister {
}
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
let segment_id = segment_entry.meta.segment_id;
let segment_id = segment_entry.segment_id();
self.segment_states.insert(segment_id, segment_entry);
}
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
self.add_segment_entry(SegmentEntry {
meta: segment_meta.clone(),
state: SegmentState::Ready,
});
}
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
self.segment_states.remove(segment_id);
}
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.cancel_merge();
}
pub fn start_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
@@ -138,63 +108,73 @@ impl SegmentRegister {
.start_merge();
}
}
impl From<Vec<SegmentMeta>> for SegmentRegister {
fn from(segment_metas: Vec<SegmentMeta>) -> SegmentRegister {
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: DeleteCursor) -> SegmentRegister {
let mut segment_states = HashMap::new();
for segment_meta in segment_metas {
let segment_id = segment_meta.segment_id;
let segment_entry = SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,
};
let segment_id = segment_meta.id();
let segment_entry = SegmentEntry::new(
segment_meta,
delete_cursor.clone(),
None);
segment_states.insert(segment_id, segment_entry);
}
SegmentRegister {
segment_states: segment_states,
segment_states: segment_states
}
}
}
impl Default for SegmentRegister {
fn default() -> SegmentRegister {
SegmentRegister {
segment_states: HashMap::new(),
}
}
}
#[cfg(test)]
mod tests {
use indexer::SegmentState;
use core::SegmentId;
use core::SegmentMeta;
use indexer::delete_queue::*;
use super::*;
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
segment_register
.segment_metas()
.into_iter()
.map(|segment_meta| segment_meta.id())
.collect()
}
#[test]
fn test_segment_register() {
let delete_queue = DeleteQueue::new();
let mut segment_register = SegmentRegister::default();
let segment_id_a = SegmentId::generate_random();
let segment_id_b = SegmentId::generate_random();
let segment_id_merged = SegmentId::generate_random();
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 10 + 20);
segment_register.add_segment(SegmentMeta::new(segment_id_a, 10));
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::Ready);
assert_eq!(segment_register.segment_ids(), vec!(segment_id_a));
segment_register.add_segment(SegmentMeta::new(segment_id_b, 20));
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::Ready);
{
let segment_meta = SegmentMeta::new(segment_id_a);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::Ready);
assert_eq!(segment_ids(&segment_register), vec!(segment_id_a));
{
let segment_meta = SegmentMeta::new(segment_id_b);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::Ready);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::InMerge);
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::InMerge);
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::InMerge);
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::InMerge);
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
segment_register.add_segment(segment_meta_merged);
assert_eq!(segment_register.segment_ids(), vec!(segment_id_merged));
{
let segment_meta_merged = SegmentMeta::new(segment_id_merged);
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_ids(&segment_register), vec!(segment_id_merged));
}
}

View File

@@ -1,9 +1,6 @@
use Result;
use std::io::Write;
use rustc_serialize::json;
use core::Segment;
use core::SegmentInfo;
use core::SegmentComponent;
use fastfield::FastFieldSerializer;
use store::StoreWriter;
@@ -13,7 +10,6 @@ use postings::PostingsSerializer;
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
pub struct SegmentSerializer {
segment: Segment,
store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: FastFieldSerializer,
@@ -33,7 +29,6 @@ impl SegmentSerializer {
let postings_serializer = try!(PostingsSerializer::open(segment));
Ok(SegmentSerializer {
segment: segment.clone(),
postings_serializer: postings_serializer,
store_writer: StoreWriter::new(store_write),
fast_field_serializer: fast_field_serializer,
@@ -61,16 +56,6 @@ impl SegmentSerializer {
&mut self.store_writer
}
/// Write the `SegmentInfo`
pub fn write_segment_info(&mut self, segment_info: &SegmentInfo) -> Result<()> {
let mut write = try!(self.segment.open_write(SegmentComponent::INFO));
let json_data = json::encode(segment_info)
.expect("Encoding to segment_info to JSON failed. This should never happen");
try!(write.write_all(json_data.as_bytes()));
try!(write.flush());
Ok(())
}
/// Finalize the segment serialization.
pub fn close(self) -> Result<()> {
try!(self.fast_field_serializer.close());

View File

@@ -1,44 +1,43 @@
#![allow(for_kv_map)]
use chan;
use core::Index;
use std::sync::Mutex;
use core::IndexMeta;
use core::META_FILEPATH;
use core::Segment;
use core::SegmentId;
use core::SegmentMeta;
use std::mem;
use core::SerializableSegment;
use indexer::MergePolicy;
use directory::Directory;
use indexer::stamper::Stamper;
use Error;
use futures_cpupool::CpuPool;
use futures::Future;
use futures::Canceled;
use futures::oneshot;
use directory::FileProtection;
use indexer::{MergePolicy, DefaultMergePolicy};
use indexer::index_writer::advance_deletes;
use indexer::MergeCandidate;
use indexer::merger::IndexMerger;
use indexer::SegmentEntry;
use indexer::SegmentSerializer;
use std::thread;
use schema::Schema;
use directory::Directory;
use std::thread::JoinHandle;
use std::sync::Arc;
use std::collections::HashMap;
use rustc_serialize::json;
use Result;
use core::IndexMeta;
use core::META_FILEPATH;
use futures_cpupool::CpuFuture;
use rustc_serialize::json;
use indexer::delete_queue::DeleteCursor;
use schema::Schema;
use std::borrow::BorrowMut;
use std::collections::HashMap;
use std::io::Write;
use super::segment_manager::{SegmentManager, get_segment_ready_for_commit};
use super::super::core::index::get_segment_manager;
pub type SegmentUpdateSender = chan::Sender<SegmentUpdate>;
pub type SegmentUpdateReceiver = chan::Receiver<SegmentUpdate>;
fn create_metas(segment_manager: &SegmentManager, schema: Schema, docstamp: u64) -> IndexMeta {
let (committed_segments, uncommitted_segments) = segment_manager.segment_metas();
IndexMeta {
committed_segments: committed_segments,
uncommitted_segments: uncommitted_segments,
schema: schema,
docstamp: docstamp,
}
}
use std::mem;
use std::ops::DerefMut;
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, AtomicBool};
use std::sync::atomic::Ordering;
use std::sync::RwLock;
use std::thread;
use std::thread::JoinHandle;
use super::segment_manager::{SegmentManager, get_mergeable_segments};
/// Save the index meta file.
@@ -51,11 +50,10 @@ fn create_metas(segment_manager: &SegmentManager, schema: Schema, docstamp: u64)
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(schema: Schema,
docstamp: u64,
opstamp: u64,
directory: &mut Directory)
-> Result<()> {
let segment_manager = SegmentManager::from_segments(Vec::new());
save_metas(&segment_manager, schema, docstamp, directory)
save_metas(vec!(), schema, opstamp, directory)
}
@@ -69,305 +67,438 @@ pub fn save_new_metas(schema: Schema,
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_metas(segment_manager: &SegmentManager,
pub fn save_metas(segment_metas: Vec<SegmentMeta>,
schema: Schema,
docstamp: u64,
opstamp: u64,
directory: &mut Directory)
-> Result<()> {
let metas = create_metas(segment_manager, schema, docstamp);
let mut w = Vec::new();
let metas = IndexMeta {
segments: segment_metas,
schema: schema,
opstamp: opstamp,
};
let mut w = vec!();
try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas)));
directory.atomic_write(&META_FILEPATH, &w[..])
.map_err(From::from)
let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
debug!("Saved metas {}", json::as_pretty_json(&metas));
Ok(res)
}
#[derive(Debug, Clone)]
pub enum SegmentUpdate {
// The segment update runner is in charge of processing all
// of the `SegmentUpdate`s.
//
// All this processing happens on a single thread
// consuming a common queue.
#[derive(Clone)]
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
fn perform_merge(segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64) -> Result<SegmentEntry> {
// first we need to apply deletes to our segment.
info!("Start merge: {:?}", segment_ids);
/// New segment added.
/// Created by the indexing worker thread
AddSegment(SegmentMeta),
let ref index = segment_updater.0.index;
let schema = index.schema();
let mut segment_entries = vec!();
let mut file_protections: Vec<FileProtection> = vec!();
for segment_id in segment_ids {
if let Some(mut segment_entry) = segment_updater.0
.segment_manager
.segment_entry(segment_id) {
let segment = index.segment(segment_entry.meta().clone());
if let Some(file_protection) = advance_deletes(segment, &mut segment_entry, target_opstamp)? {
file_protections.push(file_protection);
}
segment_entries.push(segment_entry);
}
else {
error!("Error, had to abort merge as some of the segment is not managed anymore.a");
return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", segment_id)));
}
}
/// A merge is ended.
/// Remove the merged segment and record the new
/// large merged segment.
EndMerge(usize, Vec<SegmentId>, SegmentMeta),
let delete_cursor = segment_entries[0].delete_cursor().clone();
let segments: Vec<Segment> = segment_entries
.iter()
.map(|segment_entry| {
index.segment(segment_entry.meta().clone())
})
.collect();
/// Happens when rollback is called.
/// The current generation of segments is cancelled.
CancelGeneration,
// An IndexMerger is like a "view" of our merged segments.
let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?;
// ... we just serialize this index merger in our new segment
// to merge the two segments.
let segment_serializer =
SegmentSerializer::for_segment(&mut merged_segment)
.expect("Creating index serializer failed");
let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed");
let mut segment_meta = SegmentMeta::new(merged_segment.id());
segment_meta.set_max_doc(num_docs);
/// Starts a new generation... This
/// happens at the end of Rollback.
NewGeneration,
/// Just dropping the Segment updater object
/// is safe, but some merge might be happening in
/// the background and the user may want to wait for these
/// threads to terminate.
///
/// When receiving the Terminate signal, the segment updater stops
/// receiving segment updates and just waits for the merging threads
/// to terminate.
Terminate,
/// Commit marks uncommmitted segments as committed.
Commit(u64),
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
Ok(after_merge_segment_entry)
}
/// The segment updater is in charge of processing all of the
/// `SegmentUpdate`s.
///
/// All this processing happens on a single thread
/// consuming a common queue.
///
/// The segment updates producers are :
/// - indexing threads are sending new segments
/// - merging threads are sending merge operations
/// - the index writer sends "terminate"
pub struct SegmentUpdater {
index: Index,
is_cancelled_generation: bool,
segment_update_receiver: SegmentUpdateReceiver,
segment_update_sender: SegmentUpdateSender,
segment_manager_arc: Arc<SegmentManager>,
merge_policy: Arc<Mutex<Box<MergePolicy>>>,
merging_thread_id: usize,
merging_threads: HashMap<usize, JoinHandle<(Vec<SegmentId>, SegmentMeta)> >,
struct InnerSegmentUpdater {
pool: CpuPool,
index: Index,
segment_manager: SegmentManager,
merge_policy: RwLock<Box<MergePolicy>>,
merging_thread_id: AtomicUsize,
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
generation: AtomicUsize,
killed: AtomicBool,
stamper: Stamper,
}
impl SegmentUpdater {
pub fn start_updater(index: Index, merge_policy: Arc<Mutex<Box<MergePolicy>>>) -> (SegmentUpdateSender, JoinHandle<()>) {
let segment_updater = SegmentUpdater::new(index, merge_policy);
(segment_updater.segment_update_sender.clone(), segment_updater.start())
pub fn new(index: Index,
stamper: Stamper,
delete_cursor: DeleteCursor) -> Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
Ok(
SegmentUpdater(Arc::new(InnerSegmentUpdater {
pool: CpuPool::new(1),
index: index,
segment_manager: segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper: stamper,
}))
)
}
fn new(index: Index, merge_policy: Arc<Mutex<Box<MergePolicy>>>) -> SegmentUpdater {
let segment_manager_arc = get_segment_manager(&index);
let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async();
SegmentUpdater {
index: index,
is_cancelled_generation: false,
segment_update_sender: segment_update_sender,
segment_update_receiver: segment_update_receiver,
segment_manager_arc: segment_manager_arc,
merge_policy: merge_policy,
merging_thread_id: 0,
merging_threads: HashMap::new(),
}
}
fn new_merging_thread_id(&mut self,) -> usize {
self.merging_thread_id += 1;
self.merging_thread_id
pub fn new_segment(&self) -> Segment {
let new_segment = self.0.index.new_segment();
let segment_id = new_segment.id();
self.0.segment_manager.write_segment(segment_id);
new_segment
}
fn end_merge(
&mut self,
segment_ids: Vec<SegmentId>,
segment_meta: SegmentMeta) {
let segment_manager = self.segment_manager_arc.clone();
segment_manager.end_merge(&segment_ids, &segment_meta);
save_metas(
&*segment_manager,
self.index.schema(),
self.index.docstamp(),
self.index.directory_mut()).expect("Could not save metas.");
for segment_id in segment_ids {
self.index.delete_segment(segment_id);
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
self.0.merge_policy.read().unwrap().box_clone()
}
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
*self.0.merge_policy.write().unwrap()= merge_policy;
}
fn get_merging_thread_id(&self) -> usize {
self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
}
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(&self, f: F) -> CpuFuture<T, Error> {
let me_clone = self.clone();
self.0.pool.spawn_fn(move || {
Ok(f(me_clone))
})
}
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
if generation >= self.0.generation.load(Ordering::Acquire) {
self.run_async(|segment_updater| {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
}).forget();
true
}
else {
false
}
}
pub fn kill(&mut self,) {
self.0.killed.store(true, Ordering::Release);
}
fn start_merges(&mut self,) {
let merge_candidates = self.consider_merge_options();
for MergeCandidate(segment_ids) in merge_candidates {
let merging_thread_id = self.new_merging_thread_id();
self.segment_manager().start_merge(&segment_ids);
fn is_alive(&self,) -> bool {
!self.0.killed.load(Ordering::Acquire)
}
let index_clone = self.index.clone();
let segment_update_sender_clone = self.segment_update_sender.clone();
let merge_thread_handle = thread::Builder::new()
.name(format!("merge_thread_{:?}", merging_thread_id))
.spawn(move || {
info!("Start merge: {:?}", segment_ids);
let schema = index_clone.schema();
let segments: Vec<Segment> = segment_ids
.iter()
.map(|&segment_id| index_clone.segment(segment_id))
.collect();
// An IndexMerger is like a "view" of our merged segments.
// TODO unwrap
let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed");
let mut merged_segment = index_clone.new_segment();
// ... we just serialize this index merger in our new segment
// to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed");
let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed");
let segment_meta = SegmentMeta {
segment_id: merged_segment.id(),
num_docs: num_docs,
};
let segment_update = SegmentUpdate::EndMerge(merging_thread_id, segment_ids.clone(), segment_meta.clone());
segment_update_sender_clone.send(segment_update.clone());
(segment_ids, segment_meta)
})
.expect("Failed to spawn merge thread");
self.merging_threads.insert(merging_thread_id, merge_thread_handle);
/// Apply deletes up to the target opstamp to all segments.
///
/// Tne method returns copies of the segment entries,
/// updated with the delete information.
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
let mut segment_entries = self.0.segment_manager.segment_entries();
for segment_entry in &mut segment_entries {
let segment = self.0.index.segment(segment_entry.meta().clone());
advance_deletes(segment, segment_entry, target_opstamp)?;
}
Ok(segment_entries)
}
pub fn save_metas(&self, opstamp: u64) {
if self.is_alive() {
let index = &self.0.index;
let directory = index.directory();
save_metas(
self.0.segment_manager.committed_segment_metas(),
index.schema(),
opstamp,
directory.box_clone().borrow_mut()).expect("Could not save metas.");
}
}
fn consider_merge_options(&self,) -> Vec<MergeCandidate> {
let segment_manager = self.segment_manager();
let (committed_segments, uncommitted_segments) = get_segment_ready_for_commit(segment_manager);
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independantly.
let merge_policy_lock = self.merge_policy.lock().unwrap();
let mut merge_candidates = merge_policy_lock.compute_merge_candidates(&uncommitted_segments);
let committed_merge_candidates = merge_policy_lock.compute_merge_candidates(&committed_segments);
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
merge_candidates
pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec();
}).wait()
}
fn segment_manager(&self,) -> &SegmentManager {
&*self.segment_manager_arc
}
pub fn start(self,) -> JoinHandle<()> {
thread::Builder::new()
.name("segment_update".to_string())
.spawn(move || {
self.process();
})
.expect("Failed to start segment updater thread.")
fn garbage_collect_files_exec(&self) {
let living_files = self.0.segment_manager.list_files();
let mut index = self.0.index.clone();
index.directory_mut().garbage_collect(living_files);
}
fn process(mut self,) {
let segment_manager = self.segment_manager_arc.clone();
for segment_update in self.segment_update_receiver.clone() {
if let SegmentUpdate::Terminate = segment_update {
break;
pub fn commit(&self, opstamp: u64) -> Result<()> {
self.run_async(move |segment_updater| {
if segment_updater.is_alive() {
let segment_entries = segment_updater
.purge_deletes(opstamp)
.expect("Failed purge deletes");
segment_updater.0.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp);
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
}
// we check the generation number as if it was
// dirty-bit. If the value is different
// to our generation, then the segment_manager has
// been update updated.
let generation_before_update = segment_manager.generation();
self.process_one(segment_update);
if generation_before_update != segment_manager.generation() {
// The segment manager has changed, we need to
// - save meta.json
save_metas(
&*segment_manager,
self.index.schema(),
self.index.docstamp(),
self.index.directory_mut()).expect("Could not save metas.");
}).wait()
}
// - update the searchers
// update the searchers so that they eventually will
// use the new segments.
// TODO eventually have this work through watching meta.json
// so that an external process stays up to date as well.
match self.index.load_searchers() {
Ok(()) => {
}
Err(e) => {
error!("Failure while loading new searchers {:?}", e);
panic!(format!("Failure while loading new searchers {:?}", e));
}
}
// - start merges if required
self.start_merges();
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> impl Future<Item=SegmentMeta, Error=Canceled> {
self.0.segment_manager.start_merge(segment_ids);
let segment_updater_clone = self.clone();
let segment_ids_vec = segment_ids.to_vec();
let merging_thread_id = self.get_merging_thread_id();
let (merging_future_send, merging_future_recv) = oneshot();
if segment_ids.is_empty() {
return merging_future_recv;
}
let mut merging_threads = HashMap::new();
mem::swap(&mut merging_threads, &mut self.merging_threads);
for (_, merging_thread_handle) in merging_threads {
match merging_thread_handle.join() {
Ok((segment_ids, segment_meta)) => {
self.end_merge(segment_ids, segment_meta);
let target_opstamp = self.0.stamper.stamp();
let merging_join_handle = thread::spawn(move || {
// first we need to apply deletes to our segment.
info!("Start merge: {:?}", segment_ids_vec);
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(&segment_ids_vec, &segment_updater_clone, merged_segment, target_opstamp);
match merge_result {
Ok(after_merge_segment_entry) => {
let merged_segment_meta = after_merge_segment_entry.meta().clone();
segment_updater_clone
.end_merge(segment_ids_vec, after_merge_segment_entry)
.expect("Segment updater thread is corrupted.");
// the future may fail if the listener of the oneshot future
// has been destroyed.
//
// This is not a problem here, so we just ignore any
// possible error.
let _merging_future_res = merging_future_send.send(merged_segment_meta);
}
Err(e) => {
error!("Error in merging thread {:?}", e);
break;
}
error!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
// merging_future_send will be dropped, sending an error to the future.
}
}
segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id);
Ok(())
});
self.0.merging_threads.write().unwrap().insert(merging_thread_id, merging_join_handle);
merging_future_recv
}
fn consider_merge_options(&self) {
let (committed_segments, uncommitted_segments) = get_mergeable_segments(&self.0.segment_manager);
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.
let merge_policy = self.get_merge_policy();
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
for MergeCandidate(segment_metas) in merge_candidates {
self.start_merge(&segment_metas);
}
}
fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId) {
self.0.segment_manager.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry);
}
// Process a single segment update.
pub fn process_one(
&mut self,
segment_update: SegmentUpdate) {
info!("Segment update: {:?}", segment_update);
fn end_merge(&self,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry) -> Result<()> {
match segment_update {
SegmentUpdate::AddSegment(segment_meta) => {
if !self.is_cancelled_generation {
self.segment_manager().add_segment(segment_meta);
}
else {
// rollback has been called and this
// segment actually belong to the
// documents that have been dropped.
//
// Let's just remove its files.
self.index.delete_segment(segment_meta.segment_id);
}
}
SegmentUpdate::EndMerge(merging_thread_id, segment_ids, segment_meta) => {
self.end_merge(
segment_ids,
segment_meta);
self.merging_threads.remove(&merging_thread_id);
}
SegmentUpdate::CancelGeneration => {
// Called during rollback. The segment
// that will arrive will be ignored
// until a NewGeneration is update arrives.
self.is_cancelled_generation = true;
}
SegmentUpdate::NewGeneration => {
// After rollback, we can resume
// indexing new documents.
self.is_cancelled_generation = false;
}
SegmentUpdate::Commit(docstamp) => {
self.segment_manager().commit(docstamp);
}
SegmentUpdate::Terminate => {
panic!("We should have left the loop before processing it.");
}
}
}
self.run_async(move |segment_updater| {
debug!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.0.index.opstamp();
if delete_operation.opstamp < committed_opstamp {
let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone());
// TODO check unwrap
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp).unwrap();
}
}
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
segment_updater.consider_merge_options();
segment_updater.save_metas(segment_updater.0.index.opstamp());
}).wait()
}
/// Wait for current merging threads.
///
/// Upon termination of the current merging threads,
/// merge opportunity may appear.
//
/// We keep waiting until the merge policy judges that
/// no opportunity is available.
///
/// Note that it is not required to call this
/// method in your application.
/// Terminating your application without letting
/// merge terminate is perfectly safe.
///
/// Obsolete files will eventually be cleaned up
/// by the directory garbage collector.
pub fn wait_merging_thread(&self) -> Result<()> {
let mut num_segments: usize;
loop {
num_segments = self.0.segment_manager.num_segments();
let mut new_merging_threads = HashMap::new();
{
let mut merging_threads = self.0.merging_threads.write().unwrap();
mem::swap(&mut new_merging_threads, merging_threads.deref_mut());
}
debug!("wait merging thread {}", new_merging_threads.len());
for (_, merging_thread_handle) in new_merging_threads {
merging_thread_handle
.join()
.map(|_| ())
.map_err(|_| {
Error::ErrorInThread("Merging thread failed.".to_string())
})?
}
// Our merging thread may have queued their completed
self.run_async(move |_| {}).wait()?;
let new_num_segments = self.0.segment_manager.num_segments();
if new_num_segments >= num_segments {
break;
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use Index;
use schema::*;
use indexer::merge_policy::tests::MergeWheneverPossible;
#[test]
fn test_delete_during_merge() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.set_merge_policy(box MergeWheneverPossible);
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
}
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
}
assert!(index_writer.commit().is_ok());
}
{
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
assert!(index_writer.commit().is_ok());
}
{
let term = Term::from_field_text(text_field, "a");
index_writer.delete_term(term);
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
assert_eq!(index.searcher().num_docs(), 302);
{
index_writer.wait_merging_threads()
.expect( "waiting for merging threads");
}
index.load_searchers().unwrap();
assert_eq!(index.searcher().segment_readers().len(), 1);
assert_eq!(index.searcher().num_docs(), 302);
}
}

View File

@@ -1,10 +1,8 @@
use Result;
use DocId;
use std::io;
use schema::Schema;
use schema::Document;
use schema::Schema;
use schema::Term;
use core::SegmentInfo;
use core::Segment;
use core::SerializableSegment;
use postings::PostingsWriter;
@@ -19,6 +17,8 @@ use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
use indexer::segment_serializer::SegmentSerializer;
use datastruct::stacker::Heap;
use indexer::index_writer::MARGIN_IN_BYTES;
use super::operation::AddOperation;
/// A `SegmentWriter` is in charge of creating segment index from a
/// documents.
@@ -32,6 +32,7 @@ pub struct SegmentWriter<'a> {
segment_serializer: SegmentSerializer,
fast_field_writers: U32FastFieldsWriter,
fieldnorms_writer: U32FastFieldsWriter,
doc_opstamps: Vec<u64>,
}
@@ -70,7 +71,6 @@ fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box
impl<'a> SegmentWriter<'a> {
/// Creates a new `SegmentWriter`
///
/// The arguments are defined as follows
@@ -80,7 +80,9 @@ impl<'a> SegmentWriter<'a> {
/// the flushing behavior as a buffer limit
/// - segment: The segment being written
/// - schema
pub fn for_segment(heap: &'a Heap, mut segment: Segment, schema: &Schema) -> Result<SegmentWriter<'a>> {
pub fn for_segment(heap: &'a Heap,
mut segment: Segment,
schema: &Schema) -> Result<SegmentWriter<'a>> {
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
let mut per_field_postings_writers: Vec<Box<PostingsWriter + 'a>> = Vec::new();
for field_entry in schema.fields() {
@@ -94,6 +96,7 @@ impl<'a> SegmentWriter<'a> {
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: U32FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
})
}
@@ -101,18 +104,16 @@ impl<'a> SegmentWriter<'a> {
///
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(mut self,) -> Result<()> {
let segment_info = self.segment_info();
pub fn finalize(mut self) -> Result<Vec<u64>> {
for per_field_postings_writer in &mut self.per_field_postings_writers {
per_field_postings_writer.close(self.heap);
}
try!(write(&self.per_field_postings_writers,
write(&self.per_field_postings_writers,
&self.fast_field_writers,
&self.fieldnorms_writer,
segment_info,
self.segment_serializer,
self.heap));
Ok(())
self.heap)?;
Ok(self.doc_opstamps)
}
/// Returns true iff the segment writer's buffer has reached capacity.
@@ -129,8 +130,10 @@ impl<'a> SegmentWriter<'a> {
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document(&mut self, doc: &Document, schema: &Schema) -> io::Result<()> {
pub fn add_document(&mut self, add_operation: &AddOperation, schema: &Schema) -> io::Result<()> {
let doc_id = self.max_doc;
let doc = &add_operation.document;
self.doc_opstamps.push(add_operation.opstamp);
for (field, field_values) in doc.get_sorted_field_values() {
let field_posting_writer: &mut Box<PostingsWriter> = &mut self.per_field_postings_writers[field.0 as usize];
let field_options = schema.get_field_entry(field);
@@ -165,7 +168,7 @@ impl<'a> SegmentWriter<'a> {
}
}
self.fieldnorms_writer.fill_val_up_to(doc_id);
self.fast_field_writers.add_document(doc);
self.fast_field_writers.add_document(&doc);
let stored_fieldvalues: Vec<&FieldValue> = doc
.field_values()
.iter()
@@ -177,14 +180,6 @@ impl<'a> SegmentWriter<'a> {
Ok(())
}
/// Creates the `SegmentInfo` that will be serialized along
/// with the index in JSON format.
fn segment_info(&self,) -> SegmentInfo {
SegmentInfo {
max_doc: self.max_doc
}
}
/// Max doc is
/// - the number of documents in the segment assuming there is no deletes
@@ -212,26 +207,25 @@ impl<'a> SegmentWriter<'a> {
fn write<'a>(per_field_postings_writers: &[Box<PostingsWriter + 'a>],
fast_field_writers: &U32FastFieldsWriter,
fieldnorms_writer: &U32FastFieldsWriter,
segment_info: SegmentInfo,
mut serializer: SegmentSerializer,
heap: &'a Heap,) -> Result<u32> {
for per_field_postings_writer in per_field_postings_writers.iter() {
heap: &'a Heap,) -> Result<()> {
for per_field_postings_writer in per_field_postings_writers {
try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap));
}
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
try!(serializer.write_segment_info(&segment_info));
try!(serializer.close());
Ok(segment_info.max_doc)
Ok(())
}
impl<'a> SerializableSegment for SegmentWriter<'a> {
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(&self.per_field_postings_writers,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_info(),
serializer,
self.heap)
self.heap)?;
Ok(max_doc)
}
}

17
src/indexer/stamper.rs Normal file
View File

@@ -0,0 +1,17 @@
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
pub fn stamp(&self,) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst)
}
}

View File

@@ -5,10 +5,11 @@
#![feature(box_syntax)]
#![feature(optin_builtin_traits)]
#![feature(conservative_impl_trait)]
#![feature(integer_atomics)]
#![cfg_attr(test, feature(test))]
#![cfg_attr(test, feature(step_by))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![feature(conservative_impl_trait)]
#![warn(missing_docs)]
@@ -27,6 +28,7 @@ extern crate lazy_static;
extern crate log;
#[macro_use]
extern crate version;
extern crate fst;
extern crate byteorder;
extern crate memmap;
@@ -44,13 +46,26 @@ extern crate combine;
extern crate itertools;
extern crate chan;
extern crate crossbeam;
extern crate bit_set;
extern crate futures;
extern crate futures_cpupool;
#[cfg(test)]
extern crate env_logger;
#[cfg(feature="simdcompression")]
extern crate libc;
#[cfg(windows)]
extern crate winapi;
#[cfg(test)] extern crate test;
#[cfg(test)] extern crate rand;
#[cfg(test)]
mod functional_test;
#[macro_use]
mod macros {
macro_rules! get(
@@ -69,6 +84,11 @@ mod macros {
);
}
pub use error::Error;
/// Tantivy result.
pub type Result<T> = std::result::Result<T, Error>;
mod core;
mod compression;
mod fastfield;
@@ -76,13 +96,11 @@ mod store;
mod indexer;
mod common;
mod error;
pub use error::{Result, Error};
mod analyzer;
mod datastruct;
/// Query module
pub mod query;
/// Directory module
@@ -96,13 +114,9 @@ pub mod schema;
pub use directory::Directory;
pub use core::searcher::Searcher;
pub use core::Segment;
pub use core::Index;
pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher};
pub use indexer::IndexWriter;
pub use schema::Term;
pub use schema::Document;
pub use schema::{Term, Document};
pub use core::SegmentReader;
pub use self::common::TimerTree;
@@ -111,8 +125,20 @@ pub use postings::DocSet;
pub use postings::Postings;
pub use postings::SegmentPostingsOption;
pub use core::TermIterator;
/// Expose the current version of tantivy, as well
/// whether it was compiled with the simd compression.
pub fn version() -> &'static str {
if cfg!(feature="simdcompression") {
concat!(version!(), "-simd")
}
else {
concat!(version!(), "-nosimd")
}
}
/// Tantivy's makes it possible to personalize when
/// the indexer should merge its segments
pub mod merge_policy {
@@ -170,8 +196,10 @@ mod tests {
use Index;
use core::SegmentReader;
use query::BooleanQuery;
use postings::SegmentPostingsOption;
use schema::*;
use DocSet;
use IndexWriter;
use Postings;
#[test]
@@ -185,15 +213,15 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
}
@@ -207,26 +235,27 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
index_writer.add_document(doc!(text_field=>"a b c")).unwrap();
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit().unwrap();
}
{
{
let doc = doc!(text_field=>"a");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a a");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
let doc = doc!(text_field=>"c");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
index_writer.commit().unwrap();
}
{
index.load_searchers().unwrap();
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3);
@@ -249,20 +278,20 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!();
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap();
@@ -272,6 +301,198 @@ mod tests {
}
}
#[test]
fn test_delete_postings1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{ // 0
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
{ // 1
let doc = doc!(text_field=>" a c");
index_writer.add_document(doc);
}
{ // 2
let doc = doc!(text_field=>" b c");
index_writer.add_document(doc);
}
{ // 3
let doc = doc!(text_field=>" b d");
index_writer.add_document(doc);
}
{
index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
{
index_writer.delete_term(Term::from_field_text(text_field, "a"));
}
{ // 4
let doc = doc!(text_field=>" b c");
index_writer.add_document(doc);
}
{ // 5
let doc = doc!(text_field=>" a");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
}
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{ // 0
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
{ // 1
index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
index_writer.rollback().unwrap();
}
{
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
}
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
{
index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
index_writer = index_writer.rollback().unwrap();
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.commit().unwrap();
}
{
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "c")).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
}
}
}
#[test]
fn test_indexed_u32() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u32_field("text", U32_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(
doc!(field=>1)
);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_u32(field, 1u32);
let mut postings = searcher.segment_reader(0).read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
}
#[test]
fn test_delete_postings2() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
let doc = doc!(text_field=>val);
index_writer.add_document(doc);
};
let remove_document = |index_writer: &mut IndexWriter, val: &'static str| {
let delterm = Term::from_field_text(text_field, val);
index_writer.delete_term(delterm);
};
add_document(&mut index_writer, "63");
add_document(&mut index_writer, "70");
add_document(&mut index_writer, "34");
add_document(&mut index_writer, "1");
add_document(&mut index_writer, "38");
add_document(&mut index_writer, "33");
add_document(&mut index_writer, "40");
add_document(&mut index_writer, "17");
remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34");
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 6);
}
#[test]
fn test_termfreq() {
let mut schema_builder = SchemaBuilder::default();
@@ -283,11 +504,12 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
@@ -311,19 +533,20 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af af af b");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
index.load_searchers().unwrap();
let searcher = index.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
@@ -377,15 +600,15 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc).unwrap();
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}

View File

@@ -1,71 +0,0 @@
use DocId;
use postings::Postings;
use postings::OffsetPostings;
use postings::DocSet;
use postings::HasLen;
/// Creates a posting object that chains two postings
/// together.
///
/// When iterating over the chained postings,
/// it will consume all of the documents of the first postings,
/// and then iterate over the documents over the second postings.
///
/// The chained postings is used when merging segments.
pub struct ChainedPostings<'a> {
chained_postings: Vec<OffsetPostings<'a>>,
posting_id: usize,
len: usize,
}
impl<'a> From<Vec<OffsetPostings<'a>>> for ChainedPostings<'a> {
fn from(chained_postings: Vec<OffsetPostings<'a>>) -> ChainedPostings {
let len: usize = chained_postings
.iter()
.map(|segment_postings| segment_postings.len())
.sum();
ChainedPostings {
chained_postings: chained_postings,
posting_id: 0,
len: len,
}
}
}
impl<'a> DocSet for ChainedPostings<'a> {
fn advance(&mut self,) -> bool {
if self.posting_id == self.chained_postings.len() {
return false;
}
while !self.chained_postings[self.posting_id].advance() {
self.posting_id += 1;
if self.posting_id == self.chained_postings.len() {
return false;
}
}
true
}
fn doc(&self,) -> DocId {
self.chained_postings[self.posting_id].doc()
}
}
impl<'a> HasLen for ChainedPostings<'a> {
fn len(&self,) -> usize {
self.len
}
}
impl<'a> Postings for ChainedPostings<'a> {
fn term_freq(&self,) -> u32 {
self.chained_postings[self.posting_id].term_freq()
}
fn positions(&self) -> &[u32] {
self.chained_postings[self.posting_id].positions()
}
}

View File

@@ -67,6 +67,7 @@ pub trait DocSet {
}
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
fn advance(&mut self) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();

View File

@@ -2,8 +2,6 @@ use postings::DocSet;
use postings::SkipResult;
use DocId;
// TODO Find a way to specialize `IntersectionDocSet`
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
pub struct IntersectionDocSet<TDocSet: DocSet> {
docsets: Vec<TDocSet>,

Some files were not shown because too many files have changed in this diff Show More